diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,23574 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.006940627099539697, + "eval_steps": 1000, + "global_step": 2500, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "clip_ratio": 0.0, + "completion_length": 240.56250762939453, + "epoch": 2.776250839815879e-06, + "grad_norm": 0.06479538977146149, + "kl": 0.0, + "learning_rate": 8.310249307479225e-09, + "loss": -0.0006, + "reward": 0.16875001788139343, + "reward_std": 0.19138706848025322, + "rewards/countdown_reward_func": 0.16875001788139343, + "step": 1, + "zero_std_ratio": 0.375 + }, + { + "clip_ratio": 0.0, + "epoch": 5.552501679631758e-06, + "grad_norm": 0.055109903216362, + "kl": 0.0, + "learning_rate": 1.662049861495845e-08, + "loss": -0.0006, + "step": 2 + }, + { + "clip_ratio": 0.0, + "epoch": 8.328752519447637e-06, + "grad_norm": 0.0616462379693985, + "kl": 0.0006676262128166854, + "learning_rate": 2.4930747922437675e-08, + "loss": -0.0006, + "step": 3 + }, + { + "clip_ratio": 0.0, + "epoch": 1.1105003359263517e-05, + "grad_norm": 0.061320219188928604, + "kl": 0.0007026523817330599, + "learning_rate": 3.32409972299169e-08, + "loss": -0.0005, + "step": 4 + }, + { + "clip_ratio": 0.0002470490289852023, + "epoch": 1.3881254199079395e-05, + "grad_norm": 0.059856366366147995, + "kl": 0.0007550643058493733, + "learning_rate": 4.155124653739612e-08, + "loss": -0.0006, + "step": 5 + }, + { + "clip_ratio": 8.138021075865254e-05, + "epoch": 1.6657505038895273e-05, + "grad_norm": 0.09312308579683304, + "kl": 0.0007920752104837447, + "learning_rate": 4.986149584487535e-08, + "loss": -0.0009, + "step": 6 + }, + { + "clip_ratio": 0.0, + "epoch": 1.9433755878711153e-05, + "grad_norm": 0.0637163445353508, + "kl": 0.0006995745643507689, + "learning_rate": 5.8171745152354567e-08, + "loss": -0.0007, + "step": 7 + }, + { + "clip_ratio": 8.138021075865254e-05, + "epoch": 2.2210006718527033e-05, + "grad_norm": 0.055853065103292465, + "kl": 0.0008612782985437661, + "learning_rate": 6.64819944598338e-08, + "loss": -0.0002, + "step": 8 + }, + { + "clip_ratio": 0.0, + "epoch": 2.498625755834291e-05, + "grad_norm": 0.060049429535865784, + "kl": 0.00074524967931211, + "learning_rate": 7.479224376731302e-08, + "loss": -0.0007, + "step": 9 + }, + { + "clip_ratio": 0.0, + "epoch": 2.776250839815879e-05, + "grad_norm": 0.059901271015405655, + "kl": 0.0008222198521252722, + "learning_rate": 8.310249307479224e-08, + "loss": -0.0004, + "step": 10 + }, + { + "clip_ratio": 0.00016566881095059216, + "epoch": 3.0538759237974666e-05, + "grad_norm": 0.05985812842845917, + "kl": 0.0007750319491606206, + "learning_rate": 9.141274238227148e-08, + "loss": -0.0006, + "step": 11 + }, + { + "clip_ratio": 0.0, + "epoch": 3.3315010077790546e-05, + "grad_norm": 0.09739804267883301, + "kl": 0.0008287512173410505, + "learning_rate": 9.97229916897507e-08, + "loss": -0.0006, + "step": 12 + }, + { + "clip_ratio": 8.928571332944557e-05, + "completion_length": 224.6666717529297, + "epoch": 3.6091260917606426e-05, + "grad_norm": 0.19317923486232758, + "kl": 0.0007415811996906996, + "learning_rate": 1.0803324099722992e-07, + "loss": 0.0222, + "reward": 0.2645833343267441, + "reward_std": 0.2765769511461258, + "rewards/countdown_reward_func": 0.2645833268761635, + "step": 13, + "zero_std_ratio": 0.25 + }, + { + "clip_ratio": 8.827683632262051e-05, + "epoch": 3.8867511757422306e-05, + "grad_norm": 0.07646733522415161, + "kl": 0.0008568160701543093, + "learning_rate": 1.1634349030470913e-07, + "loss": 0.0217, + "step": 14 + }, + { + "clip_ratio": 0.00019336564582772553, + "epoch": 4.1643762597238186e-05, + "grad_norm": 0.10118044912815094, + "kl": 0.0007392842962872237, + "learning_rate": 1.2465373961218836e-07, + "loss": 0.0214, + "step": 15 + }, + { + "clip_ratio": 0.0004612546181306243, + "epoch": 4.4420013437054066e-05, + "grad_norm": 0.09648283571004868, + "kl": 0.0007288659107871354, + "learning_rate": 1.329639889196676e-07, + "loss": 0.0215, + "step": 16 + }, + { + "clip_ratio": 0.0, + "epoch": 4.7196264276869946e-05, + "grad_norm": 0.10432377457618713, + "kl": 0.0007946545956656337, + "learning_rate": 1.4127423822714683e-07, + "loss": 0.021, + "step": 17 + }, + { + "clip_ratio": 0.0, + "epoch": 4.997251511668582e-05, + "grad_norm": 0.0799316018819809, + "kl": 0.0007485100359190255, + "learning_rate": 1.4958448753462604e-07, + "loss": 0.0217, + "step": 18 + }, + { + "clip_ratio": 8.480325777782127e-05, + "epoch": 5.27487659565017e-05, + "grad_norm": 0.20232859253883362, + "kl": 0.0009814091317821294, + "learning_rate": 1.5789473684210525e-07, + "loss": 0.022, + "step": 19 + }, + { + "clip_ratio": 0.00027768261497840285, + "epoch": 5.552501679631758e-05, + "grad_norm": 0.07837554812431335, + "kl": 0.000779987225541845, + "learning_rate": 1.6620498614958448e-07, + "loss": 0.0217, + "step": 20 + }, + { + "clip_ratio": 0.00010407993249827996, + "epoch": 5.830126763613346e-05, + "grad_norm": 0.09943106770515442, + "kl": 0.0007467044633813202, + "learning_rate": 1.7451523545706372e-07, + "loss": 0.0214, + "step": 21 + }, + { + "clip_ratio": 0.0005535055533982813, + "epoch": 6.107751847594933e-05, + "grad_norm": 0.09467501193284988, + "kl": 0.0007356623827945441, + "learning_rate": 1.8282548476454296e-07, + "loss": 0.0217, + "step": 22 + }, + { + "clip_ratio": 8.979885024018586e-05, + "epoch": 6.385376931576521e-05, + "grad_norm": 0.1044834554195404, + "kl": 0.0007783641340211034, + "learning_rate": 1.9113573407202217e-07, + "loss": 0.0216, + "step": 23 + }, + { + "clip_ratio": 0.0, + "epoch": 6.663002015558109e-05, + "grad_norm": 0.08091197907924652, + "kl": 0.0007477364561054856, + "learning_rate": 1.994459833795014e-07, + "loss": 0.0214, + "step": 24 + }, + { + "clip_ratio": 0.000327225134242326, + "completion_length": 223.89583587646484, + "epoch": 6.940627099539697e-05, + "grad_norm": 0.07894178479909897, + "kl": 0.0009490847878623754, + "learning_rate": 2.0775623268698064e-07, + "loss": 0.0032, + "reward": 0.24583334475755692, + "reward_std": 0.2639523148536682, + "rewards/countdown_reward_func": 0.24583334475755692, + "step": 25, + "zero_std_ratio": 0.25 + }, + { + "clip_ratio": 0.00018698579515330493, + "epoch": 7.218252183521285e-05, + "grad_norm": 0.07660482078790665, + "kl": 0.0009208305855281651, + "learning_rate": 2.1606648199445985e-07, + "loss": 0.0032, + "step": 26 + }, + { + "clip_ratio": 0.0, + "epoch": 7.495877267502873e-05, + "grad_norm": 0.10856402665376663, + "kl": 0.0008324629161506891, + "learning_rate": 2.2437673130193906e-07, + "loss": 0.0029, + "step": 27 + }, + { + "clip_ratio": 0.0, + "epoch": 7.773502351484461e-05, + "grad_norm": 0.10259876400232315, + "kl": 0.0007138713845051825, + "learning_rate": 2.3268698060941827e-07, + "loss": 0.0028, + "step": 28 + }, + { + "clip_ratio": 0.0005700875190086663, + "epoch": 8.051127435466049e-05, + "grad_norm": 0.07817202061414719, + "kl": 0.0009188149124383926, + "learning_rate": 2.409972299168975e-07, + "loss": 0.003, + "step": 29 + }, + { + "clip_ratio": 8.18062835605815e-05, + "epoch": 8.328752519447637e-05, + "grad_norm": 0.09154564142227173, + "kl": 0.000773191568441689, + "learning_rate": 2.493074792243767e-07, + "loss": 0.0026, + "step": 30 + }, + { + "clip_ratio": 0.00010195758659392595, + "epoch": 8.606377603429225e-05, + "grad_norm": 0.09764699637889862, + "kl": 0.0008510929765179753, + "learning_rate": 2.57617728531856e-07, + "loss": 0.0032, + "step": 31 + }, + { + "clip_ratio": 0.0, + "epoch": 8.884002687410813e-05, + "grad_norm": 0.07986056804656982, + "kl": 0.0009318325319327414, + "learning_rate": 2.659279778393352e-07, + "loss": 0.0025, + "step": 32 + }, + { + "clip_ratio": 0.00019616441568359733, + "epoch": 9.161627771392401e-05, + "grad_norm": 0.10944823920726776, + "kl": 0.0008964207954704762, + "learning_rate": 2.742382271468144e-07, + "loss": 0.0024, + "step": 33 + }, + { + "clip_ratio": 0.0, + "epoch": 9.439252855373989e-05, + "grad_norm": 0.08562328666448593, + "kl": 0.000787771336035803, + "learning_rate": 2.8254847645429366e-07, + "loss": 0.0027, + "step": 34 + }, + { + "clip_ratio": 0.00016318648704327643, + "epoch": 9.716877939355577e-05, + "grad_norm": 0.06373903900384903, + "kl": 0.0009429152996744961, + "learning_rate": 2.9085872576177287e-07, + "loss": 0.0028, + "step": 35 + }, + { + "clip_ratio": 8.138021075865254e-05, + "epoch": 9.994503023337164e-05, + "grad_norm": 0.0951249971985817, + "kl": 0.0008729175024200231, + "learning_rate": 2.991689750692521e-07, + "loss": 0.0029, + "step": 36 + }, + { + "clip_ratio": 0.00010088780982187018, + "completion_length": 234.20833587646484, + "epoch": 0.00010272128107318752, + "grad_norm": 0.0882072001695633, + "kl": 0.001087184005882591, + "learning_rate": 3.0747922437673134e-07, + "loss": -0.0108, + "reward": 0.24791668355464935, + "reward_std": 0.2163795679807663, + "rewards/countdown_reward_func": 0.24791668355464935, + "step": 37, + "zero_std_ratio": 0.5 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0001054975319130034, + "grad_norm": 0.08608131855726242, + "kl": 0.0007613546913489699, + "learning_rate": 3.157894736842105e-07, + "loss": -0.0112, + "step": 38 + }, + { + "clip_ratio": 0.0, + "epoch": 0.00010827378275281928, + "grad_norm": 0.081035315990448, + "kl": 0.0008077043457888067, + "learning_rate": 3.2409972299168976e-07, + "loss": -0.0113, + "step": 39 + }, + { + "clip_ratio": 0.0, + "epoch": 0.00011105003359263516, + "grad_norm": 0.07537035644054413, + "kl": 0.0007854337454773486, + "learning_rate": 3.3240997229916897e-07, + "loss": -0.0118, + "step": 40 + }, + { + "clip_ratio": 8.138021075865254e-05, + "epoch": 0.00011382628443245104, + "grad_norm": 0.08525540679693222, + "kl": 0.0007534388569183648, + "learning_rate": 3.407202216066482e-07, + "loss": -0.0112, + "step": 41 + }, + { + "clip_ratio": 0.0003339054746902548, + "epoch": 0.00011660253527226692, + "grad_norm": 0.1856469362974167, + "kl": 0.000957256241235882, + "learning_rate": 3.4903047091412744e-07, + "loss": -0.0105, + "step": 42 + }, + { + "clip_ratio": 8.138021075865254e-05, + "epoch": 0.0001193787861120828, + "grad_norm": 0.06687930226325989, + "kl": 0.0008277066808659583, + "learning_rate": 3.5734072022160665e-07, + "loss": -0.011, + "step": 43 + }, + { + "clip_ratio": 0.0, + "epoch": 0.00012215503695189867, + "grad_norm": 0.07720962911844254, + "kl": 0.0007938558992464095, + "learning_rate": 3.656509695290859e-07, + "loss": -0.0115, + "step": 44 + }, + { + "clip_ratio": 0.0, + "epoch": 0.00012493128779171456, + "grad_norm": 0.08875006437301636, + "kl": 0.0007698170084040612, + "learning_rate": 3.739612188365651e-07, + "loss": -0.011, + "step": 45 + }, + { + "clip_ratio": 0.0, + "epoch": 0.00012770753863153043, + "grad_norm": 0.07545354217290878, + "kl": 0.0008280337788164616, + "learning_rate": 3.8227146814404433e-07, + "loss": -0.0114, + "step": 46 + }, + { + "clip_ratio": 8.138021075865254e-05, + "epoch": 0.00013048378947134632, + "grad_norm": 0.07932056486606598, + "kl": 0.000836798019008711, + "learning_rate": 3.905817174515236e-07, + "loss": -0.0109, + "step": 47 + }, + { + "clip_ratio": 8.138021075865254e-05, + "epoch": 0.00013326004031116219, + "grad_norm": 0.11963900178670883, + "kl": 0.0009241281659342349, + "learning_rate": 3.988919667590028e-07, + "loss": -0.011, + "step": 48 + }, + { + "clip_ratio": 0.0, + "completion_length": 218.25000762939453, + "epoch": 0.00013603629115097808, + "grad_norm": 0.06625409424304962, + "kl": 0.0008119716076180339, + "learning_rate": 4.07202216066482e-07, + "loss": 0.0113, + "reward": 0.24166668951511383, + "reward_std": 0.18230264633893967, + "rewards/countdown_reward_func": 0.24166668206453323, + "step": 49, + "zero_std_ratio": 0.25 + }, + { + "clip_ratio": 0.0, + "epoch": 0.00013881254199079395, + "grad_norm": 0.07639951258897781, + "kl": 0.0009117976296693087, + "learning_rate": 4.155124653739613e-07, + "loss": 0.0117, + "step": 50 + }, + { + "clip_ratio": 0.00010530749568715692, + "epoch": 0.00014158879283060984, + "grad_norm": 0.0806988924741745, + "kl": 0.0008828463032841682, + "learning_rate": 4.238227146814405e-07, + "loss": 0.0111, + "step": 51 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0001443650436704257, + "grad_norm": 0.07537931948900223, + "kl": 0.0007655246299691498, + "learning_rate": 4.321329639889197e-07, + "loss": 0.0111, + "step": 52 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0001471412945102416, + "grad_norm": 0.06568142026662827, + "kl": 0.0007329773507080972, + "learning_rate": 4.4044321329639896e-07, + "loss": 0.0112, + "step": 53 + }, + { + "clip_ratio": 0.0, + "epoch": 0.00014991754535005747, + "grad_norm": 0.08050818741321564, + "kl": 0.0007917624025139958, + "learning_rate": 4.487534626038781e-07, + "loss": 0.0115, + "step": 54 + }, + { + "clip_ratio": 0.0, + "epoch": 0.00015269379618987336, + "grad_norm": 0.07148700952529907, + "kl": 0.0008034448546823114, + "learning_rate": 4.570637119113573e-07, + "loss": 0.0113, + "step": 55 + }, + { + "clip_ratio": 0.00021934154210612178, + "epoch": 0.00015547004702968923, + "grad_norm": 0.07921700179576874, + "kl": 0.0008946515154093504, + "learning_rate": 4.6537396121883653e-07, + "loss": 0.0113, + "step": 56 + }, + { + "clip_ratio": 8.692628762219101e-05, + "epoch": 0.0001582462978695051, + "grad_norm": 0.07876580208539963, + "kl": 0.0009213399316649884, + "learning_rate": 4.736842105263158e-07, + "loss": 0.011, + "step": 57 + }, + { + "clip_ratio": 0.0002706017985474318, + "epoch": 0.00016102254870932099, + "grad_norm": 0.06707654148340225, + "kl": 0.0008947286114562303, + "learning_rate": 4.81994459833795e-07, + "loss": 0.0109, + "step": 58 + }, + { + "clip_ratio": 0.0, + "epoch": 0.00016379879954913685, + "grad_norm": 0.08683553338050842, + "kl": 0.0007533867028541863, + "learning_rate": 4.903047091412742e-07, + "loss": 0.0113, + "step": 59 + }, + { + "clip_ratio": 0.0, + "epoch": 0.00016657505038895275, + "grad_norm": 0.07259709388017654, + "kl": 0.0008512145723216236, + "learning_rate": 4.986149584487534e-07, + "loss": 0.0115, + "step": 60 + }, + { + "clip_ratio": 0.0, + "completion_length": 242.2291717529297, + "epoch": 0.0001693513012287686, + "grad_norm": 0.10226056724786758, + "kl": 0.0007961629016790539, + "learning_rate": 5.069252077562327e-07, + "loss": 0.0107, + "reward": 0.3229166716337204, + "reward_std": 0.3417354077100754, + "rewards/countdown_reward_func": 0.3229166716337204, + "step": 61, + "zero_std_ratio": 0.25 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0001721275520685845, + "grad_norm": 0.1283038705587387, + "kl": 0.0008402638195548207, + "learning_rate": 5.15235457063712e-07, + "loss": 0.0097, + "step": 62 + }, + { + "clip_ratio": 8.138021075865254e-05, + "epoch": 0.00017490380290840037, + "grad_norm": 0.10426519066095352, + "kl": 0.0008127306937240064, + "learning_rate": 5.235457063711912e-07, + "loss": 0.0104, + "step": 63 + }, + { + "clip_ratio": 0.0, + "epoch": 0.00017768005374821627, + "grad_norm": 0.09159515053033829, + "kl": 0.0008558765111956745, + "learning_rate": 5.318559556786704e-07, + "loss": 0.0104, + "step": 64 + }, + { + "clip_ratio": 0.0, + "epoch": 0.00018045630458803213, + "grad_norm": 0.09254452586174011, + "kl": 0.00104894393007271, + "learning_rate": 5.401662049861496e-07, + "loss": 0.0107, + "step": 65 + }, + { + "clip_ratio": 0.00045153952669352293, + "epoch": 0.00018323255542784803, + "grad_norm": 0.10837826132774353, + "kl": 0.0008054885256569833, + "learning_rate": 5.484764542936288e-07, + "loss": 0.0104, + "step": 66 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0001860088062676639, + "grad_norm": 0.09977608174085617, + "kl": 0.0007916014001239091, + "learning_rate": 5.567867036011081e-07, + "loss": 0.011, + "step": 67 + }, + { + "clip_ratio": 0.0, + "epoch": 0.00018878505710747979, + "grad_norm": 0.1362018585205078, + "kl": 0.0008146155159920454, + "learning_rate": 5.650969529085873e-07, + "loss": 0.0102, + "step": 68 + }, + { + "clip_ratio": 0.0, + "epoch": 0.00019156130794729565, + "grad_norm": 0.09612994641065598, + "kl": 0.0008106507884804159, + "learning_rate": 5.734072022160665e-07, + "loss": 0.0099, + "step": 69 + }, + { + "clip_ratio": 0.00017850531003205106, + "epoch": 0.00019433755878711155, + "grad_norm": 0.10862355679273605, + "kl": 0.0008707395172677934, + "learning_rate": 5.817174515235457e-07, + "loss": 0.0095, + "step": 70 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0001971138096269274, + "grad_norm": 0.09866558760404587, + "kl": 0.0008735440496820956, + "learning_rate": 5.900277008310249e-07, + "loss": 0.0106, + "step": 71 + }, + { + "clip_ratio": 9.110787505051121e-05, + "epoch": 0.00019989006046674328, + "grad_norm": 0.09790490567684174, + "kl": 0.0008786998223513365, + "learning_rate": 5.983379501385042e-07, + "loss": 0.0105, + "step": 72 + }, + { + "clip_ratio": 0.0001945525291375816, + "completion_length": 232.2291717529297, + "epoch": 0.00020266631130655917, + "grad_norm": 0.09406734257936478, + "kl": 0.0010395684512332082, + "learning_rate": 6.066481994459835e-07, + "loss": 0.0024, + "reward": 0.23750000447034836, + "reward_std": 0.2697988599538803, + "rewards/countdown_reward_func": 0.23750000447034836, + "step": 73, + "zero_std_ratio": 0.25 + }, + { + "clip_ratio": 8.138021075865254e-05, + "epoch": 0.00020544256214637504, + "grad_norm": 0.11244494467973709, + "kl": 0.0007681146962568164, + "learning_rate": 6.149584487534627e-07, + "loss": 0.0034, + "step": 74 + }, + { + "clip_ratio": 0.0, + "epoch": 0.00020821881298619093, + "grad_norm": 0.15458862483501434, + "kl": 0.0007610543980263174, + "learning_rate": 6.232686980609418e-07, + "loss": 0.0025, + "step": 75 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0002109950638260068, + "grad_norm": 0.07957284897565842, + "kl": 0.0009243077947758138, + "learning_rate": 6.31578947368421e-07, + "loss": 0.0031, + "step": 76 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0002137713146658227, + "grad_norm": 0.1434016078710556, + "kl": 0.0009022459562402219, + "learning_rate": 6.398891966759003e-07, + "loss": 0.0033, + "step": 77 + }, + { + "clip_ratio": 0.0002997264382429421, + "epoch": 0.00021654756550563856, + "grad_norm": 0.07511159032583237, + "kl": 0.0009360458934679627, + "learning_rate": 6.481994459833795e-07, + "loss": 0.0025, + "step": 78 + }, + { + "clip_ratio": 0.0002821488888002932, + "epoch": 0.00021932381634545445, + "grad_norm": 0.10179822146892548, + "kl": 0.0009871571965049952, + "learning_rate": 6.565096952908587e-07, + "loss": 0.0025, + "step": 79 + }, + { + "clip_ratio": 9.72762645687908e-05, + "epoch": 0.00022210006718527032, + "grad_norm": 0.10691147297620773, + "kl": 0.0007517710037063807, + "learning_rate": 6.648199445983379e-07, + "loss": 0.0032, + "step": 80 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0002248763180250862, + "grad_norm": 0.1533641219139099, + "kl": 0.0008048239687923342, + "learning_rate": 6.731301939058171e-07, + "loss": 0.0022, + "step": 81 + }, + { + "clip_ratio": 0.0, + "epoch": 0.00022765256886490208, + "grad_norm": 0.08312182873487473, + "kl": 0.0009179475018754601, + "learning_rate": 6.814404432132964e-07, + "loss": 0.0032, + "step": 82 + }, + { + "clip_ratio": 8.138021075865254e-05, + "epoch": 0.00023042881970471797, + "grad_norm": 0.11414949595928192, + "kl": 0.0008502569980919361, + "learning_rate": 6.897506925207757e-07, + "loss": 0.0024, + "step": 83 + }, + { + "clip_ratio": 0.0001669251942075789, + "epoch": 0.00023320507054453384, + "grad_norm": 0.07714688032865524, + "kl": 0.0008958573453128338, + "learning_rate": 6.980609418282549e-07, + "loss": 0.0027, + "step": 84 + }, + { + "clip_ratio": 0.0002005309797823429, + "completion_length": 219.5416717529297, + "epoch": 0.00023598132138434973, + "grad_norm": 0.08387522399425507, + "kl": 0.0009274522599298507, + "learning_rate": 7.063711911357341e-07, + "loss": 0.0284, + "reward": 0.1666666716337204, + "reward_std": 0.16661179438233376, + "rewards/countdown_reward_func": 0.1666666641831398, + "step": 85, + "zero_std_ratio": 0.25 + }, + { + "clip_ratio": 9.077705180970952e-05, + "epoch": 0.0002387575722241656, + "grad_norm": 0.07613258063793182, + "kl": 0.0010448671237099916, + "learning_rate": 7.146814404432133e-07, + "loss": 0.028, + "step": 86 + }, + { + "clip_ratio": 0.0, + "epoch": 0.00024153382306398146, + "grad_norm": 0.11073552072048187, + "kl": 0.000871124560944736, + "learning_rate": 7.229916897506925e-07, + "loss": 0.0284, + "step": 87 + }, + { + "clip_ratio": 0.0, + "epoch": 0.00024431007390379733, + "grad_norm": 0.08452224731445312, + "kl": 0.0010483066726010293, + "learning_rate": 7.313019390581718e-07, + "loss": 0.0288, + "step": 88 + }, + { + "clip_ratio": 0.00011130899656563997, + "epoch": 0.0002470863247436132, + "grad_norm": 0.0861300677061081, + "kl": 0.0009267764107789844, + "learning_rate": 7.39612188365651e-07, + "loss": 0.028, + "step": 89 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0002498625755834291, + "grad_norm": 0.07036253809928894, + "kl": 0.0008911852783057839, + "learning_rate": 7.479224376731302e-07, + "loss": 0.028, + "step": 90 + }, + { + "clip_ratio": 0.0003118399763479829, + "epoch": 0.000252638826423245, + "grad_norm": 0.09484658390283585, + "kl": 0.0009728774311952293, + "learning_rate": 7.562326869806093e-07, + "loss": 0.0283, + "step": 91 + }, + { + "clip_ratio": 0.00011130899656563997, + "epoch": 0.00025541507726306085, + "grad_norm": 0.07554644346237183, + "kl": 0.0008746770035941154, + "learning_rate": 7.645429362880887e-07, + "loss": 0.0278, + "step": 92 + }, + { + "clip_ratio": 0.0, + "epoch": 0.00025819132810287674, + "grad_norm": 0.1146901547908783, + "kl": 0.0008961272542364895, + "learning_rate": 7.728531855955679e-07, + "loss": 0.0286, + "step": 93 + }, + { + "clip_ratio": 8.191350207198411e-05, + "epoch": 0.00026096757894269264, + "grad_norm": 0.08794571459293365, + "kl": 0.0010003510979004204, + "learning_rate": 7.811634349030472e-07, + "loss": 0.0281, + "step": 94 + }, + { + "clip_ratio": 0.00023231577506521717, + "epoch": 0.00026374382978250853, + "grad_norm": 0.08675868809223175, + "kl": 0.0009201938519254327, + "learning_rate": 7.894736842105263e-07, + "loss": 0.028, + "step": 95 + }, + { + "clip_ratio": 0.0, + "epoch": 0.00026652008062232437, + "grad_norm": 0.07415983825922012, + "kl": 0.0008623613393865526, + "learning_rate": 7.977839335180056e-07, + "loss": 0.028, + "step": 96 + }, + { + "clip_ratio": 0.0, + "completion_length": 240.6041717529297, + "epoch": 0.00026929633146214026, + "grad_norm": 0.05498122051358223, + "kl": 0.000935161137022078, + "learning_rate": 8.060941828254847e-07, + "loss": 0.0019, + "reward": 0.18958335369825363, + "reward_std": 0.20226776599884033, + "rewards/countdown_reward_func": 0.18958335369825363, + "step": 97, + "zero_std_ratio": 0.375 + }, + { + "clip_ratio": 8.741259080125019e-05, + "epoch": 0.00027207258230195616, + "grad_norm": 0.0805509015917778, + "kl": 0.0009494524856563658, + "learning_rate": 8.14404432132964e-07, + "loss": 0.0017, + "step": 98 + }, + { + "clip_ratio": 0.0, + "epoch": 0.00027484883314177205, + "grad_norm": 0.06182171031832695, + "kl": 0.0010764948674477637, + "learning_rate": 8.227146814404432e-07, + "loss": 0.0018, + "step": 99 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0002776250839815879, + "grad_norm": 0.05677403509616852, + "kl": 0.000932041322812438, + "learning_rate": 8.310249307479226e-07, + "loss": 0.0022, + "step": 100 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0002804013348214038, + "grad_norm": 0.06866513192653656, + "kl": 0.0009856745309662074, + "learning_rate": 8.393351800554017e-07, + "loss": 0.0018, + "step": 101 + }, + { + "clip_ratio": 0.00016465802764287218, + "epoch": 0.0002831775856612197, + "grad_norm": 0.0790778174996376, + "kl": 0.0009655553149059415, + "learning_rate": 8.47645429362881e-07, + "loss": 0.0018, + "step": 102 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0002859538365010355, + "grad_norm": 0.06253324449062347, + "kl": 0.0010523943637963384, + "learning_rate": 8.559556786703601e-07, + "loss": 0.0017, + "step": 103 + }, + { + "clip_ratio": 0.00016276042151730508, + "epoch": 0.0002887300873408514, + "grad_norm": 0.08336114138364792, + "kl": 0.0009701263043098152, + "learning_rate": 8.642659279778394e-07, + "loss": 0.002, + "step": 104 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0002915063381806673, + "grad_norm": 0.05647846683859825, + "kl": 0.0010485479142516851, + "learning_rate": 8.725761772853186e-07, + "loss": 0.0017, + "step": 105 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0002942825890204832, + "grad_norm": 0.07278945297002792, + "kl": 0.0010502847435418516, + "learning_rate": 8.808864265927979e-07, + "loss": 0.0019, + "step": 106 + }, + { + "clip_ratio": 0.0, + "epoch": 0.00029705883986029904, + "grad_norm": 0.06493644416332245, + "kl": 0.0009983716008719057, + "learning_rate": 8.89196675900277e-07, + "loss": 0.0018, + "step": 107 + }, + { + "clip_ratio": 0.00026072279433719814, + "epoch": 0.00029983509070011493, + "grad_norm": 0.07456071674823761, + "kl": 0.0010671942727640271, + "learning_rate": 8.975069252077562e-07, + "loss": 0.0021, + "step": 108 + }, + { + "clip_ratio": 0.00034843204775825143, + "completion_length": 220.12500762939453, + "epoch": 0.0003026113415399308, + "grad_norm": 0.09195102006196976, + "kl": 0.001304664183408022, + "learning_rate": 9.058171745152355e-07, + "loss": -0.0043, + "reward": 0.260416679084301, + "reward_std": 0.29160892963409424, + "rewards/countdown_reward_func": 0.2604166641831398, + "step": 109, + "zero_std_ratio": 0.125 + }, + { + "clip_ratio": 0.00044802867341786623, + "epoch": 0.0003053875923797467, + "grad_norm": 0.11659281700849533, + "kl": 0.001220056030433625, + "learning_rate": 9.141274238227146e-07, + "loss": -0.0037, + "step": 110 + }, + { + "clip_ratio": 0.0, + "epoch": 0.00030816384321956256, + "grad_norm": 0.11090529710054398, + "kl": 0.0011154624517075717, + "learning_rate": 9.22437673130194e-07, + "loss": -0.0036, + "step": 111 + }, + { + "clip_ratio": 0.0, + "epoch": 0.00031094009405937845, + "grad_norm": 0.11202968657016754, + "kl": 0.0010931476717814803, + "learning_rate": 9.307479224376731e-07, + "loss": -0.0036, + "step": 112 + }, + { + "clip_ratio": 0.0, + "epoch": 0.00031371634489919434, + "grad_norm": 0.11670778691768646, + "kl": 0.001031600550049916, + "learning_rate": 9.390581717451524e-07, + "loss": -0.0045, + "step": 113 + }, + { + "clip_ratio": 8.710801193956286e-05, + "epoch": 0.0003164925957390102, + "grad_norm": 0.10784825682640076, + "kl": 0.0010588545119389892, + "learning_rate": 9.473684210526316e-07, + "loss": -0.0044, + "step": 114 + }, + { + "clip_ratio": 0.00035092979669570923, + "epoch": 0.0003192688465788261, + "grad_norm": 0.12178794294595718, + "kl": 0.0014504955615848303, + "learning_rate": 9.55678670360111e-07, + "loss": -0.0033, + "step": 115 + }, + { + "clip_ratio": 0.00010288065823260695, + "epoch": 0.00032204509741864197, + "grad_norm": 0.092310331761837, + "kl": 0.0012508891522884369, + "learning_rate": 9.6398891966759e-07, + "loss": -0.0034, + "step": 116 + }, + { + "clip_ratio": 0.0, + "epoch": 0.00032482134825845786, + "grad_norm": 0.10470046103000641, + "kl": 0.0011784560047090054, + "learning_rate": 9.722991689750693e-07, + "loss": -0.0037, + "step": 117 + }, + { + "clip_ratio": 8.710801193956286e-05, + "epoch": 0.0003275975990982737, + "grad_norm": 0.10416633635759354, + "kl": 0.0012174599687568843, + "learning_rate": 9.806094182825484e-07, + "loss": -0.0036, + "step": 118 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0003303738499380896, + "grad_norm": 0.12231718003749847, + "kl": 0.0010898615000769496, + "learning_rate": 9.889196675900277e-07, + "loss": -0.0035, + "step": 119 + }, + { + "clip_ratio": 0.00019340052676852793, + "epoch": 0.0003331501007779055, + "grad_norm": 0.11489807814359665, + "kl": 0.001152284734416753, + "learning_rate": 9.972299168975068e-07, + "loss": -0.0045, + "step": 120 + }, + { + "clip_ratio": 0.0, + "completion_length": 221.6041717529297, + "epoch": 0.0003359263516177214, + "grad_norm": 0.05373651906847954, + "kl": 0.0010495753376744688, + "learning_rate": 1.0055401662049862e-06, + "loss": -0.014, + "reward": 0.19166668504476547, + "reward_std": 0.15476077795028687, + "rewards/countdown_reward_func": 0.19166666641831398, + "step": 121, + "zero_std_ratio": 0.625 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0003387026024575372, + "grad_norm": 0.07888471335172653, + "kl": 0.0014498817035928369, + "learning_rate": 1.0138504155124655e-06, + "loss": -0.0136, + "step": 122 + }, + { + "clip_ratio": 9.491267701378092e-05, + "epoch": 0.0003414788532973531, + "grad_norm": 0.11604306101799011, + "kl": 0.0012312422040849924, + "learning_rate": 1.0221606648199446e-06, + "loss": -0.0136, + "step": 123 + }, + { + "clip_ratio": 0.0, + "epoch": 0.000344255104137169, + "grad_norm": 0.059598926454782486, + "kl": 0.001186943962238729, + "learning_rate": 1.030470914127424e-06, + "loss": -0.0142, + "step": 124 + }, + { + "clip_ratio": 9.307520667789504e-05, + "epoch": 0.0003470313549769849, + "grad_norm": 0.08821563422679901, + "kl": 0.0014322291244752705, + "learning_rate": 1.0387811634349032e-06, + "loss": -0.0145, + "step": 125 + }, + { + "clip_ratio": 9.491267701378092e-05, + "epoch": 0.00034980760581680074, + "grad_norm": 0.08867970108985901, + "kl": 0.0012800320982933044, + "learning_rate": 1.0470914127423823e-06, + "loss": -0.014, + "step": 126 + }, + { + "clip_ratio": 0.0, + "epoch": 0.00035258385665661664, + "grad_norm": 0.05335162952542305, + "kl": 0.0010596674110274762, + "learning_rate": 1.0554016620498616e-06, + "loss": -0.0142, + "step": 127 + }, + { + "clip_ratio": 0.00040634788456372917, + "epoch": 0.00035536010749643253, + "grad_norm": 0.08012160658836365, + "kl": 0.0014414938050322235, + "learning_rate": 1.0637119113573407e-06, + "loss": -0.0138, + "step": 128 + }, + { + "clip_ratio": 0.0, + "epoch": 0.00035813635833624837, + "grad_norm": 0.13474638760089874, + "kl": 0.0013173840707167983, + "learning_rate": 1.0720221606648198e-06, + "loss": -0.0142, + "step": 129 + }, + { + "clip_ratio": 0.00018023690790869296, + "epoch": 0.00036091260917606426, + "grad_norm": 0.05427733436226845, + "kl": 0.0011062846169807017, + "learning_rate": 1.0803324099722992e-06, + "loss": -0.0142, + "step": 130 + }, + { + "clip_ratio": 9.307520667789504e-05, + "epoch": 0.00036368886001588016, + "grad_norm": 0.0898558497428894, + "kl": 0.0013623088016174734, + "learning_rate": 1.0886426592797783e-06, + "loss": -0.0141, + "step": 131 + }, + { + "clip_ratio": 0.0, + "epoch": 0.00036646511085569605, + "grad_norm": 0.07763069868087769, + "kl": 0.0013295715907588601, + "learning_rate": 1.0969529085872576e-06, + "loss": -0.0142, + "step": 132 + }, + { + "clip_ratio": 0.00016427145601483062, + "completion_length": 248.33333587646484, + "epoch": 0.0003692413616955119, + "grad_norm": 0.07501526921987534, + "kl": 0.0011433768086135387, + "learning_rate": 1.1052631578947369e-06, + "loss": 0.0024, + "reward": 0.2458333671092987, + "reward_std": 0.23012542724609375, + "rewards/countdown_reward_func": 0.2458333522081375, + "step": 133, + "zero_std_ratio": 0.375 + }, + { + "clip_ratio": 0.0001751334930304438, + "epoch": 0.0003720176125353278, + "grad_norm": 0.12185314297676086, + "kl": 0.0012197635951451957, + "learning_rate": 1.1135734072022162e-06, + "loss": 0.0026, + "step": 134 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0003747938633751437, + "grad_norm": 0.08777681738138199, + "kl": 0.0011197520070709288, + "learning_rate": 1.1218836565096953e-06, + "loss": 0.0021, + "step": 135 + }, + { + "clip_ratio": 0.0, + "epoch": 0.00037757011421495957, + "grad_norm": 0.08989999443292618, + "kl": 0.0010376300197094679, + "learning_rate": 1.1301939058171746e-06, + "loss": 0.0023, + "step": 136 + }, + { + "clip_ratio": 8.223684562835842e-05, + "epoch": 0.0003803463650547754, + "grad_norm": 0.08074529469013214, + "kl": 0.0010572479804977775, + "learning_rate": 1.1385041551246537e-06, + "loss": 0.0019, + "step": 137 + }, + { + "clip_ratio": 8.138021075865254e-05, + "epoch": 0.0003831226158945913, + "grad_norm": 0.08292272686958313, + "kl": 0.0011392622254788876, + "learning_rate": 1.146814404432133e-06, + "loss": 0.0022, + "step": 138 + }, + { + "clip_ratio": 8.289124525617808e-05, + "epoch": 0.0003858988667344072, + "grad_norm": 0.0767727866768837, + "kl": 0.001106983982026577, + "learning_rate": 1.1551246537396122e-06, + "loss": 0.0019, + "step": 139 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0003886751175742231, + "grad_norm": 0.0795488953590393, + "kl": 0.0012870717328041792, + "learning_rate": 1.1634349030470915e-06, + "loss": 0.003, + "step": 140 + }, + { + "clip_ratio": 0.0, + "epoch": 0.00039145136841403893, + "grad_norm": 0.0856185331940651, + "kl": 0.001159544801339507, + "learning_rate": 1.1717451523545706e-06, + "loss": 0.0018, + "step": 141 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0003942276192538548, + "grad_norm": 0.08445297926664352, + "kl": 0.0011639554286375642, + "learning_rate": 1.1800554016620499e-06, + "loss": 0.0025, + "step": 142 + }, + { + "clip_ratio": 8.223684562835842e-05, + "epoch": 0.0003970038700936707, + "grad_norm": 0.07916339486837387, + "kl": 0.0011218629661016166, + "learning_rate": 1.188365650969529e-06, + "loss": 0.0025, + "step": 143 + }, + { + "clip_ratio": 0.00024620501790195704, + "epoch": 0.00039978012093348656, + "grad_norm": 0.0729660838842392, + "kl": 0.0012778243399225175, + "learning_rate": 1.1966759002770083e-06, + "loss": 0.0024, + "step": 144 + }, + { + "clip_ratio": 8.967001485871151e-05, + "completion_length": 232.02083587646484, + "epoch": 0.00040255637177330245, + "grad_norm": 0.16677485406398773, + "kl": 0.0013882183120585978, + "learning_rate": 1.2049861495844876e-06, + "loss": 0.014, + "reward": 0.30416667461395264, + "reward_std": 0.36280614137649536, + "rewards/countdown_reward_func": 0.30416667461395264, + "step": 145, + "zero_std_ratio": 0.125 + }, + { + "clip_ratio": 0.00010926573304459453, + "epoch": 0.00040533262261311834, + "grad_norm": 0.17154261469841003, + "kl": 0.0015535103739239275, + "learning_rate": 1.213296398891967e-06, + "loss": 0.0145, + "step": 146 + }, + { + "clip_ratio": 8.526603050995618e-05, + "epoch": 0.00040810887345293424, + "grad_norm": 0.11442252993583679, + "kl": 0.001147409901022911, + "learning_rate": 1.221606648199446e-06, + "loss": 0.015, + "step": 147 + }, + { + "clip_ratio": 8.967001485871151e-05, + "epoch": 0.0004108851242927501, + "grad_norm": 0.11827956140041351, + "kl": 0.0013873514835722744, + "learning_rate": 1.2299168975069254e-06, + "loss": 0.0147, + "step": 148 + }, + { + "clip_ratio": 0.0008980906713986769, + "epoch": 0.00041366137513256597, + "grad_norm": 0.10520009696483612, + "kl": 0.001313833869062364, + "learning_rate": 1.2382271468144045e-06, + "loss": 0.0145, + "step": 149 + }, + { + "clip_ratio": 0.0, + "epoch": 0.00041643762597238186, + "grad_norm": 0.11322156339883804, + "kl": 0.0012296418426558375, + "learning_rate": 1.2465373961218836e-06, + "loss": 0.0146, + "step": 150 + }, + { + "clip_ratio": 0.00025922466011252254, + "epoch": 0.00041921387681219776, + "grad_norm": 0.16956517100334167, + "kl": 0.0012730106245726347, + "learning_rate": 1.2548476454293629e-06, + "loss": 0.0134, + "step": 151 + }, + { + "clip_ratio": 0.00010926573304459453, + "epoch": 0.0004219901276520136, + "grad_norm": 0.1768382489681244, + "kl": 0.0016011170810088515, + "learning_rate": 1.263157894736842e-06, + "loss": 0.0138, + "step": 152 + }, + { + "clip_ratio": 8.428860746789724e-05, + "epoch": 0.0004247663784918295, + "grad_norm": 0.11108796298503876, + "kl": 0.0012242573429830372, + "learning_rate": 1.2714681440443213e-06, + "loss": 0.0142, + "step": 153 + }, + { + "clip_ratio": 0.00016696537932148203, + "epoch": 0.0004275426293316454, + "grad_norm": 0.13032646477222443, + "kl": 0.0015075987321324646, + "learning_rate": 1.2797783933518006e-06, + "loss": 0.014, + "step": 154 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0004303188801714613, + "grad_norm": 0.09640121459960938, + "kl": 0.0013893723953515291, + "learning_rate": 1.28808864265928e-06, + "loss": 0.0138, + "step": 155 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0004330951310112771, + "grad_norm": 0.10653083771467209, + "kl": 0.0013818400911986828, + "learning_rate": 1.296398891966759e-06, + "loss": 0.014, + "step": 156 + }, + { + "clip_ratio": 8.138021075865254e-05, + "completion_length": 235.75, + "epoch": 0.000435871381851093, + "grad_norm": 0.05519864708185196, + "kl": 0.001432242221198976, + "learning_rate": 1.3047091412742383e-06, + "loss": 0.0092, + "reward": 0.24583333730697632, + "reward_std": 0.12264448031783104, + "rewards/countdown_reward_func": 0.24583333730697632, + "step": 157, + "zero_std_ratio": 0.625 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0004386476326909089, + "grad_norm": 0.09035968035459518, + "kl": 0.0013828090741299093, + "learning_rate": 1.3130193905817175e-06, + "loss": 0.0092, + "step": 158 + }, + { + "clip_ratio": 0.0005962533032288775, + "epoch": 0.00044142388353072474, + "grad_norm": 0.1271812468767166, + "kl": 0.0021160971373319626, + "learning_rate": 1.3213296398891968e-06, + "loss": 0.0093, + "step": 159 + }, + { + "clip_ratio": 0.0, + "epoch": 0.00044420013437054064, + "grad_norm": 0.0863696038722992, + "kl": 0.001588326005730778, + "learning_rate": 1.3296398891966759e-06, + "loss": 0.0096, + "step": 160 + }, + { + "clip_ratio": 0.00016276042151730508, + "epoch": 0.00044697638521035653, + "grad_norm": 0.061348333954811096, + "kl": 0.0016502806101925671, + "learning_rate": 1.3379501385041552e-06, + "loss": 0.0092, + "step": 161 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0004497526360501724, + "grad_norm": 0.04854971170425415, + "kl": 0.0016185293206945062, + "learning_rate": 1.3462603878116343e-06, + "loss": 0.0094, + "step": 162 + }, + { + "clip_ratio": 0.0, + "epoch": 0.00045252888688998826, + "grad_norm": 0.060902826488018036, + "kl": 0.0025005778297781944, + "learning_rate": 1.3545706371191136e-06, + "loss": 0.009, + "step": 163 + }, + { + "clip_ratio": 0.00011261261533945799, + "epoch": 0.00045530513772980416, + "grad_norm": 0.08311949670314789, + "kl": 0.0018348235171288252, + "learning_rate": 1.3628808864265927e-06, + "loss": 0.0094, + "step": 164 + }, + { + "clip_ratio": 8.138021075865254e-05, + "epoch": 0.00045808138856962005, + "grad_norm": 0.1301003396511078, + "kl": 0.0029168009059503675, + "learning_rate": 1.371191135734072e-06, + "loss": 0.01, + "step": 165 + }, + { + "clip_ratio": 0.0, + "epoch": 0.00046085763940943594, + "grad_norm": 0.07615955919027328, + "kl": 0.002113637514412403, + "learning_rate": 1.3795013850415513e-06, + "loss": 0.009, + "step": 166 + }, + { + "clip_ratio": 9.67492233030498e-05, + "epoch": 0.0004636338902492518, + "grad_norm": 0.061112839728593826, + "kl": 0.001958071778062731, + "learning_rate": 1.3878116343490307e-06, + "loss": 0.0092, + "step": 167 + }, + { + "clip_ratio": 0.00017946927982848138, + "epoch": 0.0004664101410890677, + "grad_norm": 0.04938596114516258, + "kl": 0.002263071248307824, + "learning_rate": 1.3961218836565098e-06, + "loss": 0.0092, + "step": 168 + }, + { + "clip_ratio": 0.0, + "completion_length": 212.87500762939453, + "epoch": 0.00046918639192888357, + "grad_norm": 0.08904451131820679, + "kl": 0.0021521300077438354, + "learning_rate": 1.404432132963989e-06, + "loss": -0.0054, + "reward": 0.3541666716337204, + "reward_std": 0.3927241712808609, + "rewards/countdown_reward_func": 0.3541666716337204, + "step": 169, + "zero_std_ratio": 0.0 + }, + { + "clip_ratio": 0.000244140625, + "epoch": 0.00047196264276869946, + "grad_norm": 0.11747531592845917, + "kl": 0.0023746880469843745, + "learning_rate": 1.4127423822714682e-06, + "loss": -0.0057, + "step": 170 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0004747388936085153, + "grad_norm": 0.10242423415184021, + "kl": 0.0022907587699592113, + "learning_rate": 1.4210526315789473e-06, + "loss": -0.0055, + "step": 171 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0004775151444483312, + "grad_norm": 0.09900445491075516, + "kl": 0.0021405539009720087, + "learning_rate": 1.4293628808864266e-06, + "loss": -0.0054, + "step": 172 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0004802913952881471, + "grad_norm": 0.13611117005348206, + "kl": 0.0026756724109873176, + "learning_rate": 1.4376731301939057e-06, + "loss": -0.0062, + "step": 173 + }, + { + "clip_ratio": 0.0, + "epoch": 0.00048306764612796293, + "grad_norm": 0.11884411424398422, + "kl": 0.0026626097969710827, + "learning_rate": 1.445983379501385e-06, + "loss": -0.0055, + "step": 174 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0004858438969677788, + "grad_norm": 0.09348145127296448, + "kl": 0.002659423043951392, + "learning_rate": 1.4542936288088643e-06, + "loss": -0.0048, + "step": 175 + }, + { + "clip_ratio": 0.000244140625, + "epoch": 0.0004886201478075947, + "grad_norm": 0.11439940333366394, + "kl": 0.0028921624179929495, + "learning_rate": 1.4626038781163436e-06, + "loss": -0.0065, + "step": 176 + }, + { + "clip_ratio": 9.476876584812999e-05, + "epoch": 0.0004913963986474106, + "grad_norm": 0.21524833142757416, + "kl": 0.002880470361560583, + "learning_rate": 1.4709141274238228e-06, + "loss": -0.0057, + "step": 177 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0004941726494872264, + "grad_norm": 0.10695669800043106, + "kl": 0.0028178965440019965, + "learning_rate": 1.479224376731302e-06, + "loss": -0.006, + "step": 178 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0004969489003270424, + "grad_norm": 0.12360948324203491, + "kl": 0.0034589068964123726, + "learning_rate": 1.4875346260387812e-06, + "loss": -0.006, + "step": 179 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0004997251511668582, + "grad_norm": 0.1427268236875534, + "kl": 0.0033028185134753585, + "learning_rate": 1.4958448753462605e-06, + "loss": -0.006, + "step": 180 + }, + { + "clip_ratio": 0.0, + "completion_length": 237.1041717529297, + "epoch": 0.0005025014020066741, + "grad_norm": 0.06570961326360703, + "kl": 0.0035237747943028808, + "learning_rate": 1.5041551246537396e-06, + "loss": 0.0151, + "reward": 0.166666679084301, + "reward_std": 0.19649019464850426, + "rewards/countdown_reward_func": 0.1666666716337204, + "step": 181, + "zero_std_ratio": 0.25 + }, + { + "clip_ratio": 0.0, + "epoch": 0.00050527765284649, + "grad_norm": 0.07061938941478729, + "kl": 0.003632499254308641, + "learning_rate": 1.5124653739612187e-06, + "loss": 0.0151, + "step": 182 + }, + { + "clip_ratio": 0.00017770155682228506, + "epoch": 0.0005080539036863059, + "grad_norm": 0.09591417759656906, + "kl": 0.004101799800992012, + "learning_rate": 1.5207756232686982e-06, + "loss": 0.0149, + "step": 183 + }, + { + "clip_ratio": 8.169934881152585e-05, + "epoch": 0.0005108301545261217, + "grad_norm": 0.0730750560760498, + "kl": 0.003928871126845479, + "learning_rate": 1.5290858725761773e-06, + "loss": 0.015, + "step": 184 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0005136064053659377, + "grad_norm": 0.08014266192913055, + "kl": 0.003984199371188879, + "learning_rate": 1.5373961218836564e-06, + "loss": 0.0144, + "step": 185 + }, + { + "clip_ratio": 8.138021075865254e-05, + "epoch": 0.0005163826562057535, + "grad_norm": 0.08353135734796524, + "kl": 0.004139116033911705, + "learning_rate": 1.5457063711911357e-06, + "loss": 0.0146, + "step": 186 + }, + { + "clip_ratio": 0.00018080235895467922, + "epoch": 0.0005191589070455693, + "grad_norm": 0.07052478194236755, + "kl": 0.004334121011197567, + "learning_rate": 1.554016620498615e-06, + "loss": 0.0151, + "step": 187 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0005219351578853853, + "grad_norm": 0.12264339625835419, + "kl": 0.004643098916858435, + "learning_rate": 1.5623268698060944e-06, + "loss": 0.0148, + "step": 188 + }, + { + "clip_ratio": 8.7596352386754e-05, + "epoch": 0.0005247114087252011, + "grad_norm": 0.10199768096208572, + "kl": 0.00538562866859138, + "learning_rate": 1.5706371191135735e-06, + "loss": 0.014, + "step": 189 + }, + { + "clip_ratio": 8.169934881152585e-05, + "epoch": 0.0005274876595650171, + "grad_norm": 0.07175012677907944, + "kl": 0.005290654953569174, + "learning_rate": 1.5789473684210526e-06, + "loss": 0.0147, + "step": 190 + }, + { + "clip_ratio": 0.00017524592112749815, + "epoch": 0.0005302639104048329, + "grad_norm": 0.08007644861936569, + "kl": 0.0052713199984282255, + "learning_rate": 1.5872576177285321e-06, + "loss": 0.0146, + "step": 191 + }, + { + "clip_ratio": 8.538251131540164e-05, + "epoch": 0.0005330401612446487, + "grad_norm": 0.08058490604162216, + "kl": 0.005752292228862643, + "learning_rate": 1.5955678670360112e-06, + "loss": 0.0142, + "step": 192 + }, + { + "clip_ratio": 0.0, + "completion_length": 224.25000762939453, + "epoch": 0.0005358164120844647, + "grad_norm": 0.07866574823856354, + "kl": 0.006615105550736189, + "learning_rate": 1.6038781163434903e-06, + "loss": -0.005, + "reward": 0.1458333507180214, + "reward_std": 0.1595480851829052, + "rewards/countdown_reward_func": 0.1458333432674408, + "step": 193, + "zero_std_ratio": 0.125 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0005385926629242805, + "grad_norm": 0.10931258648633957, + "kl": 0.006460321601480246, + "learning_rate": 1.6121883656509694e-06, + "loss": -0.0049, + "step": 194 + }, + { + "clip_ratio": 8.37240440887399e-05, + "epoch": 0.0005413689137640964, + "grad_norm": 0.08174652606248856, + "kl": 0.0075355947483330965, + "learning_rate": 1.6204986149584487e-06, + "loss": -0.0045, + "step": 195 + }, + { + "clip_ratio": 8.37240440887399e-05, + "epoch": 0.0005441451646039123, + "grad_norm": 0.08151189982891083, + "kl": 0.006791774649173021, + "learning_rate": 1.628808864265928e-06, + "loss": -0.0046, + "step": 196 + }, + { + "clip_ratio": 9.077705180970952e-05, + "epoch": 0.0005469214154437282, + "grad_norm": 0.06075465306639671, + "kl": 0.007927711587399244, + "learning_rate": 1.6371191135734074e-06, + "loss": -0.005, + "step": 197 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0005496976662835441, + "grad_norm": 0.06800781190395355, + "kl": 0.007068223087117076, + "learning_rate": 1.6454293628808865e-06, + "loss": -0.005, + "step": 198 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0005524739171233599, + "grad_norm": 0.08043520897626877, + "kl": 0.008283360861241817, + "learning_rate": 1.6537396121883656e-06, + "loss": -0.0051, + "step": 199 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0005552501679631758, + "grad_norm": 0.08258956670761108, + "kl": 0.007602859288454056, + "learning_rate": 1.662049861495845e-06, + "loss": -0.0056, + "step": 200 + }, + { + "clip_ratio": 8.37240440887399e-05, + "epoch": 0.0005580264188029917, + "grad_norm": 0.08197806775569916, + "kl": 0.009075871668756008, + "learning_rate": 1.6703601108033242e-06, + "loss": -0.0042, + "step": 201 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0005608026696428076, + "grad_norm": 0.08293389528989792, + "kl": 0.0072558128740638494, + "learning_rate": 1.6786703601108033e-06, + "loss": -0.0044, + "step": 202 + }, + { + "clip_ratio": 0.00018364480638410896, + "epoch": 0.0005635789204826234, + "grad_norm": 0.0675339549779892, + "kl": 0.008414975367486477, + "learning_rate": 1.6869806094182824e-06, + "loss": -0.0051, + "step": 203 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0005663551713224394, + "grad_norm": 0.0665825828909874, + "kl": 0.007153096608817577, + "learning_rate": 1.695290858725762e-06, + "loss": -0.0048, + "step": 204 + }, + { + "clip_ratio": 0.0, + "completion_length": 216.5416717529297, + "epoch": 0.0005691314221622552, + "grad_norm": 0.0898197814822197, + "kl": 0.0062689268961548805, + "learning_rate": 1.703601108033241e-06, + "loss": -0.0212, + "reward": 0.24375002086162567, + "reward_std": 0.23638741672039032, + "rewards/countdown_reward_func": 0.24375001341104507, + "step": 205, + "zero_std_ratio": 0.25 + }, + { + "clip_ratio": 0.0007762892200844362, + "epoch": 0.000571907673002071, + "grad_norm": 0.11292041838169098, + "kl": 0.0070912938099354506, + "learning_rate": 1.7119113573407201e-06, + "loss": -0.0204, + "step": 206 + }, + { + "clip_ratio": 0.0, + "epoch": 0.000574683923841887, + "grad_norm": 0.1421259045600891, + "kl": 0.006881088018417358, + "learning_rate": 1.7202216066481995e-06, + "loss": -0.0206, + "step": 207 + }, + { + "clip_ratio": 0.00012413108197506517, + "epoch": 0.0005774601746817028, + "grad_norm": 0.12615558505058289, + "kl": 0.006374599179252982, + "learning_rate": 1.7285318559556788e-06, + "loss": -0.022, + "step": 208 + }, + { + "clip_ratio": 0.0001857691750046797, + "epoch": 0.0005802364255215188, + "grad_norm": 0.11529010534286499, + "kl": 0.007537527941167355, + "learning_rate": 1.736842105263158e-06, + "loss": -0.0217, + "step": 209 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0005830126763613346, + "grad_norm": 0.12297216802835464, + "kl": 0.006081829313188791, + "learning_rate": 1.7451523545706372e-06, + "loss": -0.0219, + "step": 210 + }, + { + "clip_ratio": 8.138021075865254e-05, + "epoch": 0.0005857889272011504, + "grad_norm": 0.09268545359373093, + "kl": 0.005436737323179841, + "learning_rate": 1.7534626038781163e-06, + "loss": -0.0217, + "step": 211 + }, + { + "clip_ratio": 0.00048481223348062485, + "epoch": 0.0005885651780409664, + "grad_norm": 0.09842047840356827, + "kl": 0.005601341603323817, + "learning_rate": 1.7617728531855958e-06, + "loss": -0.021, + "step": 212 + }, + { + "clip_ratio": 0.00019936203898396343, + "epoch": 0.0005913414288807822, + "grad_norm": 0.13735024631023407, + "kl": 0.005223254906013608, + "learning_rate": 1.770083102493075e-06, + "loss": -0.022, + "step": 213 + }, + { + "clip_ratio": 0.00022600556985707954, + "epoch": 0.0005941176797205981, + "grad_norm": 0.11505624651908875, + "kl": 0.004508420126512647, + "learning_rate": 1.778393351800554e-06, + "loss": -0.0228, + "step": 214 + }, + { + "clip_ratio": 0.0, + "epoch": 0.000596893930560414, + "grad_norm": 0.10706788301467896, + "kl": 0.005065717967227101, + "learning_rate": 1.7867036011080331e-06, + "loss": -0.022, + "step": 215 + }, + { + "clip_ratio": 8.6088155512698e-05, + "epoch": 0.0005996701814002299, + "grad_norm": 0.11780065298080444, + "kl": 0.004196330206468701, + "learning_rate": 1.7950138504155125e-06, + "loss": -0.0226, + "step": 216 + }, + { + "clip_ratio": 0.0, + "completion_length": 229.7291717529297, + "epoch": 0.0006024464322400457, + "grad_norm": 0.07949130237102509, + "kl": 0.004571998957544565, + "learning_rate": 1.8033240997229918e-06, + "loss": 0.0072, + "reward": 0.24583334475755692, + "reward_std": 0.22264151275157928, + "rewards/countdown_reward_func": 0.24583334475755692, + "step": 217, + "zero_std_ratio": 0.375 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0006052226830798616, + "grad_norm": 0.10650928318500519, + "kl": 0.004555474733933806, + "learning_rate": 1.811634349030471e-06, + "loss": 0.0074, + "step": 218 + }, + { + "clip_ratio": 9.95222944766283e-05, + "epoch": 0.0006079989339196775, + "grad_norm": 0.10155003517866135, + "kl": 0.0035600599367171526, + "learning_rate": 1.8199445983379502e-06, + "loss": 0.0071, + "step": 219 + }, + { + "clip_ratio": 0.00018531362002249807, + "epoch": 0.0006107751847594934, + "grad_norm": 0.09107557684183121, + "kl": 0.0034167662961408496, + "learning_rate": 1.8282548476454293e-06, + "loss": 0.0078, + "step": 220 + }, + { + "clip_ratio": 9.95222944766283e-05, + "epoch": 0.0006135514355993093, + "grad_norm": 0.13129286468029022, + "kl": 0.003628110629506409, + "learning_rate": 1.8365650969529088e-06, + "loss": 0.008, + "step": 221 + }, + { + "clip_ratio": 8.138021075865254e-05, + "epoch": 0.0006163276864391251, + "grad_norm": 0.07515281438827515, + "kl": 0.0033600571332499385, + "learning_rate": 1.844875346260388e-06, + "loss": 0.0079, + "step": 222 + }, + { + "clip_ratio": 8.138021075865254e-05, + "epoch": 0.0006191039372789411, + "grad_norm": 0.07991602271795273, + "kl": 0.0036217205924913287, + "learning_rate": 1.853185595567867e-06, + "loss": 0.0073, + "step": 223 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0006218801881187569, + "grad_norm": 0.1082308441400528, + "kl": 0.0038258974673226476, + "learning_rate": 1.8614958448753461e-06, + "loss": 0.0074, + "step": 224 + }, + { + "clip_ratio": 0.00037994710146449506, + "epoch": 0.0006246564389585727, + "grad_norm": 0.09276928007602692, + "kl": 0.0031863467302173376, + "learning_rate": 1.8698060941828257e-06, + "loss": 0.0073, + "step": 225 + }, + { + "clip_ratio": 8.25082533992827e-05, + "epoch": 0.0006274326897983887, + "grad_norm": 0.09415683150291443, + "kl": 0.002969763823784888, + "learning_rate": 1.8781163434903048e-06, + "loss": 0.0079, + "step": 226 + }, + { + "clip_ratio": 0.00026924171834252775, + "epoch": 0.0006302089406382045, + "grad_norm": 0.12183138728141785, + "kl": 0.003270600689575076, + "learning_rate": 1.8864265927977839e-06, + "loss": 0.0078, + "step": 227 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0006329851914780204, + "grad_norm": 0.07732009142637253, + "kl": 0.002818679786287248, + "learning_rate": 1.8947368421052632e-06, + "loss": 0.0079, + "step": 228 + }, + { + "clip_ratio": 8.138021075865254e-05, + "completion_length": 227.75000762939453, + "epoch": 0.0006357614423178363, + "grad_norm": 0.10973645746707916, + "kl": 0.003377788234502077, + "learning_rate": 1.9030470914127425e-06, + "loss": 0.0024, + "reward": 0.2458333522081375, + "reward_std": 0.3024734854698181, + "rewards/countdown_reward_func": 0.24583334475755692, + "step": 229, + "zero_std_ratio": 0.25 + }, + { + "clip_ratio": 0.0001726210830383934, + "epoch": 0.0006385376931576522, + "grad_norm": 0.0772981122136116, + "kl": 0.003124785842373967, + "learning_rate": 1.911357340720222e-06, + "loss": 0.0019, + "step": 230 + }, + { + "clip_ratio": 9.286775457439944e-05, + "epoch": 0.0006413139439974681, + "grad_norm": 0.2130415439605713, + "kl": 0.003285923390649259, + "learning_rate": 1.919667590027701e-06, + "loss": 0.003, + "step": 231 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0006440901948372839, + "grad_norm": 0.09580881893634796, + "kl": 0.002900973428040743, + "learning_rate": 1.92797783933518e-06, + "loss": 0.0023, + "step": 232 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0006468664456770998, + "grad_norm": 0.09705420583486557, + "kl": 0.003031375934369862, + "learning_rate": 1.9362880886426595e-06, + "loss": 0.002, + "step": 233 + }, + { + "clip_ratio": 9.177679748972878e-05, + "epoch": 0.0006496426965169157, + "grad_norm": 0.08564502000808716, + "kl": 0.0033782614627853036, + "learning_rate": 1.9445983379501387e-06, + "loss": 0.0024, + "step": 234 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0006524189473567316, + "grad_norm": 0.11757489293813705, + "kl": 0.003013497218489647, + "learning_rate": 1.9529085872576178e-06, + "loss": 0.0018, + "step": 235 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0006551951981965474, + "grad_norm": 0.08156201988458633, + "kl": 0.0028692481573671103, + "learning_rate": 1.961218836565097e-06, + "loss": 0.0022, + "step": 236 + }, + { + "clip_ratio": 9.124087227974087e-05, + "epoch": 0.0006579714490363634, + "grad_norm": 0.20828205347061157, + "kl": 0.00310861156322062, + "learning_rate": 1.969529085872576e-06, + "loss": 0.0024, + "step": 237 + }, + { + "clip_ratio": 0.0011520737316459417, + "epoch": 0.0006607476998761792, + "grad_norm": 0.09965456277132034, + "kl": 0.0033284203382208943, + "learning_rate": 1.9778393351800555e-06, + "loss": 0.0024, + "step": 238 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0006635239507159951, + "grad_norm": 0.08843494206666946, + "kl": 0.0030406563309952617, + "learning_rate": 1.9861495844875346e-06, + "loss": 0.0018, + "step": 239 + }, + { + "clip_ratio": 0.00018355359497945756, + "epoch": 0.000666300201555811, + "grad_norm": 0.08795179426670074, + "kl": 0.0030062462901696563, + "learning_rate": 1.9944598337950137e-06, + "loss": 0.0022, + "step": 240 + }, + { + "clip_ratio": 8.698677993379533e-05, + "completion_length": 239.81250762939453, + "epoch": 0.0006690764523956268, + "grad_norm": 0.06673049926757812, + "kl": 0.002854794613085687, + "learning_rate": 2.002770083102493e-06, + "loss": -0.0041, + "reward": 0.1875000149011612, + "reward_std": 0.16496698185801506, + "rewards/countdown_reward_func": 0.1875000149011612, + "step": 241, + "zero_std_ratio": 0.375 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0006718527032354428, + "grad_norm": 0.05837683007121086, + "kl": 0.0029842500807717443, + "learning_rate": 2.0110803324099723e-06, + "loss": -0.0044, + "step": 242 + }, + { + "clip_ratio": 8.698677993379533e-05, + "epoch": 0.0006746289540752586, + "grad_norm": 0.05153853818774223, + "kl": 0.0027709035202860832, + "learning_rate": 2.0193905817174514e-06, + "loss": -0.0041, + "step": 243 + }, + { + "clip_ratio": 0.00016836699796840549, + "epoch": 0.0006774052049150744, + "grad_norm": 0.09485623240470886, + "kl": 0.003231698414310813, + "learning_rate": 2.027700831024931e-06, + "loss": -0.0041, + "step": 244 + }, + { + "clip_ratio": 9.104151831706986e-05, + "epoch": 0.0006801814557548904, + "grad_norm": 0.06381552666425705, + "kl": 0.003178777056746185, + "learning_rate": 2.03601108033241e-06, + "loss": -0.0044, + "step": 245 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0006829577065947062, + "grad_norm": 0.06970212608575821, + "kl": 0.0039439547108486295, + "learning_rate": 2.044321329639889e-06, + "loss": -0.0044, + "step": 246 + }, + { + "clip_ratio": 0.0007053310982882977, + "epoch": 0.0006857339574345221, + "grad_norm": 0.05486287549138069, + "kl": 0.0030296844197437167, + "learning_rate": 2.0526315789473687e-06, + "loss": -0.0041, + "step": 247 + }, + { + "clip_ratio": 0.0, + "epoch": 0.000688510208274338, + "grad_norm": 0.05788696929812431, + "kl": 0.0034874266711995006, + "learning_rate": 2.060941828254848e-06, + "loss": -0.0043, + "step": 248 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0006912864591141539, + "grad_norm": 0.05053585022687912, + "kl": 0.003134583937935531, + "learning_rate": 2.069252077562327e-06, + "loss": -0.0045, + "step": 249 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0006940627099539698, + "grad_norm": 0.08815469592809677, + "kl": 0.0033259709598496556, + "learning_rate": 2.0775623268698064e-06, + "loss": -0.0048, + "step": 250 + }, + { + "clip_ratio": 9.104151831706986e-05, + "epoch": 0.0006968389607937856, + "grad_norm": 0.06786980479955673, + "kl": 0.00334134662989527, + "learning_rate": 2.0858725761772855e-06, + "loss": -0.0048, + "step": 251 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0006996152116336015, + "grad_norm": 0.06675417721271515, + "kl": 0.0038671550573781133, + "learning_rate": 2.0941828254847646e-06, + "loss": -0.0048, + "step": 252 + }, + { + "clip_ratio": 0.0, + "completion_length": 234.33334350585938, + "epoch": 0.0007023914624734174, + "grad_norm": 0.10852495580911636, + "kl": 0.003565722843632102, + "learning_rate": 2.1024930747922437e-06, + "loss": 0.0025, + "reward": 0.24791669100522995, + "reward_std": 0.22883932292461395, + "rewards/countdown_reward_func": 0.24791669100522995, + "step": 253, + "zero_std_ratio": 0.375 + }, + { + "clip_ratio": 0.00017266585200559348, + "epoch": 0.0007051677133132333, + "grad_norm": 0.10487343370914459, + "kl": 0.0033710359130054712, + "learning_rate": 2.1108033240997233e-06, + "loss": 0.0034, + "step": 254 + }, + { + "clip_ratio": 0.00030323654209496453, + "epoch": 0.0007079439641530491, + "grad_norm": 0.10246977210044861, + "kl": 0.003350336686708033, + "learning_rate": 2.1191135734072024e-06, + "loss": 0.0024, + "step": 255 + }, + { + "clip_ratio": 0.0002672308764886111, + "epoch": 0.0007107202149928651, + "grad_norm": 0.11946409195661545, + "kl": 0.0036674703005701303, + "learning_rate": 2.1274238227146815e-06, + "loss": 0.0023, + "step": 256 + }, + { + "clip_ratio": 0.0001684882226982154, + "epoch": 0.0007134964658326809, + "grad_norm": 0.0756080374121666, + "kl": 0.0034829488722607493, + "learning_rate": 2.1357340720221606e-06, + "loss": 0.0024, + "step": 257 + }, + { + "clip_ratio": 0.00016693805810064077, + "epoch": 0.0007162727166724967, + "grad_norm": 0.11272068321704865, + "kl": 0.0037394753890112042, + "learning_rate": 2.1440443213296397e-06, + "loss": 0.0026, + "step": 258 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0007190489675123127, + "grad_norm": 0.10757721215486526, + "kl": 0.003679752117022872, + "learning_rate": 2.152354570637119e-06, + "loss": 0.0022, + "step": 259 + }, + { + "clip_ratio": 0.00017266585200559348, + "epoch": 0.0007218252183521285, + "grad_norm": 0.07714775204658508, + "kl": 0.003283996251411736, + "learning_rate": 2.1606648199445983e-06, + "loss": 0.0029, + "step": 260 + }, + { + "clip_ratio": 0.00020028267317684367, + "epoch": 0.0007246014691919445, + "grad_norm": 0.08071676641702652, + "kl": 0.0033569439547136426, + "learning_rate": 2.1689750692520774e-06, + "loss": 0.0022, + "step": 261 + }, + { + "clip_ratio": 0.0005443122354336083, + "epoch": 0.0007273777200317603, + "grad_norm": 0.11737988144159317, + "kl": 0.004172776127234101, + "learning_rate": 2.1772853185595565e-06, + "loss": 0.0027, + "step": 262 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0007301539708715762, + "grad_norm": 0.0743941143155098, + "kl": 0.003631085390225053, + "learning_rate": 2.185595567867036e-06, + "loss": 0.0024, + "step": 263 + }, + { + "clip_ratio": 8.138021075865254e-05, + "epoch": 0.0007329302217113921, + "grad_norm": 0.10840493440628052, + "kl": 0.003976256353780627, + "learning_rate": 2.193905817174515e-06, + "loss": 0.0021, + "step": 264 + }, + { + "clip_ratio": 0.00016276042151730508, + "completion_length": 217.4791717529297, + "epoch": 0.0007357064725512079, + "grad_norm": 0.08730115741491318, + "kl": 0.0036497570108622313, + "learning_rate": 2.2022160664819947e-06, + "loss": -0.0134, + "reward": 0.24375002086162567, + "reward_std": 0.19520405679941177, + "rewards/countdown_reward_func": 0.24375002086162567, + "step": 265, + "zero_std_ratio": 0.375 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0007384827233910238, + "grad_norm": 0.07678765058517456, + "kl": 0.003502070438116789, + "learning_rate": 2.2105263157894738e-06, + "loss": -0.0136, + "step": 266 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0007412589742308397, + "grad_norm": 0.10239151120185852, + "kl": 0.004613874014467001, + "learning_rate": 2.218836565096953e-06, + "loss": -0.0133, + "step": 267 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0007440352250706556, + "grad_norm": 0.09318691492080688, + "kl": 0.003522416460327804, + "learning_rate": 2.2271468144044324e-06, + "loss": -0.0137, + "step": 268 + }, + { + "clip_ratio": 0.00012007684563286602, + "epoch": 0.0007468114759104715, + "grad_norm": 0.10666222870349884, + "kl": 0.0034163352102041245, + "learning_rate": 2.2354570637119115e-06, + "loss": -0.0136, + "step": 269 + }, + { + "clip_ratio": 0.00018601190822664648, + "epoch": 0.0007495877267502874, + "grad_norm": 0.07045263051986694, + "kl": 0.003822776139713824, + "learning_rate": 2.2437673130193906e-06, + "loss": -0.0135, + "step": 270 + }, + { + "clip_ratio": 0.00016324006719514728, + "epoch": 0.0007523639775901032, + "grad_norm": 0.08979963511228561, + "kl": 0.003612265922129154, + "learning_rate": 2.25207756232687e-06, + "loss": -0.0136, + "step": 271 + }, + { + "clip_ratio": 9.300595411332324e-05, + "epoch": 0.0007551402284299191, + "grad_norm": 0.07281750440597534, + "kl": 0.0034800268476828933, + "learning_rate": 2.2603878116343493e-06, + "loss": -0.014, + "step": 272 + }, + { + "clip_ratio": 0.0, + "epoch": 0.000757916479269735, + "grad_norm": 0.1016596257686615, + "kl": 0.004669018788263202, + "learning_rate": 2.2686980609418284e-06, + "loss": -0.0135, + "step": 273 + }, + { + "clip_ratio": 0.00021114865376148373, + "epoch": 0.0007606927301095508, + "grad_norm": 0.09079304337501526, + "kl": 0.0033616855507716537, + "learning_rate": 2.2770083102493075e-06, + "loss": -0.0134, + "step": 274 + }, + { + "clip_ratio": 0.00019769607024500147, + "epoch": 0.0007634689809493668, + "grad_norm": 0.11171045154333115, + "kl": 0.0033946483163163066, + "learning_rate": 2.285318559556787e-06, + "loss": -0.0136, + "step": 275 + }, + { + "clip_ratio": 0.00032530390308238566, + "epoch": 0.0007662452317891826, + "grad_norm": 0.06686889380216599, + "kl": 0.004112225957214832, + "learning_rate": 2.293628808864266e-06, + "loss": -0.0133, + "step": 276 + }, + { + "clip_ratio": 0.000663017388433218, + "completion_length": 232.50000762939453, + "epoch": 0.0007690214826289984, + "grad_norm": 0.09006670117378235, + "kl": 0.003324171178974211, + "learning_rate": 2.301939058171745e-06, + "loss": 0.0007, + "reward": 0.3541666865348816, + "reward_std": 0.34993430972099304, + "rewards/countdown_reward_func": 0.3541666865348816, + "step": 277, + "zero_std_ratio": 0.125 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0007717977334688144, + "grad_norm": 0.10477007925510406, + "kl": 0.003485492314212024, + "learning_rate": 2.3102493074792243e-06, + "loss": -0.0001, + "step": 278 + }, + { + "clip_ratio": 0.0001145737842307426, + "epoch": 0.0007745739843086302, + "grad_norm": 0.11382175236940384, + "kl": 0.003511702874675393, + "learning_rate": 2.3185595567867034e-06, + "loss": 0.0, + "step": 279 + }, + { + "clip_ratio": 8.138021075865254e-05, + "epoch": 0.0007773502351484462, + "grad_norm": 0.09704895317554474, + "kl": 0.0034232381731271744, + "learning_rate": 2.326869806094183e-06, + "loss": 0.0006, + "step": 280 + }, + { + "clip_ratio": 0.0, + "epoch": 0.000780126485988262, + "grad_norm": 0.11391977220773697, + "kl": 0.0033790983725339174, + "learning_rate": 2.335180055401662e-06, + "loss": 0.0002, + "step": 281 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0007829027368280779, + "grad_norm": 0.3949395418167114, + "kl": 0.0036244012881070375, + "learning_rate": 2.343490304709141e-06, + "loss": 0.0005, + "step": 282 + }, + { + "clip_ratio": 0.0006535947904922068, + "epoch": 0.0007856789876678938, + "grad_norm": 0.10197897255420685, + "kl": 0.00319200346712023, + "learning_rate": 2.3518005540166202e-06, + "loss": 0.0001, + "step": 283 + }, + { + "clip_ratio": 8.138021075865254e-05, + "epoch": 0.0007884552385077096, + "grad_norm": 0.10884040594100952, + "kl": 0.0036248579854145646, + "learning_rate": 2.3601108033240998e-06, + "loss": -0.0003, + "step": 284 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0007912314893475255, + "grad_norm": 0.1121264100074768, + "kl": 0.003715887665748596, + "learning_rate": 2.368421052631579e-06, + "loss": -0.0012, + "step": 285 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0007940077401873414, + "grad_norm": 0.10213881731033325, + "kl": 0.003711581346578896, + "learning_rate": 2.376731301939058e-06, + "loss": -0.0, + "step": 286 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0007967839910271573, + "grad_norm": 0.15096428990364075, + "kl": 0.004820869769901037, + "learning_rate": 2.3850415512465375e-06, + "loss": -0.0011, + "step": 287 + }, + { + "clip_ratio": 0.0001633986976230517, + "epoch": 0.0007995602418669731, + "grad_norm": 0.12257353216409683, + "kl": 0.004069758579134941, + "learning_rate": 2.3933518005540166e-06, + "loss": -0.0001, + "step": 288 + }, + { + "clip_ratio": 0.00025150904548354447, + "completion_length": 229.6041717529297, + "epoch": 0.0008023364927067891, + "grad_norm": 0.1052643209695816, + "kl": 0.004225482931360602, + "learning_rate": 2.401662049861496e-06, + "loss": 0.0125, + "reward": 0.22500000149011612, + "reward_std": 0.2582135424017906, + "rewards/countdown_reward_func": 0.22500000149011612, + "step": 289, + "zero_std_ratio": 0.375 + }, + { + "clip_ratio": 8.138021075865254e-05, + "epoch": 0.0008051127435466049, + "grad_norm": 0.08905075490474701, + "kl": 0.004490195075049996, + "learning_rate": 2.4099722991689752e-06, + "loss": 0.0132, + "step": 290 + }, + { + "clip_ratio": 0.0005068937316536903, + "epoch": 0.0008078889943864208, + "grad_norm": 0.07550480216741562, + "kl": 0.0038030524738132954, + "learning_rate": 2.4182825484764543e-06, + "loss": 0.012, + "step": 291 + }, + { + "clip_ratio": 0.00012575452274177223, + "epoch": 0.0008106652452262367, + "grad_norm": 0.08875437080860138, + "kl": 0.00425003154668957, + "learning_rate": 2.426592797783934e-06, + "loss": 0.0121, + "step": 292 + }, + { + "clip_ratio": 8.138021075865254e-05, + "epoch": 0.0008134414960660525, + "grad_norm": 0.08870893716812134, + "kl": 0.004312893608585, + "learning_rate": 2.434903047091413e-06, + "loss": 0.0117, + "step": 293 + }, + { + "clip_ratio": 0.0008138020639307797, + "epoch": 0.0008162177469058685, + "grad_norm": 0.08667095005512238, + "kl": 0.004525796044617891, + "learning_rate": 2.443213296398892e-06, + "loss": 0.0117, + "step": 294 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0008189939977456843, + "grad_norm": 0.10581125319004059, + "kl": 0.004761378280818462, + "learning_rate": 2.451523545706371e-06, + "loss": 0.0118, + "step": 295 + }, + { + "clip_ratio": 0.00016276042151730508, + "epoch": 0.0008217702485855002, + "grad_norm": 0.08822281658649445, + "kl": 0.005203166743740439, + "learning_rate": 2.4598337950138507e-06, + "loss": 0.0123, + "step": 296 + }, + { + "clip_ratio": 0.0010381632746430114, + "epoch": 0.0008245464994253161, + "grad_norm": 0.07447604089975357, + "kl": 0.00508764130063355, + "learning_rate": 2.46814404432133e-06, + "loss": 0.012, + "step": 297 + }, + { + "clip_ratio": 0.00037726358277723193, + "epoch": 0.0008273227502651319, + "grad_norm": 0.087750643491745, + "kl": 0.005412213504314423, + "learning_rate": 2.476454293628809e-06, + "loss": 0.0111, + "step": 298 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0008300990011049479, + "grad_norm": 0.08693079650402069, + "kl": 0.005746564595028758, + "learning_rate": 2.484764542936288e-06, + "loss": 0.0114, + "step": 299 + }, + { + "clip_ratio": 0.0007324218604480848, + "epoch": 0.0008328752519447637, + "grad_norm": 0.09118447452783585, + "kl": 0.0058568131644278765, + "learning_rate": 2.493074792243767e-06, + "loss": 0.0116, + "step": 300 + }, + { + "clip_ratio": 0.0, + "completion_length": 223.0, + "epoch": 0.0008356515027845796, + "grad_norm": 0.09235308319330215, + "kl": 0.006070051807910204, + "learning_rate": 2.5013850415512467e-06, + "loss": -0.0069, + "reward": 0.20000001043081284, + "reward_std": 0.2610399127006531, + "rewards/countdown_reward_func": 0.20000001043081284, + "step": 301, + "zero_std_ratio": 0.0 + }, + { + "clip_ratio": 8.361203799722716e-05, + "epoch": 0.0008384277536243955, + "grad_norm": 0.08406054228544235, + "kl": 0.0060172725934535265, + "learning_rate": 2.5096952908587258e-06, + "loss": -0.0069, + "step": 302 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0008412040044642114, + "grad_norm": 0.0954718291759491, + "kl": 0.006230462109670043, + "learning_rate": 2.518005540166205e-06, + "loss": -0.0066, + "step": 303 + }, + { + "clip_ratio": 0.00010575295891612768, + "epoch": 0.0008439802553040272, + "grad_norm": 0.09710827469825745, + "kl": 0.00680284365080297, + "learning_rate": 2.526315789473684e-06, + "loss": -0.007, + "step": 304 + }, + { + "clip_ratio": 0.0002602904787636362, + "epoch": 0.0008467565061438431, + "grad_norm": 0.09757263213396072, + "kl": 0.007144181756302714, + "learning_rate": 2.5346260387811635e-06, + "loss": -0.0062, + "step": 305 + }, + { + "clip_ratio": 0.0, + "epoch": 0.000849532756983659, + "grad_norm": 0.07679014652967453, + "kl": 0.008339704247191548, + "learning_rate": 2.5429362880886426e-06, + "loss": -0.0076, + "step": 306 + }, + { + "clip_ratio": 8.361203799722716e-05, + "epoch": 0.0008523090078234748, + "grad_norm": 0.09126242995262146, + "kl": 0.0085701416246593, + "learning_rate": 2.5512465373961217e-06, + "loss": -0.0071, + "step": 307 + }, + { + "clip_ratio": 8.361203799722716e-05, + "epoch": 0.0008550852586632908, + "grad_norm": 0.08423109352588654, + "kl": 0.007637211121618748, + "learning_rate": 2.5595567867036012e-06, + "loss": -0.0073, + "step": 308 + }, + { + "clip_ratio": 8.833922038320452e-05, + "epoch": 0.0008578615095031066, + "grad_norm": 0.09705601632595062, + "kl": 0.007593115558847785, + "learning_rate": 2.5678670360110803e-06, + "loss": -0.0067, + "step": 309 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0008606377603429226, + "grad_norm": 0.09621502459049225, + "kl": 0.008113941876217723, + "learning_rate": 2.57617728531856e-06, + "loss": -0.0075, + "step": 310 + }, + { + "clip_ratio": 0.00020451023010537028, + "epoch": 0.0008634140111827384, + "grad_norm": 0.10358273983001709, + "kl": 0.008504221215844154, + "learning_rate": 2.584487534626039e-06, + "loss": -0.0073, + "step": 311 + }, + { + "clip_ratio": 0.00016722407599445432, + "epoch": 0.0008661902620225542, + "grad_norm": 0.08098059147596359, + "kl": 0.009506989270448685, + "learning_rate": 2.592797783933518e-06, + "loss": -0.0075, + "step": 312 + }, + { + "clip_ratio": 0.00016914748994167894, + "completion_length": 226.37500762939453, + "epoch": 0.0008689665128623702, + "grad_norm": 0.0626518577337265, + "kl": 0.008252686355262995, + "learning_rate": 2.6011080332409976e-06, + "loss": 0.0071, + "reward": 0.2291666716337204, + "reward_std": 0.21667250245809555, + "rewards/countdown_reward_func": 0.2291666641831398, + "step": 313, + "zero_std_ratio": 0.375 + }, + { + "clip_ratio": 8.138021075865254e-05, + "epoch": 0.000871742763702186, + "grad_norm": 0.0984596535563469, + "kl": 0.010352411307394505, + "learning_rate": 2.6094182825484767e-06, + "loss": 0.0072, + "step": 314 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0008745190145420019, + "grad_norm": 0.09516298025846481, + "kl": 0.011239033192396164, + "learning_rate": 2.617728531855956e-06, + "loss": 0.0077, + "step": 315 + }, + { + "clip_ratio": 9.09090886125341e-05, + "epoch": 0.0008772952653818178, + "grad_norm": 0.10682272166013718, + "kl": 0.010467468295246363, + "learning_rate": 2.626038781163435e-06, + "loss": 0.0069, + "step": 316 + }, + { + "clip_ratio": 8.138021075865254e-05, + "epoch": 0.0008800715162216336, + "grad_norm": 0.08754424750804901, + "kl": 0.010327389929443598, + "learning_rate": 2.6343490304709144e-06, + "loss": 0.0072, + "step": 317 + }, + { + "clip_ratio": 9.09090886125341e-05, + "epoch": 0.0008828477670614495, + "grad_norm": 0.12904123961925507, + "kl": 0.010680126026272774, + "learning_rate": 2.6426592797783935e-06, + "loss": 0.0072, + "step": 318 + }, + { + "clip_ratio": 0.00025372125674039125, + "epoch": 0.0008856240179012654, + "grad_norm": 0.06477981805801392, + "kl": 0.009791890624910593, + "learning_rate": 2.6509695290858726e-06, + "loss": 0.0066, + "step": 319 + }, + { + "clip_ratio": 0.00040258895023725927, + "epoch": 0.0008884002687410813, + "grad_norm": 0.185244619846344, + "kl": 0.010858574416488409, + "learning_rate": 2.6592797783933517e-06, + "loss": 0.0068, + "step": 320 + }, + { + "clip_ratio": 9.968101949198171e-05, + "epoch": 0.0008911765195808972, + "grad_norm": 0.08614979684352875, + "kl": 0.014307011850178242, + "learning_rate": 2.667590027700831e-06, + "loss": 0.0067, + "step": 321 + }, + { + "clip_ratio": 0.00027272728038951755, + "epoch": 0.0008939527704207131, + "grad_norm": 0.10753199458122253, + "kl": 0.013140483759343624, + "learning_rate": 2.6759002770083104e-06, + "loss": 0.0054, + "step": 322 + }, + { + "clip_ratio": 0.00029475959308911115, + "epoch": 0.0008967290212605289, + "grad_norm": 0.10408802330493927, + "kl": 0.01228410704061389, + "learning_rate": 2.6842105263157895e-06, + "loss": 0.0061, + "step": 323 + }, + { + "clip_ratio": 0.00019936203898396343, + "epoch": 0.0008995052721003448, + "grad_norm": 0.12455835938453674, + "kl": 0.014058600179851055, + "learning_rate": 2.6925207756232686e-06, + "loss": 0.0061, + "step": 324 + }, + { + "clip_ratio": 8.138021075865254e-05, + "completion_length": 237.64583587646484, + "epoch": 0.0009022815229401607, + "grad_norm": 0.08178848773241043, + "kl": 0.011353591922670603, + "learning_rate": 2.7008310249307477e-06, + "loss": 0.0089, + "reward": 0.1937500163912773, + "reward_std": 0.1958785615861416, + "rewards/countdown_reward_func": 0.1937500163912773, + "step": 325, + "zero_std_ratio": 0.5 + }, + { + "clip_ratio": 0.0006667007837677374, + "epoch": 0.0009050577737799765, + "grad_norm": 0.08745010197162628, + "kl": 0.015723693184554577, + "learning_rate": 2.709141274238227e-06, + "loss": 0.0086, + "step": 326 + }, + { + "clip_ratio": 0.00018234216258861125, + "epoch": 0.0009078340246197925, + "grad_norm": 0.08209048956632614, + "kl": 0.013638208620250225, + "learning_rate": 2.7174515235457063e-06, + "loss": 0.0081, + "step": 327 + }, + { + "clip_ratio": 8.741259080125019e-05, + "epoch": 0.0009106102754596083, + "grad_norm": 0.07701068371534348, + "kl": 0.014465087559074163, + "learning_rate": 2.7257617728531854e-06, + "loss": 0.0088, + "step": 328 + }, + { + "clip_ratio": 0.00025351856311317533, + "epoch": 0.0009133865262994242, + "grad_norm": 0.08792005479335785, + "kl": 0.014732198789715767, + "learning_rate": 2.734072022160665e-06, + "loss": 0.0089, + "step": 329 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0009161627771392401, + "grad_norm": 0.07523926347494125, + "kl": 0.014703777618706226, + "learning_rate": 2.742382271468144e-06, + "loss": 0.0082, + "step": 330 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0009189390279790559, + "grad_norm": 0.08062107861042023, + "kl": 0.013506517745554447, + "learning_rate": 2.7506925207756236e-06, + "loss": 0.0086, + "step": 331 + }, + { + "clip_ratio": 0.0006983240018598735, + "epoch": 0.0009217152788188719, + "grad_norm": 0.07194234430789948, + "kl": 0.019093542359769344, + "learning_rate": 2.7590027700831027e-06, + "loss": 0.0082, + "step": 332 + }, + { + "clip_ratio": 0.00025065265799639747, + "epoch": 0.0009244915296586877, + "grad_norm": 0.08615561574697495, + "kl": 0.015138544142246246, + "learning_rate": 2.7673130193905818e-06, + "loss": 0.0077, + "step": 333 + }, + { + "clip_ratio": 8.741259080125019e-05, + "epoch": 0.0009272677804985036, + "grad_norm": 0.09492068737745285, + "kl": 0.01689472934231162, + "learning_rate": 2.7756232686980613e-06, + "loss": 0.0082, + "step": 334 + }, + { + "clip_ratio": 0.0004922889638692141, + "epoch": 0.0009300440313383195, + "grad_norm": 0.10922621935606003, + "kl": 0.017271476797759533, + "learning_rate": 2.7839335180055404e-06, + "loss": 0.007, + "step": 335 + }, + { + "clip_ratio": 0.00010048231342807412, + "epoch": 0.0009328202821781354, + "grad_norm": 0.0761309564113617, + "kl": 0.01699853641912341, + "learning_rate": 2.7922437673130195e-06, + "loss": 0.0081, + "step": 336 + }, + { + "clip_ratio": 0.0, + "completion_length": 227.08334350585938, + "epoch": 0.0009355965330179512, + "grad_norm": 0.05785919353365898, + "kl": 0.01666484959423542, + "learning_rate": 2.8005540166204986e-06, + "loss": 0.01, + "reward": 0.15416667610406876, + "reward_std": 0.1091257855296135, + "rewards/countdown_reward_func": 0.15416667610406876, + "step": 337, + "zero_std_ratio": 0.625 + }, + { + "clip_ratio": 0.00018317173817194998, + "epoch": 0.0009383727838577671, + "grad_norm": 0.05232284963130951, + "kl": 0.01712951622903347, + "learning_rate": 2.808864265927978e-06, + "loss": 0.0099, + "step": 338 + }, + { + "clip_ratio": 0.0, + "epoch": 0.000941149034697583, + "grad_norm": 0.09636948257684708, + "kl": 0.019584951922297478, + "learning_rate": 2.8171745152354573e-06, + "loss": 0.0101, + "step": 339 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0009439252855373989, + "grad_norm": 0.06418822705745697, + "kl": 0.0171346515417099, + "learning_rate": 2.8254847645429364e-06, + "loss": 0.0094, + "step": 340 + }, + { + "clip_ratio": 0.000244140625, + "epoch": 0.0009467015363772148, + "grad_norm": 0.05186130478978157, + "kl": 0.018314712680876255, + "learning_rate": 2.8337950138504155e-06, + "loss": 0.0096, + "step": 341 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0009494777872170306, + "grad_norm": 0.05359478294849396, + "kl": 0.022239549085497856, + "learning_rate": 2.8421052631578946e-06, + "loss": 0.0094, + "step": 342 + }, + { + "clip_ratio": 8.60289073898457e-05, + "epoch": 0.0009522540380568466, + "grad_norm": 0.05925218388438225, + "kl": 0.020320788025856018, + "learning_rate": 2.850415512465374e-06, + "loss": 0.0096, + "step": 343 + }, + { + "clip_ratio": 0.00016276042151730508, + "epoch": 0.0009550302888966624, + "grad_norm": 0.05631886422634125, + "kl": 0.02036405447870493, + "learning_rate": 2.858725761772853e-06, + "loss": 0.0094, + "step": 344 + }, + { + "clip_ratio": 8.138021075865254e-05, + "epoch": 0.0009578065397364782, + "grad_norm": 0.1005195826292038, + "kl": 0.023968802765011787, + "learning_rate": 2.8670360110803323e-06, + "loss": 0.0093, + "step": 345 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0009605827905762942, + "grad_norm": 0.06793226301670074, + "kl": 0.02041178196668625, + "learning_rate": 2.8753462603878114e-06, + "loss": 0.0087, + "step": 346 + }, + { + "clip_ratio": 0.0, + "epoch": 0.00096335904141611, + "grad_norm": 0.048542365431785583, + "kl": 0.021388364024460316, + "learning_rate": 2.883656509695291e-06, + "loss": 0.0093, + "step": 347 + }, + { + "clip_ratio": 0.0002645519489306025, + "epoch": 0.0009661352922559259, + "grad_norm": 0.08362864702939987, + "kl": 0.027280261740088463, + "learning_rate": 2.89196675900277e-06, + "loss": 0.0093, + "step": 348 + }, + { + "clip_ratio": 0.0, + "completion_length": 228.02084350585938, + "epoch": 0.0009689115430957418, + "grad_norm": 0.0892510712146759, + "kl": 0.02360483817756176, + "learning_rate": 2.900277008310249e-06, + "loss": -0.0109, + "reward": 0.16875001043081284, + "reward_std": 0.1314988099038601, + "rewards/countdown_reward_func": 0.16875001043081284, + "step": 349, + "zero_std_ratio": 0.375 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0009716877939355576, + "grad_norm": 0.09001202881336212, + "kl": 0.023638173937797546, + "learning_rate": 2.9085872576177287e-06, + "loss": -0.0107, + "step": 350 + }, + { + "clip_ratio": 0.0001959434594027698, + "epoch": 0.0009744640447753736, + "grad_norm": 0.113184355199337, + "kl": 0.02315935306251049, + "learning_rate": 2.9168975069252078e-06, + "loss": -0.0112, + "step": 351 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0009772402956151893, + "grad_norm": 0.07550473511219025, + "kl": 0.02309281285852194, + "learning_rate": 2.9252077562326873e-06, + "loss": -0.0108, + "step": 352 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0009800165464550054, + "grad_norm": 0.11282236129045486, + "kl": 0.024476034566760063, + "learning_rate": 2.9335180055401664e-06, + "loss": -0.0112, + "step": 353 + }, + { + "clip_ratio": 0.00018355359497945756, + "epoch": 0.0009827927972948212, + "grad_norm": 0.11352071166038513, + "kl": 0.02151069976389408, + "learning_rate": 2.9418282548476455e-06, + "loss": -0.0114, + "step": 354 + }, + { + "clip_ratio": 0.0, + "epoch": 0.000985569048134637, + "grad_norm": 0.08133033663034439, + "kl": 0.021808774210512638, + "learning_rate": 2.950138504155125e-06, + "loss": -0.0114, + "step": 355 + }, + { + "clip_ratio": 0.00010416666918899864, + "epoch": 0.000988345298974453, + "grad_norm": 0.08895107358694077, + "kl": 0.02102847583591938, + "learning_rate": 2.958448753462604e-06, + "loss": -0.0113, + "step": 356 + }, + { + "clip_ratio": 9.177679748972878e-05, + "epoch": 0.0009911215498142687, + "grad_norm": 0.11091752350330353, + "kl": 0.01951050851494074, + "learning_rate": 2.9667590027700832e-06, + "loss": -0.0118, + "step": 357 + }, + { + "clip_ratio": 0.00028880476020276546, + "epoch": 0.0009938978006540848, + "grad_norm": 0.08083935081958771, + "kl": 0.019494274631142616, + "learning_rate": 2.9750692520775623e-06, + "loss": -0.0118, + "step": 358 + }, + { + "clip_ratio": 0.00018355359497945756, + "epoch": 0.0009966740514939006, + "grad_norm": 0.09999022632837296, + "kl": 0.019018066115677357, + "learning_rate": 2.983379501385042e-06, + "loss": -0.012, + "step": 359 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0009994503023337165, + "grad_norm": 0.09490374475717545, + "kl": 0.01723548863083124, + "learning_rate": 2.991689750692521e-06, + "loss": -0.013, + "step": 360 + }, + { + "clip_ratio": 0.0, + "completion_length": 226.5625, + "epoch": 0.0010022265531735323, + "grad_norm": 0.09549269825220108, + "kl": 0.019508808851242065, + "learning_rate": 3e-06, + "loss": 0.0015, + "reward": 0.2250000238418579, + "reward_std": 0.23222807794809341, + "rewards/countdown_reward_func": 0.22500000894069672, + "step": 361, + "zero_std_ratio": 0.25 + }, + { + "clip_ratio": 0.00017572575598023832, + "epoch": 0.0010050028040133482, + "grad_norm": 0.10312195122241974, + "kl": 0.01787219289690256, + "learning_rate": 3e-06, + "loss": 0.0021, + "step": 362 + }, + { + "clip_ratio": 0.0007824726053513587, + "epoch": 0.001007779054853164, + "grad_norm": 0.12467264384031296, + "kl": 0.017863546032458544, + "learning_rate": 3e-06, + "loss": 0.0019, + "step": 363 + }, + { + "clip_ratio": 8.37240440887399e-05, + "epoch": 0.00101055530569298, + "grad_norm": 0.2627834379673004, + "kl": 0.01767959538847208, + "learning_rate": 3e-06, + "loss": 0.0021, + "step": 364 + }, + { + "clip_ratio": 9.038322605192661e-05, + "epoch": 0.0010133315565327959, + "grad_norm": 0.0790000930428505, + "kl": 0.01718911249190569, + "learning_rate": 3e-06, + "loss": 0.0016, + "step": 365 + }, + { + "clip_ratio": 0.0001808318484108895, + "epoch": 0.0010161078073726117, + "grad_norm": 0.07705968618392944, + "kl": 0.01755279116332531, + "learning_rate": 3e-06, + "loss": 0.0022, + "step": 366 + }, + { + "clip_ratio": 0.0001741072628647089, + "epoch": 0.0010188840582124276, + "grad_norm": 0.10953141748905182, + "kl": 0.01750816684216261, + "learning_rate": 3e-06, + "loss": 0.0021, + "step": 367 + }, + { + "clip_ratio": 0.0004197447851765901, + "epoch": 0.0010216603090522434, + "grad_norm": 0.2015552967786789, + "kl": 0.017061928287148476, + "learning_rate": 3e-06, + "loss": 0.0016, + "step": 368 + }, + { + "clip_ratio": 0.000489045400172472, + "epoch": 0.0010244365598920595, + "grad_norm": 0.1368420273065567, + "kl": 0.017408848274499178, + "learning_rate": 3e-06, + "loss": 0.0013, + "step": 369 + }, + { + "clip_ratio": 0.00018153311975765973, + "epoch": 0.0010272128107318753, + "grad_norm": 0.09191857278347015, + "kl": 0.016698247753083706, + "learning_rate": 3e-06, + "loss": 0.0017, + "step": 370 + }, + { + "clip_ratio": 8.394895849050954e-05, + "epoch": 0.0010299890615716911, + "grad_norm": 0.07735387235879898, + "kl": 0.017519176937639713, + "learning_rate": 3e-06, + "loss": 0.0017, + "step": 371 + }, + { + "clip_ratio": 0.0002578312996774912, + "epoch": 0.001032765312411507, + "grad_norm": 0.07390379160642624, + "kl": 0.018353909254074097, + "learning_rate": 3e-06, + "loss": 0.0019, + "step": 372 + }, + { + "clip_ratio": 8.196721319109201e-05, + "completion_length": 228.68750762939453, + "epoch": 0.0010355415632513228, + "grad_norm": 0.10989905893802643, + "kl": 0.017692445777356625, + "learning_rate": 3e-06, + "loss": -0.0103, + "reward": 0.2500000223517418, + "reward_std": 0.2080453634262085, + "rewards/countdown_reward_func": 0.2500000223517418, + "step": 373, + "zero_std_ratio": 0.5 + }, + { + "clip_ratio": 0.00010279605339746922, + "epoch": 0.0010383178140911387, + "grad_norm": 0.15112467110157013, + "kl": 0.03264212794601917, + "learning_rate": 3e-06, + "loss": -0.0084, + "step": 374 + }, + { + "clip_ratio": 0.0011789826894528233, + "epoch": 0.0010410940649309547, + "grad_norm": 0.1216062679886818, + "kl": 0.021678635850548744, + "learning_rate": 3e-06, + "loss": -0.0091, + "step": 375 + }, + { + "clip_ratio": 9.15080527192913e-05, + "epoch": 0.0010438703157707706, + "grad_norm": 0.08477499336004257, + "kl": 0.016693778336048126, + "learning_rate": 3e-06, + "loss": -0.0099, + "step": 376 + }, + { + "clip_ratio": 0.0002832566970027983, + "epoch": 0.0010466465666105864, + "grad_norm": 0.10167127847671509, + "kl": 0.017779462039470673, + "learning_rate": 3e-06, + "loss": -0.0093, + "step": 377 + }, + { + "clip_ratio": 9.184423106489703e-05, + "epoch": 0.0010494228174504022, + "grad_norm": 0.09984228760004044, + "kl": 0.017343958839774132, + "learning_rate": 3e-06, + "loss": -0.0096, + "step": 378 + }, + { + "clip_ratio": 0.00017322444182354957, + "epoch": 0.001052199068290218, + "grad_norm": 0.10843722522258759, + "kl": 0.018563530407845974, + "learning_rate": 3e-06, + "loss": -0.0104, + "step": 379 + }, + { + "clip_ratio": 0.0003475236881058663, + "epoch": 0.0010549753191300341, + "grad_norm": 0.137297123670578, + "kl": 0.03769372217357159, + "learning_rate": 3e-06, + "loss": -0.0097, + "step": 380 + }, + { + "clip_ratio": 0.0006742323748767376, + "epoch": 0.00105775156996985, + "grad_norm": 0.13010026514530182, + "kl": 0.024436946026980877, + "learning_rate": 3e-06, + "loss": -0.0105, + "step": 381 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0010605278208096658, + "grad_norm": 0.09109175205230713, + "kl": 0.01653394289314747, + "learning_rate": 3e-06, + "loss": -0.0106, + "step": 382 + }, + { + "clip_ratio": 0.0001830161054385826, + "epoch": 0.0010633040716494816, + "grad_norm": 0.08129965513944626, + "kl": 0.017563740722835064, + "learning_rate": 3e-06, + "loss": -0.01, + "step": 383 + }, + { + "clip_ratio": 0.000265068665612489, + "epoch": 0.0010660803224892975, + "grad_norm": 0.09801369905471802, + "kl": 0.01689669769257307, + "learning_rate": 3e-06, + "loss": -0.0115, + "step": 384 + }, + { + "clip_ratio": 0.0, + "completion_length": 240.64583587646484, + "epoch": 0.0010688565733291133, + "grad_norm": 0.07832996547222137, + "kl": 0.02487264759838581, + "learning_rate": 3e-06, + "loss": 0.0146, + "reward": 0.19166668504476547, + "reward_std": 0.15476077422499657, + "rewards/countdown_reward_func": 0.19166668504476547, + "step": 385, + "zero_std_ratio": 0.625 + }, + { + "clip_ratio": 8.716875890968367e-05, + "epoch": 0.0010716328241689294, + "grad_norm": 0.07169502228498459, + "kl": 0.016012447886168957, + "learning_rate": 3e-06, + "loss": 0.0138, + "step": 386 + }, + { + "clip_ratio": 9.005763422464952e-05, + "epoch": 0.0010744090750087452, + "grad_norm": 0.08705933392047882, + "kl": 0.016524842474609613, + "learning_rate": 3e-06, + "loss": 0.0143, + "step": 387 + }, + { + "clip_ratio": 9.61538462433964e-05, + "epoch": 0.001077185325848561, + "grad_norm": 0.0732106938958168, + "kl": 0.015910383313894272, + "learning_rate": 3e-06, + "loss": 0.0142, + "step": 388 + }, + { + "clip_ratio": 0.0002589142677607015, + "epoch": 0.001079961576688377, + "grad_norm": 0.07911605387926102, + "kl": 0.01897307112812996, + "learning_rate": 3e-06, + "loss": 0.0151, + "step": 389 + }, + { + "clip_ratio": 0.00017143784498330206, + "epoch": 0.0010827378275281927, + "grad_norm": 0.0664311945438385, + "kl": 0.016379999462515116, + "learning_rate": 3e-06, + "loss": 0.0145, + "step": 390 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0010855140783680088, + "grad_norm": 0.08451702445745468, + "kl": 0.0259824451059103, + "learning_rate": 3e-06, + "loss": 0.0146, + "step": 391 + }, + { + "clip_ratio": 9.005763422464952e-05, + "epoch": 0.0010882903292078246, + "grad_norm": 0.07148966938257217, + "kl": 0.016325827687978745, + "learning_rate": 3e-06, + "loss": 0.0138, + "step": 392 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0010910665800476405, + "grad_norm": 0.09877816587686539, + "kl": 0.01728895679116249, + "learning_rate": 3e-06, + "loss": 0.0141, + "step": 393 + }, + { + "clip_ratio": 0.0001923076924867928, + "epoch": 0.0010938428308874563, + "grad_norm": 0.08034543693065643, + "kl": 0.016465777531266212, + "learning_rate": 3e-06, + "loss": 0.0138, + "step": 394 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0010966190817272722, + "grad_norm": 0.0796988308429718, + "kl": 0.020429577212780714, + "learning_rate": 3e-06, + "loss": 0.0139, + "step": 395 + }, + { + "clip_ratio": 0.0005143135640537366, + "epoch": 0.0010993953325670882, + "grad_norm": 0.06786444783210754, + "kl": 0.01705406652763486, + "learning_rate": 3e-06, + "loss": 0.013, + "step": 396 + }, + { + "clip_ratio": 0.00019317050464451313, + "completion_length": 222.3541717529297, + "epoch": 0.001102171583406904, + "grad_norm": 0.1809883564710617, + "kl": 0.02103044930845499, + "learning_rate": 3e-06, + "loss": 0.0201, + "reward": 0.32500000298023224, + "reward_std": 0.28887902945280075, + "rewards/countdown_reward_func": 0.32499998807907104, + "step": 397, + "zero_std_ratio": 0.375 + }, + { + "clip_ratio": 0.0003668378631118685, + "epoch": 0.0011049478342467199, + "grad_norm": 0.13064327836036682, + "kl": 0.019325342029333115, + "learning_rate": 3e-06, + "loss": 0.0188, + "step": 398 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0011077240850865357, + "grad_norm": 0.134730726480484, + "kl": 0.018743818625807762, + "learning_rate": 3e-06, + "loss": 0.018, + "step": 399 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0011105003359263516, + "grad_norm": 0.11716154217720032, + "kl": 0.03218943625688553, + "learning_rate": 3e-06, + "loss": 0.0188, + "step": 400 + }, + { + "clip_ratio": 0.0003666364573291503, + "epoch": 0.0011132765867661674, + "grad_norm": 0.1279788464307785, + "kl": 0.02369430661201477, + "learning_rate": 3e-06, + "loss": 0.0183, + "step": 401 + }, + { + "clip_ratio": 0.00018321751849725842, + "epoch": 0.0011160528376059835, + "grad_norm": 0.1514863669872284, + "kl": 0.02853427268564701, + "learning_rate": 3e-06, + "loss": 0.0172, + "step": 402 + }, + { + "clip_ratio": 0.0005473888304550201, + "epoch": 0.0011188290884457993, + "grad_norm": 0.17688550055027008, + "kl": 0.02879752404987812, + "learning_rate": 3e-06, + "loss": 0.0159, + "step": 403 + }, + { + "clip_ratio": 0.0003765894507523626, + "epoch": 0.0011216053392856151, + "grad_norm": 0.1264808475971222, + "kl": 0.028363430872559547, + "learning_rate": 3e-06, + "loss": 0.0154, + "step": 404 + }, + { + "clip_ratio": 0.0002071251074085012, + "epoch": 0.001124381590125431, + "grad_norm": 0.13313868641853333, + "kl": 0.028233185410499573, + "learning_rate": 3e-06, + "loss": 0.0148, + "step": 405 + }, + { + "clip_ratio": 0.00029883457318646833, + "epoch": 0.0011271578409652468, + "grad_norm": 0.11772707849740982, + "kl": 0.0557091049849987, + "learning_rate": 3e-06, + "loss": 0.0158, + "step": 406 + }, + { + "clip_ratio": 0.0008593437087256461, + "epoch": 0.0011299340918050629, + "grad_norm": 0.1197652816772461, + "kl": 0.03832645434886217, + "learning_rate": 3e-06, + "loss": 0.0146, + "step": 407 + }, + { + "clip_ratio": 0.0008315023733302951, + "epoch": 0.0011327103426448787, + "grad_norm": 0.11358743906021118, + "kl": 0.049091488122940063, + "learning_rate": 3e-06, + "loss": 0.0139, + "step": 408 + }, + { + "clip_ratio": 0.00010860121983569115, + "completion_length": 212.20833587646484, + "epoch": 0.0011354865934846946, + "grad_norm": 0.13000725209712982, + "kl": 0.03431596327573061, + "learning_rate": 3e-06, + "loss": -0.0011, + "reward": 0.239583358168602, + "reward_std": 0.3074723482131958, + "rewards/countdown_reward_func": 0.2395833507180214, + "step": 409, + "zero_std_ratio": 0.0 + }, + { + "clip_ratio": 0.00010860121983569115, + "epoch": 0.0011382628443245104, + "grad_norm": 0.12621691823005676, + "kl": 0.03901347145438194, + "learning_rate": 3e-06, + "loss": -0.0018, + "step": 410 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0011410390951643262, + "grad_norm": 0.15552707016468048, + "kl": 0.05270291492342949, + "learning_rate": 3e-06, + "loss": -0.0012, + "step": 411 + }, + { + "clip_ratio": 0.0003161708955303766, + "epoch": 0.001143815346004142, + "grad_norm": 0.1499052792787552, + "kl": 0.04181257076561451, + "learning_rate": 3e-06, + "loss": -0.0025, + "step": 412 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0011465915968439581, + "grad_norm": 0.11208264529705048, + "kl": 0.04561302810907364, + "learning_rate": 3e-06, + "loss": -0.0011, + "step": 413 + }, + { + "clip_ratio": 8.514986257068813e-05, + "epoch": 0.001149367847683774, + "grad_norm": 0.117032989859581, + "kl": 0.0457566250115633, + "learning_rate": 3e-06, + "loss": -0.002, + "step": 414 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0011521440985235898, + "grad_norm": 0.1544887125492096, + "kl": 0.04844648204743862, + "learning_rate": 3e-06, + "loss": -0.0009, + "step": 415 + }, + { + "clip_ratio": 0.00010860121983569115, + "epoch": 0.0011549203493634056, + "grad_norm": 0.1259896457195282, + "kl": 0.05372907593846321, + "learning_rate": 3e-06, + "loss": -0.0026, + "step": 416 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0011576966002032215, + "grad_norm": 0.15457068383693695, + "kl": 0.06770433485507965, + "learning_rate": 3e-06, + "loss": -0.0032, + "step": 417 + }, + { + "clip_ratio": 0.00022890920809004456, + "epoch": 0.0011604728510430375, + "grad_norm": 0.1197003573179245, + "kl": 0.0525658018887043, + "learning_rate": 3e-06, + "loss": -0.0043, + "step": 418 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0011632491018828534, + "grad_norm": 0.10684943944215775, + "kl": 0.053456977009773254, + "learning_rate": 3e-06, + "loss": -0.0021, + "step": 419 + }, + { + "clip_ratio": 0.00021355503849918023, + "epoch": 0.0011660253527226692, + "grad_norm": 0.10173308104276657, + "kl": 0.05415925942361355, + "learning_rate": 3e-06, + "loss": -0.0027, + "step": 420 + }, + { + "clip_ratio": 0.0, + "completion_length": 237.62500762939453, + "epoch": 0.001168801603562485, + "grad_norm": 0.11933105438947678, + "kl": 0.06486517563462257, + "learning_rate": 3e-06, + "loss": 0.0027, + "reward": 0.27916668355464935, + "reward_std": 0.29565654695034027, + "rewards/countdown_reward_func": 0.27916668355464935, + "step": 421, + "zero_std_ratio": 0.125 + }, + { + "clip_ratio": 0.0, + "epoch": 0.001171577854402301, + "grad_norm": 0.20322704315185547, + "kl": 0.08543343096971512, + "learning_rate": 3e-06, + "loss": 0.0032, + "step": 422 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0011743541052421167, + "grad_norm": 0.19525644183158875, + "kl": 0.13222120702266693, + "learning_rate": 3e-06, + "loss": 0.0045, + "step": 423 + }, + { + "clip_ratio": 0.0002695980292628519, + "epoch": 0.0011771303560819328, + "grad_norm": 0.13731662929058075, + "kl": 0.0626319907605648, + "learning_rate": 3e-06, + "loss": 0.0031, + "step": 424 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0011799066069217486, + "grad_norm": 0.13071469962596893, + "kl": 0.05833687447011471, + "learning_rate": 3e-06, + "loss": 0.0014, + "step": 425 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0011826828577615645, + "grad_norm": 0.23276741802692413, + "kl": 0.06872095167636871, + "learning_rate": 3e-06, + "loss": 0.0012, + "step": 426 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0011854591086013803, + "grad_norm": 0.11540886759757996, + "kl": 0.053551726043224335, + "learning_rate": 3e-06, + "loss": 0.0002, + "step": 427 + }, + { + "clip_ratio": 0.00016356298874597996, + "epoch": 0.0011882353594411962, + "grad_norm": 0.19618336856365204, + "kl": 0.06640568189322948, + "learning_rate": 3e-06, + "loss": -0.0006, + "step": 428 + }, + { + "clip_ratio": 8.218277798732743e-05, + "epoch": 0.0011910116102810122, + "grad_norm": 0.18861764669418335, + "kl": 0.1051701121032238, + "learning_rate": 3e-06, + "loss": 0.0011, + "step": 429 + }, + { + "clip_ratio": 0.000244140625, + "epoch": 0.001193787861120828, + "grad_norm": 0.14318421483039856, + "kl": 0.04604991525411606, + "learning_rate": 3e-06, + "loss": 0.0016, + "step": 430 + }, + { + "clip_ratio": 0.0003451758166193031, + "epoch": 0.0011965641119606439, + "grad_norm": 0.14576469361782074, + "kl": 0.04200077801942825, + "learning_rate": 3e-06, + "loss": -0.0002, + "step": 431 + }, + { + "clip_ratio": 8.138021075865254e-05, + "epoch": 0.0011993403628004597, + "grad_norm": 0.2222023755311966, + "kl": 0.04660602658987045, + "learning_rate": 3e-06, + "loss": -0.004, + "step": 432 + }, + { + "clip_ratio": 0.0, + "completion_length": 236.81250762939453, + "epoch": 0.0012021166136402756, + "grad_norm": 0.08921542763710022, + "kl": 0.04309102147817612, + "learning_rate": 3e-06, + "loss": -0.0024, + "reward": 0.15208333730697632, + "reward_std": 0.11422888934612274, + "rewards/countdown_reward_func": 0.15208332985639572, + "step": 433, + "zero_std_ratio": 0.5 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0012048928644800914, + "grad_norm": 0.1347406804561615, + "kl": 0.04063236340880394, + "learning_rate": 3e-06, + "loss": -0.0032, + "step": 434 + }, + { + "clip_ratio": 0.0002881626787711866, + "epoch": 0.0012076691153199075, + "grad_norm": 0.11881622672080994, + "kl": 0.0401871744543314, + "learning_rate": 3e-06, + "loss": -0.0035, + "step": 435 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0012104453661597233, + "grad_norm": 0.06465653330087662, + "kl": 0.03999011963605881, + "learning_rate": 3e-06, + "loss": -0.003, + "step": 436 + }, + { + "clip_ratio": 9.13075273274444e-05, + "epoch": 0.0012132216169995391, + "grad_norm": 0.07897289842367172, + "kl": 0.03400629200041294, + "learning_rate": 3e-06, + "loss": -0.0031, + "step": 437 + }, + { + "clip_ratio": 0.00045705895172432065, + "epoch": 0.001215997867839355, + "grad_norm": 0.07360372692346573, + "kl": 0.03901753947138786, + "learning_rate": 3e-06, + "loss": -0.0032, + "step": 438 + }, + { + "clip_ratio": 0.000265190057689324, + "epoch": 0.0012187741186791708, + "grad_norm": 0.07743567228317261, + "kl": 0.030041148886084557, + "learning_rate": 3e-06, + "loss": -0.0041, + "step": 439 + }, + { + "clip_ratio": 0.000703962694387883, + "epoch": 0.0012215503695189869, + "grad_norm": 0.13309620320796967, + "kl": 0.029125919565558434, + "learning_rate": 3e-06, + "loss": -0.0056, + "step": 440 + }, + { + "clip_ratio": 0.0005505228764377534, + "epoch": 0.0012243266203588027, + "grad_norm": 0.0934651717543602, + "kl": 0.028855517506599426, + "learning_rate": 3e-06, + "loss": -0.0049, + "step": 441 + }, + { + "clip_ratio": 0.00017098593525588512, + "epoch": 0.0012271028711986186, + "grad_norm": 0.060086771845817566, + "kl": 0.028184207156300545, + "learning_rate": 3e-06, + "loss": -0.0043, + "step": 442 + }, + { + "clip_ratio": 0.002318943908903748, + "epoch": 0.0012298791220384344, + "grad_norm": 0.07434255629777908, + "kl": 0.024150204844772816, + "learning_rate": 3e-06, + "loss": -0.0043, + "step": 443 + }, + { + "clip_ratio": 0.0017171713116113096, + "epoch": 0.0012326553728782502, + "grad_norm": 0.06684679538011551, + "kl": 0.028608722612261772, + "learning_rate": 3e-06, + "loss": -0.0045, + "step": 444 + }, + { + "clip_ratio": 0.00026747502852231264, + "completion_length": 234.14584350585938, + "epoch": 0.001235431623718066, + "grad_norm": 0.07466662675142288, + "kl": 0.024345185607671738, + "learning_rate": 3e-06, + "loss": 0.0042, + "reward": 0.16875001043081284, + "reward_std": 0.1619665026664734, + "rewards/countdown_reward_func": 0.16875001043081284, + "step": 445, + "zero_std_ratio": 0.375 + }, + { + "clip_ratio": 8.138021075865254e-05, + "epoch": 0.0012382078745578821, + "grad_norm": 0.0632314682006836, + "kl": 0.02845650538802147, + "learning_rate": 3e-06, + "loss": 0.0042, + "step": 446 + }, + { + "clip_ratio": 0.0, + "epoch": 0.001240984125397698, + "grad_norm": 0.064049132168293, + "kl": 0.0239101629704237, + "learning_rate": 3e-06, + "loss": 0.0045, + "step": 447 + }, + { + "clip_ratio": 0.0002663619234226644, + "epoch": 0.0012437603762375138, + "grad_norm": 0.11490896344184875, + "kl": 0.02280531730502844, + "learning_rate": 3e-06, + "loss": 0.0038, + "step": 448 + }, + { + "clip_ratio": 0.00027501455042511225, + "epoch": 0.0012465366270773296, + "grad_norm": 0.0676225870847702, + "kl": 0.02224800456315279, + "learning_rate": 3e-06, + "loss": 0.0042, + "step": 449 + }, + { + "clip_ratio": 0.000460914452560246, + "epoch": 0.0012493128779171455, + "grad_norm": 0.07294591516256332, + "kl": 0.023516996763646603, + "learning_rate": 3e-06, + "loss": 0.0047, + "step": 450 + }, + { + "clip_ratio": 0.00017831669538281858, + "epoch": 0.0012520891287569615, + "grad_norm": 0.07453124225139618, + "kl": 0.021801339462399483, + "learning_rate": 3e-06, + "loss": 0.0039, + "step": 451 + }, + { + "clip_ratio": 0.0003460512016317807, + "epoch": 0.0012548653795967774, + "grad_norm": 0.06052190437912941, + "kl": 0.026570623740553856, + "learning_rate": 3e-06, + "loss": 0.004, + "step": 452 + }, + { + "clip_ratio": 8.138021075865254e-05, + "epoch": 0.0012576416304365932, + "grad_norm": 0.06681171804666519, + "kl": 0.022533809766173363, + "learning_rate": 3e-06, + "loss": 0.004, + "step": 453 + }, + { + "clip_ratio": 0.00017356310854665935, + "epoch": 0.001260417881276409, + "grad_norm": 0.1189865693449974, + "kl": 0.02223300840705633, + "learning_rate": 3e-06, + "loss": 0.0031, + "step": 454 + }, + { + "clip_ratio": 0.00018221575010102242, + "epoch": 0.001263194132116225, + "grad_norm": 0.06577113270759583, + "kl": 0.021816913969814777, + "learning_rate": 3e-06, + "loss": 0.0037, + "step": 455 + }, + { + "clip_ratio": 0.0005418355867732316, + "epoch": 0.0012659703829560407, + "grad_norm": 0.13180898129940033, + "kl": 0.0227721044793725, + "learning_rate": 3e-06, + "loss": 0.0036, + "step": 456 + }, + { + "clip_ratio": 0.00020764119108207524, + "completion_length": 221.70833587646484, + "epoch": 0.0012687466337958568, + "grad_norm": 0.109416663646698, + "kl": 0.022198159247636795, + "learning_rate": 3e-06, + "loss": 0.0299, + "reward": 0.32500001788139343, + "reward_std": 0.28183095902204514, + "rewards/countdown_reward_func": 0.32500001788139343, + "step": 457, + "zero_std_ratio": 0.375 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0012715228846356726, + "grad_norm": 0.12022383511066437, + "kl": 0.02293353620916605, + "learning_rate": 3e-06, + "loss": 0.0297, + "step": 458 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0012742991354754885, + "grad_norm": 0.10432354360818863, + "kl": 0.024400770664215088, + "learning_rate": 3e-06, + "loss": 0.0295, + "step": 459 + }, + { + "clip_ratio": 9.13075273274444e-05, + "epoch": 0.0012770753863153043, + "grad_norm": 0.10826432704925537, + "kl": 0.022371195256710052, + "learning_rate": 3e-06, + "loss": 0.0295, + "step": 460 + }, + { + "clip_ratio": 0.00018136516155209392, + "epoch": 0.0012798516371551201, + "grad_norm": 0.1814108043909073, + "kl": 0.02964179776608944, + "learning_rate": 3e-06, + "loss": 0.029, + "step": 461 + }, + { + "clip_ratio": 8.138021075865254e-05, + "epoch": 0.0012826278879949362, + "grad_norm": 0.1098814606666565, + "kl": 0.02524806559085846, + "learning_rate": 3e-06, + "loss": 0.0288, + "step": 462 + }, + { + "clip_ratio": 0.00020764119108207524, + "epoch": 0.001285404138834752, + "grad_norm": 0.10370974242687225, + "kl": 0.023861460387706757, + "learning_rate": 3e-06, + "loss": 0.0284, + "step": 463 + }, + { + "clip_ratio": 0.00010382059554103762, + "epoch": 0.0012881803896745679, + "grad_norm": 0.1190827265381813, + "kl": 0.024492272175848484, + "learning_rate": 3e-06, + "loss": 0.0278, + "step": 464 + }, + { + "clip_ratio": 0.00046262028627097607, + "epoch": 0.0012909566405143837, + "grad_norm": 0.10446801036596298, + "kl": 0.027533382177352905, + "learning_rate": 3e-06, + "loss": 0.0282, + "step": 465 + }, + { + "clip_ratio": 0.0005552970251301304, + "epoch": 0.0012937328913541996, + "grad_norm": 0.10887821763753891, + "kl": 0.024930739775300026, + "learning_rate": 3e-06, + "loss": 0.0271, + "step": 466 + }, + { + "clip_ratio": 0.0009011498477775604, + "epoch": 0.0012965091421940156, + "grad_norm": 0.15684081614017487, + "kl": 0.03675767965614796, + "learning_rate": 3e-06, + "loss": 0.026, + "step": 467 + }, + { + "clip_ratio": 0.0004027693357784301, + "epoch": 0.0012992853930338315, + "grad_norm": 0.11377524584531784, + "kl": 0.03045613970607519, + "learning_rate": 3e-06, + "loss": 0.0264, + "step": 468 + }, + { + "clip_ratio": 0.0, + "completion_length": 239.1875, + "epoch": 0.0013020616438736473, + "grad_norm": 0.10519543290138245, + "kl": 0.03355495072901249, + "learning_rate": 3e-06, + "loss": 0.0046, + "reward": 0.20625001192092896, + "reward_std": 0.2200612723827362, + "rewards/countdown_reward_func": 0.20625001192092896, + "step": 469, + "zero_std_ratio": 0.25 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0013048378947134631, + "grad_norm": 0.09295057505369186, + "kl": 0.03722470626235008, + "learning_rate": 3e-06, + "loss": 0.0037, + "step": 470 + }, + { + "clip_ratio": 0.0, + "epoch": 0.001307614145553279, + "grad_norm": 0.07510315626859665, + "kl": 0.035189252346754074, + "learning_rate": 3e-06, + "loss": 0.0038, + "step": 471 + }, + { + "clip_ratio": 0.000260960339801386, + "epoch": 0.0013103903963930948, + "grad_norm": 0.1088418960571289, + "kl": 0.03208626061677933, + "learning_rate": 3e-06, + "loss": 0.0039, + "step": 472 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0013131666472329109, + "grad_norm": 0.08343033492565155, + "kl": 0.03320677764713764, + "learning_rate": 3e-06, + "loss": 0.0031, + "step": 473 + }, + { + "clip_ratio": 0.0002764667078736238, + "epoch": 0.0013159428980727267, + "grad_norm": 0.11299256235361099, + "kl": 0.0330524817109108, + "learning_rate": 3e-06, + "loss": 0.0035, + "step": 474 + }, + { + "clip_ratio": 0.00027988169313175604, + "epoch": 0.0013187191489125426, + "grad_norm": 0.10915958881378174, + "kl": 0.04382970742881298, + "learning_rate": 3e-06, + "loss": 0.0033, + "step": 475 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0013214953997523584, + "grad_norm": 0.08877313882112503, + "kl": 0.04925047419965267, + "learning_rate": 3e-06, + "loss": 0.0027, + "step": 476 + }, + { + "clip_ratio": 0.00017536517407279462, + "epoch": 0.0013242716505921742, + "grad_norm": 0.08160615712404251, + "kl": 0.0415896400809288, + "learning_rate": 3e-06, + "loss": 0.0036, + "step": 477 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0013270479014319903, + "grad_norm": 0.14993789792060852, + "kl": 0.036379581317305565, + "learning_rate": 3e-06, + "loss": 0.0034, + "step": 478 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0013298241522718061, + "grad_norm": 0.08193667978048325, + "kl": 0.03808531537652016, + "learning_rate": 3e-06, + "loss": 0.0028, + "step": 479 + }, + { + "clip_ratio": 0.0002638619553181343, + "epoch": 0.001332600403111622, + "grad_norm": 0.10145576298236847, + "kl": 0.0362465288490057, + "learning_rate": 3e-06, + "loss": 0.0028, + "step": 480 + }, + { + "clip_ratio": 0.0, + "completion_length": 232.0625, + "epoch": 0.0013353766539514378, + "grad_norm": 0.11346805095672607, + "kl": 0.038885802030563354, + "learning_rate": 3e-06, + "loss": 0.0224, + "reward": 0.19166667014360428, + "reward_std": 0.20098165795207024, + "rewards/countdown_reward_func": 0.19166667014360428, + "step": 481, + "zero_std_ratio": 0.375 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0013381529047912536, + "grad_norm": 0.0885414257645607, + "kl": 0.03647511638700962, + "learning_rate": 3e-06, + "loss": 0.0223, + "step": 482 + }, + { + "clip_ratio": 0.0002071251074085012, + "epoch": 0.0013409291556310695, + "grad_norm": 0.09309430420398712, + "kl": 0.03728037141263485, + "learning_rate": 3e-06, + "loss": 0.0219, + "step": 483 + }, + { + "clip_ratio": 0.00018494276446290314, + "epoch": 0.0013437054064708855, + "grad_norm": 0.08568699657917023, + "kl": 0.053651321679353714, + "learning_rate": 3e-06, + "loss": 0.0231, + "step": 484 + }, + { + "clip_ratio": 0.00026833474839804694, + "epoch": 0.0013464816573107014, + "grad_norm": 0.12132450193166733, + "kl": 0.04722470976412296, + "learning_rate": 3e-06, + "loss": 0.0222, + "step": 485 + }, + { + "clip_ratio": 8.532423089491203e-05, + "epoch": 0.0013492579081505172, + "grad_norm": 0.09739983081817627, + "kl": 0.03846907988190651, + "learning_rate": 3e-06, + "loss": 0.0218, + "step": 486 + }, + { + "clip_ratio": 8.138021075865254e-05, + "epoch": 0.001352034158990333, + "grad_norm": 0.1051330417394638, + "kl": 0.04565967619419098, + "learning_rate": 3e-06, + "loss": 0.0213, + "step": 487 + }, + { + "clip_ratio": 0.0003686312338686548, + "epoch": 0.001354810409830149, + "grad_norm": 0.08285031467676163, + "kl": 0.04506035894155502, + "learning_rate": 3e-06, + "loss": 0.0212, + "step": 488 + }, + { + "clip_ratio": 0.0008285004296340048, + "epoch": 0.001357586660669965, + "grad_norm": 0.0878533124923706, + "kl": 0.04655962623655796, + "learning_rate": 3e-06, + "loss": 0.0206, + "step": 489 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0013603629115097808, + "grad_norm": 0.07896918803453445, + "kl": 0.06781511753797531, + "learning_rate": 3e-06, + "loss": 0.0215, + "step": 490 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0013631391623495966, + "grad_norm": 0.10892756283283234, + "kl": 0.06482304260134697, + "learning_rate": 3e-06, + "loss": 0.0204, + "step": 491 + }, + { + "clip_ratio": 8.138021075865254e-05, + "epoch": 0.0013659154131894125, + "grad_norm": 0.09697525948286057, + "kl": 0.052130842581391335, + "learning_rate": 3e-06, + "loss": 0.0204, + "step": 492 + }, + { + "clip_ratio": 0.0, + "completion_length": 230.4791717529297, + "epoch": 0.0013686916640292283, + "grad_norm": 0.12033633142709732, + "kl": 0.05308363772928715, + "learning_rate": 3e-06, + "loss": -0.0028, + "reward": 0.21041666716337204, + "reward_std": 0.17074457183480263, + "rewards/countdown_reward_func": 0.21041666716337204, + "step": 493, + "zero_std_ratio": 0.5 + }, + { + "clip_ratio": 0.0003349836479173973, + "epoch": 0.0013714679148690441, + "grad_norm": 0.09749346226453781, + "kl": 0.05684215575456619, + "learning_rate": 3e-06, + "loss": -0.0034, + "step": 494 + }, + { + "clip_ratio": 9.025270992424339e-05, + "epoch": 0.0013742441657088602, + "grad_norm": 0.08872581273317337, + "kl": 0.05968480557203293, + "learning_rate": 3e-06, + "loss": -0.003, + "step": 495 + }, + { + "clip_ratio": 0.00028852681134594604, + "epoch": 0.001377020416548676, + "grad_norm": 0.11848699301481247, + "kl": 0.06252800300717354, + "learning_rate": 3e-06, + "loss": -0.0026, + "step": 496 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0013797966673884919, + "grad_norm": 0.24924242496490479, + "kl": 0.06530779227614403, + "learning_rate": 3e-06, + "loss": -0.0012, + "step": 497 + }, + { + "clip_ratio": 0.0003621914656832814, + "epoch": 0.0013825729182283077, + "grad_norm": 0.11978671699762344, + "kl": 0.06549267843365669, + "learning_rate": 3e-06, + "loss": -0.0024, + "step": 498 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0013853491690681236, + "grad_norm": 0.08640985190868378, + "kl": 0.06214660406112671, + "learning_rate": 3e-06, + "loss": -0.0034, + "step": 499 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0013881254199079396, + "grad_norm": 0.10044127702713013, + "kl": 0.061088208109140396, + "learning_rate": 3e-06, + "loss": -0.0035, + "step": 500 + }, + { + "clip_ratio": 9.025270992424339e-05, + "epoch": 0.0013909016707477555, + "grad_norm": 0.09724526852369308, + "kl": 0.06105843186378479, + "learning_rate": 3e-06, + "loss": -0.0036, + "step": 501 + }, + { + "clip_ratio": 0.00047674778033979237, + "epoch": 0.0013936779215875713, + "grad_norm": 0.1106707900762558, + "kl": 0.05963746830821037, + "learning_rate": 3e-06, + "loss": -0.0037, + "step": 502 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0013964541724273871, + "grad_norm": 0.28701546788215637, + "kl": 0.058546338230371475, + "learning_rate": 3e-06, + "loss": -0.0043, + "step": 503 + }, + { + "clip_ratio": 0.0002860495515051298, + "epoch": 0.001399230423267203, + "grad_norm": 0.11430586874485016, + "kl": 0.05745413526892662, + "learning_rate": 3e-06, + "loss": -0.0037, + "step": 504 + }, + { + "clip_ratio": 0.0004231043276377022, + "completion_length": 207.95833587646484, + "epoch": 0.0014020066741070188, + "grad_norm": 0.1265731304883957, + "kl": 0.06725966557860374, + "learning_rate": 3e-06, + "loss": -0.0148, + "reward": 0.10833334177732468, + "reward_std": 0.0599165465682745, + "rewards/countdown_reward_func": 0.10833334177732468, + "step": 505, + "zero_std_ratio": 0.625 + }, + { + "clip_ratio": 0.0001892800792120397, + "epoch": 0.0014047829249468349, + "grad_norm": 0.07790236175060272, + "kl": 0.06213269755244255, + "learning_rate": 3e-06, + "loss": -0.0152, + "step": 506 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0014075591757866507, + "grad_norm": 0.10863670706748962, + "kl": 0.05851333029568195, + "learning_rate": 3e-06, + "loss": -0.0152, + "step": 507 + }, + { + "clip_ratio": 0.00011870844900840893, + "epoch": 0.0014103354266264666, + "grad_norm": 0.11995543539524078, + "kl": 0.0762905403971672, + "learning_rate": 3e-06, + "loss": -0.0152, + "step": 508 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0014131116774662824, + "grad_norm": 0.10893931984901428, + "kl": 0.07419506646692753, + "learning_rate": 3e-06, + "loss": -0.0151, + "step": 509 + }, + { + "clip_ratio": 0.0002796600092551671, + "epoch": 0.0014158879283060982, + "grad_norm": 0.11752152442932129, + "kl": 0.05459407716989517, + "learning_rate": 3e-06, + "loss": -0.0165, + "step": 510 + }, + { + "clip_ratio": 0.00028186137205921113, + "epoch": 0.0014186641791459143, + "grad_norm": 0.10197150707244873, + "kl": 0.0478304848074913, + "learning_rate": 3e-06, + "loss": -0.0171, + "step": 511 + }, + { + "clip_ratio": 0.0010716330725699663, + "epoch": 0.0014214404299857301, + "grad_norm": 0.06922204792499542, + "kl": 0.042900703847408295, + "learning_rate": 3e-06, + "loss": -0.0171, + "step": 512 + }, + { + "clip_ratio": 0.00039922940777614713, + "epoch": 0.001424216680825546, + "grad_norm": 0.09703180193901062, + "kl": 0.03880976140499115, + "learning_rate": 3e-06, + "loss": -0.0185, + "step": 513 + }, + { + "clip_ratio": 0.0022654032800346613, + "epoch": 0.0014269929316653618, + "grad_norm": 0.10023833066225052, + "kl": 0.04965279810130596, + "learning_rate": 3e-06, + "loss": -0.0183, + "step": 514 + }, + { + "clip_ratio": 0.0015507703792536631, + "epoch": 0.0014297691825051776, + "grad_norm": 0.0990394875407219, + "kl": 0.04965635947883129, + "learning_rate": 3e-06, + "loss": -0.0188, + "step": 515 + }, + { + "clip_ratio": 0.0020738598541356623, + "epoch": 0.0014325454333449935, + "grad_norm": 0.1003248319029808, + "kl": 0.03512744698673487, + "learning_rate": 3e-06, + "loss": -0.0204, + "step": 516 + }, + { + "clip_ratio": 0.00010129659494850785, + "completion_length": 233.4166717529297, + "epoch": 0.0014353216841848095, + "grad_norm": 0.09848001599311829, + "kl": 0.026790697127580643, + "learning_rate": 3e-06, + "loss": -0.0064, + "reward": 0.2250000238418579, + "reward_std": 0.2915927767753601, + "rewards/countdown_reward_func": 0.22500000894069672, + "step": 517, + "zero_std_ratio": 0.25 + }, + { + "clip_ratio": 0.0002025931898970157, + "epoch": 0.0014380979350246254, + "grad_norm": 0.10145186632871628, + "kl": 0.024240675382316113, + "learning_rate": 3e-06, + "loss": -0.0064, + "step": 518 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0014408741858644412, + "grad_norm": 0.09955999255180359, + "kl": 0.02282124198973179, + "learning_rate": 3e-06, + "loss": -0.0068, + "step": 519 + }, + { + "clip_ratio": 0.0001803751802071929, + "epoch": 0.001443650436704257, + "grad_norm": 0.11280816793441772, + "kl": 0.024487541057169437, + "learning_rate": 3e-06, + "loss": -0.0059, + "step": 520 + }, + { + "clip_ratio": 0.0003646711993496865, + "epoch": 0.001446426687544073, + "grad_norm": 0.10844244807958603, + "kl": 0.025233074091374874, + "learning_rate": 3e-06, + "loss": -0.0072, + "step": 521 + }, + { + "clip_ratio": 8.722958591533825e-05, + "epoch": 0.001449202938383889, + "grad_norm": 0.16373442113399506, + "kl": 0.024108163081109524, + "learning_rate": 3e-06, + "loss": -0.0069, + "step": 522 + }, + { + "clip_ratio": 0.0006587411771761253, + "epoch": 0.0014519791892237048, + "grad_norm": 0.09821134060621262, + "kl": 0.022494456730782986, + "learning_rate": 3e-06, + "loss": -0.0072, + "step": 523 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0014547554400635206, + "grad_norm": 0.1120600625872612, + "kl": 0.021051418967545033, + "learning_rate": 3e-06, + "loss": -0.0086, + "step": 524 + }, + { + "clip_ratio": 0.00027575576677918434, + "epoch": 0.0014575316909033365, + "grad_norm": 0.09990929067134857, + "kl": 0.020683609880506992, + "learning_rate": 3e-06, + "loss": -0.0081, + "step": 525 + }, + { + "clip_ratio": 0.0008699278841959313, + "epoch": 0.0014603079417431523, + "grad_norm": 0.1135360524058342, + "kl": 0.02212886419147253, + "learning_rate": 3e-06, + "loss": -0.0077, + "step": 526 + }, + { + "clip_ratio": 0.0009819942351896316, + "epoch": 0.0014630841925829681, + "grad_norm": 0.09932764619588852, + "kl": 0.02397053875029087, + "learning_rate": 3e-06, + "loss": -0.008, + "step": 527 + }, + { + "clip_ratio": 0.0010553163010627031, + "epoch": 0.0014658604434227842, + "grad_norm": 0.17817182838916779, + "kl": 0.02317346353083849, + "learning_rate": 3e-06, + "loss": -0.008, + "step": 528 + }, + { + "clip_ratio": 0.00024488919734722003, + "completion_length": 238.56250762939453, + "epoch": 0.0014686366942626, + "grad_norm": 0.06560882925987244, + "kl": 0.025266059674322605, + "learning_rate": 3e-06, + "loss": -0.0058, + "reward": 0.19166668504476547, + "reward_std": 0.15476077422499657, + "rewards/countdown_reward_func": 0.19166668504476547, + "step": 529, + "zero_std_ratio": 0.625 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0014714129451024159, + "grad_norm": 0.06739655882120132, + "kl": 0.02375541441142559, + "learning_rate": 3e-06, + "loss": -0.0055, + "step": 530 + }, + { + "clip_ratio": 0.00017831669538281858, + "epoch": 0.0014741891959422317, + "grad_norm": 0.10260723531246185, + "kl": 0.028817658312618732, + "learning_rate": 3e-06, + "loss": -0.0052, + "step": 531 + }, + { + "clip_ratio": 0.00017053855845006183, + "epoch": 0.0014769654467820476, + "grad_norm": 0.0644664391875267, + "kl": 0.028914256952703, + "learning_rate": 3e-06, + "loss": -0.0053, + "step": 532 + }, + { + "clip_ratio": 9.03179170563817e-05, + "epoch": 0.0014797416976218636, + "grad_norm": 0.11235247552394867, + "kl": 0.024057154543697834, + "learning_rate": 3e-06, + "loss": -0.0055, + "step": 533 + }, + { + "clip_ratio": 0.0001695947503321804, + "epoch": 0.0014825179484616795, + "grad_norm": 0.20861512422561646, + "kl": 0.027661575004458427, + "learning_rate": 3e-06, + "loss": -0.0074, + "step": 534 + }, + { + "clip_ratio": 0.00025097496109083295, + "epoch": 0.0014852941993014953, + "grad_norm": 0.06489825993776321, + "kl": 0.026025223545730114, + "learning_rate": 3e-06, + "loss": -0.0062, + "step": 535 + }, + { + "clip_ratio": 0.0007896195165812969, + "epoch": 0.0014880704501413111, + "grad_norm": 0.07131274044513702, + "kl": 0.025113885290920734, + "learning_rate": 3e-06, + "loss": -0.0059, + "step": 536 + }, + { + "clip_ratio": 0.000509865116328001, + "epoch": 0.001490846700981127, + "grad_norm": 0.10190945118665695, + "kl": 0.028591866604983807, + "learning_rate": 3e-06, + "loss": -0.0073, + "step": 537 + }, + { + "clip_ratio": 0.0011356433387845755, + "epoch": 0.001493622951820943, + "grad_norm": 0.07074970752000809, + "kl": 0.027634769678115845, + "learning_rate": 3e-06, + "loss": -0.0058, + "step": 538 + }, + { + "clip_ratio": 0.000357331897248514, + "epoch": 0.0014963992026607589, + "grad_norm": 0.09467112272977829, + "kl": 0.02522993925958872, + "learning_rate": 3e-06, + "loss": -0.0074, + "step": 539 + }, + { + "clip_ratio": 0.0012593485007528216, + "epoch": 0.0014991754535005747, + "grad_norm": 0.22211527824401855, + "kl": 0.027548625133931637, + "learning_rate": 3e-06, + "loss": -0.0112, + "step": 540 + }, + { + "clip_ratio": 0.0, + "completion_length": 235.39584350585938, + "epoch": 0.0015019517043403905, + "grad_norm": 0.07119705528020859, + "kl": 0.02431309036910534, + "learning_rate": 3e-06, + "loss": 0.0065, + "reward": 0.30000002682209015, + "reward_std": 0.22795327007770538, + "rewards/countdown_reward_func": 0.30000001192092896, + "step": 541, + "zero_std_ratio": 0.375 + }, + { + "clip_ratio": 0.00016297261754516512, + "epoch": 0.0015047279551802064, + "grad_norm": 0.09539937227964401, + "kl": 0.025972411036491394, + "learning_rate": 3e-06, + "loss": 0.006, + "step": 542 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0015075042060200222, + "grad_norm": 0.1664063185453415, + "kl": 0.024256471544504166, + "learning_rate": 3e-06, + "loss": 0.0062, + "step": 543 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0015102804568598383, + "grad_norm": 0.07563801109790802, + "kl": 0.02507698815315962, + "learning_rate": 3e-06, + "loss": 0.0052, + "step": 544 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0015130567076996541, + "grad_norm": 0.07607442140579224, + "kl": 0.02424045093357563, + "learning_rate": 3e-06, + "loss": 0.0058, + "step": 545 + }, + { + "clip_ratio": 0.0006382791907526553, + "epoch": 0.00151583295853947, + "grad_norm": 0.15590515732765198, + "kl": 0.02763733733445406, + "learning_rate": 3e-06, + "loss": 0.0064, + "step": 546 + }, + { + "clip_ratio": 0.0003597766626626253, + "epoch": 0.0015186092093792858, + "grad_norm": 0.07136835902929306, + "kl": 0.025670908391475677, + "learning_rate": 3e-06, + "loss": 0.0057, + "step": 547 + }, + { + "clip_ratio": 0.0004282992740627378, + "epoch": 0.0015213854602191016, + "grad_norm": 0.08294139057397842, + "kl": 0.027309386059641838, + "learning_rate": 3e-06, + "loss": 0.0059, + "step": 548 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0015241617110589177, + "grad_norm": 0.1601613610982895, + "kl": 0.025316720828413963, + "learning_rate": 3e-06, + "loss": 0.0056, + "step": 549 + }, + { + "clip_ratio": 0.0002441406322759576, + "epoch": 0.0015269379618987335, + "grad_norm": 0.0745542123913765, + "kl": 0.025896431878209114, + "learning_rate": 3e-06, + "loss": 0.0051, + "step": 550 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0015297142127385494, + "grad_norm": 0.06619153916835785, + "kl": 0.024953342974185944, + "learning_rate": 3e-06, + "loss": 0.005, + "step": 551 + }, + { + "clip_ratio": 0.000708240841049701, + "epoch": 0.0015324904635783652, + "grad_norm": 0.13466133177280426, + "kl": 0.028307722881436348, + "learning_rate": 3e-06, + "loss": 0.0054, + "step": 552 + }, + { + "clip_ratio": 0.0, + "completion_length": 234.81250762939453, + "epoch": 0.001535266714418181, + "grad_norm": 0.09829782694578171, + "kl": 0.02855612989515066, + "learning_rate": 3e-06, + "loss": 0.0059, + "reward": 0.19166667014360428, + "reward_std": 0.19716466218233109, + "rewards/countdown_reward_func": 0.19166666269302368, + "step": 553, + "zero_std_ratio": 0.5 + }, + { + "clip_ratio": 0.00018876852846005931, + "epoch": 0.001538042965257997, + "grad_norm": 0.07418268173933029, + "kl": 0.026963720098137856, + "learning_rate": 3e-06, + "loss": 0.0054, + "step": 554 + }, + { + "clip_ratio": 8.138021075865254e-05, + "epoch": 0.001540819216097813, + "grad_norm": 0.07878899574279785, + "kl": 0.028570099733769894, + "learning_rate": 3e-06, + "loss": 0.006, + "step": 555 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0015435954669376288, + "grad_norm": 0.10893654823303223, + "kl": 0.027858249843120575, + "learning_rate": 3e-06, + "loss": 0.0057, + "step": 556 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0015463717177774446, + "grad_norm": 0.07773488014936447, + "kl": 0.02769723255187273, + "learning_rate": 3e-06, + "loss": 0.0057, + "step": 557 + }, + { + "clip_ratio": 0.00019869584502885118, + "epoch": 0.0015491479686172605, + "grad_norm": 0.08286737650632858, + "kl": 0.02732114028185606, + "learning_rate": 3e-06, + "loss": 0.0053, + "step": 558 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0015519242194570763, + "grad_norm": 0.08795492351055145, + "kl": 0.027023627422749996, + "learning_rate": 3e-06, + "loss": 0.0049, + "step": 559 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0015547004702968924, + "grad_norm": 0.07500362396240234, + "kl": 0.02461559884250164, + "learning_rate": 3e-06, + "loss": 0.0044, + "step": 560 + }, + { + "clip_ratio": 0.00018876852846005931, + "epoch": 0.0015574767211367082, + "grad_norm": 0.08417128771543503, + "kl": 0.026552588678896427, + "learning_rate": 3e-06, + "loss": 0.0052, + "step": 561 + }, + { + "clip_ratio": 0.0, + "epoch": 0.001560252971976524, + "grad_norm": 0.07278092950582504, + "kl": 0.024690870195627213, + "learning_rate": 3e-06, + "loss": 0.0046, + "step": 562 + }, + { + "clip_ratio": 8.138021075865254e-05, + "epoch": 0.0015630292228163399, + "grad_norm": 0.07505667209625244, + "kl": 0.02481877151876688, + "learning_rate": 3e-06, + "loss": 0.0054, + "step": 563 + }, + { + "clip_ratio": 0.00010738831770140678, + "epoch": 0.0015658054736561557, + "grad_norm": 0.08289463818073273, + "kl": 0.024328703992068768, + "learning_rate": 3e-06, + "loss": 0.0044, + "step": 564 + }, + { + "clip_ratio": 0.00019328358030179515, + "completion_length": 228.62500762939453, + "epoch": 0.0015685817244959716, + "grad_norm": 0.09321510046720505, + "kl": 0.02534264326095581, + "learning_rate": 3e-06, + "loss": -0.0105, + "reward": 0.27916668355464935, + "reward_std": 0.28499096632003784, + "rewards/countdown_reward_func": 0.27916668355464935, + "step": 565, + "zero_std_ratio": 0.25 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0015713579753357876, + "grad_norm": 0.09789825975894928, + "kl": 0.023500449024140835, + "learning_rate": 3e-06, + "loss": -0.0099, + "step": 566 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0015741342261756035, + "grad_norm": 0.12602142989635468, + "kl": 0.024465853348374367, + "learning_rate": 3e-06, + "loss": -0.01, + "step": 567 + }, + { + "clip_ratio": 0.00026942871045321226, + "epoch": 0.0015769104770154193, + "grad_norm": 0.07588708400726318, + "kl": 0.021980220451951027, + "learning_rate": 3e-06, + "loss": -0.0094, + "step": 568 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0015796867278552351, + "grad_norm": 0.10751640051603317, + "kl": 0.02414284460246563, + "learning_rate": 3e-06, + "loss": -0.0093, + "step": 569 + }, + { + "clip_ratio": 0.0, + "epoch": 0.001582462978695051, + "grad_norm": 0.12433457374572754, + "kl": 0.02209012396633625, + "learning_rate": 3e-06, + "loss": -0.0097, + "step": 570 + }, + { + "clip_ratio": 0.0, + "epoch": 0.001585239229534867, + "grad_norm": 0.1603914499282837, + "kl": 0.02457761950790882, + "learning_rate": 3e-06, + "loss": -0.0102, + "step": 571 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0015880154803746829, + "grad_norm": 0.09691650420427322, + "kl": 0.023993924260139465, + "learning_rate": 3e-06, + "loss": -0.0106, + "step": 572 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0015907917312144987, + "grad_norm": 0.13051147758960724, + "kl": 0.02514663338661194, + "learning_rate": 3e-06, + "loss": -0.0105, + "step": 573 + }, + { + "clip_ratio": 0.00018027036276180297, + "epoch": 0.0015935679820543145, + "grad_norm": 0.14570623636245728, + "kl": 0.023281600326299667, + "learning_rate": 3e-06, + "loss": -0.0104, + "step": 574 + }, + { + "clip_ratio": 8.698677993379533e-05, + "epoch": 0.0015963442328941304, + "grad_norm": 0.10116558521986008, + "kl": 0.026070833206176758, + "learning_rate": 3e-06, + "loss": -0.0109, + "step": 575 + }, + { + "clip_ratio": 8.698677993379533e-05, + "epoch": 0.0015991204837339462, + "grad_norm": 0.1298682689666748, + "kl": 0.02464818675071001, + "learning_rate": 3e-06, + "loss": -0.0109, + "step": 576 + }, + { + "clip_ratio": 0.0005351027357392013, + "completion_length": 240.83333587646484, + "epoch": 0.0016018967345737623, + "grad_norm": 0.05101247504353523, + "kl": 0.02285961899906397, + "learning_rate": 3e-06, + "loss": 0.0031, + "reward": 0.2083333507180214, + "reward_std": 0.16015682369470596, + "rewards/countdown_reward_func": 0.2083333507180214, + "step": 577, + "zero_std_ratio": 0.375 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0016046729854135781, + "grad_norm": 0.07094918191432953, + "kl": 0.023334643803536892, + "learning_rate": 3e-06, + "loss": 0.0029, + "step": 578 + }, + { + "clip_ratio": 0.0, + "epoch": 0.001607449236253394, + "grad_norm": 0.047830481082201004, + "kl": 0.0244672242552042, + "learning_rate": 3e-06, + "loss": 0.003, + "step": 579 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0016102254870932098, + "grad_norm": 0.06464120745658875, + "kl": 0.024777245707809925, + "learning_rate": 3e-06, + "loss": 0.0036, + "step": 580 + }, + { + "clip_ratio": 8.138021075865254e-05, + "epoch": 0.0016130017379330256, + "grad_norm": 0.062469832599163055, + "kl": 0.02657880913466215, + "learning_rate": 3e-06, + "loss": 0.0032, + "step": 581 + }, + { + "clip_ratio": 0.0007138257715268992, + "epoch": 0.0016157779887728417, + "grad_norm": 0.07282475382089615, + "kl": 0.026876126416027546, + "learning_rate": 3e-06, + "loss": 0.0027, + "step": 582 + }, + { + "clip_ratio": 0.00042808218859136105, + "epoch": 0.0016185542396126575, + "grad_norm": 0.057990189641714096, + "kl": 0.025540747679769993, + "learning_rate": 3e-06, + "loss": 0.0029, + "step": 583 + }, + { + "clip_ratio": 8.138021075865254e-05, + "epoch": 0.0016213304904524734, + "grad_norm": 0.07290374487638474, + "kl": 0.02545579057186842, + "learning_rate": 3e-06, + "loss": 0.0028, + "step": 584 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0016241067412922892, + "grad_norm": 0.04659357666969299, + "kl": 0.026573507115244865, + "learning_rate": 3e-06, + "loss": 0.0028, + "step": 585 + }, + { + "clip_ratio": 0.0006510416860692203, + "epoch": 0.001626882992132105, + "grad_norm": 0.05872870981693268, + "kl": 0.02712887153029442, + "learning_rate": 3e-06, + "loss": 0.0031, + "step": 586 + }, + { + "clip_ratio": 0.0, + "epoch": 0.001629659242971921, + "grad_norm": 0.05923086777329445, + "kl": 0.02828708291053772, + "learning_rate": 3e-06, + "loss": 0.0029, + "step": 587 + }, + { + "clip_ratio": 8.138021075865254e-05, + "epoch": 0.001632435493811737, + "grad_norm": 0.07497607171535492, + "kl": 0.02832121029496193, + "learning_rate": 3e-06, + "loss": 0.002, + "step": 588 + }, + { + "clip_ratio": 0.0, + "completion_length": 240.33333587646484, + "epoch": 0.0016352117446515528, + "grad_norm": 0.12389404326677322, + "kl": 0.028777985833585262, + "learning_rate": 3e-06, + "loss": 0.0163, + "reward": 0.3583333417773247, + "reward_std": 0.36252936720848083, + "rewards/countdown_reward_func": 0.3583333268761635, + "step": 589, + "zero_std_ratio": 0.125 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0016379879954913686, + "grad_norm": 0.16146422922611237, + "kl": 0.029465525411069393, + "learning_rate": 3e-06, + "loss": 0.0157, + "step": 590 + }, + { + "clip_ratio": 0.0002460629912093282, + "epoch": 0.0016407642463311845, + "grad_norm": 0.11740246415138245, + "kl": 0.02861588355153799, + "learning_rate": 3e-06, + "loss": 0.0157, + "step": 591 + }, + { + "clip_ratio": 8.674531272845343e-05, + "epoch": 0.0016435404971710003, + "grad_norm": 0.11579007655382156, + "kl": 0.02733410894870758, + "learning_rate": 3e-06, + "loss": 0.0159, + "step": 592 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0016463167480108164, + "grad_norm": 0.09282111376523972, + "kl": 0.029231702908873558, + "learning_rate": 3e-06, + "loss": 0.0151, + "step": 593 + }, + { + "clip_ratio": 0.0002602359454613179, + "epoch": 0.0016490929988506322, + "grad_norm": 0.15394672751426697, + "kl": 0.029871191829442978, + "learning_rate": 3e-06, + "loss": 0.0157, + "step": 594 + }, + { + "clip_ratio": 8.202099706977606e-05, + "epoch": 0.001651869249690448, + "grad_norm": 0.12174469977617264, + "kl": 0.027274416759610176, + "learning_rate": 3e-06, + "loss": 0.0142, + "step": 595 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0016546455005302639, + "grad_norm": 0.20378051698207855, + "kl": 0.026893497444689274, + "learning_rate": 3e-06, + "loss": 0.0129, + "step": 596 + }, + { + "clip_ratio": 0.00016404199413955212, + "epoch": 0.0016574217513700797, + "grad_norm": 0.11612839996814728, + "kl": 0.027020023204386234, + "learning_rate": 3e-06, + "loss": 0.0138, + "step": 597 + }, + { + "clip_ratio": 8.555784006603062e-05, + "epoch": 0.0016601980022098958, + "grad_norm": 0.10161249339580536, + "kl": 0.025793558917939663, + "learning_rate": 3e-06, + "loss": 0.013, + "step": 598 + }, + { + "clip_ratio": 8.555784006603062e-05, + "epoch": 0.0016629742530497116, + "grad_norm": 0.09491390734910965, + "kl": 0.027327225543558598, + "learning_rate": 3e-06, + "loss": 0.0133, + "step": 599 + }, + { + "clip_ratio": 0.000265891409071628, + "epoch": 0.0016657505038895275, + "grad_norm": 0.13563629984855652, + "kl": 0.027993053197860718, + "learning_rate": 3e-06, + "loss": 0.0133, + "step": 600 + }, + { + "clip_ratio": 8.322236681124195e-05, + "completion_length": 238.75000762939453, + "epoch": 0.0016685267547293433, + "grad_norm": 0.09000701457262039, + "kl": 0.028913519345223904, + "learning_rate": 3e-06, + "loss": 0.0272, + "reward": 0.2083333432674408, + "reward_std": 0.24819570034742355, + "rewards/countdown_reward_func": 0.2083333283662796, + "step": 601, + "zero_std_ratio": 0.25 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0016713030055691591, + "grad_norm": 0.08314099907875061, + "kl": 0.025995067320764065, + "learning_rate": 3e-06, + "loss": 0.0262, + "step": 602 + }, + { + "clip_ratio": 0.0, + "epoch": 0.001674079256408975, + "grad_norm": 0.1207355260848999, + "kl": 0.03479589242488146, + "learning_rate": 3e-06, + "loss": 0.0271, + "step": 603 + }, + { + "clip_ratio": 0.0, + "epoch": 0.001676855507248791, + "grad_norm": 0.11306949704885483, + "kl": 0.030153939500451088, + "learning_rate": 3e-06, + "loss": 0.0267, + "step": 604 + }, + { + "clip_ratio": 8.138021075865254e-05, + "epoch": 0.0016796317580886069, + "grad_norm": 0.08935578912496567, + "kl": 0.028079458512365818, + "learning_rate": 3e-06, + "loss": 0.0258, + "step": 605 + }, + { + "clip_ratio": 0.00024782493710517883, + "epoch": 0.0016824080089284227, + "grad_norm": 0.0978025496006012, + "kl": 0.029070213437080383, + "learning_rate": 3e-06, + "loss": 0.0253, + "step": 606 + }, + { + "clip_ratio": 0.0005404279145295732, + "epoch": 0.0016851842597682385, + "grad_norm": 0.0878811702132225, + "kl": 0.03100433386862278, + "learning_rate": 3e-06, + "loss": 0.025, + "step": 607 + }, + { + "clip_ratio": 0.00018288222781848162, + "epoch": 0.0016879605106080544, + "grad_norm": 0.09215917438268661, + "kl": 0.02795230783522129, + "learning_rate": 3e-06, + "loss": 0.0248, + "step": 608 + }, + { + "clip_ratio": 9.144111390924081e-05, + "epoch": 0.0016907367614478704, + "grad_norm": 0.12471023947000504, + "kl": 0.03957472741603851, + "learning_rate": 3e-06, + "loss": 0.0252, + "step": 609 + }, + { + "clip_ratio": 8.322236681124195e-05, + "epoch": 0.0016935130122876863, + "grad_norm": 0.1100461557507515, + "kl": 0.0361151285469532, + "learning_rate": 3e-06, + "loss": 0.0234, + "step": 610 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0016962892631275021, + "grad_norm": 0.08403483778238297, + "kl": 0.03325536102056503, + "learning_rate": 3e-06, + "loss": 0.0233, + "step": 611 + }, + { + "clip_ratio": 0.0007063246885081753, + "epoch": 0.001699065513967318, + "grad_norm": 0.09051875025033951, + "kl": 0.034811416640877724, + "learning_rate": 3e-06, + "loss": 0.0225, + "step": 612 + }, + { + "clip_ratio": 8.138021075865254e-05, + "completion_length": 243.9791717529297, + "epoch": 0.0017018417648071338, + "grad_norm": 0.09752815216779709, + "kl": 0.0324931014329195, + "learning_rate": 3e-06, + "loss": 0.0035, + "reward": 0.20625001937150955, + "reward_std": 0.2455231510102749, + "rewards/countdown_reward_func": 0.20625000447034836, + "step": 613, + "zero_std_ratio": 0.375 + }, + { + "clip_ratio": 0.00018611777340993285, + "epoch": 0.0017046180156469496, + "grad_norm": 0.10532078891992569, + "kl": 0.03395223990082741, + "learning_rate": 3e-06, + "loss": 0.0041, + "step": 614 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0017073942664867657, + "grad_norm": 0.0800807774066925, + "kl": 0.034072598442435265, + "learning_rate": 3e-06, + "loss": 0.0038, + "step": 615 + }, + { + "clip_ratio": 0.00025160193035844713, + "epoch": 0.0017101705173265815, + "grad_norm": 0.11874743551015854, + "kl": 0.03573352470993996, + "learning_rate": 3e-06, + "loss": 0.0044, + "step": 616 + }, + { + "clip_ratio": 8.164597966242582e-05, + "epoch": 0.0017129467681663974, + "grad_norm": 0.08836469054222107, + "kl": 0.06380523927509785, + "learning_rate": 3e-06, + "loss": 0.0045, + "step": 617 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0017157230190062132, + "grad_norm": 0.08665862679481506, + "kl": 0.045055605471134186, + "learning_rate": 3e-06, + "loss": 0.0032, + "step": 618 + }, + { + "clip_ratio": 0.0003530426474753767, + "epoch": 0.001718499269846029, + "grad_norm": 0.12727470695972443, + "kl": 0.040752191096544266, + "learning_rate": 3e-06, + "loss": 0.004, + "step": 619 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0017212755206858451, + "grad_norm": 0.11716008186340332, + "kl": 0.042310649529099464, + "learning_rate": 3e-06, + "loss": 0.0028, + "step": 620 + }, + { + "clip_ratio": 8.138021075865254e-05, + "epoch": 0.001724051771525661, + "grad_norm": 0.07381536811590195, + "kl": 0.041947562247514725, + "learning_rate": 3e-06, + "loss": 0.0031, + "step": 621 + }, + { + "clip_ratio": 0.0001702217195997946, + "epoch": 0.0017268280223654768, + "grad_norm": 0.09706465154886246, + "kl": 0.042636461555957794, + "learning_rate": 3e-06, + "loss": 0.0047, + "step": 622 + }, + { + "clip_ratio": 8.138021075865254e-05, + "epoch": 0.0017296042732052926, + "grad_norm": 0.08298773318529129, + "kl": 0.07458402588963509, + "learning_rate": 3e-06, + "loss": 0.0036, + "step": 623 + }, + { + "clip_ratio": 0.00016276042151730508, + "epoch": 0.0017323805240451085, + "grad_norm": 0.08479683846235275, + "kl": 0.05383196100592613, + "learning_rate": 3e-06, + "loss": 0.0025, + "step": 624 + }, + { + "clip_ratio": 0.0, + "completion_length": 227.9166717529297, + "epoch": 0.0017351567748849243, + "grad_norm": 0.09818287193775177, + "kl": 0.049956170842051506, + "learning_rate": 3e-06, + "loss": 0.0324, + "reward": 0.20625001937150955, + "reward_std": 0.20693624019622803, + "rewards/countdown_reward_func": 0.20625000447034836, + "step": 625, + "zero_std_ratio": 0.375 + }, + { + "clip_ratio": 0.0003646973054856062, + "epoch": 0.0017379330257247404, + "grad_norm": 0.11653796583414078, + "kl": 0.0570333506911993, + "learning_rate": 3e-06, + "loss": 0.0323, + "step": 626 + }, + { + "clip_ratio": 8.138021075865254e-05, + "epoch": 0.0017407092765645562, + "grad_norm": 0.10893271118402481, + "kl": 0.05227653309702873, + "learning_rate": 3e-06, + "loss": 0.0315, + "step": 627 + }, + { + "clip_ratio": 0.00045520756975747645, + "epoch": 0.001743485527404372, + "grad_norm": 0.1160278171300888, + "kl": 0.05303039960563183, + "learning_rate": 3e-06, + "loss": 0.0313, + "step": 628 + }, + { + "clip_ratio": 8.159269054885954e-05, + "epoch": 0.0017462617782441879, + "grad_norm": 0.10207342356443405, + "kl": 0.05501401796936989, + "learning_rate": 3e-06, + "loss": 0.0314, + "step": 629 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0017490380290840037, + "grad_norm": 0.08500054478645325, + "kl": 0.06589866057038307, + "learning_rate": 3e-06, + "loss": 0.0306, + "step": 630 + }, + { + "clip_ratio": 9.117432637140155e-05, + "epoch": 0.0017518142799238198, + "grad_norm": 0.09498704224824905, + "kl": 0.061905499547719955, + "learning_rate": 3e-06, + "loss": 0.0309, + "step": 631 + }, + { + "clip_ratio": 0.00027352297911420465, + "epoch": 0.0017545905307636356, + "grad_norm": 0.10427255183458328, + "kl": 0.07243792712688446, + "learning_rate": 3e-06, + "loss": 0.0298, + "step": 632 + }, + { + "clip_ratio": 0.00017255453713005409, + "epoch": 0.0017573667816034515, + "grad_norm": 0.11053306609392166, + "kl": 0.07044854387640953, + "learning_rate": 3e-06, + "loss": 0.0295, + "step": 633 + }, + { + "clip_ratio": 0.00010434056457597762, + "epoch": 0.0017601430324432673, + "grad_norm": 0.11823248863220215, + "kl": 0.07041215151548386, + "learning_rate": 3e-06, + "loss": 0.0289, + "step": 634 + }, + { + "clip_ratio": 0.0005901292752241716, + "epoch": 0.0017629192832830831, + "grad_norm": 0.09512235224246979, + "kl": 0.07614094018936157, + "learning_rate": 3e-06, + "loss": 0.0288, + "step": 635 + }, + { + "clip_ratio": 0.00029720802558586, + "epoch": 0.001765695534122899, + "grad_norm": 0.07722548395395279, + "kl": 0.09275055304169655, + "learning_rate": 3e-06, + "loss": 0.0284, + "step": 636 + }, + { + "clip_ratio": 0.0, + "completion_length": 237.89583587646484, + "epoch": 0.001768471784962715, + "grad_norm": 0.0991094708442688, + "kl": 0.10344970971345901, + "learning_rate": 3e-06, + "loss": 0.0205, + "reward": 0.19166666269302368, + "reward_std": 0.1672205552458763, + "rewards/countdown_reward_func": 0.19166666269302368, + "step": 637, + "zero_std_ratio": 0.5 + }, + { + "clip_ratio": 8.896797226043418e-05, + "epoch": 0.0017712480358025309, + "grad_norm": 0.10992806404829025, + "kl": 0.10585097968578339, + "learning_rate": 3e-06, + "loss": 0.0203, + "step": 638 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0017740242866423467, + "grad_norm": 0.07519710063934326, + "kl": 0.1175805889070034, + "learning_rate": 3e-06, + "loss": 0.0194, + "step": 639 + }, + { + "clip_ratio": 0.00017600550199858844, + "epoch": 0.0017768005374821625, + "grad_norm": 0.15158335864543915, + "kl": 0.1538476049900055, + "learning_rate": 3e-06, + "loss": 0.0215, + "step": 640 + }, + { + "clip_ratio": 9.462528396397829e-05, + "epoch": 0.0017795767883219784, + "grad_norm": 0.07322803884744644, + "kl": 0.14574335515499115, + "learning_rate": 3e-06, + "loss": 0.021, + "step": 641 + }, + { + "clip_ratio": 0.0005492736818268895, + "epoch": 0.0017823530391617944, + "grad_norm": 0.12571430206298828, + "kl": 0.15892429277300835, + "learning_rate": 3e-06, + "loss": 0.0215, + "step": 642 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0017851292900016103, + "grad_norm": 0.0937129482626915, + "kl": 0.14000995457172394, + "learning_rate": 3e-06, + "loss": 0.0194, + "step": 643 + }, + { + "clip_ratio": 0.00017034818301908672, + "epoch": 0.0017879055408414261, + "grad_norm": 0.10581913590431213, + "kl": 0.13933642953634262, + "learning_rate": 3e-06, + "loss": 0.0194, + "step": 644 + }, + { + "clip_ratio": 0.00018925056792795658, + "epoch": 0.001790681791681242, + "grad_norm": 0.09151846915483475, + "kl": 0.14892030507326126, + "learning_rate": 3e-06, + "loss": 0.0194, + "step": 645 + }, + { + "clip_ratio": 0.0002573857200331986, + "epoch": 0.0017934580425210578, + "grad_norm": 0.16274204850196838, + "kl": 0.1857452318072319, + "learning_rate": 3e-06, + "loss": 0.0201, + "step": 646 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0017962342933608736, + "grad_norm": 0.07416386157274246, + "kl": 0.17207730561494827, + "learning_rate": 3e-06, + "loss": 0.0196, + "step": 647 + }, + { + "clip_ratio": 0.0008967332323663868, + "epoch": 0.0017990105442006897, + "grad_norm": 0.11084357649087906, + "kl": 0.17409475147724152, + "learning_rate": 3e-06, + "loss": 0.02, + "step": 648 + }, + { + "clip_ratio": 9.999999747378752e-05, + "completion_length": 226.64584350585938, + "epoch": 0.0018017867950405055, + "grad_norm": 0.15701425075531006, + "kl": 0.16254248470067978, + "learning_rate": 3e-06, + "loss": 0.0062, + "reward": 0.2770833596587181, + "reward_std": 0.3268149420619011, + "rewards/countdown_reward_func": 0.2770833596587181, + "step": 649, + "zero_std_ratio": 0.125 + }, + { + "clip_ratio": 9.999999747378752e-05, + "epoch": 0.0018045630458803214, + "grad_norm": 0.14769934117794037, + "kl": 0.15972411632537842, + "learning_rate": 3e-06, + "loss": 0.0047, + "step": 650 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0018073392967201372, + "grad_norm": 0.15065068006515503, + "kl": 0.15974202752113342, + "learning_rate": 3e-06, + "loss": 0.0055, + "step": 651 + }, + { + "clip_ratio": 0.00018544086924521253, + "epoch": 0.001810115547559953, + "grad_norm": 0.25048333406448364, + "kl": 0.1500309482216835, + "learning_rate": 3e-06, + "loss": 0.0031, + "step": 652 + }, + { + "clip_ratio": 8.638562576379627e-05, + "epoch": 0.0018128917983997691, + "grad_norm": 0.16509313881397247, + "kl": 0.14627444744110107, + "learning_rate": 3e-06, + "loss": 0.0038, + "step": 653 + }, + { + "clip_ratio": 0.00018638561596162617, + "epoch": 0.001815668049239585, + "grad_norm": 0.2661731243133545, + "kl": 0.1369135081768036, + "learning_rate": 3e-06, + "loss": 0.0038, + "step": 654 + }, + { + "clip_ratio": 9.999999747378752e-05, + "epoch": 0.0018184443000794008, + "grad_norm": 0.14947877824306488, + "kl": 0.13618455082178116, + "learning_rate": 3e-06, + "loss": 0.0042, + "step": 655 + }, + { + "clip_ratio": 0.00019307520415168256, + "epoch": 0.0018212205509192166, + "grad_norm": 0.16721273958683014, + "kl": 0.12803955748677254, + "learning_rate": 3e-06, + "loss": 0.0031, + "step": 656 + }, + { + "clip_ratio": 0.00010548523277975619, + "epoch": 0.0018239968017590325, + "grad_norm": 0.13844816386699677, + "kl": 0.12379540503025055, + "learning_rate": 3e-06, + "loss": 0.0029, + "step": 657 + }, + { + "clip_ratio": 0.0003572673595044762, + "epoch": 0.0018267730525988483, + "grad_norm": 0.24204890429973602, + "kl": 0.10973387956619263, + "learning_rate": 3e-06, + "loss": -0.003, + "step": 658 + }, + { + "clip_ratio": 0.0006522906478494406, + "epoch": 0.0018295493034386644, + "grad_norm": 0.1522047072649002, + "kl": 0.10548713058233261, + "learning_rate": 3e-06, + "loss": -0.0003, + "step": 659 + }, + { + "clip_ratio": 0.001921730930916965, + "epoch": 0.0018323255542784802, + "grad_norm": 0.25724494457244873, + "kl": 0.09767122194170952, + "learning_rate": 3e-06, + "loss": -0.0013, + "step": 660 + }, + { + "clip_ratio": 9.051412052940577e-05, + "completion_length": 218.7916717529297, + "epoch": 0.001835101805118296, + "grad_norm": 0.11165298521518707, + "kl": 0.0836155079305172, + "learning_rate": 3e-06, + "loss": 0.007, + "reward": 0.2604166716337204, + "reward_std": 0.26486171036958694, + "rewards/countdown_reward_func": 0.2604166716337204, + "step": 661, + "zero_std_ratio": 0.25 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0018378780559581119, + "grad_norm": 0.10689469426870346, + "kl": 0.09136947244405746, + "learning_rate": 3e-06, + "loss": 0.0088, + "step": 662 + }, + { + "clip_ratio": 0.00019087179680354893, + "epoch": 0.0018406543067979277, + "grad_norm": 0.12443461269140244, + "kl": 0.07553870230913162, + "learning_rate": 3e-06, + "loss": 0.0075, + "step": 663 + }, + { + "clip_ratio": 8.383634849451482e-05, + "epoch": 0.0018434305576377438, + "grad_norm": 0.1088162511587143, + "kl": 0.07327612116932869, + "learning_rate": 3e-06, + "loss": 0.0079, + "step": 664 + }, + { + "clip_ratio": 0.0001824945939006284, + "epoch": 0.0018462068084775596, + "grad_norm": 0.09485284239053726, + "kl": 0.07073798030614853, + "learning_rate": 3e-06, + "loss": 0.0066, + "step": 665 + }, + { + "clip_ratio": 0.00030053697992116213, + "epoch": 0.0018489830593173755, + "grad_norm": 0.13278953731060028, + "kl": 0.06900656968355179, + "learning_rate": 3e-06, + "loss": 0.0067, + "step": 666 + }, + { + "clip_ratio": 9.865824540611356e-05, + "epoch": 0.0018517593101571913, + "grad_norm": 0.10497219115495682, + "kl": 0.06802161037921906, + "learning_rate": 3e-06, + "loss": 0.0067, + "step": 667 + }, + { + "clip_ratio": 0.00021725406986661255, + "epoch": 0.0018545355609970071, + "grad_norm": 0.11255109310150146, + "kl": 0.07535571232438087, + "learning_rate": 3e-06, + "loss": 0.0089, + "step": 668 + }, + { + "clip_ratio": 0.0005888359155505896, + "epoch": 0.0018573118118368232, + "grad_norm": 0.10095056146383286, + "kl": 0.06463643535971642, + "learning_rate": 3e-06, + "loss": 0.0064, + "step": 669 + }, + { + "clip_ratio": 0.0005477168742800131, + "epoch": 0.001860088062676639, + "grad_norm": 0.09998640418052673, + "kl": 0.06509601883590221, + "learning_rate": 3e-06, + "loss": 0.007, + "step": 670 + }, + { + "clip_ratio": 0.0001824945939006284, + "epoch": 0.0018628643135164549, + "grad_norm": 0.1530102789402008, + "kl": 0.06189018301665783, + "learning_rate": 3e-06, + "loss": 0.0052, + "step": 671 + }, + { + "clip_ratio": 0.0002959747507702559, + "epoch": 0.0018656405643562707, + "grad_norm": 0.13745903968811035, + "kl": 0.06357544660568237, + "learning_rate": 3e-06, + "loss": 0.0064, + "step": 672 + }, + { + "clip_ratio": 8.486082515446469e-05, + "completion_length": 220.12500762939453, + "epoch": 0.0018684168151960865, + "grad_norm": 0.17126166820526123, + "kl": 0.060750387609004974, + "learning_rate": 3e-06, + "loss": -0.0094, + "reward": 0.26250001788139343, + "reward_std": 0.2778630629181862, + "rewards/countdown_reward_func": 0.26250001043081284, + "step": 673, + "zero_std_ratio": 0.25 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0018711930660359024, + "grad_norm": 0.11247212439775467, + "kl": 0.059839921072125435, + "learning_rate": 3e-06, + "loss": -0.0094, + "step": 674 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0018739693168757184, + "grad_norm": 0.2667171359062195, + "kl": 0.06426364183425903, + "learning_rate": 3e-06, + "loss": -0.0092, + "step": 675 + }, + { + "clip_ratio": 9.36329597607255e-05, + "epoch": 0.0018767455677155343, + "grad_norm": 0.1640663743019104, + "kl": 0.06390438973903656, + "learning_rate": 3e-06, + "loss": -0.0087, + "step": 676 + }, + { + "clip_ratio": 0.0001101321613532491, + "epoch": 0.0018795218185553501, + "grad_norm": 0.11860151588916779, + "kl": 0.054045552387833595, + "learning_rate": 3e-06, + "loss": -0.0108, + "step": 677 + }, + { + "clip_ratio": 0.00019816032727248967, + "epoch": 0.001882298069395166, + "grad_norm": 0.1296575963497162, + "kl": 0.059720540419220924, + "learning_rate": 3e-06, + "loss": -0.0096, + "step": 678 + }, + { + "clip_ratio": 0.00033944330061785877, + "epoch": 0.0018850743202349818, + "grad_norm": 0.1466340571641922, + "kl": 0.05133458413183689, + "learning_rate": 3e-06, + "loss": -0.0108, + "step": 679 + }, + { + "clip_ratio": 8.486082515446469e-05, + "epoch": 0.0018878505710747979, + "grad_norm": 0.10148210823535919, + "kl": 0.049093831330537796, + "learning_rate": 3e-06, + "loss": -0.0103, + "step": 680 + }, + { + "clip_ratio": 9.476876584812999e-05, + "epoch": 0.0018906268219146137, + "grad_norm": 0.20377697050571442, + "kl": 0.05094917304813862, + "learning_rate": 3e-06, + "loss": -0.0118, + "step": 681 + }, + { + "clip_ratio": 0.0003458706341916695, + "epoch": 0.0018934030727544295, + "grad_norm": 0.15611247718334198, + "kl": 0.050493909046053886, + "learning_rate": 3e-06, + "loss": -0.0106, + "step": 682 + }, + { + "clip_ratio": 0.0002511018610675819, + "epoch": 0.0018961793235942454, + "grad_norm": 0.1299794614315033, + "kl": 0.04327445663511753, + "learning_rate": 3e-06, + "loss": -0.0112, + "step": 683 + }, + { + "clip_ratio": 0.0003479021688690409, + "epoch": 0.0018989555744340612, + "grad_norm": 0.12855900824069977, + "kl": 0.04604472406208515, + "learning_rate": 3e-06, + "loss": -0.0124, + "step": 684 + }, + { + "clip_ratio": 0.00042530003702268004, + "completion_length": 235.7916717529297, + "epoch": 0.001901731825273877, + "grad_norm": 0.13944292068481445, + "kl": 0.04754863306879997, + "learning_rate": 3e-06, + "loss": 0.0191, + "reward": 0.24791667610406876, + "reward_std": 0.23748211562633514, + "rewards/countdown_reward_func": 0.24791667610406876, + "step": 685, + "zero_std_ratio": 0.375 + }, + { + "clip_ratio": 0.0, + "epoch": 0.001904508076113693, + "grad_norm": 0.10677601397037506, + "kl": 0.041842538863420486, + "learning_rate": 3e-06, + "loss": 0.0195, + "step": 686 + }, + { + "clip_ratio": 0.0, + "epoch": 0.001907284326953509, + "grad_norm": 0.08806996792554855, + "kl": 0.04203086532652378, + "learning_rate": 3e-06, + "loss": 0.019, + "step": 687 + }, + { + "clip_ratio": 0.00018215480668004602, + "epoch": 0.0019100605777933248, + "grad_norm": 0.1156989261507988, + "kl": 0.042531004175543785, + "learning_rate": 3e-06, + "loss": 0.0186, + "step": 688 + }, + { + "clip_ratio": 0.00018691782315727323, + "epoch": 0.0019128368286331406, + "grad_norm": 0.09394989162683487, + "kl": 0.041263919323682785, + "learning_rate": 3e-06, + "loss": 0.0194, + "step": 689 + }, + { + "clip_ratio": 0.00026606619940139353, + "epoch": 0.0019156130794729565, + "grad_norm": 0.10816143453121185, + "kl": 0.041256049647927284, + "learning_rate": 3e-06, + "loss": 0.0201, + "step": 690 + }, + { + "clip_ratio": 0.0005038198505644687, + "epoch": 0.0019183893303127725, + "grad_norm": 0.16718047857284546, + "kl": 0.04300625994801521, + "learning_rate": 3e-06, + "loss": 0.0206, + "step": 691 + }, + { + "clip_ratio": 0.0001649922487558797, + "epoch": 0.0019211655811525884, + "grad_norm": 0.10943221300840378, + "kl": 0.04007406160235405, + "learning_rate": 3e-06, + "loss": 0.0195, + "step": 692 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0019239418319924042, + "grad_norm": 0.08448076248168945, + "kl": 0.040019482374191284, + "learning_rate": 3e-06, + "loss": 0.0188, + "step": 693 + }, + { + "clip_ratio": 0.00025433551491005346, + "epoch": 0.00192671808283222, + "grad_norm": 0.11321469396352768, + "kl": 0.04307270236313343, + "learning_rate": 3e-06, + "loss": 0.0183, + "step": 694 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0019294943336720359, + "grad_norm": 0.09902040660381317, + "kl": 0.042439911514520645, + "learning_rate": 3e-06, + "loss": 0.0188, + "step": 695 + }, + { + "clip_ratio": 0.0001846859959186986, + "epoch": 0.0019322705845118517, + "grad_norm": 0.11367753148078918, + "kl": 0.04280451126396656, + "learning_rate": 3e-06, + "loss": 0.0197, + "step": 696 + }, + { + "clip_ratio": 0.00010229132749373093, + "completion_length": 230.8541717529297, + "epoch": 0.0019350468353516678, + "grad_norm": 0.13595792651176453, + "kl": 0.04034610837697983, + "learning_rate": 3e-06, + "loss": 0.0264, + "reward": 0.26250001788139343, + "reward_std": 0.2784983515739441, + "rewards/countdown_reward_func": 0.26250000298023224, + "step": 697, + "zero_std_ratio": 0.25 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0019378230861914836, + "grad_norm": 0.1038818508386612, + "kl": 0.040640873834490776, + "learning_rate": 3e-06, + "loss": 0.0261, + "step": 698 + }, + { + "clip_ratio": 8.138021075865254e-05, + "epoch": 0.0019405993370312995, + "grad_norm": 0.0944518968462944, + "kl": 0.04026073403656483, + "learning_rate": 3e-06, + "loss": 0.0269, + "step": 699 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0019433755878711153, + "grad_norm": 0.10623365640640259, + "kl": 0.03956114687025547, + "learning_rate": 3e-06, + "loss": 0.0262, + "step": 700 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0019461518387109311, + "grad_norm": 0.09136571735143661, + "kl": 0.044885121285915375, + "learning_rate": 3e-06, + "loss": 0.0263, + "step": 701 + }, + { + "clip_ratio": 8.915834769140929e-05, + "epoch": 0.0019489280895507472, + "grad_norm": 0.12446179240942001, + "kl": 0.04785134270787239, + "learning_rate": 3e-06, + "loss": 0.0254, + "step": 702 + }, + { + "clip_ratio": 0.0003792484931182116, + "epoch": 0.001951704340390563, + "grad_norm": 0.1446102112531662, + "kl": 0.04690235108137131, + "learning_rate": 3e-06, + "loss": 0.0251, + "step": 703 + }, + { + "clip_ratio": 8.915834769140929e-05, + "epoch": 0.0019544805912303786, + "grad_norm": 0.11323653906583786, + "kl": 0.048219822347164154, + "learning_rate": 3e-06, + "loss": 0.0248, + "step": 704 + }, + { + "clip_ratio": 0.00016276042151730508, + "epoch": 0.001957256842070195, + "grad_norm": 0.09300023317337036, + "kl": 0.04829951003193855, + "learning_rate": 3e-06, + "loss": 0.0253, + "step": 705 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0019600330929100108, + "grad_norm": 0.09304996579885483, + "kl": 0.04665645770728588, + "learning_rate": 3e-06, + "loss": 0.0243, + "step": 706 + }, + { + "clip_ratio": 0.0004711005021817982, + "epoch": 0.0019628093437498266, + "grad_norm": 0.10053187608718872, + "kl": 0.056041302159428596, + "learning_rate": 3e-06, + "loss": 0.0249, + "step": 707 + }, + { + "clip_ratio": 8.915834769140929e-05, + "epoch": 0.0019655855945896424, + "grad_norm": 0.11689490079879761, + "kl": 0.05928555130958557, + "learning_rate": 3e-06, + "loss": 0.023, + "step": 708 + }, + { + "clip_ratio": 0.0001020408162730746, + "completion_length": 217.87500762939453, + "epoch": 0.0019683618454294583, + "grad_norm": 0.12107165902853012, + "kl": 0.059857327491045, + "learning_rate": 3e-06, + "loss": 0.0018, + "reward": 0.2875000312924385, + "reward_std": 0.26614009588956833, + "rewards/countdown_reward_func": 0.2875000312924385, + "step": 709, + "zero_std_ratio": 0.375 + }, + { + "clip_ratio": 0.00016276042151730508, + "epoch": 0.001971138096269274, + "grad_norm": 0.1230737566947937, + "kl": 0.06688201427459717, + "learning_rate": 3e-06, + "loss": 0.0023, + "step": 710 + }, + { + "clip_ratio": 0.0003445422335062176, + "epoch": 0.00197391434710909, + "grad_norm": 0.09549138695001602, + "kl": 0.06407090276479721, + "learning_rate": 3e-06, + "loss": 0.0025, + "step": 711 + }, + { + "clip_ratio": 0.00010593220213195309, + "epoch": 0.001976690597948906, + "grad_norm": 0.1953248232603073, + "kl": 0.06954507902264595, + "learning_rate": 3e-06, + "loss": 0.0026, + "step": 712 + }, + { + "clip_ratio": 0.000314092067128513, + "epoch": 0.0019794668487887216, + "grad_norm": 0.09191843122243881, + "kl": 0.0671706348657608, + "learning_rate": 3e-06, + "loss": 0.0025, + "step": 713 + }, + { + "clip_ratio": 9.191176650347188e-05, + "epoch": 0.0019822430996285375, + "grad_norm": 0.12914390861988068, + "kl": 0.06849226728081703, + "learning_rate": 3e-06, + "loss": 0.0013, + "step": 714 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0019850193504683533, + "grad_norm": 0.1148599311709404, + "kl": 0.06904346123337746, + "learning_rate": 3e-06, + "loss": 0.0014, + "step": 715 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0019877956013081696, + "grad_norm": 0.12667132914066315, + "kl": 0.07613038271665573, + "learning_rate": 3e-06, + "loss": 0.002, + "step": 716 + }, + { + "clip_ratio": 0.00017349517293041572, + "epoch": 0.0019905718521479854, + "grad_norm": 0.09524544328451157, + "kl": 0.07107832655310631, + "learning_rate": 3e-06, + "loss": 0.0013, + "step": 717 + }, + { + "clip_ratio": 0.00010593220213195309, + "epoch": 0.0019933481029878013, + "grad_norm": 0.13948820531368256, + "kl": 0.07579323649406433, + "learning_rate": 3e-06, + "loss": 0.0023, + "step": 718 + }, + { + "clip_ratio": 0.00030192390113370493, + "epoch": 0.001996124353827617, + "grad_norm": 0.0964130386710167, + "kl": 0.07115180417895317, + "learning_rate": 3e-06, + "loss": 0.0018, + "step": 719 + }, + { + "clip_ratio": 0.00019251657067798078, + "epoch": 0.001998900604667433, + "grad_norm": 0.13039907813072205, + "kl": 0.07175230234861374, + "learning_rate": 3e-06, + "loss": 0.0002, + "step": 720 + }, + { + "clip_ratio": 0.00028670569736277685, + "completion_length": 223.6875, + "epoch": 0.002001676855507249, + "grad_norm": 0.22225071489810944, + "kl": 0.07658716291189194, + "learning_rate": 3e-06, + "loss": 0.0017, + "reward": 0.15208333730697632, + "reward_std": 0.14035604149103165, + "rewards/countdown_reward_func": 0.15208333730697632, + "step": 721, + "zero_std_ratio": 0.625 + }, + { + "clip_ratio": 8.138021075865254e-05, + "epoch": 0.0020044531063470646, + "grad_norm": 0.07475583255290985, + "kl": 0.06551392935216427, + "learning_rate": 3e-06, + "loss": 0.0021, + "step": 722 + }, + { + "clip_ratio": 9.742790280142799e-05, + "epoch": 0.0020072293571868805, + "grad_norm": 0.08553145080804825, + "kl": 0.06234482862055302, + "learning_rate": 3e-06, + "loss": 0.0017, + "step": 723 + }, + { + "clip_ratio": 0.00019808593060588464, + "epoch": 0.0020100056080266963, + "grad_norm": 0.05904084071516991, + "kl": 0.06513764709234238, + "learning_rate": 3e-06, + "loss": 0.0017, + "step": 724 + }, + { + "clip_ratio": 0.0, + "epoch": 0.002012781858866512, + "grad_norm": 0.07201807200908661, + "kl": 0.06482937932014465, + "learning_rate": 3e-06, + "loss": 0.0019, + "step": 725 + }, + { + "clip_ratio": 9.984025382436812e-05, + "epoch": 0.002015558109706328, + "grad_norm": 0.07085428386926651, + "kl": 0.06466732174158096, + "learning_rate": 3e-06, + "loss": 0.0018, + "step": 726 + }, + { + "clip_ratio": 0.00018783042469294742, + "epoch": 0.0020183343605461443, + "grad_norm": 0.24391846358776093, + "kl": 0.06686633080244064, + "learning_rate": 3e-06, + "loss": -0.001, + "step": 727 + }, + { + "clip_ratio": 0.00016469038382638246, + "epoch": 0.00202111061138596, + "grad_norm": 0.08265461027622223, + "kl": 0.056683897972106934, + "learning_rate": 3e-06, + "loss": 0.0008, + "step": 728 + }, + { + "clip_ratio": 0.0, + "epoch": 0.002023886862225776, + "grad_norm": 0.08346160501241684, + "kl": 0.05314911529421806, + "learning_rate": 3e-06, + "loss": 0.0008, + "step": 729 + }, + { + "clip_ratio": 0.00019712094945134595, + "epoch": 0.0020266631130655918, + "grad_norm": 0.057865407317876816, + "kl": 0.05542047321796417, + "learning_rate": 3e-06, + "loss": 0.0012, + "step": 730 + }, + { + "clip_ratio": 0.0002478523238096386, + "epoch": 0.0020294393639054076, + "grad_norm": 0.07208742946386337, + "kl": 0.054563652724027634, + "learning_rate": 3e-06, + "loss": 0.0008, + "step": 731 + }, + { + "clip_ratio": 0.0003381839778739959, + "epoch": 0.0020322156147452235, + "grad_norm": 0.06954418867826462, + "kl": 0.05391604080796242, + "learning_rate": 3e-06, + "loss": 0.0007, + "step": 732 + }, + { + "clip_ratio": 0.0, + "completion_length": 209.27083587646484, + "epoch": 0.0020349918655850393, + "grad_norm": 0.12923622131347656, + "kl": 0.054487695917487144, + "learning_rate": 3e-06, + "loss": 0.0177, + "reward": 0.24791669100522995, + "reward_std": 0.3050043284893036, + "rewards/countdown_reward_func": 0.24791667610406876, + "step": 733, + "zero_std_ratio": 0.125 + }, + { + "clip_ratio": 0.0001197317978949286, + "epoch": 0.002037768116424855, + "grad_norm": 0.10722116380929947, + "kl": 0.05108374170958996, + "learning_rate": 3e-06, + "loss": 0.0178, + "step": 734 + }, + { + "clip_ratio": 0.0001197317978949286, + "epoch": 0.002040544367264671, + "grad_norm": 0.11042473465204239, + "kl": 0.052645549178123474, + "learning_rate": 3e-06, + "loss": 0.0183, + "step": 735 + }, + { + "clip_ratio": 9.104151831706986e-05, + "epoch": 0.002043320618104487, + "grad_norm": 0.1352359801530838, + "kl": 0.049059027805924416, + "learning_rate": 3e-06, + "loss": 0.0169, + "step": 736 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0020460968689443026, + "grad_norm": 0.11228242516517639, + "kl": 0.056157197803258896, + "learning_rate": 3e-06, + "loss": 0.0182, + "step": 737 + }, + { + "clip_ratio": 0.00029813701985403895, + "epoch": 0.002048873119784119, + "grad_norm": 0.09875979274511337, + "kl": 0.04752412252128124, + "learning_rate": 3e-06, + "loss": 0.0174, + "step": 738 + }, + { + "clip_ratio": 8.922198321670294e-05, + "epoch": 0.0020516493706239348, + "grad_norm": 0.12920497357845306, + "kl": 0.04895847663283348, + "learning_rate": 3e-06, + "loss": 0.0167, + "step": 739 + }, + { + "clip_ratio": 0.0003326056757941842, + "epoch": 0.0020544256214637506, + "grad_norm": 0.10440492630004883, + "kl": 0.04799576476216316, + "learning_rate": 3e-06, + "loss": 0.0173, + "step": 740 + }, + { + "clip_ratio": 0.00021150236716493964, + "epoch": 0.0020572018723035664, + "grad_norm": 0.10824400186538696, + "kl": 0.050282422453165054, + "learning_rate": 3e-06, + "loss": 0.0179, + "step": 741 + }, + { + "clip_ratio": 0.0002083023136947304, + "epoch": 0.0020599781231433823, + "grad_norm": 0.17329491674900055, + "kl": 0.04743002541363239, + "learning_rate": 3e-06, + "loss": 0.0154, + "step": 742 + }, + { + "clip_ratio": 0.0, + "epoch": 0.002062754373983198, + "grad_norm": 0.11108177900314331, + "kl": 0.05471383221447468, + "learning_rate": 3e-06, + "loss": 0.017, + "step": 743 + }, + { + "clip_ratio": 0.00021150236716493964, + "epoch": 0.002065530624823014, + "grad_norm": 0.09756101667881012, + "kl": 0.048802973702549934, + "learning_rate": 3e-06, + "loss": 0.0166, + "step": 744 + }, + { + "clip_ratio": 8.650519157527015e-05, + "completion_length": 231.375, + "epoch": 0.00206830687566283, + "grad_norm": 0.11797621846199036, + "kl": 0.05034934915602207, + "learning_rate": 3e-06, + "loss": 0.0265, + "reward": 0.32500001043081284, + "reward_std": 0.33128294348716736, + "rewards/countdown_reward_func": 0.32500001043081284, + "step": 745, + "zero_std_ratio": 0.25 + }, + { + "clip_ratio": 8.138021075865254e-05, + "epoch": 0.0020710831265026456, + "grad_norm": 0.1360631287097931, + "kl": 0.049324722960591316, + "learning_rate": 3e-06, + "loss": 0.0259, + "step": 746 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0020738593773424615, + "grad_norm": 0.11022765189409256, + "kl": 0.04972606897354126, + "learning_rate": 3e-06, + "loss": 0.0263, + "step": 747 + }, + { + "clip_ratio": 8.138021075865254e-05, + "epoch": 0.0020766356281822773, + "grad_norm": 0.11825685203075409, + "kl": 0.05200809799134731, + "learning_rate": 3e-06, + "loss": 0.0255, + "step": 748 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0020794118790220936, + "grad_norm": 0.15757912397384644, + "kl": 0.05278201401233673, + "learning_rate": 3e-06, + "loss": 0.0252, + "step": 749 + }, + { + "clip_ratio": 9.335325012216344e-05, + "epoch": 0.0020821881298619094, + "grad_norm": 0.12597443163394928, + "kl": 0.05840662308037281, + "learning_rate": 3e-06, + "loss": 0.0252, + "step": 750 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0020849643807017253, + "grad_norm": 0.12119497358798981, + "kl": 0.061403946951031685, + "learning_rate": 3e-06, + "loss": 0.0255, + "step": 751 + }, + { + "clip_ratio": 0.0, + "epoch": 0.002087740631541541, + "grad_norm": 0.15548032522201538, + "kl": 0.06312582828104496, + "learning_rate": 3e-06, + "loss": 0.024, + "step": 752 + }, + { + "clip_ratio": 0.0, + "epoch": 0.002090516882381357, + "grad_norm": 0.09560614824295044, + "kl": 0.06370921805500984, + "learning_rate": 3e-06, + "loss": 0.0251, + "step": 753 + }, + { + "clip_ratio": 8.138021075865254e-05, + "epoch": 0.002093293133221173, + "grad_norm": 0.11291124671697617, + "kl": 0.06948983296751976, + "learning_rate": 3e-06, + "loss": 0.023, + "step": 754 + }, + { + "clip_ratio": 9.328358282800764e-05, + "epoch": 0.0020960693840609886, + "grad_norm": 0.14481361210346222, + "kl": 0.07263422012329102, + "learning_rate": 3e-06, + "loss": 0.0218, + "step": 755 + }, + { + "clip_ratio": 9.13075273274444e-05, + "epoch": 0.0020988456349008045, + "grad_norm": 0.11197732388973236, + "kl": 0.08130589872598648, + "learning_rate": 3e-06, + "loss": 0.0229, + "step": 756 + }, + { + "clip_ratio": 9.144111390924081e-05, + "completion_length": 233.5416717529297, + "epoch": 0.0021016218857406203, + "grad_norm": 0.08032705634832382, + "kl": 0.0910085029900074, + "learning_rate": 3e-06, + "loss": 0.0084, + "reward": 0.20625000447034836, + "reward_std": 0.245523139834404, + "rewards/countdown_reward_func": 0.20625000447034836, + "step": 757, + "zero_std_ratio": 0.375 + }, + { + "clip_ratio": 0.00019638649246189743, + "epoch": 0.002104398136580436, + "grad_norm": 0.08426212519407272, + "kl": 0.0998266153037548, + "learning_rate": 3e-06, + "loss": 0.0084, + "step": 758 + }, + { + "clip_ratio": 0.0001748835711623542, + "epoch": 0.002107174387420252, + "grad_norm": 0.09099453687667847, + "kl": 0.09878429025411606, + "learning_rate": 3e-06, + "loss": 0.0081, + "step": 759 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0021099506382600683, + "grad_norm": 0.10690543055534363, + "kl": 0.10915743559598923, + "learning_rate": 3e-06, + "loss": 0.0094, + "step": 760 + }, + { + "clip_ratio": 0.00026571148191578686, + "epoch": 0.002112726889099884, + "grad_norm": 0.08678600192070007, + "kl": 0.11478978767991066, + "learning_rate": 3e-06, + "loss": 0.0084, + "step": 761 + }, + { + "clip_ratio": 0.0001645819575060159, + "epoch": 0.0021155031399397, + "grad_norm": 0.09271499514579773, + "kl": 0.1079954281449318, + "learning_rate": 3e-06, + "loss": 0.0083, + "step": 762 + }, + { + "clip_ratio": 9.144111390924081e-05, + "epoch": 0.0021182793907795158, + "grad_norm": 0.08949034661054611, + "kl": 0.11407085880637169, + "learning_rate": 3e-06, + "loss": 0.0076, + "step": 763 + }, + { + "clip_ratio": 0.00028337843832559884, + "epoch": 0.0021210556416193316, + "grad_norm": 0.09156984835863113, + "kl": 0.12308426946401596, + "learning_rate": 3e-06, + "loss": 0.0082, + "step": 764 + }, + { + "clip_ratio": 0.0001748835711623542, + "epoch": 0.0021238318924591475, + "grad_norm": 0.08866655081510544, + "kl": 0.11775670573115349, + "learning_rate": 3e-06, + "loss": 0.0079, + "step": 765 + }, + { + "clip_ratio": 9.819324623094872e-05, + "epoch": 0.0021266081432989633, + "grad_norm": 0.10422717034816742, + "kl": 0.12401585280895233, + "learning_rate": 3e-06, + "loss": 0.0084, + "step": 766 + }, + { + "clip_ratio": 9.259259240934625e-05, + "epoch": 0.002129384394138779, + "grad_norm": 0.08140372484922409, + "kl": 0.12763554602861404, + "learning_rate": 3e-06, + "loss": 0.008, + "step": 767 + }, + { + "clip_ratio": 0.000270773540250957, + "epoch": 0.002132160644978595, + "grad_norm": 0.09283492714166641, + "kl": 0.11555681005120277, + "learning_rate": 3e-06, + "loss": 0.0076, + "step": 768 + }, + { + "clip_ratio": 0.00017576066602487117, + "completion_length": 214.00000762939453, + "epoch": 0.002134936895818411, + "grad_norm": 0.1142767146229744, + "kl": 0.10155479237437248, + "learning_rate": 3e-06, + "loss": 0.0015, + "reward": 0.20625000447034836, + "reward_std": 0.20823679491877556, + "rewards/countdown_reward_func": 0.20625000447034836, + "step": 769, + "zero_std_ratio": 0.375 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0021377131466582266, + "grad_norm": 0.12313186377286911, + "kl": 0.11426293849945068, + "learning_rate": 3e-06, + "loss": 0.001, + "step": 770 + }, + { + "clip_ratio": 0.0, + "epoch": 0.002140489397498043, + "grad_norm": 0.10338300466537476, + "kl": 0.10406426340341568, + "learning_rate": 3e-06, + "loss": 0.0004, + "step": 771 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0021432656483378588, + "grad_norm": 0.12133070081472397, + "kl": 0.09835099801421165, + "learning_rate": 3e-06, + "loss": 0.0006, + "step": 772 + }, + { + "clip_ratio": 0.00040839538269210607, + "epoch": 0.0021460418991776746, + "grad_norm": 0.13222798705101013, + "kl": 0.10666431114077568, + "learning_rate": 3e-06, + "loss": 0.0006, + "step": 773 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0021488181500174904, + "grad_norm": 0.1232038140296936, + "kl": 0.09488913416862488, + "learning_rate": 3e-06, + "loss": 0.0003, + "step": 774 + }, + { + "clip_ratio": 9.164222865365446e-05, + "epoch": 0.0021515944008573063, + "grad_norm": 0.10721003264188766, + "kl": 0.08875302597880363, + "learning_rate": 3e-06, + "loss": -0.0009, + "step": 775 + }, + { + "clip_ratio": 0.0, + "epoch": 0.002154370651697122, + "grad_norm": 0.12701751291751862, + "kl": 0.09637927263975143, + "learning_rate": 3e-06, + "loss": -0.0006, + "step": 776 + }, + { + "clip_ratio": 0.0003659786016214639, + "epoch": 0.002157146902536938, + "grad_norm": 0.10145313292741776, + "kl": 0.08755913749337196, + "learning_rate": 3e-06, + "loss": -0.001, + "step": 777 + }, + { + "clip_ratio": 0.00018341893155593425, + "epoch": 0.002159923153376754, + "grad_norm": 0.11642103642225266, + "kl": 0.08162051066756248, + "learning_rate": 3e-06, + "loss": -0.0017, + "step": 778 + }, + { + "clip_ratio": 0.0006324425921775401, + "epoch": 0.0021626994042165696, + "grad_norm": 0.1388908177614212, + "kl": 0.08449938148260117, + "learning_rate": 3e-06, + "loss": -0.0006, + "step": 779 + }, + { + "clip_ratio": 0.00041412835707888007, + "epoch": 0.0021654756550563855, + "grad_norm": 0.12610669434070587, + "kl": 0.07518958300352097, + "learning_rate": 3e-06, + "loss": -0.001, + "step": 780 + }, + { + "clip_ratio": 0.0, + "completion_length": 227.58333587646484, + "epoch": 0.0021682519058962017, + "grad_norm": 0.11886297166347504, + "kl": 0.08037593588232994, + "learning_rate": 3e-06, + "loss": 0.0063, + "reward": 0.26250000298023224, + "reward_std": 0.27733949571847916, + "rewards/countdown_reward_func": 0.26249999552965164, + "step": 781, + "zero_std_ratio": 0.25 + }, + { + "clip_ratio": 0.00010702054714784026, + "epoch": 0.0021710281567360176, + "grad_norm": 0.14289286732673645, + "kl": 0.07106167078018188, + "learning_rate": 3e-06, + "loss": 0.0047, + "step": 782 + }, + { + "clip_ratio": 0.0001884007579064928, + "epoch": 0.0021738044075758334, + "grad_norm": 0.12329453229904175, + "kl": 0.07392378151416779, + "learning_rate": 3e-06, + "loss": 0.0059, + "step": 783 + }, + { + "clip_ratio": 0.000244140625, + "epoch": 0.0021765806584156493, + "grad_norm": 0.11597783863544464, + "kl": 0.06427717953920364, + "learning_rate": 3e-06, + "loss": 0.0049, + "step": 784 + }, + { + "clip_ratio": 8.37801635498181e-05, + "epoch": 0.002179356909255465, + "grad_norm": 0.12029743194580078, + "kl": 0.06854793429374695, + "learning_rate": 3e-06, + "loss": 0.0063, + "step": 785 + }, + { + "clip_ratio": 0.0002638931109686382, + "epoch": 0.002182133160095281, + "grad_norm": 0.11125701665878296, + "kl": 0.06303357519209385, + "learning_rate": 3e-06, + "loss": 0.0061, + "step": 786 + }, + { + "clip_ratio": 0.0005720614135498181, + "epoch": 0.002184909410935097, + "grad_norm": 0.11483973264694214, + "kl": 0.06457911431789398, + "learning_rate": 3e-06, + "loss": 0.0044, + "step": 787 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0021876856617749126, + "grad_norm": 0.1489003449678421, + "kl": 0.057757457718253136, + "learning_rate": 3e-06, + "loss": 0.0044, + "step": 788 + }, + { + "clip_ratio": 0.00045818173384759575, + "epoch": 0.0021904619126147285, + "grad_norm": 0.1282450258731842, + "kl": 0.06039944291114807, + "learning_rate": 3e-06, + "loss": 0.0042, + "step": 789 + }, + { + "clip_ratio": 0.00048828125, + "epoch": 0.0021932381634545443, + "grad_norm": 0.12330767512321472, + "kl": 0.055448392406105995, + "learning_rate": 3e-06, + "loss": 0.0036, + "step": 790 + }, + { + "clip_ratio": 0.0007235735538415611, + "epoch": 0.00219601441429436, + "grad_norm": 0.12403135746717453, + "kl": 0.058060334995388985, + "learning_rate": 3e-06, + "loss": 0.0048, + "step": 791 + }, + { + "clip_ratio": 0.00037331361090764403, + "epoch": 0.0021987906651341764, + "grad_norm": 0.10532835870981216, + "kl": 0.05376404523849487, + "learning_rate": 3e-06, + "loss": 0.0047, + "step": 792 + }, + { + "clip_ratio": 0.00010984182881657034, + "completion_length": 222.31250762939453, + "epoch": 0.0022015669159739923, + "grad_norm": 0.04651214927434921, + "kl": 0.05401436612010002, + "learning_rate": 3e-06, + "loss": 0.003, + "reward": 0.15000001341104507, + "reward_std": 0.11558077484369278, + "rewards/countdown_reward_func": 0.15000000223517418, + "step": 793, + "zero_std_ratio": 0.5 + }, + { + "clip_ratio": 0.000742982214433141, + "epoch": 0.002204343166813808, + "grad_norm": 0.07730413973331451, + "kl": 0.0580199658870697, + "learning_rate": 3e-06, + "loss": 0.0033, + "step": 794 + }, + { + "clip_ratio": 0.00010984182881657034, + "epoch": 0.002207119417653624, + "grad_norm": 0.051809556782245636, + "kl": 0.057794030755758286, + "learning_rate": 3e-06, + "loss": 0.0035, + "step": 795 + }, + { + "clip_ratio": 0.0007241156417876482, + "epoch": 0.0022098956684934398, + "grad_norm": 0.0728415995836258, + "kl": 0.05362066254019737, + "learning_rate": 3e-06, + "loss": 0.0025, + "step": 796 + }, + { + "clip_ratio": 0.0002787023695418611, + "epoch": 0.0022126719193332556, + "grad_norm": 0.06317047029733658, + "kl": 0.05362400598824024, + "learning_rate": 3e-06, + "loss": 0.0032, + "step": 797 + }, + { + "clip_ratio": 0.00017235546692973003, + "epoch": 0.0022154481701730715, + "grad_norm": 0.06358745694160461, + "kl": 0.05262857303023338, + "learning_rate": 3e-06, + "loss": 0.0024, + "step": 798 + }, + { + "clip_ratio": 9.097525617107749e-05, + "epoch": 0.0022182244210128873, + "grad_norm": 0.05143192410469055, + "kl": 0.049039315432310104, + "learning_rate": 3e-06, + "loss": 0.0028, + "step": 799 + }, + { + "clip_ratio": 0.00010984182881657034, + "epoch": 0.002221000671852703, + "grad_norm": 0.0856727808713913, + "kl": 0.052018070593476295, + "learning_rate": 3e-06, + "loss": 0.0027, + "step": 800 + }, + { + "clip_ratio": 0.00038732589746359736, + "epoch": 0.002223776922692519, + "grad_norm": 0.05116529390215874, + "kl": 0.05213000252842903, + "learning_rate": 3e-06, + "loss": 0.003, + "step": 801 + }, + { + "clip_ratio": 0.0006221811127034016, + "epoch": 0.002226553173532335, + "grad_norm": 0.07559063285589218, + "kl": 0.04888819716870785, + "learning_rate": 3e-06, + "loss": 0.0019, + "step": 802 + }, + { + "clip_ratio": 0.0006752755725756288, + "epoch": 0.002229329424372151, + "grad_norm": 0.06661748886108398, + "kl": 0.04938088357448578, + "learning_rate": 3e-06, + "loss": 0.0025, + "step": 803 + }, + { + "clip_ratio": 0.0006201023061294109, + "epoch": 0.002232105675211967, + "grad_norm": 0.06247713044285774, + "kl": 0.04884421452879906, + "learning_rate": 3e-06, + "loss": 0.0018, + "step": 804 + }, + { + "clip_ratio": 0.00010399334132671356, + "completion_length": 228.52083587646484, + "epoch": 0.0022348819260517828, + "grad_norm": 0.10135756433010101, + "kl": 0.043619923293590546, + "learning_rate": 3e-06, + "loss": 0.0141, + "reward": 0.2875000014901161, + "reward_std": 0.28183095902204514, + "rewards/countdown_reward_func": 0.2875000014901161, + "step": 805, + "zero_std_ratio": 0.375 + }, + { + "clip_ratio": 0.0003937007859349251, + "epoch": 0.0022376581768915986, + "grad_norm": 0.10308714210987091, + "kl": 0.0447169654071331, + "learning_rate": 3e-06, + "loss": 0.014, + "step": 806 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0022404344277314144, + "grad_norm": 0.12397963553667068, + "kl": 0.04130372405052185, + "learning_rate": 3e-06, + "loss": 0.0146, + "step": 807 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0022432106785712303, + "grad_norm": 0.0904117226600647, + "kl": 0.04326653108000755, + "learning_rate": 3e-06, + "loss": 0.0137, + "step": 808 + }, + { + "clip_ratio": 8.747375977691263e-05, + "epoch": 0.002245986929411046, + "grad_norm": 0.11636728793382645, + "kl": 0.04562844708561897, + "learning_rate": 3e-06, + "loss": 0.0135, + "step": 809 + }, + { + "clip_ratio": 0.0, + "epoch": 0.002248763180250862, + "grad_norm": 0.10902263969182968, + "kl": 0.04578346759080887, + "learning_rate": 3e-06, + "loss": 0.0137, + "step": 810 + }, + { + "clip_ratio": 8.747375977691263e-05, + "epoch": 0.002251539431090678, + "grad_norm": 0.10831674933433533, + "kl": 0.04260227829217911, + "learning_rate": 3e-06, + "loss": 0.0132, + "step": 811 + }, + { + "clip_ratio": 0.0003889085492119193, + "epoch": 0.0022543156819304936, + "grad_norm": 0.11185412853956223, + "kl": 0.04456242546439171, + "learning_rate": 3e-06, + "loss": 0.0142, + "step": 812 + }, + { + "clip_ratio": 0.00047818864550208673, + "epoch": 0.0022570919327703095, + "grad_norm": 0.10440458357334137, + "kl": 0.04222998023033142, + "learning_rate": 3e-06, + "loss": 0.0138, + "step": 813 + }, + { + "clip_ratio": 0.00018712585733737797, + "epoch": 0.0022598681836101257, + "grad_norm": 0.0939682200551033, + "kl": 0.04461575858294964, + "learning_rate": 3e-06, + "loss": 0.0124, + "step": 814 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0022626444344499416, + "grad_norm": 0.09211880713701248, + "kl": 0.047047100961208344, + "learning_rate": 3e-06, + "loss": 0.0121, + "step": 815 + }, + { + "clip_ratio": 0.00036837265361100435, + "epoch": 0.0022654206852897574, + "grad_norm": 0.12892583012580872, + "kl": 0.04755326174199581, + "learning_rate": 3e-06, + "loss": 0.0134, + "step": 816 + }, + { + "clip_ratio": 0.00047789004747755826, + "completion_length": 220.1666717529297, + "epoch": 0.0022681969361295733, + "grad_norm": 0.18188029527664185, + "kl": 0.04710717685520649, + "learning_rate": 3e-06, + "loss": 0.0458, + "reward": 0.34166669845581055, + "reward_std": 0.27476726472377777, + "rewards/countdown_reward_func": 0.34166666865348816, + "step": 817, + "zero_std_ratio": 0.25 + }, + { + "clip_ratio": 0.00017529455362819135, + "epoch": 0.002270973186969389, + "grad_norm": 0.1454269289970398, + "kl": 0.05038350075483322, + "learning_rate": 3e-06, + "loss": 0.0453, + "step": 818 + }, + { + "clip_ratio": 0.0, + "epoch": 0.002273749437809205, + "grad_norm": 0.13133502006530762, + "kl": 0.04751100763678551, + "learning_rate": 3e-06, + "loss": 0.045, + "step": 819 + }, + { + "clip_ratio": 0.0, + "epoch": 0.002276525688649021, + "grad_norm": 0.1250462830066681, + "kl": 0.04578425548970699, + "learning_rate": 3e-06, + "loss": 0.0449, + "step": 820 + }, + { + "clip_ratio": 0.0001035625537042506, + "epoch": 0.0022793019394888366, + "grad_norm": 0.14770175516605377, + "kl": 0.05231819488108158, + "learning_rate": 3e-06, + "loss": 0.0442, + "step": 821 + }, + { + "clip_ratio": 0.0004608447488863021, + "epoch": 0.0022820781903286525, + "grad_norm": 0.14028239250183105, + "kl": 0.05149639584124088, + "learning_rate": 3e-06, + "loss": 0.0435, + "step": 822 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0022848544411684683, + "grad_norm": 0.17355775833129883, + "kl": 0.056207796558737755, + "learning_rate": 3e-06, + "loss": 0.0419, + "step": 823 + }, + { + "clip_ratio": 0.0001035625537042506, + "epoch": 0.002287630692008284, + "grad_norm": 0.14540952444076538, + "kl": 0.060877278447151184, + "learning_rate": 3e-06, + "loss": 0.042, + "step": 824 + }, + { + "clip_ratio": 0.00018522187019698322, + "epoch": 0.0022904069428481004, + "grad_norm": 0.1283130794763565, + "kl": 0.060043616220355034, + "learning_rate": 3e-06, + "loss": 0.042, + "step": 825 + }, + { + "clip_ratio": 9.104151831706986e-05, + "epoch": 0.0022931831936879163, + "grad_norm": 0.13025355339050293, + "kl": 0.06011705473065376, + "learning_rate": 3e-06, + "loss": 0.0415, + "step": 826 + }, + { + "clip_ratio": 0.0006889307842357084, + "epoch": 0.002295959444527732, + "grad_norm": 0.1317429095506668, + "kl": 0.07226631790399551, + "learning_rate": 3e-06, + "loss": 0.0392, + "step": 827 + }, + { + "clip_ratio": 0.0011769172851927578, + "epoch": 0.002298735695367548, + "grad_norm": 0.12668536603450775, + "kl": 0.07085844874382019, + "learning_rate": 3e-06, + "loss": 0.0392, + "step": 828 + }, + { + "clip_ratio": 0.0, + "completion_length": 217.25, + "epoch": 0.0023015119462073638, + "grad_norm": 0.0859948992729187, + "kl": 0.07980459555983543, + "learning_rate": 3e-06, + "loss": 0.0234, + "reward": 0.21250000596046448, + "reward_std": 0.2080453746020794, + "rewards/countdown_reward_func": 0.21250000596046448, + "step": 829, + "zero_std_ratio": 0.5 + }, + { + "clip_ratio": 9.593246068106964e-05, + "epoch": 0.0023042881970471796, + "grad_norm": 0.08892592787742615, + "kl": 0.08402768895030022, + "learning_rate": 3e-06, + "loss": 0.0227, + "step": 830 + }, + { + "clip_ratio": 9.15080527192913e-05, + "epoch": 0.0023070644478869955, + "grad_norm": 0.07939447462558746, + "kl": 0.08712485805153847, + "learning_rate": 3e-06, + "loss": 0.0224, + "step": 831 + }, + { + "clip_ratio": 9.15080527192913e-05, + "epoch": 0.0023098406987268113, + "grad_norm": 0.09546811133623123, + "kl": 0.09581883251667023, + "learning_rate": 3e-06, + "loss": 0.0229, + "step": 832 + }, + { + "clip_ratio": 0.0, + "epoch": 0.002312616949566627, + "grad_norm": 0.1163158193230629, + "kl": 0.10002047568559647, + "learning_rate": 3e-06, + "loss": 0.0228, + "step": 833 + }, + { + "clip_ratio": 0.00019186492136213928, + "epoch": 0.002315393200406443, + "grad_norm": 0.1022767499089241, + "kl": 0.10018961131572723, + "learning_rate": 3e-06, + "loss": 0.0225, + "step": 834 + }, + { + "clip_ratio": 9.15080527192913e-05, + "epoch": 0.002318169451246259, + "grad_norm": 0.08190451562404633, + "kl": 0.11375463381409645, + "learning_rate": 3e-06, + "loss": 0.0222, + "step": 835 + }, + { + "clip_ratio": 0.0001830161054385826, + "epoch": 0.002320945702086075, + "grad_norm": 0.1015833392739296, + "kl": 0.11530355364084244, + "learning_rate": 3e-06, + "loss": 0.0219, + "step": 836 + }, + { + "clip_ratio": 0.00027506323385750875, + "epoch": 0.002323721952925891, + "grad_norm": 0.08156981319189072, + "kl": 0.11573269963264465, + "learning_rate": 3e-06, + "loss": 0.0224, + "step": 837 + }, + { + "clip_ratio": 9.15080527192913e-05, + "epoch": 0.0023264982037657068, + "grad_norm": 0.08199756592512131, + "kl": 0.12695779651403427, + "learning_rate": 3e-06, + "loss": 0.0217, + "step": 838 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0023292744546055226, + "grad_norm": 0.1132536232471466, + "kl": 0.12991363927721977, + "learning_rate": 3e-06, + "loss": 0.0228, + "step": 839 + }, + { + "clip_ratio": 0.00036751566221937537, + "epoch": 0.0023320507054453384, + "grad_norm": 0.10609925538301468, + "kl": 0.1261560171842575, + "learning_rate": 3e-06, + "loss": 0.0212, + "step": 840 + }, + { + "clip_ratio": 0.0, + "completion_length": 216.02083587646484, + "epoch": 0.0023348269562851543, + "grad_norm": 0.13725493848323822, + "kl": 0.12262994423508644, + "learning_rate": 3e-06, + "loss": 0.0071, + "reward": 0.24375002086162567, + "reward_std": 0.26503098011016846, + "rewards/countdown_reward_func": 0.24375002086162567, + "step": 841, + "zero_std_ratio": 0.25 + }, + { + "clip_ratio": 0.0, + "epoch": 0.00233760320712497, + "grad_norm": 0.13868065178394318, + "kl": 0.13203585147857666, + "learning_rate": 3e-06, + "loss": 0.007, + "step": 842 + }, + { + "clip_ratio": 9.03179170563817e-05, + "epoch": 0.002340379457964786, + "grad_norm": 0.13435383141040802, + "kl": 0.13662400841712952, + "learning_rate": 3e-06, + "loss": 0.007, + "step": 843 + }, + { + "clip_ratio": 9.038322605192661e-05, + "epoch": 0.002343155708804602, + "grad_norm": 0.158269003033638, + "kl": 0.12493979930877686, + "learning_rate": 3e-06, + "loss": 0.0079, + "step": 844 + }, + { + "clip_ratio": 8.662508480483666e-05, + "epoch": 0.0023459319596444176, + "grad_norm": 0.13201403617858887, + "kl": 0.14266318082809448, + "learning_rate": 3e-06, + "loss": 0.0071, + "step": 845 + }, + { + "clip_ratio": 9.03179170563817e-05, + "epoch": 0.0023487082104842335, + "grad_norm": 0.11519859731197357, + "kl": 0.13292869180440903, + "learning_rate": 3e-06, + "loss": 0.0078, + "step": 846 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0023514844613240497, + "grad_norm": 0.1352914720773697, + "kl": 0.12666785717010498, + "learning_rate": 3e-06, + "loss": 0.0055, + "step": 847 + }, + { + "clip_ratio": 9.07111752894707e-05, + "epoch": 0.0023542607121638656, + "grad_norm": 0.13024933636188507, + "kl": 0.12938842177391052, + "learning_rate": 3e-06, + "loss": 0.0058, + "step": 848 + }, + { + "clip_ratio": 0.0003490790259093046, + "epoch": 0.0023570369630036814, + "grad_norm": 0.13579486310482025, + "kl": 0.13479531556367874, + "learning_rate": 3e-06, + "loss": 0.0064, + "step": 849 + }, + { + "clip_ratio": 0.0009073439359781332, + "epoch": 0.0023598132138434973, + "grad_norm": 0.15496942400932312, + "kl": 0.1218239888548851, + "learning_rate": 3e-06, + "loss": 0.0063, + "step": 850 + }, + { + "clip_ratio": 0.0004516682820394635, + "epoch": 0.002362589464683313, + "grad_norm": 0.13003304600715637, + "kl": 0.13300446420907974, + "learning_rate": 3e-06, + "loss": 0.0044, + "step": 851 + }, + { + "clip_ratio": 0.00022212152543943375, + "epoch": 0.002365365715523129, + "grad_norm": 0.11777352541685104, + "kl": 0.12342452257871628, + "learning_rate": 3e-06, + "loss": 0.0056, + "step": 852 + }, + { + "clip_ratio": 0.00018712575547397137, + "completion_length": 218.52083587646484, + "epoch": 0.0023681419663629448, + "grad_norm": 0.12623701989650726, + "kl": 0.12132728099822998, + "learning_rate": 3e-06, + "loss": -0.0039, + "reward": 0.16875000298023224, + "reward_std": 0.16144294291734695, + "rewards/countdown_reward_func": 0.16874999552965164, + "step": 853, + "zero_std_ratio": 0.375 + }, + { + "clip_ratio": 9.238728671334684e-05, + "epoch": 0.0023709182172027606, + "grad_norm": 0.12842930853366852, + "kl": 0.11158213764429092, + "learning_rate": 3e-06, + "loss": -0.0044, + "step": 854 + }, + { + "clip_ratio": 0.0005675574648194015, + "epoch": 0.0023736944680425765, + "grad_norm": 0.16828866302967072, + "kl": 0.10891519114375114, + "learning_rate": 3e-06, + "loss": -0.0048, + "step": 855 + }, + { + "clip_ratio": 9.245562250725925e-05, + "epoch": 0.0023764707188823923, + "grad_norm": 0.15191693603992462, + "kl": 0.11035206541419029, + "learning_rate": 3e-06, + "loss": -0.0052, + "step": 856 + }, + { + "clip_ratio": 0.0, + "epoch": 0.002379246969722208, + "grad_norm": 0.09676827490329742, + "kl": 0.10043661668896675, + "learning_rate": 3e-06, + "loss": -0.0061, + "step": 857 + }, + { + "clip_ratio": 0.00019535439787432551, + "epoch": 0.0023820232205620244, + "grad_norm": 0.13413938879966736, + "kl": 0.0950070358812809, + "learning_rate": 3e-06, + "loss": -0.0066, + "step": 858 + }, + { + "clip_ratio": 0.00039087486220523715, + "epoch": 0.0023847994714018403, + "grad_norm": 0.12939363718032837, + "kl": 0.09167426824569702, + "learning_rate": 3e-06, + "loss": -0.0072, + "step": 859 + }, + { + "clip_ratio": 0.0017128197359852493, + "epoch": 0.002387575722241656, + "grad_norm": 0.11775387078523636, + "kl": 0.08235512301325798, + "learning_rate": 3e-06, + "loss": -0.0086, + "step": 860 + }, + { + "clip_ratio": 0.0017219405272044241, + "epoch": 0.002390351973081472, + "grad_norm": 0.10675527900457382, + "kl": 0.07996315136551857, + "learning_rate": 3e-06, + "loss": -0.0083, + "step": 861 + }, + { + "clip_ratio": 0.0017481384566053748, + "epoch": 0.0023931282239212878, + "grad_norm": 0.14627479016780853, + "kl": 0.07891392335295677, + "learning_rate": 3e-06, + "loss": -0.0086, + "step": 862 + }, + { + "clip_ratio": 0.003889568499289453, + "epoch": 0.0023959044747611036, + "grad_norm": 0.09009343385696411, + "kl": 0.07284262031316757, + "learning_rate": 3e-06, + "loss": -0.0076, + "step": 863 + }, + { + "clip_ratio": 0.006949112517759204, + "epoch": 0.0023986807256009194, + "grad_norm": 0.11177929490804672, + "kl": 0.06860150396823883, + "learning_rate": 3e-06, + "loss": -0.0095, + "step": 864 + }, + { + "clip_ratio": 8.520791016053408e-05, + "completion_length": 221.2916717529297, + "epoch": 0.0024014569764407353, + "grad_norm": 0.0992504358291626, + "kl": 0.07187891751527786, + "learning_rate": 3e-06, + "loss": 0.024, + "reward": 0.24791669100522995, + "reward_std": 0.22883931919932365, + "rewards/countdown_reward_func": 0.24791667610406876, + "step": 865, + "zero_std_ratio": 0.375 + }, + { + "clip_ratio": 0.0, + "epoch": 0.002404233227280551, + "grad_norm": 0.10073084384202957, + "kl": 0.06338691711425781, + "learning_rate": 3e-06, + "loss": 0.0226, + "step": 866 + }, + { + "clip_ratio": 9.811617201194167e-05, + "epoch": 0.002407009478120367, + "grad_norm": 0.10182993113994598, + "kl": 0.061187781393527985, + "learning_rate": 3e-06, + "loss": 0.0235, + "step": 867 + }, + { + "clip_ratio": 0.00030962208984419703, + "epoch": 0.002409785728960183, + "grad_norm": 0.1328115165233612, + "kl": 0.06472436338663101, + "learning_rate": 3e-06, + "loss": 0.0248, + "step": 868 + }, + { + "clip_ratio": 9.811617201194167e-05, + "epoch": 0.002412561979799999, + "grad_norm": 0.12169165909290314, + "kl": 0.06141612492501736, + "learning_rate": 3e-06, + "loss": 0.0244, + "step": 869 + }, + { + "clip_ratio": 0.00019770623475778848, + "epoch": 0.002415338230639815, + "grad_norm": 0.11152414232492447, + "kl": 0.06216576509177685, + "learning_rate": 3e-06, + "loss": 0.0251, + "step": 870 + }, + { + "clip_ratio": 8.520791016053408e-05, + "epoch": 0.0024181144814796308, + "grad_norm": 0.08779774606227875, + "kl": 0.06478393822908401, + "learning_rate": 3e-06, + "loss": 0.0248, + "step": 871 + }, + { + "clip_ratio": 0.00018243287922814488, + "epoch": 0.0024208907323194466, + "grad_norm": 0.10080928355455399, + "kl": 0.05965164303779602, + "learning_rate": 3e-06, + "loss": 0.0239, + "step": 872 + }, + { + "clip_ratio": 0.0002776125547825359, + "epoch": 0.0024236669831592624, + "grad_norm": 0.09574668854475021, + "kl": 0.05859038606286049, + "learning_rate": 3e-06, + "loss": 0.0238, + "step": 873 + }, + { + "clip_ratio": 0.00010575295891612768, + "epoch": 0.0024264432339990783, + "grad_norm": 0.1512366235256195, + "kl": 0.06465988978743553, + "learning_rate": 3e-06, + "loss": 0.0241, + "step": 874 + }, + { + "clip_ratio": 0.0002814402541844174, + "epoch": 0.002429219484838894, + "grad_norm": 0.11698296666145325, + "kl": 0.06264219433069229, + "learning_rate": 3e-06, + "loss": 0.0225, + "step": 875 + }, + { + "clip_ratio": 0.0004044586094096303, + "epoch": 0.00243199573567871, + "grad_norm": 0.1288166046142578, + "kl": 0.06408961862325668, + "learning_rate": 3e-06, + "loss": 0.0247, + "step": 876 + }, + { + "clip_ratio": 9.765625145519152e-05, + "completion_length": 228.87500762939453, + "epoch": 0.002434771986518526, + "grad_norm": 0.12193048000335693, + "kl": 0.06197246536612511, + "learning_rate": 3e-06, + "loss": -0.0093, + "reward": 0.3229166716337204, + "reward_std": 0.3930432200431824, + "rewards/countdown_reward_func": 0.3229166716337204, + "step": 877, + "zero_std_ratio": 0.0 + }, + { + "clip_ratio": 8.138021075865254e-05, + "epoch": 0.0024375482373583416, + "grad_norm": 0.10932080447673798, + "kl": 0.06703280657529831, + "learning_rate": 3e-06, + "loss": -0.0094, + "step": 878 + }, + { + "clip_ratio": 0.00034134124871343374, + "epoch": 0.0024403244881981575, + "grad_norm": 0.13418442010879517, + "kl": 0.06639807298779488, + "learning_rate": 3e-06, + "loss": -0.0095, + "step": 879 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0024431007390379737, + "grad_norm": 0.15062019228935242, + "kl": 0.06595133803784847, + "learning_rate": 3e-06, + "loss": -0.0089, + "step": 880 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0024458769898777896, + "grad_norm": 0.16575033962726593, + "kl": 0.0656869113445282, + "learning_rate": 3e-06, + "loss": -0.0098, + "step": 881 + }, + { + "clip_ratio": 0.000244140625, + "epoch": 0.0024486532407176054, + "grad_norm": 0.1262388676404953, + "kl": 0.06567086651921272, + "learning_rate": 3e-06, + "loss": -0.0092, + "step": 882 + }, + { + "clip_ratio": 0.00017017313075484708, + "epoch": 0.0024514294915574213, + "grad_norm": 0.11113395541906357, + "kl": 0.06466436572372913, + "learning_rate": 3e-06, + "loss": -0.0093, + "step": 883 + }, + { + "clip_ratio": 0.0, + "epoch": 0.002454205742397237, + "grad_norm": 0.10662338137626648, + "kl": 0.0688115581870079, + "learning_rate": 3e-06, + "loss": -0.0096, + "step": 884 + }, + { + "clip_ratio": 0.0, + "epoch": 0.002456981993237053, + "grad_norm": 0.1345973163843155, + "kl": 0.06801817566156387, + "learning_rate": 3e-06, + "loss": -0.0096, + "step": 885 + }, + { + "clip_ratio": 9.720062371343374e-05, + "epoch": 0.0024597582440768688, + "grad_norm": 0.144433856010437, + "kl": 0.06568440422415733, + "learning_rate": 3e-06, + "loss": -0.0107, + "step": 886 + }, + { + "clip_ratio": 8.138021075865254e-05, + "epoch": 0.0024625344949166846, + "grad_norm": 0.16164414584636688, + "kl": 0.06407869979739189, + "learning_rate": 3e-06, + "loss": -0.0106, + "step": 887 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0024653107457565005, + "grad_norm": 0.11676425486803055, + "kl": 0.06534116342663765, + "learning_rate": 3e-06, + "loss": -0.0106, + "step": 888 + }, + { + "clip_ratio": 0.0, + "completion_length": 220.02084350585938, + "epoch": 0.0024680869965963163, + "grad_norm": 0.1673632115125656, + "kl": 0.061935342848300934, + "learning_rate": 3e-06, + "loss": -0.0087, + "reward": 0.22291667014360428, + "reward_std": 0.26279305666685104, + "rewards/countdown_reward_func": 0.22291666269302368, + "step": 889, + "zero_std_ratio": 0.25 + }, + { + "clip_ratio": 0.00020538756507448852, + "epoch": 0.002470863247436132, + "grad_norm": 0.11077067255973816, + "kl": 0.06396340392529964, + "learning_rate": 3e-06, + "loss": -0.0089, + "step": 890 + }, + { + "clip_ratio": 0.00019290123600512743, + "epoch": 0.0024736394982759484, + "grad_norm": 0.15075094997882843, + "kl": 0.06460290402173996, + "learning_rate": 3e-06, + "loss": -0.0083, + "step": 891 + }, + { + "clip_ratio": 9.272996976505965e-05, + "epoch": 0.0024764157491157643, + "grad_norm": 0.10630524158477783, + "kl": 0.0635167695581913, + "learning_rate": 3e-06, + "loss": -0.0087, + "step": 892 + }, + { + "clip_ratio": 0.0002878382147173397, + "epoch": 0.00247919199995558, + "grad_norm": 0.11911377310752869, + "kl": 0.06044987216591835, + "learning_rate": 3e-06, + "loss": -0.0086, + "step": 893 + }, + { + "clip_ratio": 0.00030706560937687755, + "epoch": 0.002481968250795396, + "grad_norm": 0.11326951533555984, + "kl": 0.06032133474946022, + "learning_rate": 3e-06, + "loss": -0.0098, + "step": 894 + }, + { + "clip_ratio": 0.00010088780982187018, + "epoch": 0.0024847445016352118, + "grad_norm": 0.17850473523139954, + "kl": 0.05723920278251171, + "learning_rate": 3e-06, + "loss": -0.0116, + "step": 895 + }, + { + "clip_ratio": 0.00030706560937687755, + "epoch": 0.0024875207524750276, + "grad_norm": 0.11720696091651917, + "kl": 0.05883798375725746, + "learning_rate": 3e-06, + "loss": -0.0106, + "step": 896 + }, + { + "clip_ratio": 0.0002937890458269976, + "epoch": 0.0024902970033148434, + "grad_norm": 0.1409841924905777, + "kl": 0.0597931444644928, + "learning_rate": 3e-06, + "loss": -0.0107, + "step": 897 + }, + { + "clip_ratio": 0.00028926064987899736, + "epoch": 0.0024930732541546593, + "grad_norm": 0.1104314997792244, + "kl": 0.05723441019654274, + "learning_rate": 3e-06, + "loss": -0.0106, + "step": 898 + }, + { + "clip_ratio": 0.0003814154479186982, + "epoch": 0.002495849504994475, + "grad_norm": 0.1183939203619957, + "kl": 0.05589612200856209, + "learning_rate": 3e-06, + "loss": -0.0112, + "step": 899 + }, + { + "clip_ratio": 0.0010607101139612496, + "epoch": 0.002498625755834291, + "grad_norm": 0.12353586405515671, + "kl": 0.05687661096453667, + "learning_rate": 3e-06, + "loss": -0.0124, + "step": 900 + }, + { + "clip_ratio": 0.0, + "completion_length": 232.25, + "epoch": 0.002501402006674107, + "grad_norm": 0.09452740103006363, + "kl": 0.06010809168219566, + "learning_rate": 3e-06, + "loss": 0.0025, + "reward": 0.3020833358168602, + "reward_std": 0.2922677993774414, + "rewards/countdown_reward_func": 0.3020833358168602, + "step": 901, + "zero_std_ratio": 0.25 + }, + { + "clip_ratio": 9.834775846684352e-05, + "epoch": 0.002504178257513923, + "grad_norm": 0.11706841737031937, + "kl": 0.06478729844093323, + "learning_rate": 3e-06, + "loss": 0.0024, + "step": 902 + }, + { + "clip_ratio": 8.680555765749887e-05, + "epoch": 0.002506954508353739, + "grad_norm": 0.09865949302911758, + "kl": 0.062279969453811646, + "learning_rate": 3e-06, + "loss": 0.0019, + "step": 903 + }, + { + "clip_ratio": 0.00018655490566743538, + "epoch": 0.0025097307591935548, + "grad_norm": 0.13041819632053375, + "kl": 0.05919046886265278, + "learning_rate": 3e-06, + "loss": 0.0029, + "step": 904 + }, + { + "clip_ratio": 9.834775846684352e-05, + "epoch": 0.0025125070100333706, + "grad_norm": 0.14748121798038483, + "kl": 0.059419430792331696, + "learning_rate": 3e-06, + "loss": 0.0023, + "step": 905 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0025152832608731864, + "grad_norm": 0.09640306979417801, + "kl": 0.05946239456534386, + "learning_rate": 3e-06, + "loss": 0.0024, + "step": 906 + }, + { + "clip_ratio": 0.00036380960227688774, + "epoch": 0.0025180595117130023, + "grad_norm": 0.08679016679525375, + "kl": 0.05758332274854183, + "learning_rate": 3e-06, + "loss": 0.0021, + "step": 907 + }, + { + "clip_ratio": 0.0004546546551864594, + "epoch": 0.002520835762552818, + "grad_norm": 0.10903292149305344, + "kl": 0.06176324933767319, + "learning_rate": 3e-06, + "loss": 0.0007, + "step": 908 + }, + { + "clip_ratio": 0.0001810974645195529, + "epoch": 0.002523612013392634, + "grad_norm": 0.10386759042739868, + "kl": 0.06048583798110485, + "learning_rate": 3e-06, + "loss": 0.0009, + "step": 909 + }, + { + "clip_ratio": 0.0005492813070304692, + "epoch": 0.00252638826423245, + "grad_norm": 0.10254993289709091, + "kl": 0.057429682463407516, + "learning_rate": 3e-06, + "loss": 0.0013, + "step": 910 + }, + { + "clip_ratio": 9.834775846684352e-05, + "epoch": 0.0025291645150722656, + "grad_norm": 0.14101967215538025, + "kl": 0.05812258459627628, + "learning_rate": 3e-06, + "loss": 0.0015, + "step": 911 + }, + { + "clip_ratio": 0.0004883037181571126, + "epoch": 0.0025319407659120815, + "grad_norm": 0.08682620525360107, + "kl": 0.059369875118136406, + "learning_rate": 3e-06, + "loss": 0.0012, + "step": 912 + }, + { + "clip_ratio": 0.0005338219925761223, + "completion_length": 228.77083587646484, + "epoch": 0.0025347170167518977, + "grad_norm": 0.11657971143722534, + "kl": 0.05467422492802143, + "learning_rate": 3e-06, + "loss": 0.0358, + "reward": 0.23125001043081284, + "reward_std": 0.2115694098174572, + "rewards/countdown_reward_func": 0.23125001043081284, + "step": 913, + "zero_std_ratio": 0.5 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0025374932675917136, + "grad_norm": 0.12091764807701111, + "kl": 0.05531150847673416, + "learning_rate": 3e-06, + "loss": 0.0353, + "step": 914 + }, + { + "clip_ratio": 0.0012001376890111715, + "epoch": 0.0025402695184315294, + "grad_norm": 0.13323377072811127, + "kl": 0.05716773122549057, + "learning_rate": 3e-06, + "loss": 0.0353, + "step": 915 + }, + { + "clip_ratio": 0.0001666462339926511, + "epoch": 0.0025430457692713453, + "grad_norm": 0.08699194341897964, + "kl": 0.058137066662311554, + "learning_rate": 3e-06, + "loss": 0.0353, + "step": 916 + }, + { + "clip_ratio": 0.0005200195519137196, + "epoch": 0.002545822020111161, + "grad_norm": 0.10792747139930725, + "kl": 0.06085832789540291, + "learning_rate": 3e-06, + "loss": 0.0362, + "step": 917 + }, + { + "clip_ratio": 8.60289073898457e-05, + "epoch": 0.002548598270950977, + "grad_norm": 0.08918231725692749, + "kl": 0.05700236186385155, + "learning_rate": 3e-06, + "loss": 0.0347, + "step": 918 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0025513745217907928, + "grad_norm": 0.11331060528755188, + "kl": 0.05938470549881458, + "learning_rate": 3e-06, + "loss": 0.035, + "step": 919 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0025541507726306086, + "grad_norm": 0.12028547376394272, + "kl": 0.060254018753767014, + "learning_rate": 3e-06, + "loss": 0.0339, + "step": 920 + }, + { + "clip_ratio": 0.0003332924679853022, + "epoch": 0.0025569270234704245, + "grad_norm": 0.12606388330459595, + "kl": 0.06682027131319046, + "learning_rate": 3e-06, + "loss": 0.0338, + "step": 921 + }, + { + "clip_ratio": 8.526603050995618e-05, + "epoch": 0.0025597032743102403, + "grad_norm": 0.08522091805934906, + "kl": 0.06515257433056831, + "learning_rate": 3e-06, + "loss": 0.0341, + "step": 922 + }, + { + "clip_ratio": 0.0005214190459810197, + "epoch": 0.0025624795251500566, + "grad_norm": 0.10933967679738998, + "kl": 0.06955453753471375, + "learning_rate": 3e-06, + "loss": 0.0342, + "step": 923 + }, + { + "clip_ratio": 8.60289073898457e-05, + "epoch": 0.0025652557759898724, + "grad_norm": 0.08516346663236618, + "kl": 0.06566111743450165, + "learning_rate": 3e-06, + "loss": 0.0332, + "step": 924 + }, + { + "clip_ratio": 0.0, + "completion_length": 238.70834350585938, + "epoch": 0.0025680320268296883, + "grad_norm": 0.051000598818063736, + "kl": 0.06130129098892212, + "learning_rate": 3e-06, + "loss": 0.008, + "reward": 0.19166667759418488, + "reward_std": 0.1091257855296135, + "rewards/countdown_reward_func": 0.19166666269302368, + "step": 925, + "zero_std_ratio": 0.625 + }, + { + "clip_ratio": 9.704969124868512e-05, + "epoch": 0.002570808277669504, + "grad_norm": 0.06743042171001434, + "kl": 0.06677010655403137, + "learning_rate": 3e-06, + "loss": 0.0085, + "step": 926 + }, + { + "clip_ratio": 0.0002635217970237136, + "epoch": 0.00257358452850932, + "grad_norm": 0.05554405599832535, + "kl": 0.06589578464627266, + "learning_rate": 3e-06, + "loss": 0.0079, + "step": 927 + }, + { + "clip_ratio": 8.520791016053408e-05, + "epoch": 0.0025763607793491358, + "grad_norm": 0.06437760591506958, + "kl": 0.06517062336206436, + "learning_rate": 3e-06, + "loss": 0.0083, + "step": 928 + }, + { + "clip_ratio": 0.00024796833167783916, + "epoch": 0.0025791370301889516, + "grad_norm": 0.056508298963308334, + "kl": 0.0656033419072628, + "learning_rate": 3e-06, + "loss": 0.0079, + "step": 929 + }, + { + "clip_ratio": 0.0002620808663778007, + "epoch": 0.0025819132810287674, + "grad_norm": 0.05479772761464119, + "kl": 0.07185834646224976, + "learning_rate": 3e-06, + "loss": 0.0084, + "step": 930 + }, + { + "clip_ratio": 9.704969124868512e-05, + "epoch": 0.0025846895318685833, + "grad_norm": 0.05194047465920448, + "kl": 0.06922575458884239, + "learning_rate": 3e-06, + "loss": 0.008, + "step": 931 + }, + { + "clip_ratio": 0.0, + "epoch": 0.002587465782708399, + "grad_norm": 0.06964042782783508, + "kl": 0.07410523667931557, + "learning_rate": 3e-06, + "loss": 0.008, + "step": 932 + }, + { + "clip_ratio": 9.704969124868512e-05, + "epoch": 0.002590242033548215, + "grad_norm": 0.04855266585946083, + "kl": 0.07128038257360458, + "learning_rate": 3e-06, + "loss": 0.0079, + "step": 933 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0025930182843880312, + "grad_norm": 0.07284444570541382, + "kl": 0.0707533061504364, + "learning_rate": 3e-06, + "loss": 0.0085, + "step": 934 + }, + { + "clip_ratio": 0.00016276042151730508, + "epoch": 0.002595794535227847, + "grad_norm": 0.06308530271053314, + "kl": 0.07013512030243874, + "learning_rate": 3e-06, + "loss": 0.0075, + "step": 935 + }, + { + "clip_ratio": 0.00040690103196538985, + "epoch": 0.002598570786067663, + "grad_norm": 0.05616437643766403, + "kl": 0.0736420564353466, + "learning_rate": 3e-06, + "loss": 0.0074, + "step": 936 + }, + { + "clip_ratio": 0.00017452477186452597, + "completion_length": 236.02084350585938, + "epoch": 0.0026013470369074788, + "grad_norm": 0.09799813479185104, + "kl": 0.08096648007631302, + "learning_rate": 3e-06, + "loss": -0.0021, + "reward": 0.24583333730697632, + "reward_std": 0.214500330388546, + "rewards/countdown_reward_func": 0.24583332985639572, + "step": 937, + "zero_std_ratio": 0.375 + }, + { + "clip_ratio": 0.00016611294995527714, + "epoch": 0.0026041232877472946, + "grad_norm": 0.19633734226226807, + "kl": 0.08069871366024017, + "learning_rate": 3e-06, + "loss": -0.0012, + "step": 938 + }, + { + "clip_ratio": 0.00033192152477568015, + "epoch": 0.0026068995385871104, + "grad_norm": 0.09509781002998352, + "kl": 0.07796993106603622, + "learning_rate": 3e-06, + "loss": -0.0019, + "step": 939 + }, + { + "clip_ratio": 9.865824540611356e-05, + "epoch": 0.0026096757894269263, + "grad_norm": 0.09252092242240906, + "kl": 0.08357620239257812, + "learning_rate": 3e-06, + "loss": -0.002, + "step": 940 + }, + { + "clip_ratio": 8.138021075865254e-05, + "epoch": 0.002612452040266742, + "grad_norm": 0.1239117830991745, + "kl": 0.0816221609711647, + "learning_rate": 3e-06, + "loss": -0.002, + "step": 941 + }, + { + "clip_ratio": 0.00018341893155593425, + "epoch": 0.002615228291106558, + "grad_norm": 0.09028197079896927, + "kl": 0.0803113654255867, + "learning_rate": 3e-06, + "loss": -0.0015, + "step": 942 + }, + { + "clip_ratio": 8.138021075865254e-05, + "epoch": 0.002618004541946374, + "grad_norm": 0.09007646143436432, + "kl": 0.07762451097369194, + "learning_rate": 3e-06, + "loss": -0.0025, + "step": 943 + }, + { + "clip_ratio": 8.138021075865254e-05, + "epoch": 0.0026207807927861896, + "grad_norm": 0.21453428268432617, + "kl": 0.07529747486114502, + "learning_rate": 3e-06, + "loss": -0.004, + "step": 944 + }, + { + "clip_ratio": 0.0001691611105343327, + "epoch": 0.002623557043626006, + "grad_norm": 0.08801465481519699, + "kl": 0.07271482422947884, + "learning_rate": 3e-06, + "loss": -0.002, + "step": 945 + }, + { + "clip_ratio": 0.00018643914518179372, + "epoch": 0.0026263332944658217, + "grad_norm": 0.09710396826267242, + "kl": 0.0768899917602539, + "learning_rate": 3e-06, + "loss": -0.003, + "step": 946 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0026291095453056376, + "grad_norm": 0.1289989948272705, + "kl": 0.07622705027461052, + "learning_rate": 3e-06, + "loss": -0.0033, + "step": 947 + }, + { + "clip_ratio": 0.0005273459973977879, + "epoch": 0.0026318857961454534, + "grad_norm": 0.08834261447191238, + "kl": 0.07354174181818962, + "learning_rate": 3e-06, + "loss": -0.0024, + "step": 948 + }, + { + "clip_ratio": 8.890469325706363e-05, + "completion_length": 230.70834350585938, + "epoch": 0.0026346620469852693, + "grad_norm": 0.07417230308055878, + "kl": 0.06710755452513695, + "learning_rate": 3e-06, + "loss": -0.0056, + "reward": 0.16875001788139343, + "reward_std": 0.15433254092931747, + "rewards/countdown_reward_func": 0.16875001788139343, + "step": 949, + "zero_std_ratio": 0.625 + }, + { + "clip_ratio": 0.00036710136919282377, + "epoch": 0.002637438297825085, + "grad_norm": 0.10319241136312485, + "kl": 0.0669807717204094, + "learning_rate": 3e-06, + "loss": -0.0052, + "step": 950 + }, + { + "clip_ratio": 8.890469325706363e-05, + "epoch": 0.002640214548664901, + "grad_norm": 0.08117877691984177, + "kl": 0.06458613276481628, + "learning_rate": 3e-06, + "loss": -0.0062, + "step": 951 + }, + { + "clip_ratio": 0.0001982553512789309, + "epoch": 0.0026429907995047168, + "grad_norm": 0.08867120742797852, + "kl": 0.06354731135070324, + "learning_rate": 3e-06, + "loss": -0.0049, + "step": 952 + }, + { + "clip_ratio": 9.65996878221631e-05, + "epoch": 0.0026457670503445326, + "grad_norm": 0.07009001821279526, + "kl": 0.06276779621839523, + "learning_rate": 3e-06, + "loss": -0.0057, + "step": 953 + }, + { + "clip_ratio": 0.0001789265952538699, + "epoch": 0.0026485433011843485, + "grad_norm": 0.08066268265247345, + "kl": 0.061625886708498, + "learning_rate": 3e-06, + "loss": -0.0056, + "step": 954 + }, + { + "clip_ratio": 0.00026640027499524876, + "epoch": 0.0026513195520241643, + "grad_norm": 0.07269750535488129, + "kl": 0.058654628694057465, + "learning_rate": 3e-06, + "loss": -0.0063, + "step": 955 + }, + { + "clip_ratio": 0.0012421588180586696, + "epoch": 0.0026540958028639806, + "grad_norm": 0.08015119284391403, + "kl": 0.05745803192257881, + "learning_rate": 3e-06, + "loss": -0.0057, + "step": 956 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0026568720537037964, + "grad_norm": 0.07885020971298218, + "kl": 0.056764453649520874, + "learning_rate": 3e-06, + "loss": -0.007, + "step": 957 + }, + { + "clip_ratio": 0.00027389177557779476, + "epoch": 0.0026596483045436123, + "grad_norm": 0.08463863283395767, + "kl": 0.056341877207159996, + "learning_rate": 3e-06, + "loss": -0.0064, + "step": 958 + }, + { + "clip_ratio": 0.001056087960023433, + "epoch": 0.002662424555383428, + "grad_norm": 0.07922295480966568, + "kl": 0.05454845167696476, + "learning_rate": 3e-06, + "loss": -0.007, + "step": 959 + }, + { + "clip_ratio": 0.0007927687838673592, + "epoch": 0.002665200806223244, + "grad_norm": 0.0849594920873642, + "kl": 0.05426880158483982, + "learning_rate": 3e-06, + "loss": -0.006, + "step": 960 + }, + { + "clip_ratio": 0.00043250381713733077, + "completion_length": 235.6666717529297, + "epoch": 0.0026679770570630598, + "grad_norm": 0.09037365019321442, + "kl": 0.05271290987730026, + "learning_rate": 3e-06, + "loss": 0.0065, + "reward": 0.229166679084301, + "reward_std": 0.18291139230132103, + "rewards/countdown_reward_func": 0.229166679084301, + "step": 961, + "zero_std_ratio": 0.5 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0026707533079028756, + "grad_norm": 0.10632222890853882, + "kl": 0.05197305604815483, + "learning_rate": 3e-06, + "loss": 0.0064, + "step": 962 + }, + { + "clip_ratio": 0.0001750104856910184, + "epoch": 0.0026735295587426914, + "grad_norm": 0.06896176934242249, + "kl": 0.04834933951497078, + "learning_rate": 3e-06, + "loss": 0.0068, + "step": 963 + }, + { + "clip_ratio": 0.0002540506684454158, + "epoch": 0.0026763058095825073, + "grad_norm": 0.07026878744363785, + "kl": 0.051994968205690384, + "learning_rate": 3e-06, + "loss": 0.007, + "step": 964 + }, + { + "clip_ratio": 8.821453957352787e-05, + "epoch": 0.002679082060422323, + "grad_norm": 0.06770803779363632, + "kl": 0.05004582740366459, + "learning_rate": 3e-06, + "loss": 0.0068, + "step": 965 + }, + { + "clip_ratio": 8.722958591533825e-05, + "epoch": 0.002681858311262139, + "grad_norm": 0.06422404199838638, + "kl": 0.04954391345381737, + "learning_rate": 3e-06, + "loss": 0.0067, + "step": 966 + }, + { + "clip_ratio": 8.821453957352787e-05, + "epoch": 0.0026846345621019552, + "grad_norm": 0.09136969596147537, + "kl": 0.04910329729318619, + "learning_rate": 3e-06, + "loss": 0.006, + "step": 967 + }, + { + "clip_ratio": 0.00027080931613454595, + "epoch": 0.002687410812941771, + "grad_norm": 0.10625612735748291, + "kl": 0.047768834978342056, + "learning_rate": 3e-06, + "loss": 0.0058, + "step": 968 + }, + { + "clip_ratio": 0.0003528630913933739, + "epoch": 0.002690187063781587, + "grad_norm": 0.06184415891766548, + "kl": 0.04515623487532139, + "learning_rate": 3e-06, + "loss": 0.0063, + "step": 969 + }, + { + "clip_ratio": 0.00016860979667399079, + "epoch": 0.0026929633146214028, + "grad_norm": 0.06915763020515442, + "kl": 0.0480178352445364, + "learning_rate": 3e-06, + "loss": 0.006, + "step": 970 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0026957395654612186, + "grad_norm": 0.06491398066282272, + "kl": 0.047293346375226974, + "learning_rate": 3e-06, + "loss": 0.0062, + "step": 971 + }, + { + "clip_ratio": 0.00016860979667399079, + "epoch": 0.0026985158163010344, + "grad_norm": 0.07768521457910538, + "kl": 0.04707910679280758, + "learning_rate": 3e-06, + "loss": 0.0061, + "step": 972 + }, + { + "clip_ratio": 0.0, + "completion_length": 241.56250762939453, + "epoch": 0.0027012920671408503, + "grad_norm": 0.10156456381082535, + "kl": 0.04707324132323265, + "learning_rate": 3e-06, + "loss": 0.0114, + "reward": 0.2666666805744171, + "reward_std": 0.2661244869232178, + "rewards/countdown_reward_func": 0.2666666731238365, + "step": 973, + "zero_std_ratio": 0.25 + }, + { + "clip_ratio": 0.00025393527175765485, + "epoch": 0.002704068317980666, + "grad_norm": 0.08211185038089752, + "kl": 0.044957129284739494, + "learning_rate": 3e-06, + "loss": 0.0122, + "step": 974 + }, + { + "clip_ratio": 9.391435014549643e-05, + "epoch": 0.002706844568820482, + "grad_norm": 0.080048106610775, + "kl": 0.047641387209296227, + "learning_rate": 3e-06, + "loss": 0.0124, + "step": 975 + }, + { + "clip_ratio": 0.00024569814559072256, + "epoch": 0.002709620819660298, + "grad_norm": 0.1037638708949089, + "kl": 0.04482552409172058, + "learning_rate": 3e-06, + "loss": 0.0119, + "step": 976 + }, + { + "clip_ratio": 0.00017228929937118664, + "epoch": 0.0027123970705001136, + "grad_norm": 0.08106601238250732, + "kl": 0.04870855435729027, + "learning_rate": 3e-06, + "loss": 0.0119, + "step": 977 + }, + { + "clip_ratio": 0.0002454323766869493, + "epoch": 0.00271517332133993, + "grad_norm": 0.09446461498737335, + "kl": 0.04764538258314133, + "learning_rate": 3e-06, + "loss": 0.0117, + "step": 978 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0027179495721797457, + "grad_norm": 0.0833747461438179, + "kl": 0.04668613523244858, + "learning_rate": 3e-06, + "loss": 0.0109, + "step": 979 + }, + { + "clip_ratio": 9.09090886125341e-05, + "epoch": 0.0027207258230195616, + "grad_norm": 0.07825154811143875, + "kl": 0.045210424810647964, + "learning_rate": 3e-06, + "loss": 0.0113, + "step": 980 + }, + { + "clip_ratio": 8.138021075865254e-05, + "epoch": 0.0027235020738593774, + "grad_norm": 0.0840124562382698, + "kl": 0.04799755476415157, + "learning_rate": 3e-06, + "loss": 0.0107, + "step": 981 + }, + { + "clip_ratio": 0.0004264606104698032, + "epoch": 0.0027262783246991933, + "grad_norm": 0.11435186117887497, + "kl": 0.04499867558479309, + "learning_rate": 3e-06, + "loss": 0.0109, + "step": 982 + }, + { + "clip_ratio": 0.00037225715641397983, + "epoch": 0.002729054575539009, + "grad_norm": 0.08535240590572357, + "kl": 0.049062151461839676, + "learning_rate": 3e-06, + "loss": 0.011, + "step": 983 + }, + { + "clip_ratio": 0.00016276042151730508, + "epoch": 0.002731830826378825, + "grad_norm": 0.09356574714183807, + "kl": 0.04856777377426624, + "learning_rate": 3e-06, + "loss": 0.0105, + "step": 984 + }, + { + "clip_ratio": 0.0, + "completion_length": 222.45833587646484, + "epoch": 0.0027346070772186408, + "grad_norm": 0.11254774034023285, + "kl": 0.048314955085515976, + "learning_rate": 3e-06, + "loss": 0.0166, + "reward": 0.22291667759418488, + "reward_std": 0.26681750267744064, + "rewards/countdown_reward_func": 0.22291667759418488, + "step": 985, + "zero_std_ratio": 0.125 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0027373833280584566, + "grad_norm": 0.128098264336586, + "kl": 0.0460913497954607, + "learning_rate": 3e-06, + "loss": 0.0151, + "step": 986 + }, + { + "clip_ratio": 0.00010339123400626704, + "epoch": 0.0027401595788982725, + "grad_norm": 0.09424342215061188, + "kl": 0.0468437634408474, + "learning_rate": 3e-06, + "loss": 0.0159, + "step": 987 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0027429358297380883, + "grad_norm": 0.11079585552215576, + "kl": 0.04501481167972088, + "learning_rate": 3e-06, + "loss": 0.0165, + "step": 988 + }, + { + "clip_ratio": 0.0026023527534562163, + "epoch": 0.0027457120805779046, + "grad_norm": 0.08553663641214371, + "kl": 0.04914248362183571, + "learning_rate": 3e-06, + "loss": 0.0171, + "step": 989 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0027484883314177204, + "grad_norm": 0.0737449899315834, + "kl": 0.04859176091849804, + "learning_rate": 3e-06, + "loss": 0.016, + "step": 990 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0027512645822575362, + "grad_norm": 0.1108751967549324, + "kl": 0.05102384462952614, + "learning_rate": 3e-06, + "loss": 0.0155, + "step": 991 + }, + { + "clip_ratio": 9.15080527192913e-05, + "epoch": 0.002754040833097352, + "grad_norm": 0.10651170462369919, + "kl": 0.04962940514087677, + "learning_rate": 3e-06, + "loss": 0.0151, + "step": 992 + }, + { + "clip_ratio": 0.00029216046823421493, + "epoch": 0.002756817083937168, + "grad_norm": 0.08815775066614151, + "kl": 0.050901319831609726, + "learning_rate": 3e-06, + "loss": 0.0147, + "step": 993 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0027595933347769838, + "grad_norm": 0.1047065481543541, + "kl": 0.04865125194191933, + "learning_rate": 3e-06, + "loss": 0.0157, + "step": 994 + }, + { + "clip_ratio": 0.0027102040985482745, + "epoch": 0.0027623695856167996, + "grad_norm": 0.08856725692749023, + "kl": 0.053303858265280724, + "learning_rate": 3e-06, + "loss": 0.0166, + "step": 995 + }, + { + "clip_ratio": 8.138021075865254e-05, + "epoch": 0.0027651458364566154, + "grad_norm": 0.07977961748838425, + "kl": 0.0532270185649395, + "learning_rate": 3e-06, + "loss": 0.0156, + "step": 996 + }, + { + "clip_ratio": 0.00010254306835122406, + "completion_length": 225.8541717529297, + "epoch": 0.0027679220872964313, + "grad_norm": 0.11937547475099564, + "kl": 0.0554725993424654, + "learning_rate": 3e-06, + "loss": 0.0133, + "reward": 0.30000001937150955, + "reward_std": 0.27453966438770294, + "rewards/countdown_reward_func": 0.30000001937150955, + "step": 997, + "zero_std_ratio": 0.375 + }, + { + "clip_ratio": 0.0, + "epoch": 0.002770698338136247, + "grad_norm": 0.45567768812179565, + "kl": 0.05745168775320053, + "learning_rate": 3e-06, + "loss": 0.0133, + "step": 998 + }, + { + "clip_ratio": 0.0001995211496250704, + "epoch": 0.002773474588976063, + "grad_norm": 0.1074514389038086, + "kl": 0.060051169246435165, + "learning_rate": 3e-06, + "loss": 0.0124, + "step": 999 + }, + { + "epoch": 0.0027762508398158792, + "grad_norm": 0.12824182212352753, + "learning_rate": 3e-06, + "loss": 0.012, + "step": 1000 + }, + { + "clip_ratio": 0.00012577813322423026, + "epoch": 0.002779027090655695, + "grad_norm": 0.09932482987642288, + "kl": 0.060510776937007904, + "learning_rate": 3e-06, + "loss": 0.0125, + "step": 1001 + }, + { + "clip_ratio": 0.0, + "epoch": 0.002781803341495511, + "grad_norm": 0.09447323530912399, + "kl": 0.06021983176469803, + "learning_rate": 3e-06, + "loss": 0.0126, + "step": 1002 + }, + { + "clip_ratio": 0.0002693641581572592, + "epoch": 0.0027845795923353268, + "grad_norm": 0.11810337752103806, + "kl": 0.06065908633172512, + "learning_rate": 3e-06, + "loss": 0.0114, + "step": 1003 + }, + { + "clip_ratio": 9.97605748125352e-05, + "epoch": 0.0027873558431751426, + "grad_norm": 0.15187525749206543, + "kl": 0.06130600720643997, + "learning_rate": 3e-06, + "loss": 0.012, + "step": 1004 + }, + { + "clip_ratio": 0.00019688567408593372, + "epoch": 0.0027901320940149584, + "grad_norm": 0.09934880584478378, + "kl": 0.06536610797047615, + "learning_rate": 3e-06, + "loss": 0.0109, + "step": 1005 + }, + { + "clip_ratio": 0.0002693641581572592, + "epoch": 0.0027929083448547743, + "grad_norm": 0.1290806382894516, + "kl": 0.06462856568396091, + "learning_rate": 3e-06, + "loss": 0.0118, + "step": 1006 + }, + { + "clip_ratio": 0.00017373176524415612, + "epoch": 0.00279568459569459, + "grad_norm": 0.09871116280555725, + "kl": 0.06735832616686821, + "learning_rate": 3e-06, + "loss": 0.0116, + "step": 1007 + }, + { + "clip_ratio": 9.97605748125352e-05, + "epoch": 0.002798460846534406, + "grad_norm": 0.09537402540445328, + "kl": 0.06406117230653763, + "learning_rate": 3e-06, + "loss": 0.0114, + "step": 1008 + }, + { + "clip_ratio": 0.00011488970631035045, + "completion_length": 231.37500762939453, + "epoch": 0.002801237097374222, + "grad_norm": 0.3545832335948944, + "kl": 0.05917428806424141, + "learning_rate": 3e-06, + "loss": 0.0063, + "reward": 0.208333358168602, + "reward_std": 0.18449045717716217, + "rewards/countdown_reward_func": 0.2083333507180214, + "step": 1009, + "zero_std_ratio": 0.375 + }, + { + "clip_ratio": 0.0004897922772215679, + "epoch": 0.0028040133482140376, + "grad_norm": 0.0879272073507309, + "kl": 0.06689955294132233, + "learning_rate": 3e-06, + "loss": 0.0063, + "step": 1010 + }, + { + "clip_ratio": 8.138021075865254e-05, + "epoch": 0.002806789599053854, + "grad_norm": 0.06225398927927017, + "kl": 0.06652659550309181, + "learning_rate": 3e-06, + "loss": 0.007, + "step": 1011 + }, + { + "clip_ratio": 0.0002790517173707485, + "epoch": 0.0028095658498936697, + "grad_norm": 0.0772099494934082, + "kl": 0.07098489999771118, + "learning_rate": 3e-06, + "loss": 0.0065, + "step": 1012 + }, + { + "clip_ratio": 0.00016578249051235616, + "epoch": 0.0028123421007334856, + "grad_norm": 0.09716866910457611, + "kl": 0.06839455664157867, + "learning_rate": 3e-06, + "loss": 0.007, + "step": 1013 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0028151183515733014, + "grad_norm": 0.08656153082847595, + "kl": 0.06978728249669075, + "learning_rate": 3e-06, + "loss": 0.0069, + "step": 1014 + }, + { + "clip_ratio": 8.138021075865254e-05, + "epoch": 0.0028178946024131173, + "grad_norm": 0.07657860964536667, + "kl": 0.0637618862092495, + "learning_rate": 3e-06, + "loss": 0.0061, + "step": 1015 + }, + { + "clip_ratio": 0.00025825316697591916, + "epoch": 0.002820670853252933, + "grad_norm": 0.09020351618528366, + "kl": 0.0716327540576458, + "learning_rate": 3e-06, + "loss": 0.0058, + "step": 1016 + }, + { + "clip_ratio": 0.0004928143753204495, + "epoch": 0.002823447104092749, + "grad_norm": 0.07356920838356018, + "kl": 0.06976194307208061, + "learning_rate": 3e-06, + "loss": 0.0068, + "step": 1017 + }, + { + "clip_ratio": 0.00035517166543286294, + "epoch": 0.0028262233549325648, + "grad_norm": 0.08907874673604965, + "kl": 0.07387241721153259, + "learning_rate": 3e-06, + "loss": 0.0067, + "step": 1018 + }, + { + "clip_ratio": 0.00017687295621726662, + "epoch": 0.0028289996057723806, + "grad_norm": 0.08614590018987656, + "kl": 0.07145676389336586, + "learning_rate": 3e-06, + "loss": 0.0071, + "step": 1019 + }, + { + "clip_ratio": 0.0001792114635463804, + "epoch": 0.0028317758566121965, + "grad_norm": 0.10110598802566528, + "kl": 0.07314455509185791, + "learning_rate": 3e-06, + "loss": 0.0059, + "step": 1020 + }, + { + "clip_ratio": 0.00019379844889044762, + "completion_length": 232.77084350585938, + "epoch": 0.0028345521074520123, + "grad_norm": 0.07683458924293518, + "kl": 0.07031066715717316, + "learning_rate": 3e-06, + "loss": 0.0068, + "reward": 0.2291666716337204, + "reward_std": 0.21667250245809555, + "rewards/countdown_reward_func": 0.2291666641831398, + "step": 1021, + "zero_std_ratio": 0.375 + }, + { + "clip_ratio": 0.0002766043471638113, + "epoch": 0.0028373283582918286, + "grad_norm": 0.08281517773866653, + "kl": 0.07224087789654732, + "learning_rate": 3e-06, + "loss": 0.0075, + "step": 1022 + }, + { + "clip_ratio": 0.0003614198212744668, + "epoch": 0.0028401046091316444, + "grad_norm": 0.13180024921894073, + "kl": 0.07020625099539757, + "learning_rate": 3e-06, + "loss": 0.0074, + "step": 1023 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0028428808599714602, + "grad_norm": 0.09534606337547302, + "kl": 0.07347037643194199, + "learning_rate": 3e-06, + "loss": 0.0071, + "step": 1024 + }, + { + "clip_ratio": 0.0003760801919270307, + "epoch": 0.002845657110811276, + "grad_norm": 0.11282859742641449, + "kl": 0.07506215572357178, + "learning_rate": 3e-06, + "loss": 0.0069, + "step": 1025 + }, + { + "clip_ratio": 9.124087227974087e-05, + "epoch": 0.002848433361651092, + "grad_norm": 0.08544313162565231, + "kl": 0.07066556811332703, + "learning_rate": 3e-06, + "loss": 0.0062, + "step": 1026 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0028512096124909078, + "grad_norm": 0.07569853216409683, + "kl": 0.07081609964370728, + "learning_rate": 3e-06, + "loss": 0.0069, + "step": 1027 + }, + { + "clip_ratio": 0.00036229201941750944, + "epoch": 0.0028539858633307236, + "grad_norm": 0.09049994498491287, + "kl": 0.07069279998540878, + "learning_rate": 3e-06, + "loss": 0.0068, + "step": 1028 + }, + { + "clip_ratio": 0.0009208531701005995, + "epoch": 0.0028567621141705394, + "grad_norm": 0.15014901757240295, + "kl": 0.06979522109031677, + "learning_rate": 3e-06, + "loss": 0.0065, + "step": 1029 + }, + { + "clip_ratio": 0.00026231101946905255, + "epoch": 0.0028595383650103553, + "grad_norm": 0.10039456933736801, + "kl": 0.0724063590168953, + "learning_rate": 3e-06, + "loss": 0.006, + "step": 1030 + }, + { + "clip_ratio": 0.0007212661657831632, + "epoch": 0.002862314615850171, + "grad_norm": 0.10597745329141617, + "kl": 0.07448813319206238, + "learning_rate": 3e-06, + "loss": 0.005, + "step": 1031 + }, + { + "clip_ratio": 0.00026949287712341174, + "epoch": 0.002865090866689987, + "grad_norm": 0.08808151632547379, + "kl": 0.06819097325205803, + "learning_rate": 3e-06, + "loss": 0.0057, + "step": 1032 + }, + { + "clip_ratio": 0.0, + "completion_length": 220.4791717529297, + "epoch": 0.0028678671175298032, + "grad_norm": 0.12043464183807373, + "kl": 0.06623350828886032, + "learning_rate": 3e-06, + "loss": -0.0042, + "reward": 0.35625001788139343, + "reward_std": 0.29714028537273407, + "rewards/countdown_reward_func": 0.35625001788139343, + "step": 1033, + "zero_std_ratio": 0.125 + }, + { + "clip_ratio": 9.448223863728344e-05, + "epoch": 0.002870643368369619, + "grad_norm": 0.10195807367563248, + "kl": 0.07157684862613678, + "learning_rate": 3e-06, + "loss": -0.0033, + "step": 1034 + }, + { + "clip_ratio": 0.0, + "epoch": 0.002873419619209435, + "grad_norm": 0.09431300312280655, + "kl": 0.06604013964533806, + "learning_rate": 3e-06, + "loss": -0.0042, + "step": 1035 + }, + { + "clip_ratio": 8.138021075865254e-05, + "epoch": 0.0028761958700492508, + "grad_norm": 0.10502111911773682, + "kl": 0.06586654111742973, + "learning_rate": 3e-06, + "loss": -0.0037, + "step": 1036 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0028789721208890666, + "grad_norm": 0.17106521129608154, + "kl": 0.06683254614472389, + "learning_rate": 3e-06, + "loss": -0.0046, + "step": 1037 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0028817483717288824, + "grad_norm": 0.1259961724281311, + "kl": 0.06468446552753448, + "learning_rate": 3e-06, + "loss": -0.0038, + "step": 1038 + }, + { + "clip_ratio": 9.448223863728344e-05, + "epoch": 0.0028845246225686983, + "grad_norm": 0.12149322032928467, + "kl": 0.06319839507341385, + "learning_rate": 3e-06, + "loss": -0.0049, + "step": 1039 + }, + { + "clip_ratio": 9.448223863728344e-05, + "epoch": 0.002887300873408514, + "grad_norm": 0.09481094032526016, + "kl": 0.0666964054107666, + "learning_rate": 3e-06, + "loss": -0.0039, + "step": 1040 + }, + { + "clip_ratio": 8.491847984259948e-05, + "epoch": 0.00289007712424833, + "grad_norm": 0.09099525958299637, + "kl": 0.061311766505241394, + "learning_rate": 3e-06, + "loss": -0.005, + "step": 1041 + }, + { + "clip_ratio": 0.00018680129142012447, + "epoch": 0.002892853375088146, + "grad_norm": 0.11290211975574493, + "kl": 0.061817897483706474, + "learning_rate": 3e-06, + "loss": -0.0053, + "step": 1042 + }, + { + "clip_ratio": 0.00027388295711716637, + "epoch": 0.0028956296259279616, + "grad_norm": 0.13663621246814728, + "kl": 0.06613102555274963, + "learning_rate": 3e-06, + "loss": -0.0062, + "step": 1043 + }, + { + "clip_ratio": 0.0, + "epoch": 0.002898405876767778, + "grad_norm": 0.10643167048692703, + "kl": 0.06023329682648182, + "learning_rate": 3e-06, + "loss": -0.0055, + "step": 1044 + }, + { + "clip_ratio": 0.0, + "completion_length": 232.56250762939453, + "epoch": 0.0029011821276075937, + "grad_norm": 0.07880929112434387, + "kl": 0.06694615818560123, + "learning_rate": 3e-06, + "loss": 0.0099, + "reward": 0.21250002086162567, + "reward_std": 0.16564146801829338, + "rewards/countdown_reward_func": 0.21250002086162567, + "step": 1045, + "zero_std_ratio": 0.625 + }, + { + "clip_ratio": 0.00028686892619589344, + "epoch": 0.0029039583784474096, + "grad_norm": 0.08016741275787354, + "kl": 0.06679723039269447, + "learning_rate": 3e-06, + "loss": 0.0107, + "step": 1046 + }, + { + "clip_ratio": 8.138021075865254e-05, + "epoch": 0.0029067346292872254, + "grad_norm": 0.07099456340074539, + "kl": 0.06291750818490982, + "learning_rate": 3e-06, + "loss": 0.0097, + "step": 1047 + }, + { + "clip_ratio": 0.00010056315659312531, + "epoch": 0.0029095108801270413, + "grad_norm": 0.07919389754533768, + "kl": 0.06845093332231045, + "learning_rate": 3e-06, + "loss": 0.0109, + "step": 1048 + }, + { + "clip_ratio": 0.00010080645006382838, + "epoch": 0.002912287130966857, + "grad_norm": 0.07597494125366211, + "kl": 0.06439798139035702, + "learning_rate": 3e-06, + "loss": 0.0106, + "step": 1049 + }, + { + "clip_ratio": 0.00017322444182354957, + "epoch": 0.002915063381806673, + "grad_norm": 0.09772368520498276, + "kl": 0.0649341493844986, + "learning_rate": 3e-06, + "loss": 0.0107, + "step": 1050 + }, + { + "clip_ratio": 0.001496979035437107, + "epoch": 0.0029178396326464888, + "grad_norm": 0.08392099291086197, + "kl": 0.06803803145885468, + "learning_rate": 3e-06, + "loss": 0.0101, + "step": 1051 + }, + { + "clip_ratio": 0.000669743909384124, + "epoch": 0.0029206158834863046, + "grad_norm": 0.08624348044395447, + "kl": 0.06706851720809937, + "learning_rate": 3e-06, + "loss": 0.01, + "step": 1052 + }, + { + "clip_ratio": 0.001187364658107981, + "epoch": 0.0029233921343261205, + "grad_norm": 0.07946142554283142, + "kl": 0.06465382128953934, + "learning_rate": 3e-06, + "loss": 0.0091, + "step": 1053 + }, + { + "clip_ratio": 0.0002712814530241303, + "epoch": 0.0029261683851659363, + "grad_norm": 0.1127048060297966, + "kl": 0.07115714997053146, + "learning_rate": 3e-06, + "loss": 0.0101, + "step": 1054 + }, + { + "clip_ratio": 0.00016687953029759228, + "epoch": 0.0029289446360057526, + "grad_norm": 0.07184568047523499, + "kl": 0.06631535664200783, + "learning_rate": 3e-06, + "loss": 0.0091, + "step": 1055 + }, + { + "clip_ratio": 0.00020161290012765676, + "epoch": 0.0029317208868455684, + "grad_norm": 0.107190802693367, + "kl": 0.06775633245706558, + "learning_rate": 3e-06, + "loss": 0.0092, + "step": 1056 + }, + { + "clip_ratio": 0.00026758435706142336, + "completion_length": 230.25000762939453, + "epoch": 0.0029344971376853842, + "grad_norm": 0.08019760996103287, + "kl": 0.07465282827615738, + "learning_rate": 3e-06, + "loss": 0.0048, + "reward": 0.22708334773778915, + "reward_std": 0.17232364416122437, + "rewards/countdown_reward_func": 0.22708334773778915, + "step": 1057, + "zero_std_ratio": 0.375 + }, + { + "clip_ratio": 8.294625149574131e-05, + "epoch": 0.0029372733885252, + "grad_norm": 0.08832976967096329, + "kl": 0.07460768148303032, + "learning_rate": 3e-06, + "loss": 0.0056, + "step": 1058 + }, + { + "clip_ratio": 0.00026728439843282104, + "epoch": 0.002940049639365016, + "grad_norm": 0.07996435463428497, + "kl": 0.07490583881735802, + "learning_rate": 3e-06, + "loss": 0.0055, + "step": 1059 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0029428258902048318, + "grad_norm": 0.06566757708787918, + "kl": 0.07495003193616867, + "learning_rate": 3e-06, + "loss": 0.0056, + "step": 1060 + }, + { + "clip_ratio": 0.0001822225167416036, + "epoch": 0.0029456021410446476, + "grad_norm": 0.07333634048700333, + "kl": 0.07876036688685417, + "learning_rate": 3e-06, + "loss": 0.0054, + "step": 1061 + }, + { + "clip_ratio": 9.72762645687908e-05, + "epoch": 0.0029483783918844634, + "grad_norm": 0.08647799491882324, + "kl": 0.0771314725279808, + "learning_rate": 3e-06, + "loss": 0.005, + "step": 1062 + }, + { + "clip_ratio": 8.928571332944557e-05, + "epoch": 0.0029511546427242793, + "grad_norm": 0.07851536571979523, + "kl": 0.07513091340661049, + "learning_rate": 3e-06, + "loss": 0.0045, + "step": 1063 + }, + { + "clip_ratio": 0.0002651687682373449, + "epoch": 0.002953930893564095, + "grad_norm": 0.12982326745986938, + "kl": 0.07557458430528641, + "learning_rate": 3e-06, + "loss": 0.0051, + "step": 1064 + }, + { + "clip_ratio": 0.00026728439843282104, + "epoch": 0.0029567071444039114, + "grad_norm": 0.07488778233528137, + "kl": 0.07366466149687767, + "learning_rate": 3e-06, + "loss": 0.0045, + "step": 1065 + }, + { + "clip_ratio": 0.00026775128208100796, + "epoch": 0.0029594833952437272, + "grad_norm": 0.07061980664730072, + "kl": 0.0725603848695755, + "learning_rate": 3e-06, + "loss": 0.0048, + "step": 1066 + }, + { + "clip_ratio": 0.00024727272102609277, + "epoch": 0.002962259646083543, + "grad_norm": 0.07645444571971893, + "kl": 0.07657236978411674, + "learning_rate": 3e-06, + "loss": 0.0046, + "step": 1067 + }, + { + "clip_ratio": 0.0003571428533177823, + "epoch": 0.002965035896923359, + "grad_norm": 0.07127691805362701, + "kl": 0.07353055477142334, + "learning_rate": 3e-06, + "loss": 0.0045, + "step": 1068 + }, + { + "clip_ratio": 0.0, + "completion_length": 222.81250762939453, + "epoch": 0.0029678121477631748, + "grad_norm": 0.0665845200419426, + "kl": 0.06168382987380028, + "learning_rate": 3e-06, + "loss": 0.0032, + "reward": 0.17500000447034836, + "reward_std": 0.14995060861110687, + "rewards/countdown_reward_func": 0.17500000074505806, + "step": 1069, + "zero_std_ratio": 0.625 + }, + { + "clip_ratio": 0.00035589729668572545, + "epoch": 0.0029705883986029906, + "grad_norm": 0.08601254224777222, + "kl": 0.05869180150330067, + "learning_rate": 3e-06, + "loss": 0.0034, + "step": 1070 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0029733646494428064, + "grad_norm": 0.08124065399169922, + "kl": 0.05769502557814121, + "learning_rate": 3e-06, + "loss": 0.0034, + "step": 1071 + }, + { + "clip_ratio": 8.138021075865254e-05, + "epoch": 0.0029761409002826223, + "grad_norm": 0.062196120619773865, + "kl": 0.0583453718572855, + "learning_rate": 3e-06, + "loss": 0.0034, + "step": 1072 + }, + { + "clip_ratio": 0.0002649337984621525, + "epoch": 0.002978917151122438, + "grad_norm": 0.05779939889907837, + "kl": 0.056453775614500046, + "learning_rate": 3e-06, + "loss": 0.0038, + "step": 1073 + }, + { + "clip_ratio": 0.00030795554630458355, + "epoch": 0.002981693401962254, + "grad_norm": 0.05982575565576553, + "kl": 0.055925050750374794, + "learning_rate": 3e-06, + "loss": 0.0027, + "step": 1074 + }, + { + "clip_ratio": 0.00030165042699081823, + "epoch": 0.00298446965280207, + "grad_norm": 0.07068102061748505, + "kl": 0.057197773829102516, + "learning_rate": 3e-06, + "loss": 0.0028, + "step": 1075 + }, + { + "clip_ratio": 0.0004265254028723575, + "epoch": 0.002987245903641886, + "grad_norm": 0.09287840127944946, + "kl": 0.05506822466850281, + "learning_rate": 3e-06, + "loss": 0.0024, + "step": 1076 + }, + { + "clip_ratio": 0.0004502255469560623, + "epoch": 0.002990022154481702, + "grad_norm": 0.07072468101978302, + "kl": 0.053443435579538345, + "learning_rate": 3e-06, + "loss": 0.0028, + "step": 1077 + }, + { + "clip_ratio": 0.00017390426364727318, + "epoch": 0.0029927984053215177, + "grad_norm": 0.07123330235481262, + "kl": 0.05358114279806614, + "learning_rate": 3e-06, + "loss": 0.003, + "step": 1078 + }, + { + "clip_ratio": 0.0006227242993190885, + "epoch": 0.0029955746561613336, + "grad_norm": 0.063522107899189, + "kl": 0.052388763055205345, + "learning_rate": 3e-06, + "loss": 0.0032, + "step": 1079 + }, + { + "clip_ratio": 0.0002694505383260548, + "epoch": 0.0029983509070011494, + "grad_norm": 0.058882538229227066, + "kl": 0.05159541964530945, + "learning_rate": 3e-06, + "loss": 0.0024, + "step": 1080 + }, + { + "clip_ratio": 0.00018450184143148363, + "completion_length": 226.64584350585938, + "epoch": 0.0030011271578409653, + "grad_norm": 0.1080564484000206, + "kl": 0.05418804846704006, + "learning_rate": 3e-06, + "loss": -0.0019, + "reward": 0.28333334624767303, + "reward_std": 0.3219813033938408, + "rewards/countdown_reward_func": 0.28333333879709244, + "step": 1081, + "zero_std_ratio": 0.125 + }, + { + "clip_ratio": 0.0006435593895730563, + "epoch": 0.003003903408680781, + "grad_norm": 0.12120220065116882, + "kl": 0.05108523927628994, + "learning_rate": 3e-06, + "loss": -0.0019, + "step": 1082 + }, + { + "clip_ratio": 0.0006201210926519707, + "epoch": 0.003006679659520597, + "grad_norm": 0.10109229385852814, + "kl": 0.052261438220739365, + "learning_rate": 3e-06, + "loss": -0.0024, + "step": 1083 + }, + { + "clip_ratio": 0.00017445541743654758, + "epoch": 0.0030094559103604128, + "grad_norm": 0.09882961213588715, + "kl": 0.05041038617491722, + "learning_rate": 3e-06, + "loss": -0.0026, + "step": 1084 + }, + { + "clip_ratio": 0.00020145705639151856, + "epoch": 0.0030122321612002286, + "grad_norm": 0.10909520089626312, + "kl": 0.0491649005562067, + "learning_rate": 3e-06, + "loss": -0.0027, + "step": 1085 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0030150084120400445, + "grad_norm": 0.1672067940235138, + "kl": 0.049838531762361526, + "learning_rate": 3e-06, + "loss": -0.0029, + "step": 1086 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0030177846628798607, + "grad_norm": 0.12344881147146225, + "kl": 0.05135510489344597, + "learning_rate": 3e-06, + "loss": -0.0028, + "step": 1087 + }, + { + "clip_ratio": 0.001279033807804808, + "epoch": 0.0030205609137196766, + "grad_norm": 0.11550983041524887, + "kl": 0.04587686434388161, + "learning_rate": 3e-06, + "loss": -0.0032, + "step": 1088 + }, + { + "clip_ratio": 0.0004374117561383173, + "epoch": 0.0030233371645594924, + "grad_norm": 0.10007118433713913, + "kl": 0.048211125656962395, + "learning_rate": 3e-06, + "loss": -0.0032, + "step": 1089 + }, + { + "clip_ratio": 8.138021075865254e-05, + "epoch": 0.0030261134153993082, + "grad_norm": 0.10254620760679245, + "kl": 0.04644571617245674, + "learning_rate": 3e-06, + "loss": -0.004, + "step": 1090 + }, + { + "clip_ratio": 0.00016276042151730508, + "epoch": 0.003028889666239124, + "grad_norm": 0.11051363497972488, + "kl": 0.044875843450427055, + "learning_rate": 3e-06, + "loss": -0.0043, + "step": 1091 + }, + { + "clip_ratio": 0.0014370106218848377, + "epoch": 0.00303166591707894, + "grad_norm": 0.15505895018577576, + "kl": 0.0460149310529232, + "learning_rate": 3e-06, + "loss": -0.0041, + "step": 1092 + }, + { + "clip_ratio": 8.698677993379533e-05, + "completion_length": 237.5416717529297, + "epoch": 0.0030344421679187558, + "grad_norm": 0.10442124307155609, + "kl": 0.04337725602090359, + "learning_rate": 3e-06, + "loss": 0.0063, + "reward": 0.2854166701436043, + "reward_std": 0.31687821447849274, + "rewards/countdown_reward_func": 0.2854166701436043, + "step": 1093, + "zero_std_ratio": 0.25 + }, + { + "clip_ratio": 0.0001680653658695519, + "epoch": 0.0030372184187585716, + "grad_norm": 0.1659352034330368, + "kl": 0.04275266453623772, + "learning_rate": 3e-06, + "loss": 0.0053, + "step": 1094 + }, + { + "clip_ratio": 0.00017313426360487938, + "epoch": 0.0030399946695983874, + "grad_norm": 0.13962288200855255, + "kl": 0.04703891836106777, + "learning_rate": 3e-06, + "loss": 0.0056, + "step": 1095 + }, + { + "clip_ratio": 0.00017009561270242557, + "epoch": 0.0030427709204382033, + "grad_norm": 0.13137014210224152, + "kl": 0.044395437464118004, + "learning_rate": 3e-06, + "loss": 0.0057, + "step": 1096 + }, + { + "clip_ratio": 0.0002661462058313191, + "epoch": 0.003045547171278019, + "grad_norm": 0.10747912526130676, + "kl": 0.04033567197620869, + "learning_rate": 3e-06, + "loss": 0.0051, + "step": 1097 + }, + { + "clip_ratio": 8.138021075865254e-05, + "epoch": 0.0030483234221178354, + "grad_norm": 0.10946369171142578, + "kl": 0.042647797614336014, + "learning_rate": 3e-06, + "loss": 0.0051, + "step": 1098 + }, + { + "clip_ratio": 9.704969124868512e-05, + "epoch": 0.0030510996729576512, + "grad_norm": 0.10199002921581268, + "kl": 0.04471134953200817, + "learning_rate": 3e-06, + "loss": 0.0047, + "step": 1099 + }, + { + "clip_ratio": 0.0002724502555793151, + "epoch": 0.003053875923797467, + "grad_norm": 0.10533545166254044, + "kl": 0.044932011514902115, + "learning_rate": 3e-06, + "loss": 0.0036, + "step": 1100 + }, + { + "clip_ratio": 8.138021075865254e-05, + "epoch": 0.003056652174637283, + "grad_norm": 0.14148595929145813, + "kl": 0.047867994755506516, + "learning_rate": 3e-06, + "loss": 0.0031, + "step": 1101 + }, + { + "clip_ratio": 0.00046344404108822346, + "epoch": 0.0030594284254770988, + "grad_norm": 0.12489910423755646, + "kl": 0.047186482697725296, + "learning_rate": 3e-06, + "loss": 0.0034, + "step": 1102 + }, + { + "clip_ratio": 0.0002588110146461986, + "epoch": 0.0030622046763169146, + "grad_norm": 0.13311858475208282, + "kl": 0.04366219788789749, + "learning_rate": 3e-06, + "loss": 0.0042, + "step": 1103 + }, + { + "clip_ratio": 9.704969124868512e-05, + "epoch": 0.0030649809271567304, + "grad_norm": 0.09973665326833725, + "kl": 0.045309677720069885, + "learning_rate": 3e-06, + "loss": 0.0039, + "step": 1104 + }, + { + "clip_ratio": 0.0007995735504664481, + "completion_length": 233.06250762939453, + "epoch": 0.0030677571779965463, + "grad_norm": 0.12320923805236816, + "kl": 0.04813423752784729, + "learning_rate": 3e-06, + "loss": 0.0091, + "reward": 0.24791669100522995, + "reward_std": 0.22883931919932365, + "rewards/countdown_reward_func": 0.24791667610406876, + "step": 1105, + "zero_std_ratio": 0.375 + }, + { + "clip_ratio": 8.884150884114206e-05, + "epoch": 0.003070533428836362, + "grad_norm": 0.09392733871936798, + "kl": 0.04717006906867027, + "learning_rate": 3e-06, + "loss": 0.0082, + "step": 1106 + }, + { + "clip_ratio": 0.00026870113651966676, + "epoch": 0.003073309679676178, + "grad_norm": 0.10014389455318451, + "kl": 0.05198243260383606, + "learning_rate": 3e-06, + "loss": 0.0081, + "step": 1107 + }, + { + "clip_ratio": 8.934953802963719e-05, + "epoch": 0.003076085930515994, + "grad_norm": 0.06945673376321793, + "kl": 0.048388589173555374, + "learning_rate": 3e-06, + "loss": 0.0083, + "step": 1108 + }, + { + "clip_ratio": 8.350033749593422e-05, + "epoch": 0.00307886218135581, + "grad_norm": 0.07839835435152054, + "kl": 0.05126038379967213, + "learning_rate": 3e-06, + "loss": 0.0088, + "step": 1109 + }, + { + "clip_ratio": 0.0, + "epoch": 0.003081638432195626, + "grad_norm": 0.07928033918142319, + "kl": 0.05163071118295193, + "learning_rate": 3e-06, + "loss": 0.0077, + "step": 1110 + }, + { + "clip_ratio": 0.00010382059554103762, + "epoch": 0.0030844146830354417, + "grad_norm": 0.09869455546140671, + "kl": 0.05214657075703144, + "learning_rate": 3e-06, + "loss": 0.008, + "step": 1111 + }, + { + "clip_ratio": 0.00017819104687077925, + "epoch": 0.0030871909338752576, + "grad_norm": 0.08661158382892609, + "kl": 0.05166606977581978, + "learning_rate": 3e-06, + "loss": 0.0079, + "step": 1112 + }, + { + "clip_ratio": 0.00043943087075604126, + "epoch": 0.0030899671847150734, + "grad_norm": 0.08715176582336426, + "kl": 0.05564386770129204, + "learning_rate": 3e-06, + "loss": 0.0077, + "step": 1113 + }, + { + "clip_ratio": 0.0001715863836579956, + "epoch": 0.0030927434355548893, + "grad_norm": 0.06843938678503036, + "kl": 0.0525389164686203, + "learning_rate": 3e-06, + "loss": 0.0073, + "step": 1114 + }, + { + "clip_ratio": 0.00018732093303697184, + "epoch": 0.003095519686394705, + "grad_norm": 0.08035042136907578, + "kl": 0.05591611564159393, + "learning_rate": 3e-06, + "loss": 0.008, + "step": 1115 + }, + { + "clip_ratio": 8.934953802963719e-05, + "epoch": 0.003098295937234521, + "grad_norm": 0.0761677697300911, + "kl": 0.056424250826239586, + "learning_rate": 3e-06, + "loss": 0.0069, + "step": 1116 + }, + { + "clip_ratio": 0.00025893703423207626, + "completion_length": 221.83333587646484, + "epoch": 0.0031010721880743368, + "grad_norm": 0.08327441662549973, + "kl": 0.05536172166466713, + "learning_rate": 3e-06, + "loss": -0.0039, + "reward": 0.2645833492279053, + "reward_std": 0.21911749243736267, + "rewards/countdown_reward_func": 0.2645833343267441, + "step": 1117, + "zero_std_ratio": 0.375 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0031038484389141526, + "grad_norm": 0.100023552775383, + "kl": 0.05755512788891792, + "learning_rate": 3e-06, + "loss": -0.004, + "step": 1118 + }, + { + "clip_ratio": 0.00018813963106367737, + "epoch": 0.0031066246897539685, + "grad_norm": 0.139785036444664, + "kl": 0.05882943049073219, + "learning_rate": 3e-06, + "loss": -0.0037, + "step": 1119 + }, + { + "clip_ratio": 0.0001152073746197857, + "epoch": 0.0031094009405937847, + "grad_norm": 0.09004988521337509, + "kl": 0.055969417095184326, + "learning_rate": 3e-06, + "loss": -0.0038, + "step": 1120 + }, + { + "clip_ratio": 9.191176650347188e-05, + "epoch": 0.0031121771914336006, + "grad_norm": 0.06995224207639694, + "kl": 0.05503300577402115, + "learning_rate": 3e-06, + "loss": -0.0039, + "step": 1121 + }, + { + "clip_ratio": 0.00032552084303461015, + "epoch": 0.0031149534422734164, + "grad_norm": 0.060606665909290314, + "kl": 0.05465872026979923, + "learning_rate": 3e-06, + "loss": -0.0036, + "step": 1122 + }, + { + "clip_ratio": 0.00017755682347342372, + "epoch": 0.0031177296931132322, + "grad_norm": 0.07822634279727936, + "kl": 0.056107934564352036, + "learning_rate": 3e-06, + "loss": -0.0044, + "step": 1123 + }, + { + "clip_ratio": 0.0001152073746197857, + "epoch": 0.003120505943953048, + "grad_norm": 0.08555705845355988, + "kl": 0.05655817873775959, + "learning_rate": 3e-06, + "loss": -0.0041, + "step": 1124 + }, + { + "clip_ratio": 0.00019658758537843823, + "epoch": 0.003123282194792864, + "grad_norm": 0.13074277341365814, + "kl": 0.05860498920083046, + "learning_rate": 3e-06, + "loss": -0.005, + "step": 1125 + }, + { + "clip_ratio": 0.00038114040944492444, + "epoch": 0.0031260584456326798, + "grad_norm": 0.0899885818362236, + "kl": 0.054890843108296394, + "learning_rate": 3e-06, + "loss": -0.005, + "step": 1126 + }, + { + "clip_ratio": 0.0006433823728002608, + "epoch": 0.0031288346964724956, + "grad_norm": 0.06713841110467911, + "kl": 0.05289384722709656, + "learning_rate": 3e-06, + "loss": -0.005, + "step": 1127 + }, + { + "clip_ratio": 0.00027796779613709077, + "epoch": 0.0031316109473123114, + "grad_norm": 0.06459382176399231, + "kl": 0.05130323953926563, + "learning_rate": 3e-06, + "loss": -0.0041, + "step": 1128 + }, + { + "clip_ratio": 0.0010648140450939536, + "completion_length": 236.02083587646484, + "epoch": 0.0031343871981521273, + "grad_norm": 0.13045701384544373, + "kl": 0.052938977256417274, + "learning_rate": 3e-06, + "loss": 0.0017, + "reward": 0.2500000149011612, + "reward_std": 0.29990123212337494, + "rewards/countdown_reward_func": 0.2500000149011612, + "step": 1129, + "zero_std_ratio": 0.25 + }, + { + "clip_ratio": 0.00026675376284401864, + "epoch": 0.003137163448991943, + "grad_norm": 0.09997519105672836, + "kl": 0.05317997932434082, + "learning_rate": 3e-06, + "loss": 0.0011, + "step": 1130 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0031399396998317594, + "grad_norm": 0.08752911537885666, + "kl": 0.052106352522969246, + "learning_rate": 3e-06, + "loss": 0.001, + "step": 1131 + }, + { + "clip_ratio": 0.00019378491560928524, + "epoch": 0.0031427159506715752, + "grad_norm": 0.11155866086483002, + "kl": 0.052127305418252945, + "learning_rate": 3e-06, + "loss": 0.0009, + "step": 1132 + }, + { + "clip_ratio": 8.138021075865254e-05, + "epoch": 0.003145492201511391, + "grad_norm": 0.0927848219871521, + "kl": 0.053360870108008385, + "learning_rate": 3e-06, + "loss": 0.0009, + "step": 1133 + }, + { + "clip_ratio": 0.000244140625, + "epoch": 0.003148268452351207, + "grad_norm": 0.09402063488960266, + "kl": 0.053240761160850525, + "learning_rate": 3e-06, + "loss": 0.001, + "step": 1134 + }, + { + "clip_ratio": 0.0004069010537932627, + "epoch": 0.0031510447031910228, + "grad_norm": 0.12245853245258331, + "kl": 0.050540367141366005, + "learning_rate": 3e-06, + "loss": 0.0015, + "step": 1135 + }, + { + "clip_ratio": 0.0005173567624296993, + "epoch": 0.0031538209540308386, + "grad_norm": 0.10049139708280563, + "kl": 0.05162609927356243, + "learning_rate": 3e-06, + "loss": 0.0006, + "step": 1136 + }, + { + "clip_ratio": 8.138021075865254e-05, + "epoch": 0.0031565972048706544, + "grad_norm": 0.10036790370941162, + "kl": 0.051185326650738716, + "learning_rate": 3e-06, + "loss": -0.0004, + "step": 1137 + }, + { + "clip_ratio": 0.00037915847497060895, + "epoch": 0.0031593734557104703, + "grad_norm": 0.10827629268169403, + "kl": 0.05169490538537502, + "learning_rate": 3e-06, + "loss": 0.001, + "step": 1138 + }, + { + "clip_ratio": 0.0004089097637915984, + "epoch": 0.003162149706550286, + "grad_norm": 0.0900021344423294, + "kl": 0.052112411707639694, + "learning_rate": 3e-06, + "loss": 0.0002, + "step": 1139 + }, + { + "clip_ratio": 0.0002506030141375959, + "epoch": 0.003164925957390102, + "grad_norm": 0.09494884312152863, + "kl": 0.05369776301085949, + "learning_rate": 3e-06, + "loss": -0.0003, + "step": 1140 + }, + { + "clip_ratio": 8.316699677379802e-05, + "completion_length": 241.08334350585938, + "epoch": 0.003167702208229918, + "grad_norm": 0.14724688231945038, + "kl": 0.050434716045856476, + "learning_rate": 3e-06, + "loss": 0.0052, + "reward": 0.3750000149011612, + "reward_std": 0.40321892499923706, + "rewards/countdown_reward_func": 0.375, + "step": 1141, + "zero_std_ratio": 0.0 + }, + { + "clip_ratio": 0.0008169302745955065, + "epoch": 0.003170478459069734, + "grad_norm": 0.10081308335065842, + "kl": 0.05200190842151642, + "learning_rate": 3e-06, + "loss": 0.005, + "step": 1142 + }, + { + "clip_ratio": 0.0002557880652602762, + "epoch": 0.00317325470990955, + "grad_norm": 0.11964285373687744, + "kl": 0.05052047781646252, + "learning_rate": 3e-06, + "loss": 0.0048, + "step": 1143 + }, + { + "clip_ratio": 0.0005074728833278641, + "epoch": 0.0031760309607493657, + "grad_norm": 0.20048139989376068, + "kl": 0.056764667853713036, + "learning_rate": 3e-06, + "loss": 0.006, + "step": 1144 + }, + { + "clip_ratio": 0.00027372263139113784, + "epoch": 0.0031788072115891816, + "grad_norm": 0.10537584125995636, + "kl": 0.04961631819605827, + "learning_rate": 3e-06, + "loss": 0.0046, + "step": 1145 + }, + { + "clip_ratio": 0.00033895507658598945, + "epoch": 0.0031815834624289974, + "grad_norm": 0.11174456030130386, + "kl": 0.0519944503903389, + "learning_rate": 3e-06, + "loss": 0.0037, + "step": 1146 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0031843597132688133, + "grad_norm": 0.11552406847476959, + "kl": 0.05385969392955303, + "learning_rate": 3e-06, + "loss": 0.0032, + "step": 1147 + }, + { + "clip_ratio": 0.0011523118009790778, + "epoch": 0.003187135964108629, + "grad_norm": 0.10118366032838821, + "kl": 0.05519482307136059, + "learning_rate": 3e-06, + "loss": 0.0034, + "step": 1148 + }, + { + "clip_ratio": 0.0002575748658273369, + "epoch": 0.003189912214948445, + "grad_norm": 0.12076838314533234, + "kl": 0.05428927019238472, + "learning_rate": 3e-06, + "loss": 0.0032, + "step": 1149 + }, + { + "clip_ratio": 0.0005272936105029657, + "epoch": 0.0031926884657882608, + "grad_norm": 0.1505252718925476, + "kl": 0.06170363910496235, + "learning_rate": 3e-06, + "loss": 0.004, + "step": 1150 + }, + { + "clip_ratio": 0.00017786595708457753, + "epoch": 0.0031954647166280766, + "grad_norm": 0.0998905599117279, + "kl": 0.054063014686107635, + "learning_rate": 3e-06, + "loss": 0.0033, + "step": 1151 + }, + { + "clip_ratio": 0.0005830957088619471, + "epoch": 0.0031982409674678925, + "grad_norm": 0.11353659629821777, + "kl": 0.055830128490924835, + "learning_rate": 3e-06, + "loss": 0.003, + "step": 1152 + }, + { + "clip_ratio": 8.26719551696442e-05, + "completion_length": 232.8541717529297, + "epoch": 0.0032010172183077087, + "grad_norm": 0.12211279571056366, + "kl": 0.07155881449580193, + "learning_rate": 3e-06, + "loss": 0.0302, + "reward": 0.3229166865348816, + "reward_std": 0.24453017860651016, + "rewards/countdown_reward_func": 0.3229166865348816, + "step": 1153, + "zero_std_ratio": 0.375 + }, + { + "clip_ratio": 0.00018164265202358365, + "epoch": 0.0032037934691475246, + "grad_norm": 0.12927870452404022, + "kl": 0.064654640853405, + "learning_rate": 3e-06, + "loss": 0.0298, + "step": 1154 + }, + { + "clip_ratio": 0.0001653439103392884, + "epoch": 0.0032065697199873404, + "grad_norm": 0.10473229736089706, + "kl": 0.07196308299899101, + "learning_rate": 3e-06, + "loss": 0.0299, + "step": 1155 + }, + { + "clip_ratio": 0.00024557957658544183, + "epoch": 0.0032093459708271562, + "grad_norm": 0.1307552605867386, + "kl": 0.06787708401679993, + "learning_rate": 3e-06, + "loss": 0.0292, + "step": 1156 + }, + { + "clip_ratio": 0.00028894259594380856, + "epoch": 0.003212122221666972, + "grad_norm": 0.1252485066652298, + "kl": 0.07557385787367821, + "learning_rate": 3e-06, + "loss": 0.03, + "step": 1157 + }, + { + "clip_ratio": 0.0008342704823007807, + "epoch": 0.003214898472506788, + "grad_norm": 0.11512560397386551, + "kl": 0.07205545529723167, + "learning_rate": 3e-06, + "loss": 0.0282, + "step": 1158 + }, + { + "clip_ratio": 8.406186680076644e-05, + "epoch": 0.0032176747233466038, + "grad_norm": 0.13281333446502686, + "kl": 0.07855755090713501, + "learning_rate": 3e-06, + "loss": 0.0279, + "step": 1159 + }, + { + "clip_ratio": 0.0001690331264398992, + "epoch": 0.0032204509741864196, + "grad_norm": 0.12156619131565094, + "kl": 0.07363288104534149, + "learning_rate": 3e-06, + "loss": 0.0271, + "step": 1160 + }, + { + "clip_ratio": 0.00041335978312417865, + "epoch": 0.0032232272250262354, + "grad_norm": 0.10087471455335617, + "kl": 0.08428899571299553, + "learning_rate": 3e-06, + "loss": 0.0275, + "step": 1161 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0032260034758660513, + "grad_norm": 0.14251810312271118, + "kl": 0.07908787578344345, + "learning_rate": 3e-06, + "loss": 0.0252, + "step": 1162 + }, + { + "clip_ratio": 0.0003094059356953949, + "epoch": 0.003228779726705867, + "grad_norm": 0.11256741732358932, + "kl": 0.08613381907343864, + "learning_rate": 3e-06, + "loss": 0.026, + "step": 1163 + }, + { + "clip_ratio": 0.00026637538394425064, + "epoch": 0.0032315559775456834, + "grad_norm": 0.1097400113940239, + "kl": 0.08562665432691574, + "learning_rate": 3e-06, + "loss": 0.0253, + "step": 1164 + }, + { + "clip_ratio": 0.0001998317675315775, + "completion_length": 213.89584350585938, + "epoch": 0.0032343322283854992, + "grad_norm": 0.087900809943676, + "kl": 0.0852268636226654, + "learning_rate": 3e-06, + "loss": 0.0202, + "reward": 0.4166666865348816, + "reward_std": 0.3476388454437256, + "rewards/countdown_reward_func": 0.4166666567325592, + "step": 1165, + "zero_std_ratio": 0.125 + }, + { + "clip_ratio": 0.0, + "epoch": 0.003237108479225315, + "grad_norm": 0.1092759519815445, + "kl": 0.08935586363077164, + "learning_rate": 3e-06, + "loss": 0.02, + "step": 1166 + }, + { + "clip_ratio": 0.00037480452738236636, + "epoch": 0.003239884730065131, + "grad_norm": 0.1183471605181694, + "kl": 0.09479113668203354, + "learning_rate": 3e-06, + "loss": 0.0208, + "step": 1167 + }, + { + "clip_ratio": 0.0002687285596039146, + "epoch": 0.0032426609809049468, + "grad_norm": 0.09400281310081482, + "kl": 0.09442057460546494, + "learning_rate": 3e-06, + "loss": 0.0193, + "step": 1168 + }, + { + "clip_ratio": 0.00017308967653661966, + "epoch": 0.0032454372317447626, + "grad_norm": 0.14759588241577148, + "kl": 0.09654445201158524, + "learning_rate": 3e-06, + "loss": 0.0188, + "step": 1169 + }, + { + "clip_ratio": 0.00020166092144791037, + "epoch": 0.0032482134825845784, + "grad_norm": 0.10944969207048416, + "kl": 0.09353556111454964, + "learning_rate": 3e-06, + "loss": 0.0197, + "step": 1170 + }, + { + "clip_ratio": 0.00010879024921450764, + "epoch": 0.0032509897334243943, + "grad_norm": 0.10797934979200363, + "kl": 0.10146218538284302, + "learning_rate": 3e-06, + "loss": 0.0192, + "step": 1171 + }, + { + "clip_ratio": 0.0008672036346979439, + "epoch": 0.00325376598426421, + "grad_norm": 0.11213571578264236, + "kl": 0.1059374175965786, + "learning_rate": 3e-06, + "loss": 0.0183, + "step": 1172 + }, + { + "clip_ratio": 0.0018254909082315862, + "epoch": 0.003256542235104026, + "grad_norm": 0.1145535483956337, + "kl": 0.1167490966618061, + "learning_rate": 3e-06, + "loss": 0.0203, + "step": 1173 + }, + { + "clip_ratio": 0.0010834443964995444, + "epoch": 0.003259318485943842, + "grad_norm": 0.08255753666162491, + "kl": 0.11234157159924507, + "learning_rate": 3e-06, + "loss": 0.0177, + "step": 1174 + }, + { + "clip_ratio": 0.0012724358239211142, + "epoch": 0.003262094736783658, + "grad_norm": 0.10409360378980637, + "kl": 0.11117475107312202, + "learning_rate": 3e-06, + "loss": 0.0181, + "step": 1175 + }, + { + "clip_ratio": 0.0022241021506488323, + "epoch": 0.003264870987623474, + "grad_norm": 0.09691368043422699, + "kl": 0.10797763615846634, + "learning_rate": 3e-06, + "loss": 0.018, + "step": 1176 + }, + { + "clip_ratio": 0.0002880159590858966, + "completion_length": 233.7291717529297, + "epoch": 0.0032676472384632897, + "grad_norm": 0.08466751873493195, + "kl": 0.12909483164548874, + "learning_rate": 3e-06, + "loss": 0.0086, + "reward": 0.2291666939854622, + "reward_std": 0.2128555178642273, + "rewards/countdown_reward_func": 0.2291666865348816, + "step": 1177, + "zero_std_ratio": 0.5 + }, + { + "clip_ratio": 0.0003561055054888129, + "epoch": 0.0032704234893031056, + "grad_norm": 0.11577208340167999, + "kl": 0.12361054494976997, + "learning_rate": 3e-06, + "loss": 0.0078, + "step": 1178 + }, + { + "clip_ratio": 8.138021075865254e-05, + "epoch": 0.0032731997401429214, + "grad_norm": 0.09205642342567444, + "kl": 0.13038898259401321, + "learning_rate": 3e-06, + "loss": 0.0089, + "step": 1179 + }, + { + "clip_ratio": 0.000333176227286458, + "epoch": 0.0032759759909827373, + "grad_norm": 0.10060107707977295, + "kl": 0.13378220051527023, + "learning_rate": 3e-06, + "loss": 0.0083, + "step": 1180 + }, + { + "clip_ratio": 0.00010113268945133314, + "epoch": 0.003278752241822553, + "grad_norm": 0.09801634401082993, + "kl": 0.1360284462571144, + "learning_rate": 3e-06, + "loss": 0.0089, + "step": 1181 + }, + { + "clip_ratio": 0.0002441406322759576, + "epoch": 0.003281528492662369, + "grad_norm": 0.08726264536380768, + "kl": 0.131295807659626, + "learning_rate": 3e-06, + "loss": 0.0089, + "step": 1182 + }, + { + "clip_ratio": 0.00017773196304915473, + "epoch": 0.0032843047435021848, + "grad_norm": 0.08691044896841049, + "kl": 0.13674616813659668, + "learning_rate": 3e-06, + "loss": 0.0085, + "step": 1183 + }, + { + "clip_ratio": 0.00018315018678549677, + "epoch": 0.0032870809943420006, + "grad_norm": 0.11604570597410202, + "kl": 0.1295301541686058, + "learning_rate": 3e-06, + "loss": 0.0073, + "step": 1184 + }, + { + "clip_ratio": 0.0007019351178314537, + "epoch": 0.0032898572451818165, + "grad_norm": 0.09062675386667252, + "kl": 0.13159853219985962, + "learning_rate": 3e-06, + "loss": 0.0084, + "step": 1185 + }, + { + "clip_ratio": 0.0004526742995949462, + "epoch": 0.0032926334960216327, + "grad_norm": 0.10843745619058609, + "kl": 0.13263515383005142, + "learning_rate": 3e-06, + "loss": 0.0079, + "step": 1186 + }, + { + "clip_ratio": 0.0006641855870839208, + "epoch": 0.0032954097468614486, + "grad_norm": 0.08221055567264557, + "kl": 0.13184264674782753, + "learning_rate": 3e-06, + "loss": 0.0083, + "step": 1187 + }, + { + "clip_ratio": 0.0004921089857816696, + "epoch": 0.0032981859977012644, + "grad_norm": 0.08339028805494308, + "kl": 0.1247389204800129, + "learning_rate": 3e-06, + "loss": 0.0072, + "step": 1188 + }, + { + "clip_ratio": 0.00020850708824582398, + "completion_length": 234.0416717529297, + "epoch": 0.0033009622485410802, + "grad_norm": 0.11418038606643677, + "kl": 0.10565116629004478, + "learning_rate": 3e-06, + "loss": -0.0015, + "reward": 0.3437500298023224, + "reward_std": 0.28535500913858414, + "rewards/countdown_reward_func": 0.3437500149011612, + "step": 1189, + "zero_std_ratio": 0.375 + }, + { + "clip_ratio": 0.0, + "epoch": 0.003303738499380896, + "grad_norm": 0.12243412435054779, + "kl": 0.09964673221111298, + "learning_rate": 3e-06, + "loss": -0.0021, + "step": 1190 + }, + { + "clip_ratio": 0.00028036253206664696, + "epoch": 0.003306514750220712, + "grad_norm": 0.17603905498981476, + "kl": 0.10419896617531776, + "learning_rate": 3e-06, + "loss": -0.0001, + "step": 1191 + }, + { + "clip_ratio": 0.0001840942568378523, + "epoch": 0.0033092910010605278, + "grad_norm": 0.11945787817239761, + "kl": 0.09894903376698494, + "learning_rate": 3e-06, + "loss": -0.0027, + "step": 1192 + }, + { + "clip_ratio": 8.406186680076644e-05, + "epoch": 0.0033120672519003436, + "grad_norm": 0.14571459591388702, + "kl": 0.09924932196736336, + "learning_rate": 3e-06, + "loss": -0.0023, + "step": 1193 + }, + { + "clip_ratio": 0.000245529918174725, + "epoch": 0.0033148435027401594, + "grad_norm": 0.1262568235397339, + "kl": 0.09315666556358337, + "learning_rate": 3e-06, + "loss": -0.0041, + "step": 1194 + }, + { + "clip_ratio": 0.00029078290390316397, + "epoch": 0.0033176197535799753, + "grad_norm": 0.12053867429494858, + "kl": 0.09638616442680359, + "learning_rate": 3e-06, + "loss": -0.0028, + "step": 1195 + }, + { + "clip_ratio": 0.0008103728177957237, + "epoch": 0.0033203960044197916, + "grad_norm": 0.14339543879032135, + "kl": 0.089625783264637, + "learning_rate": 3e-06, + "loss": -0.0034, + "step": 1196 + }, + { + "clip_ratio": 0.0003617427501012571, + "epoch": 0.0033231722552596074, + "grad_norm": 0.17292562127113342, + "kl": 0.09256411343812943, + "learning_rate": 3e-06, + "loss": -0.0038, + "step": 1197 + }, + { + "clip_ratio": 0.0008070902986219153, + "epoch": 0.0033259485060994232, + "grad_norm": 0.11857297271490097, + "kl": 0.08773430064320564, + "learning_rate": 3e-06, + "loss": -0.0032, + "step": 1198 + }, + { + "clip_ratio": 0.00044789022649638355, + "epoch": 0.003328724756939239, + "grad_norm": 0.1319698989391327, + "kl": 0.08959746360778809, + "learning_rate": 3e-06, + "loss": -0.0056, + "step": 1199 + }, + { + "clip_ratio": 0.0012999755563214421, + "epoch": 0.003331501007779055, + "grad_norm": 0.1194002777338028, + "kl": 0.08413466811180115, + "learning_rate": 3e-06, + "loss": -0.0052, + "step": 1200 + }, + { + "clip_ratio": 0.00024771419703029096, + "completion_length": 225.64584350585938, + "epoch": 0.0033342772586188708, + "grad_norm": 0.07703583687543869, + "kl": 0.09083598852157593, + "learning_rate": 3e-06, + "loss": 0.0078, + "reward": 0.2291666939854622, + "reward_std": 0.18291139975190163, + "rewards/countdown_reward_func": 0.2291666865348816, + "step": 1201, + "zero_std_ratio": 0.5 + }, + { + "clip_ratio": 0.00020563387079164386, + "epoch": 0.0033370535094586866, + "grad_norm": 0.0965060293674469, + "kl": 0.09030065685510635, + "learning_rate": 3e-06, + "loss": 0.0065, + "step": 1202 + }, + { + "clip_ratio": 0.00019009599782293662, + "epoch": 0.0033398297602985024, + "grad_norm": 0.07806777209043503, + "kl": 0.09229482710361481, + "learning_rate": 3e-06, + "loss": 0.0073, + "step": 1203 + }, + { + "clip_ratio": 0.00033928067568922415, + "epoch": 0.0033426060111383183, + "grad_norm": 0.08718756586313248, + "kl": 0.08507254719734192, + "learning_rate": 3e-06, + "loss": 0.0068, + "step": 1204 + }, + { + "clip_ratio": 0.0, + "epoch": 0.003345382261978134, + "grad_norm": 0.08955029398202896, + "kl": 0.08796864375472069, + "learning_rate": 3e-06, + "loss": 0.0063, + "step": 1205 + }, + { + "clip_ratio": 8.316699677379802e-05, + "epoch": 0.00334815851281795, + "grad_norm": 0.11132470518350601, + "kl": 0.09032916277647018, + "learning_rate": 3e-06, + "loss": 0.007, + "step": 1206 + }, + { + "clip_ratio": 0.0002561136716394685, + "epoch": 0.0033509347636577662, + "grad_norm": 0.08454351872205734, + "kl": 0.08703029155731201, + "learning_rate": 3e-06, + "loss": 0.0064, + "step": 1207 + }, + { + "clip_ratio": 9.137426968663931e-05, + "epoch": 0.003353711014497582, + "grad_norm": 0.08685325086116791, + "kl": 0.08563976734876633, + "learning_rate": 3e-06, + "loss": 0.006, + "step": 1208 + }, + { + "clip_ratio": 0.0001069290010491386, + "epoch": 0.003356487265337398, + "grad_norm": 0.08751443773508072, + "kl": 0.0876334123313427, + "learning_rate": 3e-06, + "loss": 0.007, + "step": 1209 + }, + { + "clip_ratio": 0.00042423446575412527, + "epoch": 0.0033592635161772137, + "grad_norm": 0.07404050976037979, + "kl": 0.07978618890047073, + "learning_rate": 3e-06, + "loss": 0.0063, + "step": 1210 + }, + { + "clip_ratio": 0.0005211606621742249, + "epoch": 0.0033620397670170296, + "grad_norm": 0.1123705729842186, + "kl": 0.08384502306580544, + "learning_rate": 3e-06, + "loss": 0.006, + "step": 1211 + }, + { + "clip_ratio": 0.00043046276550740004, + "epoch": 0.0033648160178568454, + "grad_norm": 0.09377264976501465, + "kl": 0.08539973199367523, + "learning_rate": 3e-06, + "loss": 0.0065, + "step": 1212 + }, + { + "clip_ratio": 0.00017872503667604178, + "completion_length": 238.9375, + "epoch": 0.0033675922686966613, + "grad_norm": 0.08966858685016632, + "kl": 0.08921418339014053, + "learning_rate": 3e-06, + "loss": 0.0032, + "reward": 0.26458335667848587, + "reward_std": 0.2634518966078758, + "rewards/countdown_reward_func": 0.26458335667848587, + "step": 1213, + "zero_std_ratio": 0.375 + }, + { + "clip_ratio": 0.0003323842174722813, + "epoch": 0.003370368519536477, + "grad_norm": 0.11603780835866928, + "kl": 0.09003105014562607, + "learning_rate": 3e-06, + "loss": 0.0033, + "step": 1214 + }, + { + "clip_ratio": 9.505703201284632e-05, + "epoch": 0.003373144770376293, + "grad_norm": 0.12451639026403427, + "kl": 0.08492600917816162, + "learning_rate": 3e-06, + "loss": 0.0038, + "step": 1215 + }, + { + "clip_ratio": 8.234519191319123e-05, + "epoch": 0.0033759210212161088, + "grad_norm": 0.09685535728931427, + "kl": 0.08527249097824097, + "learning_rate": 3e-06, + "loss": 0.0028, + "step": 1216 + }, + { + "clip_ratio": 0.0005166697083041072, + "epoch": 0.0033786972720559246, + "grad_norm": 0.1449529230594635, + "kl": 0.08535202592611313, + "learning_rate": 3e-06, + "loss": 0.0034, + "step": 1217 + }, + { + "clip_ratio": 0.0, + "epoch": 0.003381473522895741, + "grad_norm": 0.08282797783613205, + "kl": 0.0859798900783062, + "learning_rate": 3e-06, + "loss": 0.0036, + "step": 1218 + }, + { + "clip_ratio": 0.00018502863531466573, + "epoch": 0.0033842497737355567, + "grad_norm": 0.09159551560878754, + "kl": 0.08557010814547539, + "learning_rate": 3e-06, + "loss": 0.0027, + "step": 1219 + }, + { + "clip_ratio": 0.0002536034444347024, + "epoch": 0.0033870260245753726, + "grad_norm": 0.10851403325796127, + "kl": 0.08524026349186897, + "learning_rate": 3e-06, + "loss": 0.0024, + "step": 1220 + }, + { + "clip_ratio": 0.0005241364997345954, + "epoch": 0.0033898022754151884, + "grad_norm": 0.1287156045436859, + "kl": 0.08104551210999489, + "learning_rate": 3e-06, + "loss": 0.0025, + "step": 1221 + }, + { + "clip_ratio": 0.00017740222392603755, + "epoch": 0.0033925785262550042, + "grad_norm": 0.10037072002887726, + "kl": 0.08082518354058266, + "learning_rate": 3e-06, + "loss": 0.002, + "step": 1222 + }, + { + "clip_ratio": 0.00017451102030463517, + "epoch": 0.00339535477709482, + "grad_norm": 0.13647525012493134, + "kl": 0.08178085088729858, + "learning_rate": 3e-06, + "loss": 0.0009, + "step": 1223 + }, + { + "clip_ratio": 0.000411814013205003, + "epoch": 0.003398131027934636, + "grad_norm": 0.08645734190940857, + "kl": 0.08200101554393768, + "learning_rate": 3e-06, + "loss": 0.0031, + "step": 1224 + }, + { + "clip_ratio": 0.0, + "completion_length": 241.62500762939453, + "epoch": 0.0034009072787744518, + "grad_norm": 0.10560040175914764, + "kl": 0.07796743884682655, + "learning_rate": 3e-06, + "loss": 0.0136, + "reward": 0.26875001937150955, + "reward_std": 0.2272602580487728, + "rewards/countdown_reward_func": 0.26875001937150955, + "step": 1225, + "zero_std_ratio": 0.5 + }, + { + "clip_ratio": 0.00016276042151730508, + "epoch": 0.0034036835296142676, + "grad_norm": 0.15348504483699799, + "kl": 0.07507862150669098, + "learning_rate": 3e-06, + "loss": 0.014, + "step": 1226 + }, + { + "clip_ratio": 0.00033518215059302747, + "epoch": 0.0034064597804540834, + "grad_norm": 0.08251035213470459, + "kl": 0.08217736333608627, + "learning_rate": 3e-06, + "loss": 0.015, + "step": 1227 + }, + { + "clip_ratio": 8.138021075865254e-05, + "epoch": 0.0034092360312938993, + "grad_norm": 0.11358001828193665, + "kl": 0.07504938542842865, + "learning_rate": 3e-06, + "loss": 0.0148, + "step": 1228 + }, + { + "clip_ratio": 0.0003684598486870527, + "epoch": 0.0034120122821337156, + "grad_norm": 0.1041935384273529, + "kl": 0.08010926097631454, + "learning_rate": 3e-06, + "loss": 0.0143, + "step": 1229 + }, + { + "clip_ratio": 0.0005087452591396868, + "epoch": 0.0034147885329735314, + "grad_norm": 0.09497429430484772, + "kl": 0.07363765686750412, + "learning_rate": 3e-06, + "loss": 0.0139, + "step": 1230 + }, + { + "clip_ratio": 8.138021075865254e-05, + "epoch": 0.0034175647838133472, + "grad_norm": 0.08915600925683975, + "kl": 0.07781890407204628, + "learning_rate": 3e-06, + "loss": 0.0131, + "step": 1231 + }, + { + "clip_ratio": 0.0003355148946866393, + "epoch": 0.003420341034653163, + "grad_norm": 0.0972413569688797, + "kl": 0.07709966227412224, + "learning_rate": 3e-06, + "loss": 0.0131, + "step": 1232 + }, + { + "clip_ratio": 0.000244140625, + "epoch": 0.003423117285492979, + "grad_norm": 0.08852770924568176, + "kl": 0.08478394895792007, + "learning_rate": 3e-06, + "loss": 0.0147, + "step": 1233 + }, + { + "clip_ratio": 0.0008937545935623348, + "epoch": 0.0034258935363327947, + "grad_norm": 0.11425919830799103, + "kl": 0.07852787896990776, + "learning_rate": 3e-06, + "loss": 0.0144, + "step": 1234 + }, + { + "clip_ratio": 0.000438432558439672, + "epoch": 0.0034286697871726106, + "grad_norm": 0.09492716938257217, + "kl": 0.08431283012032509, + "learning_rate": 3e-06, + "loss": 0.013, + "step": 1235 + }, + { + "clip_ratio": 0.0004982753162039444, + "epoch": 0.0034314460380124264, + "grad_norm": 0.10918039828538895, + "kl": 0.07764238864183426, + "learning_rate": 3e-06, + "loss": 0.012, + "step": 1236 + }, + { + "clip_ratio": 8.821453957352787e-05, + "completion_length": 239.8541717529297, + "epoch": 0.0034342222888522423, + "grad_norm": 0.1400684416294098, + "kl": 0.0789659395813942, + "learning_rate": 3e-06, + "loss": 0.0171, + "reward": 0.4375000149011612, + "reward_std": 0.398020476102829, + "rewards/countdown_reward_func": 0.4375000149011612, + "step": 1237, + "zero_std_ratio": 0.125 + }, + { + "clip_ratio": 8.790435822447762e-05, + "epoch": 0.003436998539692058, + "grad_norm": 0.117515429854393, + "kl": 0.08153605833649635, + "learning_rate": 3e-06, + "loss": 0.0183, + "step": 1238 + }, + { + "clip_ratio": 0.0001731122611090541, + "epoch": 0.003439774790531874, + "grad_norm": 0.12731020152568817, + "kl": 0.08211733773350716, + "learning_rate": 3e-06, + "loss": 0.0186, + "step": 1239 + }, + { + "clip_ratio": 0.00017580871644895524, + "epoch": 0.0034425510413716902, + "grad_norm": 0.1385928988456726, + "kl": 0.07959434390068054, + "learning_rate": 3e-06, + "loss": 0.0168, + "step": 1240 + }, + { + "clip_ratio": 0.00017041582032106817, + "epoch": 0.003445327292211506, + "grad_norm": 0.12391113489866257, + "kl": 0.08285382017493248, + "learning_rate": 3e-06, + "loss": 0.0182, + "step": 1241 + }, + { + "clip_ratio": 0.0, + "epoch": 0.003448103543051322, + "grad_norm": 0.10620970278978348, + "kl": 0.0919383093714714, + "learning_rate": 3e-06, + "loss": 0.0182, + "step": 1242 + }, + { + "clip_ratio": 8.821453957352787e-05, + "epoch": 0.0034508797938911377, + "grad_norm": 0.1453152298927307, + "kl": 0.08631188422441483, + "learning_rate": 3e-06, + "loss": 0.0174, + "step": 1243 + }, + { + "clip_ratio": 8.884150884114206e-05, + "epoch": 0.0034536560447309536, + "grad_norm": 0.11560267210006714, + "kl": 0.08962936699390411, + "learning_rate": 3e-06, + "loss": 0.0167, + "step": 1244 + }, + { + "clip_ratio": 0.000705406149791088, + "epoch": 0.0034564322955707694, + "grad_norm": 0.13734284043312073, + "kl": 0.09047690033912659, + "learning_rate": 3e-06, + "loss": 0.0165, + "step": 1245 + }, + { + "clip_ratio": 8.234519191319123e-05, + "epoch": 0.0034592085464105853, + "grad_norm": 0.11958130449056625, + "kl": 0.08676190301775932, + "learning_rate": 3e-06, + "loss": 0.0159, + "step": 1246 + }, + { + "clip_ratio": 0.00025863035989459604, + "epoch": 0.003461984797250401, + "grad_norm": 0.13450902700424194, + "kl": 0.09278702363371849, + "learning_rate": 3e-06, + "loss": 0.0169, + "step": 1247 + }, + { + "clip_ratio": 8.884150884114206e-05, + "epoch": 0.003464761048090217, + "grad_norm": 0.11801480501890182, + "kl": 0.1030571274459362, + "learning_rate": 3e-06, + "loss": 0.0163, + "step": 1248 + }, + { + "clip_ratio": 9.231905278284103e-05, + "completion_length": 225.1041717529297, + "epoch": 0.0034675372989300328, + "grad_norm": 0.13714726269245148, + "kl": 0.09993145614862442, + "learning_rate": 3e-06, + "loss": 0.0016, + "reward": 0.30000002682209015, + "reward_std": 0.30601368844509125, + "rewards/countdown_reward_func": 0.30000000447034836, + "step": 1249, + "zero_std_ratio": 0.125 + }, + { + "clip_ratio": 0.00034739852708298713, + "epoch": 0.0034703135497698486, + "grad_norm": 0.11648391932249069, + "kl": 0.10220260173082352, + "learning_rate": 3e-06, + "loss": 0.0002, + "step": 1250 + }, + { + "clip_ratio": 0.00016276042151730508, + "epoch": 0.003473089800609665, + "grad_norm": 0.0945407897233963, + "kl": 0.11158350110054016, + "learning_rate": 3e-06, + "loss": 0.0013, + "step": 1251 + }, + { + "clip_ratio": 0.00010683760774554685, + "epoch": 0.0034758660514494807, + "grad_norm": 0.1068953201174736, + "kl": 0.1054844819009304, + "learning_rate": 3e-06, + "loss": 0.001, + "step": 1252 + }, + { + "clip_ratio": 0.00010416666918899864, + "epoch": 0.0034786423022892966, + "grad_norm": 0.1203155517578125, + "kl": 0.11958234012126923, + "learning_rate": 3e-06, + "loss": 0.0009, + "step": 1253 + }, + { + "clip_ratio": 0.00018602220370667055, + "epoch": 0.0034814185531291124, + "grad_norm": 0.13715901970863342, + "kl": 0.11173819750547409, + "learning_rate": 3e-06, + "loss": 0.0016, + "step": 1254 + }, + { + "clip_ratio": 9.231905278284103e-05, + "epoch": 0.0034841948039689282, + "grad_norm": 0.14036564528942108, + "kl": 0.10752937197685242, + "learning_rate": 3e-06, + "loss": 0.0002, + "step": 1255 + }, + { + "clip_ratio": 0.0004339037259342149, + "epoch": 0.003486971054808744, + "grad_norm": 0.1094101294875145, + "kl": 0.10794253274798393, + "learning_rate": 3e-06, + "loss": -0.0001, + "step": 1256 + }, + { + "clip_ratio": 0.0002743252844084054, + "epoch": 0.00348974730564856, + "grad_norm": 0.1030493676662445, + "kl": 0.11747561022639275, + "learning_rate": 3e-06, + "loss": 0.0004, + "step": 1257 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0034925235564883758, + "grad_norm": 0.09440533816814423, + "kl": 0.11023146659135818, + "learning_rate": 3e-06, + "loss": 0.0007, + "step": 1258 + }, + { + "clip_ratio": 0.000180825540155638, + "epoch": 0.0034952998073281916, + "grad_norm": 0.11672715842723846, + "kl": 0.1223120354115963, + "learning_rate": 3e-06, + "loss": -0.0004, + "step": 1259 + }, + { + "clip_ratio": 0.00018602220370667055, + "epoch": 0.0034980760581680074, + "grad_norm": 0.14074575901031494, + "kl": 0.11307071894407272, + "learning_rate": 3e-06, + "loss": -0.0001, + "step": 1260 + }, + { + "clip_ratio": 0.00016666666488163173, + "completion_length": 237.3125, + "epoch": 0.0035008523090078233, + "grad_norm": 0.10606394708156586, + "kl": 0.10512935742735863, + "learning_rate": 3e-06, + "loss": -0.0002, + "reward": 0.23125001043081284, + "reward_std": 0.2202121764421463, + "rewards/countdown_reward_func": 0.23125001043081284, + "step": 1261, + "zero_std_ratio": 0.5 + }, + { + "clip_ratio": 9.412650251761079e-05, + "epoch": 0.0035036285598476396, + "grad_norm": 0.11450087279081345, + "kl": 0.10360938310623169, + "learning_rate": 3e-06, + "loss": 0.0004, + "step": 1262 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0035064048106874554, + "grad_norm": 0.09911279380321503, + "kl": 0.11000372096896172, + "learning_rate": 3e-06, + "loss": -0.0002, + "step": 1263 + }, + { + "clip_ratio": 0.00035296654095873237, + "epoch": 0.0035091810615272712, + "grad_norm": 0.09363103657960892, + "kl": 0.10553918406367302, + "learning_rate": 3e-06, + "loss": -0.0005, + "step": 1264 + }, + { + "clip_ratio": 8.138021075865254e-05, + "epoch": 0.003511957312367087, + "grad_norm": 0.1504822075366974, + "kl": 0.10735897347331047, + "learning_rate": 3e-06, + "loss": 0.0005, + "step": 1265 + }, + { + "clip_ratio": 0.00025724637089297175, + "epoch": 0.003514733563206903, + "grad_norm": 0.08245550096035004, + "kl": 0.10223409533500671, + "learning_rate": 3e-06, + "loss": 0.0002, + "step": 1266 + }, + { + "clip_ratio": 0.00024871622008504346, + "epoch": 0.0035175098140467187, + "grad_norm": 0.09745687991380692, + "kl": 0.1023399755358696, + "learning_rate": 3e-06, + "loss": -0.001, + "step": 1267 + }, + { + "clip_ratio": 0.0002794273314066231, + "epoch": 0.0035202860648865346, + "grad_norm": 0.09371484816074371, + "kl": 0.09988303855061531, + "learning_rate": 3e-06, + "loss": -0.0003, + "step": 1268 + }, + { + "clip_ratio": 8.896797226043418e-05, + "epoch": 0.0035230623157263504, + "grad_norm": 0.12038465589284897, + "kl": 0.10891411453485489, + "learning_rate": 3e-06, + "loss": -0.0019, + "step": 1269 + }, + { + "clip_ratio": 0.0005968022960587405, + "epoch": 0.0035258385665661663, + "grad_norm": 0.10312320291996002, + "kl": 0.1005222424864769, + "learning_rate": 3e-06, + "loss": -0.0011, + "step": 1270 + }, + { + "clip_ratio": 0.0006010157812852412, + "epoch": 0.003528614817405982, + "grad_norm": 0.16399547457695007, + "kl": 0.10354287177324295, + "learning_rate": 3e-06, + "loss": -0.0024, + "step": 1271 + }, + { + "clip_ratio": 0.00042980091529898345, + "epoch": 0.003531391068245798, + "grad_norm": 0.08301949501037598, + "kl": 0.09836991503834724, + "learning_rate": 3e-06, + "loss": -0.0004, + "step": 1272 + }, + { + "clip_ratio": 8.138021075865254e-05, + "completion_length": 243.81250762939453, + "epoch": 0.0035341673190856142, + "grad_norm": 0.08151868730783463, + "kl": 0.09731181338429451, + "learning_rate": 3e-06, + "loss": 0.0097, + "reward": 0.19375000149011612, + "reward_std": 0.15347465127706528, + "rewards/countdown_reward_func": 0.19375000149011612, + "step": 1273, + "zero_std_ratio": 0.625 + }, + { + "clip_ratio": 0.00016276042151730508, + "epoch": 0.00353694356992543, + "grad_norm": 0.08378417789936066, + "kl": 0.09494262933731079, + "learning_rate": 3e-06, + "loss": 0.0091, + "step": 1274 + }, + { + "clip_ratio": 0.00034852556564146653, + "epoch": 0.003539719820765246, + "grad_norm": 0.06453592330217361, + "kl": 0.09252246841788292, + "learning_rate": 3e-06, + "loss": 0.009, + "step": 1275 + }, + { + "clip_ratio": 8.138021075865254e-05, + "epoch": 0.0035424960716050617, + "grad_norm": 0.059850361198186874, + "kl": 0.09488331153988838, + "learning_rate": 3e-06, + "loss": 0.009, + "step": 1276 + }, + { + "clip_ratio": 0.0005926662124693394, + "epoch": 0.0035452723224448776, + "grad_norm": 0.08670902252197266, + "kl": 0.09074336290359497, + "learning_rate": 3e-06, + "loss": 0.0087, + "step": 1277 + }, + { + "clip_ratio": 0.0004440398042788729, + "epoch": 0.0035480485732846934, + "grad_norm": 0.14446234703063965, + "kl": 0.09088549762964249, + "learning_rate": 3e-06, + "loss": 0.0092, + "step": 1278 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0035508248241245093, + "grad_norm": 0.07895587384700775, + "kl": 0.09484604373574257, + "learning_rate": 3e-06, + "loss": 0.0086, + "step": 1279 + }, + { + "clip_ratio": 0.000244140625, + "epoch": 0.003553601074964325, + "grad_norm": 0.07360237091779709, + "kl": 0.09300564974546432, + "learning_rate": 3e-06, + "loss": 0.0093, + "step": 1280 + }, + { + "clip_ratio": 0.00032552084303461015, + "epoch": 0.003556377325804141, + "grad_norm": 0.06718261539936066, + "kl": 0.09006435796618462, + "learning_rate": 3e-06, + "loss": 0.0091, + "step": 1281 + }, + { + "clip_ratio": 0.00025739153352333233, + "epoch": 0.0035591535766439568, + "grad_norm": 0.06159327179193497, + "kl": 0.09368356689810753, + "learning_rate": 3e-06, + "loss": 0.0087, + "step": 1282 + }, + { + "clip_ratio": 0.00044258125126361847, + "epoch": 0.0035619298274837726, + "grad_norm": 0.1282200962305069, + "kl": 0.08870528638362885, + "learning_rate": 3e-06, + "loss": 0.0088, + "step": 1283 + }, + { + "clip_ratio": 0.0005040145115344785, + "epoch": 0.003564706078323589, + "grad_norm": 0.13320301473140717, + "kl": 0.0886475220322609, + "learning_rate": 3e-06, + "loss": 0.0076, + "step": 1284 + }, + { + "clip_ratio": 0.0005532470531761646, + "completion_length": 223.64584350585938, + "epoch": 0.0035674823291634047, + "grad_norm": 0.11603424698114395, + "kl": 0.08919220045208931, + "learning_rate": 3e-06, + "loss": -0.0095, + "reward": 0.27916668355464935, + "reward_std": 0.3674810379743576, + "rewards/countdown_reward_func": 0.27916668355464935, + "step": 1285, + "zero_std_ratio": 0.0 + }, + { + "clip_ratio": 0.0001726210757624358, + "epoch": 0.0035702585800032206, + "grad_norm": 0.11266878992319107, + "kl": 0.09426239505410194, + "learning_rate": 3e-06, + "loss": -0.0086, + "step": 1286 + }, + { + "clip_ratio": 0.00018248174455948174, + "epoch": 0.0035730348308430364, + "grad_norm": 0.1493123471736908, + "kl": 0.10469094663858414, + "learning_rate": 3e-06, + "loss": -0.0097, + "step": 1287 + }, + { + "clip_ratio": 9.272996976505965e-05, + "epoch": 0.0035758110816828522, + "grad_norm": 0.1302638202905655, + "kl": 0.09487400203943253, + "learning_rate": 3e-06, + "loss": -0.0106, + "step": 1288 + }, + { + "clip_ratio": 0.0001050420178216882, + "epoch": 0.003578587332522668, + "grad_norm": 0.11855723708868027, + "kl": 0.09268858656287193, + "learning_rate": 3e-06, + "loss": -0.0108, + "step": 1289 + }, + { + "clip_ratio": 9.124087227974087e-05, + "epoch": 0.003581363583362484, + "grad_norm": 0.0906294658780098, + "kl": 0.10145439580082893, + "learning_rate": 3e-06, + "loss": -0.009, + "step": 1290 + }, + { + "clip_ratio": 0.00035833738365909085, + "epoch": 0.0035841398342022998, + "grad_norm": 0.10448364913463593, + "kl": 0.0870097205042839, + "learning_rate": 3e-06, + "loss": -0.0114, + "step": 1291 + }, + { + "clip_ratio": 0.0001726210757624358, + "epoch": 0.0035869160850421156, + "grad_norm": 0.12192343175411224, + "kl": 0.09259336069226265, + "learning_rate": 3e-06, + "loss": -0.0107, + "step": 1292 + }, + { + "clip_ratio": 0.00018248174455948174, + "epoch": 0.0035896923358819314, + "grad_norm": 0.12048960477113724, + "kl": 0.10295253619551659, + "learning_rate": 3e-06, + "loss": -0.0115, + "step": 1293 + }, + { + "clip_ratio": 9.272996976505965e-05, + "epoch": 0.0035924685867217473, + "grad_norm": 0.12977741658687592, + "kl": 0.0927329771220684, + "learning_rate": 3e-06, + "loss": -0.0141, + "step": 1294 + }, + { + "clip_ratio": 0.0003834426242974587, + "epoch": 0.0035952448375615636, + "grad_norm": 0.1129441037774086, + "kl": 0.09166432544589043, + "learning_rate": 3e-06, + "loss": -0.0121, + "step": 1295 + }, + { + "clip_ratio": 0.0004474218876566738, + "epoch": 0.0035980210884013794, + "grad_norm": 0.10428842157125473, + "kl": 0.10321545228362083, + "learning_rate": 3e-06, + "loss": -0.0114, + "step": 1296 + }, + { + "clip_ratio": 0.00017595020472072065, + "completion_length": 240.0625, + "epoch": 0.0036007973392411952, + "grad_norm": 0.11313264816999435, + "kl": 0.08579898625612259, + "learning_rate": 3e-06, + "loss": 0.0105, + "reward": 0.18958333879709244, + "reward_std": 0.20279134064912796, + "rewards/countdown_reward_func": 0.18958333134651184, + "step": 1297, + "zero_std_ratio": 0.375 + }, + { + "clip_ratio": 0.00016954136663116515, + "epoch": 0.003603573590081011, + "grad_norm": 0.08754347264766693, + "kl": 0.07732414454221725, + "learning_rate": 3e-06, + "loss": 0.0108, + "step": 1298 + }, + { + "clip_ratio": 0.00021079258294776082, + "epoch": 0.003606349840920827, + "grad_norm": 0.09487965703010559, + "kl": 0.07977383211255074, + "learning_rate": 3e-06, + "loss": 0.0106, + "step": 1299 + }, + { + "clip_ratio": 8.468834857922047e-05, + "epoch": 0.0036091260917606427, + "grad_norm": 0.08399857580661774, + "kl": 0.08700349926948547, + "learning_rate": 3e-06, + "loss": 0.0118, + "step": 1300 + }, + { + "clip_ratio": 8.138021075865254e-05, + "epoch": 0.0036119023426004586, + "grad_norm": 0.06855960935354233, + "kl": 0.07815397530794144, + "learning_rate": 3e-06, + "loss": 0.0115, + "step": 1301 + }, + { + "clip_ratio": 0.0002462810225551948, + "epoch": 0.0036146785934402744, + "grad_norm": 0.0689474567770958, + "kl": 0.07884856685996056, + "learning_rate": 3e-06, + "loss": 0.0115, + "step": 1302 + }, + { + "clip_ratio": 0.00018677650223253295, + "epoch": 0.0036174548442800903, + "grad_norm": 0.10885361582040787, + "kl": 0.08625783771276474, + "learning_rate": 3e-06, + "loss": 0.0108, + "step": 1303 + }, + { + "clip_ratio": 0.00017632621165830642, + "epoch": 0.003620231095119906, + "grad_norm": 0.07666079699993134, + "kl": 0.07669586688280106, + "learning_rate": 3e-06, + "loss": 0.01, + "step": 1304 + }, + { + "clip_ratio": 0.0002925453591160476, + "epoch": 0.003623007345959722, + "grad_norm": 0.08154989778995514, + "kl": 0.07938069105148315, + "learning_rate": 3e-06, + "loss": 0.01, + "step": 1305 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0036257835967995382, + "grad_norm": 0.0733746886253357, + "kl": 0.08602719753980637, + "learning_rate": 3e-06, + "loss": 0.0106, + "step": 1306 + }, + { + "clip_ratio": 0.0002507569151930511, + "epoch": 0.003628559847639354, + "grad_norm": 0.06607773900032043, + "kl": 0.07838872075080872, + "learning_rate": 3e-06, + "loss": 0.0107, + "step": 1307 + }, + { + "clip_ratio": 0.0004108092689421028, + "epoch": 0.00363133609847917, + "grad_norm": 0.07465820014476776, + "kl": 0.07868605107069016, + "learning_rate": 3e-06, + "loss": 0.0115, + "step": 1308 + }, + { + "clip_ratio": 8.138021075865254e-05, + "completion_length": 249.0416717529297, + "epoch": 0.0036341123493189857, + "grad_norm": 0.10679054260253906, + "kl": 0.07724452763795853, + "learning_rate": 3e-06, + "loss": -0.0055, + "reward": 0.32500002533197403, + "reward_std": 0.27318819612264633, + "rewards/countdown_reward_func": 0.32499999552965164, + "step": 1309, + "zero_std_ratio": 0.375 + }, + { + "clip_ratio": 9.645061800256371e-05, + "epoch": 0.0036368886001588016, + "grad_norm": 0.08431645482778549, + "kl": 0.07454115152359009, + "learning_rate": 3e-06, + "loss": -0.0059, + "step": 1310 + }, + { + "clip_ratio": 0.00041751094977371395, + "epoch": 0.0036396648509986174, + "grad_norm": 0.10150210559368134, + "kl": 0.07631824165582657, + "learning_rate": 3e-06, + "loss": -0.0058, + "step": 1311 + }, + { + "clip_ratio": 0.00016276042151730508, + "epoch": 0.0036424411018384333, + "grad_norm": 0.08061017096042633, + "kl": 0.07825561240315437, + "learning_rate": 3e-06, + "loss": -0.0056, + "step": 1312 + }, + { + "clip_ratio": 8.138021075865254e-05, + "epoch": 0.003645217352678249, + "grad_norm": 0.1347588747739792, + "kl": 0.07661932334303856, + "learning_rate": 3e-06, + "loss": -0.0053, + "step": 1313 + }, + { + "clip_ratio": 0.00016276042151730508, + "epoch": 0.003647993603518065, + "grad_norm": 0.09667013585567474, + "kl": 0.07734496891498566, + "learning_rate": 3e-06, + "loss": -0.0057, + "step": 1314 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0036507698543578808, + "grad_norm": 0.11161034554243088, + "kl": 0.07669401913881302, + "learning_rate": 3e-06, + "loss": -0.0062, + "step": 1315 + }, + { + "clip_ratio": 0.00016276042151730508, + "epoch": 0.0036535461051976966, + "grad_norm": 0.08601905405521393, + "kl": 0.07428323850035667, + "learning_rate": 3e-06, + "loss": -0.0066, + "step": 1316 + }, + { + "clip_ratio": 0.00032552084303461015, + "epoch": 0.003656322356037513, + "grad_norm": 0.11961396038532257, + "kl": 0.07602384686470032, + "learning_rate": 3e-06, + "loss": -0.0068, + "step": 1317 + }, + { + "clip_ratio": 0.0009102527255890891, + "epoch": 0.0036590986068773287, + "grad_norm": 0.08503416925668716, + "kl": 0.0780373215675354, + "learning_rate": 3e-06, + "loss": -0.0059, + "step": 1318 + }, + { + "clip_ratio": 0.000244140625, + "epoch": 0.0036618748577171446, + "grad_norm": 0.13413764536380768, + "kl": 0.07871726900339127, + "learning_rate": 3e-06, + "loss": -0.0072, + "step": 1319 + }, + { + "clip_ratio": 0.00017783083603717387, + "epoch": 0.0036646511085569604, + "grad_norm": 0.16159453988075256, + "kl": 0.07949946075677872, + "learning_rate": 3e-06, + "loss": -0.0074, + "step": 1320 + }, + { + "clip_ratio": 0.0, + "completion_length": 224.3541717529297, + "epoch": 0.0036674273593967762, + "grad_norm": 0.12504050135612488, + "kl": 0.08029628917574883, + "learning_rate": 3e-06, + "loss": 0.0344, + "reward": 0.3583333492279053, + "reward_std": 0.3268071115016937, + "rewards/countdown_reward_func": 0.3583333492279053, + "step": 1321, + "zero_std_ratio": 0.25 + }, + { + "clip_ratio": 0.0005550031710299663, + "epoch": 0.003670203610236592, + "grad_norm": 0.14056633412837982, + "kl": 0.08753347396850586, + "learning_rate": 3e-06, + "loss": 0.0343, + "step": 1322 + }, + { + "clip_ratio": 0.0009008992274175398, + "epoch": 0.003672979861076408, + "grad_norm": 0.1351320594549179, + "kl": 0.0846715196967125, + "learning_rate": 3e-06, + "loss": 0.0351, + "step": 1323 + }, + { + "clip_ratio": 0.00020169082563370466, + "epoch": 0.0036757561119162238, + "grad_norm": 0.16348907351493835, + "kl": 0.07923517376184464, + "learning_rate": 3e-06, + "loss": 0.0341, + "step": 1324 + }, + { + "clip_ratio": 0.0011630582448560745, + "epoch": 0.0036785323627560396, + "grad_norm": 0.1238764300942421, + "kl": 0.07697004824876785, + "learning_rate": 3e-06, + "loss": 0.0332, + "step": 1325 + }, + { + "clip_ratio": 0.00018680757784750313, + "epoch": 0.0036813086135958554, + "grad_norm": 0.13604602217674255, + "kl": 0.08331404998898506, + "learning_rate": 3e-06, + "loss": 0.0335, + "step": 1326 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0036840848644356713, + "grad_norm": 0.12275619804859161, + "kl": 0.08377550542354584, + "learning_rate": 3e-06, + "loss": 0.0324, + "step": 1327 + }, + { + "clip_ratio": 0.0008797913324087858, + "epoch": 0.0036868611152754876, + "grad_norm": 0.13052096962928772, + "kl": 0.09020379930734634, + "learning_rate": 3e-06, + "loss": 0.032, + "step": 1328 + }, + { + "clip_ratio": 0.0005085435113869607, + "epoch": 0.0036896373661153034, + "grad_norm": 0.1253458559513092, + "kl": 0.0886102207005024, + "learning_rate": 3e-06, + "loss": 0.0316, + "step": 1329 + }, + { + "clip_ratio": 0.000303602428175509, + "epoch": 0.0036924136169551192, + "grad_norm": 0.16655124723911285, + "kl": 0.08304303884506226, + "learning_rate": 3e-06, + "loss": 0.0295, + "step": 1330 + }, + { + "clip_ratio": 0.0007008319735177793, + "epoch": 0.003695189867794935, + "grad_norm": 0.11321963369846344, + "kl": 0.08307855576276779, + "learning_rate": 3e-06, + "loss": 0.0305, + "step": 1331 + }, + { + "clip_ratio": 0.0001017087051877752, + "epoch": 0.003697966118634751, + "grad_norm": 0.12860439717769623, + "kl": 0.09165726974606514, + "learning_rate": 3e-06, + "loss": 0.0293, + "step": 1332 + }, + { + "clip_ratio": 0.00017229790682904422, + "completion_length": 233.58333587646484, + "epoch": 0.0037007423694745667, + "grad_norm": 0.12028127908706665, + "kl": 0.10655560344457626, + "learning_rate": 3e-06, + "loss": 0.0049, + "reward": 0.2666666731238365, + "reward_std": 0.2709502577781677, + "rewards/countdown_reward_func": 0.2666666731238365, + "step": 1333, + "zero_std_ratio": 0.375 + }, + { + "clip_ratio": 8.138021075865254e-05, + "epoch": 0.0037035186203143826, + "grad_norm": 0.09744270890951157, + "kl": 0.11018062755465508, + "learning_rate": 3e-06, + "loss": 0.0061, + "step": 1334 + }, + { + "clip_ratio": 8.579272252973169e-05, + "epoch": 0.0037062948711541984, + "grad_norm": 0.09601444751024246, + "kl": 0.10575228929519653, + "learning_rate": 3e-06, + "loss": 0.0058, + "step": 1335 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0037090711219940143, + "grad_norm": 0.11352216452360153, + "kl": 0.10562450438737869, + "learning_rate": 3e-06, + "loss": 0.0053, + "step": 1336 + }, + { + "clip_ratio": 8.138021075865254e-05, + "epoch": 0.00371184737283383, + "grad_norm": 0.0934673398733139, + "kl": 0.11037517711520195, + "learning_rate": 3e-06, + "loss": 0.0042, + "step": 1337 + }, + { + "clip_ratio": 0.0005623652250505984, + "epoch": 0.0037146236236736464, + "grad_norm": 0.09364868700504303, + "kl": 0.10772473365068436, + "learning_rate": 3e-06, + "loss": 0.0057, + "step": 1338 + }, + { + "clip_ratio": 9.377344395034015e-05, + "epoch": 0.0037173998745134622, + "grad_norm": 0.2115117907524109, + "kl": 0.12028898671269417, + "learning_rate": 3e-06, + "loss": 0.0033, + "step": 1339 + }, + { + "clip_ratio": 8.138021075865254e-05, + "epoch": 0.003720176125353278, + "grad_norm": 0.10891567170619965, + "kl": 0.12114686891436577, + "learning_rate": 3e-06, + "loss": 0.0054, + "step": 1340 + }, + { + "clip_ratio": 8.650519157527015e-05, + "epoch": 0.003722952376193094, + "grad_norm": 0.09488851577043533, + "kl": 0.11187062785029411, + "learning_rate": 3e-06, + "loss": 0.0057, + "step": 1341 + }, + { + "clip_ratio": 0.000176635745447129, + "epoch": 0.0037257286270329097, + "grad_norm": 0.1026405617594719, + "kl": 0.10968057066202164, + "learning_rate": 3e-06, + "loss": 0.0054, + "step": 1342 + }, + { + "clip_ratio": 0.0003423265879973769, + "epoch": 0.0037285048778727256, + "grad_norm": 0.10193421691656113, + "kl": 0.11749966815114021, + "learning_rate": 3e-06, + "loss": 0.0035, + "step": 1343 + }, + { + "clip_ratio": 0.0011813296005129814, + "epoch": 0.0037312811287125414, + "grad_norm": 0.09785063564777374, + "kl": 0.10901345312595367, + "learning_rate": 3e-06, + "loss": 0.0051, + "step": 1344 + }, + { + "clip_ratio": 8.138021075865254e-05, + "completion_length": 224.70834350585938, + "epoch": 0.0037340573795523573, + "grad_norm": 0.15201179683208466, + "kl": 0.11200324073433876, + "learning_rate": 3e-06, + "loss": 0.0035, + "reward": 0.33750002086162567, + "reward_std": 0.29471276700496674, + "rewards/countdown_reward_func": 0.33750002086162567, + "step": 1345, + "zero_std_ratio": 0.25 + }, + { + "clip_ratio": 0.0, + "epoch": 0.003736833630392173, + "grad_norm": 0.12798374891281128, + "kl": 0.11272676661610603, + "learning_rate": 3e-06, + "loss": 0.0036, + "step": 1346 + }, + { + "clip_ratio": 0.0, + "epoch": 0.003739609881231989, + "grad_norm": 0.12021691352128983, + "kl": 0.11317825317382812, + "learning_rate": 3e-06, + "loss": 0.0036, + "step": 1347 + }, + { + "clip_ratio": 9.448223863728344e-05, + "epoch": 0.0037423861320718048, + "grad_norm": 0.11852817982435226, + "kl": 0.11661011353135109, + "learning_rate": 3e-06, + "loss": 0.0027, + "step": 1348 + }, + { + "clip_ratio": 0.00016276042151730508, + "epoch": 0.003745162382911621, + "grad_norm": 0.15811295807361603, + "kl": 0.1139235608279705, + "learning_rate": 3e-06, + "loss": 0.0029, + "step": 1349 + }, + { + "clip_ratio": 0.0, + "epoch": 0.003747938633751437, + "grad_norm": 0.14151591062545776, + "kl": 0.10737059265375137, + "learning_rate": 3e-06, + "loss": 0.0033, + "step": 1350 + }, + { + "clip_ratio": 0.0002975761817651801, + "epoch": 0.0037507148845912527, + "grad_norm": 0.11291375011205673, + "kl": 0.10967541486024857, + "learning_rate": 3e-06, + "loss": 0.0032, + "step": 1351 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0037534911354310686, + "grad_norm": 0.1173899918794632, + "kl": 0.10674367100000381, + "learning_rate": 3e-06, + "loss": 0.0013, + "step": 1352 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0037562673862708844, + "grad_norm": 0.13352568447589874, + "kl": 0.10739367455244064, + "learning_rate": 3e-06, + "loss": 0.0019, + "step": 1353 + }, + { + "clip_ratio": 8.229097875300795e-05, + "epoch": 0.0037590436371107002, + "grad_norm": 0.10833993554115295, + "kl": 0.11009395867586136, + "learning_rate": 3e-06, + "loss": 0.0012, + "step": 1354 + }, + { + "clip_ratio": 0.0004950694419676438, + "epoch": 0.003761819887950516, + "grad_norm": 0.1272728592157364, + "kl": 0.106673963367939, + "learning_rate": 3e-06, + "loss": 0.0015, + "step": 1355 + }, + { + "clip_ratio": 0.0003583999059628695, + "epoch": 0.003764596138790332, + "grad_norm": 0.13600032031536102, + "kl": 0.10148491337895393, + "learning_rate": 3e-06, + "loss": 0.0016, + "step": 1356 + }, + { + "clip_ratio": 0.00035957014188170433, + "completion_length": 235.6875, + "epoch": 0.0037673723896301478, + "grad_norm": 0.1315588802099228, + "kl": 0.11271170154213905, + "learning_rate": 3e-06, + "loss": 0.0092, + "reward": 0.2854166775941849, + "reward_std": 0.23748210817575455, + "rewards/countdown_reward_func": 0.2854166775941849, + "step": 1357, + "zero_std_ratio": 0.375 + }, + { + "clip_ratio": 8.37240440887399e-05, + "epoch": 0.0037701486404699636, + "grad_norm": 0.10195645689964294, + "kl": 0.11273347586393356, + "learning_rate": 3e-06, + "loss": 0.0096, + "step": 1358 + }, + { + "clip_ratio": 0.000245323171839118, + "epoch": 0.0037729248913097794, + "grad_norm": 0.10797201097011566, + "kl": 0.10941316187381744, + "learning_rate": 3e-06, + "loss": 0.0089, + "step": 1359 + }, + { + "clip_ratio": 0.00018649760022526607, + "epoch": 0.0037757011421495957, + "grad_norm": 0.092195063829422, + "kl": 0.10438445582985878, + "learning_rate": 3e-06, + "loss": 0.0092, + "step": 1360 + }, + { + "clip_ratio": 0.0003678085922729224, + "epoch": 0.0037784773929894115, + "grad_norm": 0.1453506350517273, + "kl": 0.10365356132388115, + "learning_rate": 3e-06, + "loss": 0.0098, + "step": 1361 + }, + { + "clip_ratio": 0.00017496491636848077, + "epoch": 0.0037812536438292274, + "grad_norm": 0.12429957091808319, + "kl": 0.10129589587450027, + "learning_rate": 3e-06, + "loss": 0.0095, + "step": 1362 + }, + { + "clip_ratio": 9.272996976505965e-05, + "epoch": 0.0037840298946690432, + "grad_norm": 0.2589426636695862, + "kl": 0.10026075318455696, + "learning_rate": 3e-06, + "loss": 0.008, + "step": 1363 + }, + { + "clip_ratio": 0.00017496491636848077, + "epoch": 0.003786806145508859, + "grad_norm": 0.09787080436944962, + "kl": 0.09990701079368591, + "learning_rate": 3e-06, + "loss": 0.009, + "step": 1364 + }, + { + "clip_ratio": 0.0006807727040722966, + "epoch": 0.003789582396348675, + "grad_norm": 0.1061621904373169, + "kl": 0.09625846892595291, + "learning_rate": 3e-06, + "loss": 0.0082, + "step": 1365 + }, + { + "clip_ratio": 0.0002560440043453127, + "epoch": 0.0037923586471884907, + "grad_norm": 0.09652417153120041, + "kl": 0.0922749936580658, + "learning_rate": 3e-06, + "loss": 0.0085, + "step": 1366 + }, + { + "clip_ratio": 0.0006315374630503356, + "epoch": 0.0037951348980283066, + "grad_norm": 0.1492643803358078, + "kl": 0.09287701919674873, + "learning_rate": 3e-06, + "loss": 0.0089, + "step": 1367 + }, + { + "clip_ratio": 0.0005454363708849996, + "epoch": 0.0037979111488681224, + "grad_norm": 0.12842383980751038, + "kl": 0.0905197411775589, + "learning_rate": 3e-06, + "loss": 0.0081, + "step": 1368 + }, + { + "clip_ratio": 0.0, + "completion_length": 230.39583587646484, + "epoch": 0.0038006873997079383, + "grad_norm": 0.11756516993045807, + "kl": 0.08745963126420975, + "learning_rate": 3e-06, + "loss": -0.01, + "reward": 0.3229166865348816, + "reward_std": 0.29132401943206787, + "rewards/countdown_reward_func": 0.3229166865348816, + "step": 1369, + "zero_std_ratio": 0.375 + }, + { + "clip_ratio": 0.00017399911303073168, + "epoch": 0.003803463650547754, + "grad_norm": 0.12292972207069397, + "kl": 0.09213906899094582, + "learning_rate": 3e-06, + "loss": -0.0102, + "step": 1370 + }, + { + "clip_ratio": 8.138021075865254e-05, + "epoch": 0.0038062399013875704, + "grad_norm": 0.10542229562997818, + "kl": 0.08154939487576485, + "learning_rate": 3e-06, + "loss": -0.0099, + "step": 1371 + }, + { + "clip_ratio": 0.0002737229297054, + "epoch": 0.003809016152227386, + "grad_norm": 0.13350439071655273, + "kl": 0.08433039486408234, + "learning_rate": 3e-06, + "loss": -0.0102, + "step": 1372 + }, + { + "clip_ratio": 0.000268786505330354, + "epoch": 0.003811792403067202, + "grad_norm": 0.11377749592065811, + "kl": 0.08954215422272682, + "learning_rate": 3e-06, + "loss": -0.0088, + "step": 1373 + }, + { + "clip_ratio": 0.00017508336168248206, + "epoch": 0.003814568653907018, + "grad_norm": 0.1519603431224823, + "kl": 0.07980604469776154, + "learning_rate": 3e-06, + "loss": -0.0121, + "step": 1374 + }, + { + "clip_ratio": 0.00018208303663413972, + "epoch": 0.0038173449047468337, + "grad_norm": 0.11631568521261215, + "kl": 0.07712772488594055, + "learning_rate": 3e-06, + "loss": -0.0112, + "step": 1375 + }, + { + "clip_ratio": 0.00018214939336758107, + "epoch": 0.0038201211555866496, + "grad_norm": 0.10698658227920532, + "kl": 0.08292505145072937, + "learning_rate": 3e-06, + "loss": -0.0116, + "step": 1376 + }, + { + "clip_ratio": 0.0005369195641833358, + "epoch": 0.0038228974064264654, + "grad_norm": 0.1011744886636734, + "kl": 0.07199421525001526, + "learning_rate": 3e-06, + "loss": -0.0121, + "step": 1377 + }, + { + "clip_ratio": 0.000721444986993447, + "epoch": 0.0038256736572662813, + "grad_norm": 0.11945533007383347, + "kl": 0.07399719953536987, + "learning_rate": 3e-06, + "loss": -0.0122, + "step": 1378 + }, + { + "clip_ratio": 0.0008790974534349516, + "epoch": 0.003828449908106097, + "grad_norm": 0.11251705139875412, + "kl": 0.07932459190487862, + "learning_rate": 3e-06, + "loss": -0.0119, + "step": 1379 + }, + { + "clip_ratio": 0.000354837131453678, + "epoch": 0.003831226158945913, + "grad_norm": 0.15868762135505676, + "kl": 0.07149028778076172, + "learning_rate": 3e-06, + "loss": -0.0146, + "step": 1380 + }, + { + "clip_ratio": 0.00017755682347342372, + "completion_length": 225.33333587646484, + "epoch": 0.0038340024097857288, + "grad_norm": 0.11194837838411331, + "kl": 0.07465841248631477, + "learning_rate": 3e-06, + "loss": -0.0171, + "reward": 0.28541669249534607, + "reward_std": 0.31687821447849274, + "rewards/countdown_reward_func": 0.2854166850447655, + "step": 1381, + "zero_std_ratio": 0.25 + }, + { + "clip_ratio": 0.0005127792246639729, + "epoch": 0.003836778660625545, + "grad_norm": 0.1955379843711853, + "kl": 0.07209612429141998, + "learning_rate": 3e-06, + "loss": -0.0169, + "step": 1382 + }, + { + "clip_ratio": 8.877841173671186e-05, + "epoch": 0.003839554911465361, + "grad_norm": 0.11438043415546417, + "kl": 0.06967338174581528, + "learning_rate": 3e-06, + "loss": -0.0187, + "step": 1383 + }, + { + "clip_ratio": 0.00017607717745704576, + "epoch": 0.0038423311623051767, + "grad_norm": 0.11129593104124069, + "kl": 0.07491909712553024, + "learning_rate": 3e-06, + "loss": -0.0179, + "step": 1384 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0038451074131449926, + "grad_norm": 0.0992979034781456, + "kl": 0.07310023903846741, + "learning_rate": 3e-06, + "loss": -0.0178, + "step": 1385 + }, + { + "clip_ratio": 0.0002843922993633896, + "epoch": 0.0038478836639848084, + "grad_norm": 0.12313024699687958, + "kl": 0.0734870582818985, + "learning_rate": 3e-06, + "loss": -0.0174, + "step": 1386 + }, + { + "clip_ratio": 0.0005009273590985686, + "epoch": 0.0038506599148246242, + "grad_norm": 0.12103430181741714, + "kl": 0.06940187513828278, + "learning_rate": 3e-06, + "loss": -0.0189, + "step": 1387 + }, + { + "clip_ratio": 0.0008541795250494033, + "epoch": 0.00385343616566444, + "grad_norm": 0.20041851699352264, + "kl": 0.06606732122600079, + "learning_rate": 3e-06, + "loss": -0.0202, + "step": 1388 + }, + { + "clip_ratio": 0.00030615586729254574, + "epoch": 0.003856212416504256, + "grad_norm": 0.09863422811031342, + "kl": 0.06485064327716827, + "learning_rate": 3e-06, + "loss": -0.0203, + "step": 1389 + }, + { + "clip_ratio": 0.0005214700649958104, + "epoch": 0.0038589886673440718, + "grad_norm": 0.11281978338956833, + "kl": 0.07150644063949585, + "learning_rate": 3e-06, + "loss": -0.021, + "step": 1390 + }, + { + "clip_ratio": 0.0021090117224957794, + "epoch": 0.0038617649181838876, + "grad_norm": 0.09994053095579147, + "kl": 0.07051489502191544, + "learning_rate": 3e-06, + "loss": -0.0197, + "step": 1391 + }, + { + "clip_ratio": 0.0022545086685568094, + "epoch": 0.0038645411690237034, + "grad_norm": 0.12088467925786972, + "kl": 0.07186911255121231, + "learning_rate": 3e-06, + "loss": -0.0191, + "step": 1392 + }, + { + "clip_ratio": 0.0001826150546548888, + "completion_length": 240.52083587646484, + "epoch": 0.0038673174198635197, + "grad_norm": 0.0785083919763565, + "kl": 0.06684068590402603, + "learning_rate": 3e-06, + "loss": 0.0088, + "reward": 0.21250000596046448, + "reward_std": 0.2080453708767891, + "rewards/countdown_reward_func": 0.21250000596046448, + "step": 1393, + "zero_std_ratio": 0.5 + }, + { + "clip_ratio": 0.000263084439211525, + "epoch": 0.0038700936707033355, + "grad_norm": 0.07907012850046158, + "kl": 0.062989991158247, + "learning_rate": 3e-06, + "loss": 0.0086, + "step": 1394 + }, + { + "clip_ratio": 0.0002763880111160688, + "epoch": 0.0038728699215431514, + "grad_norm": 0.070924311876297, + "kl": 0.07118378207087517, + "learning_rate": 3e-06, + "loss": 0.0093, + "step": 1395 + }, + { + "clip_ratio": 0.0004090342263225466, + "epoch": 0.0038756461723829672, + "grad_norm": 0.06483456492424011, + "kl": 0.062327247112989426, + "learning_rate": 3e-06, + "loss": 0.0086, + "step": 1396 + }, + { + "clip_ratio": 0.0008118404075503349, + "epoch": 0.003878422423222783, + "grad_norm": 0.06826819479465485, + "kl": 0.06643109023571014, + "learning_rate": 3e-06, + "loss": 0.009, + "step": 1397 + }, + { + "clip_ratio": 0.00035210512578487396, + "epoch": 0.003881198674062599, + "grad_norm": 0.07230556011199951, + "kl": 0.0697788167744875, + "learning_rate": 3e-06, + "loss": 0.009, + "step": 1398 + }, + { + "clip_ratio": 0.0006263371469685808, + "epoch": 0.0038839749249024147, + "grad_norm": 0.08306799083948135, + "kl": 0.06783925369381905, + "learning_rate": 3e-06, + "loss": 0.009, + "step": 1399 + }, + { + "clip_ratio": 0.00025315712264273316, + "epoch": 0.0038867511757422306, + "grad_norm": 0.08525695651769638, + "kl": 0.06320414319634438, + "learning_rate": 3e-06, + "loss": 0.0081, + "step": 1400 + }, + { + "clip_ratio": 0.00045900307304691523, + "epoch": 0.0038895274265820464, + "grad_norm": 0.09048361331224442, + "kl": 0.07014712691307068, + "learning_rate": 3e-06, + "loss": 0.0092, + "step": 1401 + }, + { + "clip_ratio": 0.0009449293720535934, + "epoch": 0.0038923036774218623, + "grad_norm": 0.06213811784982681, + "kl": 0.06285431608557701, + "learning_rate": 3e-06, + "loss": 0.0082, + "step": 1402 + }, + { + "clip_ratio": 0.0002441406322759576, + "epoch": 0.003895079928261678, + "grad_norm": 0.061364803463220596, + "kl": 0.0679735615849495, + "learning_rate": 3e-06, + "loss": 0.009, + "step": 1403 + }, + { + "clip_ratio": 0.0007923201483208686, + "epoch": 0.0038978561791014944, + "grad_norm": 0.07901846617460251, + "kl": 0.06937158480286598, + "learning_rate": 3e-06, + "loss": 0.0082, + "step": 1404 + }, + { + "clip_ratio": 0.00017844396643340588, + "completion_length": 237.95833587646484, + "epoch": 0.00390063242994131, + "grad_norm": 0.09061950445175171, + "kl": 0.06311015225946903, + "learning_rate": 3e-06, + "loss": 0.0072, + "reward": 0.2083333507180214, + "reward_std": 0.21443458646535873, + "rewards/countdown_reward_func": 0.2083333432674408, + "step": 1405, + "zero_std_ratio": 0.375 + }, + { + "clip_ratio": 0.00017060219397535548, + "epoch": 0.003903408680781126, + "grad_norm": 0.06206053867936134, + "kl": 0.055555472150444984, + "learning_rate": 3e-06, + "loss": 0.007, + "step": 1406 + }, + { + "clip_ratio": 9.266123379347846e-05, + "epoch": 0.003906184931620942, + "grad_norm": 0.10189791768789291, + "kl": 0.05937515199184418, + "learning_rate": 3e-06, + "loss": 0.0071, + "step": 1407 + }, + { + "clip_ratio": 0.0004294630925869569, + "epoch": 0.003908961182460757, + "grad_norm": 0.07696424424648285, + "kl": 0.060661833733320236, + "learning_rate": 3e-06, + "loss": 0.0079, + "step": 1408 + }, + { + "clip_ratio": 0.0, + "epoch": 0.003911737433300574, + "grad_norm": 0.0714978277683258, + "kl": 0.06452694535255432, + "learning_rate": 3e-06, + "loss": 0.0071, + "step": 1409 + }, + { + "clip_ratio": 0.000443236087448895, + "epoch": 0.00391451368414039, + "grad_norm": 0.1540229171514511, + "kl": 0.06495703011751175, + "learning_rate": 3e-06, + "loss": 0.007, + "step": 1410 + }, + { + "clip_ratio": 9.266123379347846e-05, + "epoch": 0.003917289934980205, + "grad_norm": 0.09125962853431702, + "kl": 0.06371764466166496, + "learning_rate": 3e-06, + "loss": 0.0066, + "step": 1411 + }, + { + "clip_ratio": 0.0002598241771920584, + "epoch": 0.0039200661858200215, + "grad_norm": 0.0668187364935875, + "kl": 0.05546574853360653, + "learning_rate": 3e-06, + "loss": 0.0062, + "step": 1412 + }, + { + "clip_ratio": 9.266123379347846e-05, + "epoch": 0.003922842436659837, + "grad_norm": 0.1012551337480545, + "kl": 0.05864310637116432, + "learning_rate": 3e-06, + "loss": 0.0062, + "step": 1413 + }, + { + "clip_ratio": 9.266123379347846e-05, + "epoch": 0.003925618687499653, + "grad_norm": 0.07103273272514343, + "kl": 0.06015484221279621, + "learning_rate": 3e-06, + "loss": 0.0071, + "step": 1414 + }, + { + "clip_ratio": 0.0005025817663408816, + "epoch": 0.003928394938339469, + "grad_norm": 0.06943207234144211, + "kl": 0.06314868479967117, + "learning_rate": 3e-06, + "loss": 0.0062, + "step": 1415 + }, + { + "clip_ratio": 8.138021075865254e-05, + "epoch": 0.003931171189179285, + "grad_norm": 0.10475575923919678, + "kl": 0.06454218924045563, + "learning_rate": 3e-06, + "loss": 0.0056, + "step": 1416 + }, + { + "clip_ratio": 0.0, + "completion_length": 236.2291717529297, + "epoch": 0.0039339474400191, + "grad_norm": 0.0840207114815712, + "kl": 0.06996531412005424, + "learning_rate": 3e-06, + "loss": -0.0023, + "reward": 0.24583332985639572, + "reward_std": 0.3068140149116516, + "rewards/countdown_reward_func": 0.24583332985639572, + "step": 1417, + "zero_std_ratio": 0.125 + }, + { + "clip_ratio": 0.000174452448845841, + "epoch": 0.0039367236908589166, + "grad_norm": 0.09196259081363678, + "kl": 0.08129844069480896, + "learning_rate": 3e-06, + "loss": -0.0026, + "step": 1418 + }, + { + "clip_ratio": 0.0006801302515668795, + "epoch": 0.003939499941698732, + "grad_norm": 0.08758815377950668, + "kl": 0.0710601955652237, + "learning_rate": 3e-06, + "loss": -0.0027, + "step": 1419 + }, + { + "clip_ratio": 0.0006429070708691142, + "epoch": 0.003942276192538548, + "grad_norm": 0.07932908087968826, + "kl": 0.06780187785625458, + "learning_rate": 3e-06, + "loss": -0.0034, + "step": 1420 + }, + { + "clip_ratio": 0.00027745863917516544, + "epoch": 0.0039450524433783645, + "grad_norm": 0.10550674796104431, + "kl": 0.0699809230864048, + "learning_rate": 3e-06, + "loss": -0.003, + "step": 1421 + }, + { + "clip_ratio": 0.0, + "epoch": 0.00394782869421818, + "grad_norm": 0.1297944188117981, + "kl": 0.0661407969892025, + "learning_rate": 3e-06, + "loss": -0.0031, + "step": 1422 + }, + { + "clip_ratio": 0.00017994409427046776, + "epoch": 0.003950604945057996, + "grad_norm": 0.08687513321638107, + "kl": 0.06938713788986206, + "learning_rate": 3e-06, + "loss": -0.0033, + "step": 1423 + }, + { + "clip_ratio": 8.185985643649474e-05, + "epoch": 0.003953381195897812, + "grad_norm": 0.09864569455385208, + "kl": 0.08142375946044922, + "learning_rate": 3e-06, + "loss": -0.0026, + "step": 1424 + }, + { + "clip_ratio": 0.00047662161523476243, + "epoch": 0.003956157446737628, + "grad_norm": 0.10956618189811707, + "kl": 0.07189195230603218, + "learning_rate": 3e-06, + "loss": -0.0031, + "step": 1425 + }, + { + "clip_ratio": 0.000174703003722243, + "epoch": 0.003958933697577443, + "grad_norm": 0.08014890551567078, + "kl": 0.0699024386703968, + "learning_rate": 3e-06, + "loss": -0.0037, + "step": 1426 + }, + { + "clip_ratio": 0.0002667709268280305, + "epoch": 0.0039617099484172595, + "grad_norm": 0.10431994497776031, + "kl": 0.07111196964979172, + "learning_rate": 3e-06, + "loss": -0.0033, + "step": 1427 + }, + { + "clip_ratio": 8.73515018611215e-05, + "epoch": 0.003964486199257075, + "grad_norm": 0.13337118923664093, + "kl": 0.06857022643089294, + "learning_rate": 3e-06, + "loss": -0.0047, + "step": 1428 + }, + { + "clip_ratio": 0.00010064412344945595, + "completion_length": 244.58334350585938, + "epoch": 0.003967262450096891, + "grad_norm": 0.07497075945138931, + "kl": 0.058519456535577774, + "learning_rate": 3e-06, + "loss": -0.0042, + "reward": 0.22708335518836975, + "reward_std": 0.25706911087036133, + "rewards/countdown_reward_func": 0.22708334028720856, + "step": 1429, + "zero_std_ratio": 0.375 + }, + { + "clip_ratio": 0.0003266488783992827, + "epoch": 0.003970038700936707, + "grad_norm": 0.0831906795501709, + "kl": 0.06262272223830223, + "learning_rate": 3e-06, + "loss": -0.0035, + "step": 1430 + }, + { + "clip_ratio": 0.000244140625, + "epoch": 0.003972814951776523, + "grad_norm": 0.13976958394050598, + "kl": 0.05951959639787674, + "learning_rate": 3e-06, + "loss": -0.0041, + "step": 1431 + }, + { + "clip_ratio": 0.0022837779542896897, + "epoch": 0.003975591202616339, + "grad_norm": 0.09820695966482162, + "kl": 0.059285424649715424, + "learning_rate": 3e-06, + "loss": -0.0036, + "step": 1432 + }, + { + "clip_ratio": 0.0, + "epoch": 0.003978367453456155, + "grad_norm": 0.08077849447727203, + "kl": 0.05986793525516987, + "learning_rate": 3e-06, + "loss": -0.0047, + "step": 1433 + }, + { + "clip_ratio": 0.00032552084303461015, + "epoch": 0.003981143704295971, + "grad_norm": 0.091177798807621, + "kl": 0.06356241181492805, + "learning_rate": 3e-06, + "loss": -0.0043, + "step": 1434 + }, + { + "clip_ratio": 0.0005404362455010414, + "epoch": 0.003983919955135786, + "grad_norm": 0.08302978426218033, + "kl": 0.06063215062022209, + "learning_rate": 3e-06, + "loss": -0.0047, + "step": 1435 + }, + { + "clip_ratio": 0.0004089395733899437, + "epoch": 0.0039866962059756025, + "grad_norm": 0.08915877342224121, + "kl": 0.0654546320438385, + "learning_rate": 3e-06, + "loss": -0.0043, + "step": 1436 + }, + { + "clip_ratio": 0.000328657595673576, + "epoch": 0.003989472456815418, + "grad_norm": 0.1467614471912384, + "kl": 0.062131211161613464, + "learning_rate": 3e-06, + "loss": -0.0064, + "step": 1437 + }, + { + "clip_ratio": 0.0012416826793923974, + "epoch": 0.003992248707655234, + "grad_norm": 0.11402739584445953, + "kl": 0.06069890968501568, + "learning_rate": 3e-06, + "loss": -0.0043, + "step": 1438 + }, + { + "clip_ratio": 0.00041950895683839917, + "epoch": 0.00399502495849505, + "grad_norm": 0.08940764516592026, + "kl": 0.06148812361061573, + "learning_rate": 3e-06, + "loss": -0.0055, + "step": 1439 + }, + { + "clip_ratio": 0.0012557760928757489, + "epoch": 0.003997801209334866, + "grad_norm": 0.08653771132230759, + "kl": 0.06552222743630409, + "learning_rate": 3e-06, + "loss": -0.0057, + "step": 1440 + }, + { + "clip_ratio": 0.00010434056457597762, + "completion_length": 232.9791717529297, + "epoch": 0.004000577460174681, + "grad_norm": 0.11438572406768799, + "kl": 0.07905929908156395, + "learning_rate": 3e-06, + "loss": 0.0201, + "reward": 0.4375, + "reward_std": 0.3626646399497986, + "rewards/countdown_reward_func": 0.4375, + "step": 1441, + "zero_std_ratio": 0.25 + }, + { + "clip_ratio": 0.00016276042151730508, + "epoch": 0.004003353711014498, + "grad_norm": 0.13407845795154572, + "kl": 0.07975272089242935, + "learning_rate": 3e-06, + "loss": 0.0212, + "step": 1442 + }, + { + "clip_ratio": 0.0002699867181945592, + "epoch": 0.004006129961854314, + "grad_norm": 0.18041589856147766, + "kl": 0.08294541388750076, + "learning_rate": 3e-06, + "loss": 0.0206, + "step": 1443 + }, + { + "clip_ratio": 0.00037823428283445537, + "epoch": 0.004008906212694129, + "grad_norm": 0.10812884569168091, + "kl": 0.07889501005411148, + "learning_rate": 3e-06, + "loss": 0.0205, + "step": 1444 + }, + { + "clip_ratio": 0.00018572076805867255, + "epoch": 0.0040116824635339455, + "grad_norm": 0.10284543037414551, + "kl": 0.07939217984676361, + "learning_rate": 3e-06, + "loss": 0.0199, + "step": 1445 + }, + { + "clip_ratio": 0.000244140625, + "epoch": 0.004014458714373761, + "grad_norm": 0.1508864164352417, + "kl": 0.07976921647787094, + "learning_rate": 3e-06, + "loss": 0.02, + "step": 1446 + }, + { + "clip_ratio": 0.0001915093234856613, + "epoch": 0.004017234965213577, + "grad_norm": 0.11611857265233994, + "kl": 0.08126603439450264, + "learning_rate": 3e-06, + "loss": 0.0194, + "step": 1447 + }, + { + "clip_ratio": 0.00018572077533463016, + "epoch": 0.004020011216053393, + "grad_norm": 0.1259177178144455, + "kl": 0.08240155875682831, + "learning_rate": 3e-06, + "loss": 0.0193, + "step": 1448 + }, + { + "clip_ratio": 0.0006606255192309618, + "epoch": 0.004022787466893209, + "grad_norm": 0.15441936254501343, + "kl": 0.08467500656843185, + "learning_rate": 3e-06, + "loss": 0.0179, + "step": 1449 + }, + { + "clip_ratio": 0.00018383923452347517, + "epoch": 0.004025563717733024, + "grad_norm": 0.11314117908477783, + "kl": 0.0805022120475769, + "learning_rate": 3e-06, + "loss": 0.0187, + "step": 1450 + }, + { + "clip_ratio": 0.0006165863596834242, + "epoch": 0.0040283399685728406, + "grad_norm": 0.11447932571172714, + "kl": 0.08336110040545464, + "learning_rate": 3e-06, + "loss": 0.0186, + "step": 1451 + }, + { + "clip_ratio": 0.0005323204240994528, + "epoch": 0.004031116219412656, + "grad_norm": 0.15156546235084534, + "kl": 0.0839230939745903, + "learning_rate": 3e-06, + "loss": 0.0175, + "step": 1452 + }, + { + "clip_ratio": 8.278145833173767e-05, + "completion_length": 235.14583587646484, + "epoch": 0.004033892470252472, + "grad_norm": 0.1193864643573761, + "kl": 0.08672641590237617, + "learning_rate": 3e-06, + "loss": 0.0317, + "reward": 0.35625001788139343, + "reward_std": 0.3211328834295273, + "rewards/countdown_reward_func": 0.35625001788139343, + "step": 1453, + "zero_std_ratio": 0.25 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0040366687210922885, + "grad_norm": 0.12705117464065552, + "kl": 0.08588536828756332, + "learning_rate": 3e-06, + "loss": 0.0305, + "step": 1454 + }, + { + "clip_ratio": 0.0, + "epoch": 0.004039444971932104, + "grad_norm": 0.10421835631132126, + "kl": 0.08824677020311356, + "learning_rate": 3e-06, + "loss": 0.0301, + "step": 1455 + }, + { + "clip_ratio": 0.0, + "epoch": 0.00404222122277192, + "grad_norm": 0.12364216893911362, + "kl": 0.08488886058330536, + "learning_rate": 3e-06, + "loss": 0.0296, + "step": 1456 + }, + { + "clip_ratio": 9.170946577796713e-05, + "epoch": 0.004044997473611736, + "grad_norm": 0.11285992711782455, + "kl": 0.08128321543335915, + "learning_rate": 3e-06, + "loss": 0.0301, + "step": 1457 + }, + { + "clip_ratio": 9.137426968663931e-05, + "epoch": 0.004047773724451552, + "grad_norm": 0.09293954819440842, + "kl": 0.0865498036146164, + "learning_rate": 3e-06, + "loss": 0.0286, + "step": 1458 + }, + { + "clip_ratio": 0.00026412875740788877, + "epoch": 0.004050549975291367, + "grad_norm": 0.11421343684196472, + "kl": 0.09296088293194771, + "learning_rate": 3e-06, + "loss": 0.0293, + "step": 1459 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0040533262261311835, + "grad_norm": 0.11324277520179749, + "kl": 0.09277210384607315, + "learning_rate": 3e-06, + "loss": 0.0278, + "step": 1460 + }, + { + "clip_ratio": 0.0001641616690903902, + "epoch": 0.004056102476970999, + "grad_norm": 0.11188522726297379, + "kl": 0.09390882402658463, + "learning_rate": 3e-06, + "loss": 0.0275, + "step": 1461 + }, + { + "clip_ratio": 0.0001641616690903902, + "epoch": 0.004058878727810815, + "grad_norm": 0.11014936119318008, + "kl": 0.09254594147205353, + "learning_rate": 3e-06, + "loss": 0.0265, + "step": 1462 + }, + { + "clip_ratio": 0.0001744909241097048, + "epoch": 0.004061654978650631, + "grad_norm": 0.11003927886486053, + "kl": 0.08989466726779938, + "learning_rate": 3e-06, + "loss": 0.0281, + "step": 1463 + }, + { + "clip_ratio": 8.532423089491203e-05, + "epoch": 0.004064431229490447, + "grad_norm": 0.10190403461456299, + "kl": 0.09649864211678505, + "learning_rate": 3e-06, + "loss": 0.026, + "step": 1464 + }, + { + "clip_ratio": 0.00018395879305899143, + "completion_length": 239.625, + "epoch": 0.004067207480330263, + "grad_norm": 0.09303663671016693, + "kl": 0.09736296534538269, + "learning_rate": 3e-06, + "loss": 0.0134, + "reward": 0.30625003576278687, + "reward_std": 0.2696641683578491, + "rewards/countdown_reward_func": 0.30625003576278687, + "step": 1465, + "zero_std_ratio": 0.375 + }, + { + "clip_ratio": 9.211496217176318e-05, + "epoch": 0.004069983731170079, + "grad_norm": 0.1130358949303627, + "kl": 0.08961380645632744, + "learning_rate": 3e-06, + "loss": 0.0126, + "step": 1466 + }, + { + "clip_ratio": 0.00017335961456410587, + "epoch": 0.004072759982009895, + "grad_norm": 0.0847916454076767, + "kl": 0.10012925043702126, + "learning_rate": 3e-06, + "loss": 0.013, + "step": 1467 + }, + { + "clip_ratio": 0.0, + "epoch": 0.00407553623284971, + "grad_norm": 0.09342540800571442, + "kl": 0.09443873539566994, + "learning_rate": 3e-06, + "loss": 0.0127, + "step": 1468 + }, + { + "clip_ratio": 0.000447636324679479, + "epoch": 0.0040783124836895265, + "grad_norm": 0.10130415111780167, + "kl": 0.09730666130781174, + "learning_rate": 3e-06, + "loss": 0.0124, + "step": 1469 + }, + { + "clip_ratio": 8.60289073898457e-05, + "epoch": 0.004081088734529342, + "grad_norm": 0.09560680389404297, + "kl": 0.10042464733123779, + "learning_rate": 3e-06, + "loss": 0.0124, + "step": 1470 + }, + { + "clip_ratio": 0.00025938851467799395, + "epoch": 0.004083864985369158, + "grad_norm": 0.1030803993344307, + "kl": 0.10246840491890907, + "learning_rate": 3e-06, + "loss": 0.0117, + "step": 1471 + }, + { + "clip_ratio": 0.00017924292478710413, + "epoch": 0.004086641236208974, + "grad_norm": 0.10079611092805862, + "kl": 0.09707974642515182, + "learning_rate": 3e-06, + "loss": 0.0117, + "step": 1472 + }, + { + "clip_ratio": 0.0, + "epoch": 0.00408941748704879, + "grad_norm": 0.0831371396780014, + "kl": 0.10519356653094292, + "learning_rate": 3e-06, + "loss": 0.0114, + "step": 1473 + }, + { + "clip_ratio": 0.0, + "epoch": 0.004092193737888605, + "grad_norm": 0.1005701944231987, + "kl": 0.09982095658779144, + "learning_rate": 3e-06, + "loss": 0.0109, + "step": 1474 + }, + { + "clip_ratio": 0.0009021220030263066, + "epoch": 0.004094969988728422, + "grad_norm": 0.08770573884248734, + "kl": 0.10152734071016312, + "learning_rate": 3e-06, + "loss": 0.0107, + "step": 1475 + }, + { + "clip_ratio": 0.0006180326745379716, + "epoch": 0.004097746239568238, + "grad_norm": 0.10259686410427094, + "kl": 0.10511034727096558, + "learning_rate": 3e-06, + "loss": 0.0115, + "step": 1476 + }, + { + "clip_ratio": 8.747375977691263e-05, + "completion_length": 225.6875, + "epoch": 0.004100522490408053, + "grad_norm": 0.11141236126422882, + "kl": 0.10264391079545021, + "learning_rate": 3e-06, + "loss": 0.0114, + "reward": 0.26875001937150955, + "reward_std": 0.2696641534566879, + "rewards/countdown_reward_func": 0.26875001937150955, + "step": 1477, + "zero_std_ratio": 0.375 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0041032987412478695, + "grad_norm": 0.10607676953077316, + "kl": 0.10159710422158241, + "learning_rate": 3e-06, + "loss": 0.0115, + "step": 1478 + }, + { + "clip_ratio": 0.00045908106403658167, + "epoch": 0.004106074992087685, + "grad_norm": 0.12285490334033966, + "kl": 0.09724065288901329, + "learning_rate": 3e-06, + "loss": 0.0121, + "step": 1479 + }, + { + "clip_ratio": 8.915834769140929e-05, + "epoch": 0.004108851242927501, + "grad_norm": 0.10112053155899048, + "kl": 0.09738883748650551, + "learning_rate": 3e-06, + "loss": 0.0111, + "step": 1480 + }, + { + "clip_ratio": 0.000543587921129074, + "epoch": 0.004111627493767317, + "grad_norm": 0.11109442263841629, + "kl": 0.10089835524559021, + "learning_rate": 3e-06, + "loss": 0.0117, + "step": 1481 + }, + { + "clip_ratio": 8.7596352386754e-05, + "epoch": 0.004114403744607133, + "grad_norm": 0.09819518029689789, + "kl": 0.10074819624423981, + "learning_rate": 3e-06, + "loss": 0.0116, + "step": 1482 + }, + { + "clip_ratio": 0.00010557432688074186, + "epoch": 0.004117179995446948, + "grad_norm": 0.10844285786151886, + "kl": 0.1009376123547554, + "learning_rate": 3e-06, + "loss": 0.0101, + "step": 1483 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0041199562462867646, + "grad_norm": 0.1014895886182785, + "kl": 0.1020418331027031, + "learning_rate": 3e-06, + "loss": 0.0103, + "step": 1484 + }, + { + "clip_ratio": 0.0003863678648485802, + "epoch": 0.00412273249712658, + "grad_norm": 0.11585193872451782, + "kl": 0.09769553691148758, + "learning_rate": 3e-06, + "loss": 0.0107, + "step": 1485 + }, + { + "clip_ratio": 0.0004875522281508893, + "epoch": 0.004125508747966396, + "grad_norm": 0.09965824335813522, + "kl": 0.09749152138829231, + "learning_rate": 3e-06, + "loss": 0.0099, + "step": 1486 + }, + { + "clip_ratio": 0.0005232414114288986, + "epoch": 0.0041282849988062125, + "grad_norm": 0.11992790549993515, + "kl": 0.09825573861598969, + "learning_rate": 3e-06, + "loss": 0.0098, + "step": 1487 + }, + { + "clip_ratio": 0.00016897656314540654, + "epoch": 0.004131061249646028, + "grad_norm": 0.09515371173620224, + "kl": 0.09882111474871635, + "learning_rate": 3e-06, + "loss": 0.0099, + "step": 1488 + }, + { + "clip_ratio": 9.170946577796713e-05, + "completion_length": 212.7916717529297, + "epoch": 0.004133837500485844, + "grad_norm": 0.06438077986240387, + "kl": 0.1108875423669815, + "learning_rate": 3e-06, + "loss": 0.005, + "reward": 0.16874999925494194, + "reward_std": 0.11910479329526424, + "rewards/countdown_reward_func": 0.16874999925494194, + "step": 1489, + "zero_std_ratio": 0.5 + }, + { + "clip_ratio": 0.0001817244410631247, + "epoch": 0.00413661375132566, + "grad_norm": 0.07629207521677017, + "kl": 0.1037943847477436, + "learning_rate": 3e-06, + "loss": 0.0059, + "step": 1490 + }, + { + "clip_ratio": 9.170946577796713e-05, + "epoch": 0.004139390002165476, + "grad_norm": 0.07570668309926987, + "kl": 0.09897951036691666, + "learning_rate": 3e-06, + "loss": 0.0049, + "step": 1491 + }, + { + "clip_ratio": 0.0, + "epoch": 0.004142166253005291, + "grad_norm": 0.06962047517299652, + "kl": 0.1040000282227993, + "learning_rate": 3e-06, + "loss": 0.0055, + "step": 1492 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0041449425038451075, + "grad_norm": 0.07831595093011856, + "kl": 0.09726639091968536, + "learning_rate": 3e-06, + "loss": 0.0045, + "step": 1493 + }, + { + "clip_ratio": 0.0001838923490140587, + "epoch": 0.004147718754684923, + "grad_norm": 0.06859088689088821, + "kl": 0.10122519358992577, + "learning_rate": 3e-06, + "loss": 0.0049, + "step": 1494 + }, + { + "clip_ratio": 0.0007797929574735463, + "epoch": 0.004150495005524739, + "grad_norm": 0.07352989166975021, + "kl": 0.10665607079863548, + "learning_rate": 3e-06, + "loss": 0.0048, + "step": 1495 + }, + { + "clip_ratio": 8.954155055107549e-05, + "epoch": 0.004153271256364555, + "grad_norm": 0.0793951153755188, + "kl": 0.09884294867515564, + "learning_rate": 3e-06, + "loss": 0.0047, + "step": 1496 + }, + { + "clip_ratio": 0.0, + "epoch": 0.004156047507204371, + "grad_norm": 0.07391367107629776, + "kl": 0.09311042353510857, + "learning_rate": 3e-06, + "loss": 0.0046, + "step": 1497 + }, + { + "clip_ratio": 0.0002756018075160682, + "epoch": 0.004158823758044187, + "grad_norm": 0.06575078517198563, + "kl": 0.09845591336488724, + "learning_rate": 3e-06, + "loss": 0.0047, + "step": 1498 + }, + { + "clip_ratio": 0.00036945813917554915, + "epoch": 0.004161600008884003, + "grad_norm": 0.08721313625574112, + "kl": 0.09197269007563591, + "learning_rate": 3e-06, + "loss": 0.0037, + "step": 1499 + }, + { + "clip_ratio": 0.0002712659916142002, + "epoch": 0.004164376259723819, + "grad_norm": 0.07341030240058899, + "kl": 0.09440727904438972, + "learning_rate": 3e-06, + "loss": 0.0046, + "step": 1500 + }, + { + "clip_ratio": 0.00010064412344945595, + "completion_length": 220.5416717529297, + "epoch": 0.004167152510563634, + "grad_norm": 0.11328040063381195, + "kl": 0.09348515421152115, + "learning_rate": 3e-06, + "loss": 0.0053, + "reward": 0.35625001788139343, + "reward_std": 0.31346985697746277, + "rewards/countdown_reward_func": 0.35625001788139343, + "step": 1501, + "zero_std_ratio": 0.25 + }, + { + "clip_ratio": 8.138021075865254e-05, + "epoch": 0.0041699287614034505, + "grad_norm": 0.11099179089069366, + "kl": 0.09527323395013809, + "learning_rate": 3e-06, + "loss": 0.0064, + "step": 1502 + }, + { + "clip_ratio": 0.00023474179033655673, + "epoch": 0.004172705012243266, + "grad_norm": 0.10442166030406952, + "kl": 0.08588381856679916, + "learning_rate": 3e-06, + "loss": 0.0054, + "step": 1503 + }, + { + "clip_ratio": 8.992805669549853e-05, + "epoch": 0.004175481263083082, + "grad_norm": 0.13162118196487427, + "kl": 0.08620287477970123, + "learning_rate": 3e-06, + "loss": 0.005, + "step": 1504 + }, + { + "clip_ratio": 0.000326107838191092, + "epoch": 0.004178257513922898, + "grad_norm": 0.11255912482738495, + "kl": 0.08432386443018913, + "learning_rate": 3e-06, + "loss": 0.0055, + "step": 1505 + }, + { + "clip_ratio": 0.0004599874228006229, + "epoch": 0.004181033764762714, + "grad_norm": 0.11156965047121048, + "kl": 0.08439022302627563, + "learning_rate": 3e-06, + "loss": 0.0056, + "step": 1506 + }, + { + "clip_ratio": 9.750390017870814e-05, + "epoch": 0.004183810015602529, + "grad_norm": 0.11413870751857758, + "kl": 0.08592484891414642, + "learning_rate": 3e-06, + "loss": 0.0051, + "step": 1507 + }, + { + "clip_ratio": 8.138021075865254e-05, + "epoch": 0.004186586266442346, + "grad_norm": 0.12828348577022552, + "kl": 0.08880098164081573, + "learning_rate": 3e-06, + "loss": 0.0044, + "step": 1508 + }, + { + "clip_ratio": 0.0002807183118420653, + "epoch": 0.004189362517282162, + "grad_norm": 0.10022372007369995, + "kl": 0.07961970940232277, + "learning_rate": 3e-06, + "loss": 0.0039, + "step": 1509 + }, + { + "clip_ratio": 0.0004508365091169253, + "epoch": 0.004192138768121977, + "grad_norm": 0.1420833021402359, + "kl": 0.08037307858467102, + "learning_rate": 3e-06, + "loss": 0.0048, + "step": 1510 + }, + { + "clip_ratio": 0.0011079075193265453, + "epoch": 0.0041949150189617935, + "grad_norm": 0.11328054964542389, + "kl": 0.07894135639071465, + "learning_rate": 3e-06, + "loss": 0.0045, + "step": 1511 + }, + { + "clip_ratio": 0.0002834866172634065, + "epoch": 0.004197691269801609, + "grad_norm": 0.12972882390022278, + "kl": 0.07961436733603477, + "learning_rate": 3e-06, + "loss": 0.004, + "step": 1512 + }, + { + "clip_ratio": 0.0002615062694530934, + "completion_length": 231.58333587646484, + "epoch": 0.004200467520641425, + "grad_norm": 0.09790017455816269, + "kl": 0.08236554637551308, + "learning_rate": 3e-06, + "loss": 0.0149, + "reward": 0.229166679084301, + "reward_std": 0.19716466218233109, + "rewards/countdown_reward_func": 0.229166679084301, + "step": 1513, + "zero_std_ratio": 0.5 + }, + { + "clip_ratio": 0.00018543919577496126, + "epoch": 0.004203243771481241, + "grad_norm": 0.07614653557538986, + "kl": 0.09001684933900833, + "learning_rate": 3e-06, + "loss": 0.0156, + "step": 1514 + }, + { + "clip_ratio": 0.00016276042151730508, + "epoch": 0.004206020022321057, + "grad_norm": 0.08829134702682495, + "kl": 0.0819174312055111, + "learning_rate": 3e-06, + "loss": 0.0153, + "step": 1515 + }, + { + "clip_ratio": 0.0001734273391775787, + "epoch": 0.004208796273160872, + "grad_norm": 0.05850941315293312, + "kl": 0.0837816521525383, + "learning_rate": 3e-06, + "loss": 0.0154, + "step": 1516 + }, + { + "clip_ratio": 0.00029112402262398973, + "epoch": 0.0042115725240006886, + "grad_norm": 0.07008534669876099, + "kl": 0.08121475949883461, + "learning_rate": 3e-06, + "loss": 0.0152, + "step": 1517 + }, + { + "clip_ratio": 0.0001695139508228749, + "epoch": 0.004214348774840504, + "grad_norm": 0.07618328183889389, + "kl": 0.08276839926838875, + "learning_rate": 3e-06, + "loss": 0.0155, + "step": 1518 + }, + { + "clip_ratio": 0.00034776486427290365, + "epoch": 0.00421712502568032, + "grad_norm": 0.10261756926774979, + "kl": 0.082290880382061, + "learning_rate": 3e-06, + "loss": 0.0155, + "step": 1519 + }, + { + "clip_ratio": 0.00016469038382638246, + "epoch": 0.0042199012765201365, + "grad_norm": 0.06574001908302307, + "kl": 0.09078622981905937, + "learning_rate": 3e-06, + "loss": 0.0156, + "step": 1520 + }, + { + "clip_ratio": 0.0, + "epoch": 0.004222677527359952, + "grad_norm": 0.08486269414424896, + "kl": 0.08278439193964005, + "learning_rate": 3e-06, + "loss": 0.0151, + "step": 1521 + }, + { + "clip_ratio": 0.0001796506403479725, + "epoch": 0.004225453778199768, + "grad_norm": 0.05522237718105316, + "kl": 0.08544733002781868, + "learning_rate": 3e-06, + "loss": 0.0154, + "step": 1522 + }, + { + "clip_ratio": 0.00026453185273567215, + "epoch": 0.004228230029039584, + "grad_norm": 0.08137688040733337, + "kl": 0.08243484795093536, + "learning_rate": 3e-06, + "loss": 0.0151, + "step": 1523 + }, + { + "clip_ratio": 0.00016276042151730508, + "epoch": 0.0042310062798794, + "grad_norm": 0.06071337312459946, + "kl": 0.08487970381975174, + "learning_rate": 3e-06, + "loss": 0.0142, + "step": 1524 + }, + { + "clip_ratio": 0.00026819700724445283, + "completion_length": 232.87500762939453, + "epoch": 0.004233782530719215, + "grad_norm": 0.066123828291893, + "kl": 0.08086292445659637, + "learning_rate": 3e-06, + "loss": -0.0006, + "reward": 0.17083334922790527, + "reward_std": 0.11393594369292259, + "rewards/countdown_reward_func": 0.17083334922790527, + "step": 1525, + "zero_std_ratio": 0.625 + }, + { + "clip_ratio": 0.00016953254089457914, + "epoch": 0.0042365587815590315, + "grad_norm": 0.19338181614875793, + "kl": 0.08114173635840416, + "learning_rate": 3e-06, + "loss": -0.0, + "step": 1526 + }, + { + "clip_ratio": 8.138021075865254e-05, + "epoch": 0.004239335032398847, + "grad_norm": 0.08377951383590698, + "kl": 0.07633576914668083, + "learning_rate": 3e-06, + "loss": -0.0006, + "step": 1527 + }, + { + "clip_ratio": 0.0001913186424644664, + "epoch": 0.004242111283238663, + "grad_norm": 0.06688714772462845, + "kl": 0.0815008170902729, + "learning_rate": 3e-06, + "loss": -0.0004, + "step": 1528 + }, + { + "clip_ratio": 8.138021075865254e-05, + "epoch": 0.004244887534078479, + "grad_norm": 0.05774759501218796, + "kl": 0.07802558317780495, + "learning_rate": 3e-06, + "loss": -0.0003, + "step": 1529 + }, + { + "clip_ratio": 0.0004640174884116277, + "epoch": 0.004247663784918295, + "grad_norm": 0.09573258459568024, + "kl": 0.07392871007323265, + "learning_rate": 3e-06, + "loss": -0.001, + "step": 1530 + }, + { + "clip_ratio": 0.00025327454204671085, + "epoch": 0.004250440035758111, + "grad_norm": 0.07546308636665344, + "kl": 0.07837391272187233, + "learning_rate": 3e-06, + "loss": -0.0008, + "step": 1531 + }, + { + "clip_ratio": 8.138021075865254e-05, + "epoch": 0.004253216286597927, + "grad_norm": 0.21055151522159576, + "kl": 0.07768003270030022, + "learning_rate": 3e-06, + "loss": -0.0019, + "step": 1532 + }, + { + "clip_ratio": 8.138021075865254e-05, + "epoch": 0.004255992537437743, + "grad_norm": 0.0746055543422699, + "kl": 0.07167375087738037, + "learning_rate": 3e-06, + "loss": -0.0011, + "step": 1533 + }, + { + "clip_ratio": 0.0003103909839410335, + "epoch": 0.004258768788277558, + "grad_norm": 0.07092703133821487, + "kl": 0.076376061886549, + "learning_rate": 3e-06, + "loss": -0.0013, + "step": 1534 + }, + { + "clip_ratio": 0.0007744758331682533, + "epoch": 0.0042615450391173745, + "grad_norm": 0.058808572590351105, + "kl": 0.07276593148708344, + "learning_rate": 3e-06, + "loss": -0.0009, + "step": 1535 + }, + { + "clip_ratio": 8.138021075865254e-05, + "epoch": 0.00426432128995719, + "grad_norm": 0.07845474034547806, + "kl": 0.06798744946718216, + "learning_rate": 3e-06, + "loss": -0.0021, + "step": 1536 + }, + { + "clip_ratio": 0.0003383140719961375, + "completion_length": 234.5, + "epoch": 0.004267097540797006, + "grad_norm": 0.11504442989826202, + "kl": 0.06899916008114815, + "learning_rate": 3e-06, + "loss": 0.0036, + "reward": 0.2854166775941849, + "reward_std": 0.2782912850379944, + "rewards/countdown_reward_func": 0.2854166552424431, + "step": 1537, + "zero_std_ratio": 0.25 + }, + { + "clip_ratio": 0.00025050100521184504, + "epoch": 0.004269873791636822, + "grad_norm": 0.08852005749940872, + "kl": 0.06766302511096, + "learning_rate": 3e-06, + "loss": 0.0039, + "step": 1538 + }, + { + "clip_ratio": 0.0, + "epoch": 0.004272650042476638, + "grad_norm": 0.08253919333219528, + "kl": 0.06684885919094086, + "learning_rate": 3e-06, + "loss": 0.0035, + "step": 1539 + }, + { + "clip_ratio": 0.0003363504074513912, + "epoch": 0.004275426293316453, + "grad_norm": 0.09681417047977448, + "kl": 0.062055185437202454, + "learning_rate": 3e-06, + "loss": 0.0033, + "step": 1540 + }, + { + "clip_ratio": 0.0004321035448811017, + "epoch": 0.00427820254415627, + "grad_norm": 0.09291265904903412, + "kl": 0.06612430512905121, + "learning_rate": 3e-06, + "loss": 0.0039, + "step": 1541 + }, + { + "clip_ratio": 8.520791016053408e-05, + "epoch": 0.004280978794996086, + "grad_norm": 0.09506966918706894, + "kl": 0.06933099403977394, + "learning_rate": 3e-06, + "loss": 0.0042, + "step": 1542 + }, + { + "clip_ratio": 0.0, + "epoch": 0.004283755045835901, + "grad_norm": 0.11389172077178955, + "kl": 0.0656692385673523, + "learning_rate": 3e-06, + "loss": 0.0028, + "step": 1543 + }, + { + "clip_ratio": 0.00033449956390541047, + "epoch": 0.0042865312966757175, + "grad_norm": 0.08597692847251892, + "kl": 0.0642948318272829, + "learning_rate": 3e-06, + "loss": 0.0026, + "step": 1544 + }, + { + "clip_ratio": 8.138021075865254e-05, + "epoch": 0.004289307547515533, + "grad_norm": 0.09447018057107925, + "kl": 0.06415480002760887, + "learning_rate": 3e-06, + "loss": 0.0023, + "step": 1545 + }, + { + "clip_ratio": 0.0008052970515564084, + "epoch": 0.004292083798355349, + "grad_norm": 0.08938866853713989, + "kl": 0.06041167676448822, + "learning_rate": 3e-06, + "loss": 0.0018, + "step": 1546 + }, + { + "clip_ratio": 0.00062086500111036, + "epoch": 0.004294860049195165, + "grad_norm": 0.5281029939651489, + "kl": 0.06464917957782745, + "learning_rate": 3e-06, + "loss": 0.0026, + "step": 1547 + }, + { + "clip_ratio": 0.0005138094347785227, + "epoch": 0.004297636300034981, + "grad_norm": 0.08887960761785507, + "kl": 0.06742081791162491, + "learning_rate": 3e-06, + "loss": 0.0035, + "step": 1548 + }, + { + "clip_ratio": 0.0002517917600926012, + "completion_length": 237.14584350585938, + "epoch": 0.004300412550874796, + "grad_norm": 0.10000678896903992, + "kl": 0.05254969373345375, + "learning_rate": 3e-06, + "loss": 0.0067, + "reward": 0.2666666954755783, + "reward_std": 0.2623074799776077, + "rewards/countdown_reward_func": 0.2666666880249977, + "step": 1549, + "zero_std_ratio": 0.375 + }, + { + "clip_ratio": 0.0003343000134918839, + "epoch": 0.0043031888017146126, + "grad_norm": 0.09186454117298126, + "kl": 0.056838370859622955, + "learning_rate": 3e-06, + "loss": 0.0056, + "step": 1550 + }, + { + "clip_ratio": 0.0007483240551664494, + "epoch": 0.004305965052554429, + "grad_norm": 0.09988920390605927, + "kl": 0.05866162106394768, + "learning_rate": 3e-06, + "loss": 0.0075, + "step": 1551 + }, + { + "clip_ratio": 0.00024526867491658777, + "epoch": 0.004308741303394244, + "grad_norm": 0.07087462395429611, + "kl": 0.055921848863363266, + "learning_rate": 3e-06, + "loss": 0.0053, + "step": 1552 + }, + { + "clip_ratio": 8.138021075865254e-05, + "epoch": 0.0043115175542340605, + "grad_norm": 0.07110070437192917, + "kl": 0.05466669611632824, + "learning_rate": 3e-06, + "loss": 0.0062, + "step": 1553 + }, + { + "clip_ratio": 0.00032777692831587046, + "epoch": 0.004314293805073876, + "grad_norm": 0.09251809120178223, + "kl": 0.05664430558681488, + "learning_rate": 3e-06, + "loss": 0.0071, + "step": 1554 + }, + { + "clip_ratio": 0.00032552083575865254, + "epoch": 0.004317070055913692, + "grad_norm": 0.09662747383117676, + "kl": 0.05329587869346142, + "learning_rate": 3e-06, + "loss": 0.0064, + "step": 1555 + }, + { + "clip_ratio": 0.00032552083575865254, + "epoch": 0.004319846306753508, + "grad_norm": 0.10306031256914139, + "kl": 0.056288715451955795, + "learning_rate": 3e-06, + "loss": 0.0048, + "step": 1556 + }, + { + "clip_ratio": 0.0003680418885778636, + "epoch": 0.004322622557593324, + "grad_norm": 0.08361591398715973, + "kl": 0.06024627387523651, + "learning_rate": 3e-06, + "loss": 0.0068, + "step": 1557 + }, + { + "clip_ratio": 0.00032552083575865254, + "epoch": 0.004325398808433139, + "grad_norm": 0.06961996108293533, + "kl": 0.05571894347667694, + "learning_rate": 3e-06, + "loss": 0.0052, + "step": 1558 + }, + { + "clip_ratio": 9.051412052940577e-05, + "epoch": 0.0043281750592729555, + "grad_norm": 0.20629696547985077, + "kl": 0.05542530678212643, + "learning_rate": 3e-06, + "loss": 0.0058, + "step": 1559 + }, + { + "clip_ratio": 0.0005872198089491576, + "epoch": 0.004330951310112771, + "grad_norm": 0.09624534100294113, + "kl": 0.05719461478292942, + "learning_rate": 3e-06, + "loss": 0.0056, + "step": 1560 + }, + { + "clip_ratio": 0.0, + "completion_length": 233.52084350585938, + "epoch": 0.004333727560952587, + "grad_norm": 0.07015831768512726, + "kl": 0.06173841841518879, + "learning_rate": 3e-06, + "loss": -0.0015, + "reward": 0.2666666954755783, + "reward_std": 0.22149831801652908, + "rewards/countdown_reward_func": 0.2666666805744171, + "step": 1561, + "zero_std_ratio": 0.5 + }, + { + "clip_ratio": 0.00018502863531466573, + "epoch": 0.0043365038117924035, + "grad_norm": 0.07204274088144302, + "kl": 0.0598264392465353, + "learning_rate": 3e-06, + "loss": -0.0018, + "step": 1562 + }, + { + "clip_ratio": 0.00016556291666347533, + "epoch": 0.004339280062632219, + "grad_norm": 0.08235902339220047, + "kl": 0.06111270561814308, + "learning_rate": 3e-06, + "loss": -0.0016, + "step": 1563 + }, + { + "clip_ratio": 0.000179314476554282, + "epoch": 0.004342056313472035, + "grad_norm": 0.08542285859584808, + "kl": 0.06232334300875664, + "learning_rate": 3e-06, + "loss": -0.002, + "step": 1564 + }, + { + "clip_ratio": 0.0001975396226043813, + "epoch": 0.004344832564311851, + "grad_norm": 0.08785772323608398, + "kl": 0.06148502230644226, + "learning_rate": 3e-06, + "loss": -0.0023, + "step": 1565 + }, + { + "clip_ratio": 0.0001065643664333038, + "epoch": 0.004347608815151667, + "grad_norm": 0.06917145103216171, + "kl": 0.06312652491033077, + "learning_rate": 3e-06, + "loss": -0.0017, + "step": 1566 + }, + { + "clip_ratio": 0.0, + "epoch": 0.004350385065991482, + "grad_norm": 0.07437288761138916, + "kl": 0.0617559514939785, + "learning_rate": 3e-06, + "loss": -0.0019, + "step": 1567 + }, + { + "clip_ratio": 8.138021075865254e-05, + "epoch": 0.0043531613168312985, + "grad_norm": 0.07470710575580597, + "kl": 0.0611141100525856, + "learning_rate": 3e-06, + "loss": -0.0026, + "step": 1568 + }, + { + "clip_ratio": 0.00016971942386589944, + "epoch": 0.004355937567671114, + "grad_norm": 0.10094926506280899, + "kl": 0.06114407069981098, + "learning_rate": 3e-06, + "loss": -0.0028, + "step": 1569 + }, + { + "clip_ratio": 0.0, + "epoch": 0.00435871381851093, + "grad_norm": 0.08059901744127274, + "kl": 0.06223343499004841, + "learning_rate": 3e-06, + "loss": -0.0026, + "step": 1570 + }, + { + "clip_ratio": 0.0002885148787754588, + "epoch": 0.004361490069350746, + "grad_norm": 0.07476944476366043, + "kl": 0.06176266819238663, + "learning_rate": 3e-06, + "loss": -0.0027, + "step": 1571 + }, + { + "clip_ratio": 0.0001713502424536273, + "epoch": 0.004364266320190562, + "grad_norm": 0.07194018363952637, + "kl": 0.06364806741476059, + "learning_rate": 3e-06, + "loss": -0.0024, + "step": 1572 + }, + { + "clip_ratio": 0.0, + "completion_length": 238.9791717529297, + "epoch": 0.004367042571030378, + "grad_norm": 0.06975971907377243, + "kl": 0.07021701335906982, + "learning_rate": 3e-06, + "loss": 0.0159, + "reward": 0.22500000894069672, + "reward_std": 0.18065783381462097, + "rewards/countdown_reward_func": 0.22500000149011612, + "step": 1573, + "zero_std_ratio": 0.375 + }, + { + "clip_ratio": 0.0, + "epoch": 0.004369818821870194, + "grad_norm": 0.06149615719914436, + "kl": 0.07013114541769028, + "learning_rate": 3e-06, + "loss": 0.0162, + "step": 1574 + }, + { + "clip_ratio": 0.0001724217290757224, + "epoch": 0.00437259507271001, + "grad_norm": 0.06460988521575928, + "kl": 0.07087487727403641, + "learning_rate": 3e-06, + "loss": 0.0164, + "step": 1575 + }, + { + "clip_ratio": 0.0001953798928298056, + "epoch": 0.004375371323549825, + "grad_norm": 0.08817639201879501, + "kl": 0.07189563289284706, + "learning_rate": 3e-06, + "loss": 0.0164, + "step": 1576 + }, + { + "clip_ratio": 0.00010486577230039984, + "epoch": 0.0043781475743896415, + "grad_norm": 0.0764622911810875, + "kl": 0.06396012753248215, + "learning_rate": 3e-06, + "loss": 0.0164, + "step": 1577 + }, + { + "clip_ratio": 0.0002441406322759576, + "epoch": 0.004380923825229457, + "grad_norm": 0.06788492202758789, + "kl": 0.06740446761250496, + "learning_rate": 3e-06, + "loss": 0.0158, + "step": 1578 + }, + { + "clip_ratio": 0.00016276042151730508, + "epoch": 0.004383700076069273, + "grad_norm": 0.06214107945561409, + "kl": 0.07128945738077164, + "learning_rate": 3e-06, + "loss": 0.0165, + "step": 1579 + }, + { + "clip_ratio": 0.00024570666573708877, + "epoch": 0.004386476326909089, + "grad_norm": 0.06276597827672958, + "kl": 0.0706591084599495, + "learning_rate": 3e-06, + "loss": 0.0157, + "step": 1580 + }, + { + "clip_ratio": 0.00033518214331706986, + "epoch": 0.004389252577748905, + "grad_norm": 0.06644298136234283, + "kl": 0.07160716131329536, + "learning_rate": 3e-06, + "loss": 0.0156, + "step": 1581 + }, + { + "clip_ratio": 0.00033465474552940577, + "epoch": 0.00439202882858872, + "grad_norm": 0.09485163539648056, + "kl": 0.07267293706536293, + "learning_rate": 3e-06, + "loss": 0.0154, + "step": 1582 + }, + { + "clip_ratio": 9.104151831706986e-05, + "epoch": 0.0043948050794285366, + "grad_norm": 0.07193972915410995, + "kl": 0.06612110696732998, + "learning_rate": 3e-06, + "loss": 0.0152, + "step": 1583 + }, + { + "clip_ratio": 8.138021075865254e-05, + "epoch": 0.004397581330268353, + "grad_norm": 0.06304839253425598, + "kl": 0.06994717195630074, + "learning_rate": 3e-06, + "loss": 0.0155, + "step": 1584 + }, + { + "clip_ratio": 0.00011627907224465162, + "completion_length": 229.9166717529297, + "epoch": 0.004400357581108168, + "grad_norm": 0.16353268921375275, + "kl": 0.07109616324305534, + "learning_rate": 3e-06, + "loss": 0.012, + "reward": 0.3437500298023224, + "reward_std": 0.33480696380138397, + "rewards/countdown_reward_func": 0.3437500298023224, + "step": 1585, + "zero_std_ratio": 0.25 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0044031338319479845, + "grad_norm": 0.09060167521238327, + "kl": 0.06814149767160416, + "learning_rate": 3e-06, + "loss": 0.0127, + "step": 1586 + }, + { + "clip_ratio": 0.00019944607629440725, + "epoch": 0.0044059100827878, + "grad_norm": 0.11436658352613449, + "kl": 0.06723786145448685, + "learning_rate": 3e-06, + "loss": 0.0122, + "step": 1587 + }, + { + "clip_ratio": 0.00029042133246548474, + "epoch": 0.004408686333627616, + "grad_norm": 0.09583216160535812, + "kl": 0.07507549971342087, + "learning_rate": 3e-06, + "loss": 0.0116, + "step": 1588 + }, + { + "clip_ratio": 0.0, + "epoch": 0.004411462584467432, + "grad_norm": 0.08748263865709305, + "kl": 0.07277511805295944, + "learning_rate": 3e-06, + "loss": 0.0118, + "step": 1589 + }, + { + "clip_ratio": 0.0005822835373692214, + "epoch": 0.004414238835307248, + "grad_norm": 0.10275944322347641, + "kl": 0.06728723272681236, + "learning_rate": 3e-06, + "loss": 0.0114, + "step": 1590 + }, + { + "clip_ratio": 0.00011627907224465162, + "epoch": 0.004417015086147063, + "grad_norm": 0.15305157005786896, + "kl": 0.0735839381814003, + "learning_rate": 3e-06, + "loss": 0.0102, + "step": 1591 + }, + { + "clip_ratio": 0.00043021741294069216, + "epoch": 0.0044197913369868795, + "grad_norm": 0.1069241613149643, + "kl": 0.07472379878163338, + "learning_rate": 3e-06, + "loss": 0.0111, + "step": 1592 + }, + { + "clip_ratio": 0.0, + "epoch": 0.004422567587826695, + "grad_norm": 0.09179497510194778, + "kl": 0.07055623456835747, + "learning_rate": 3e-06, + "loss": 0.0105, + "step": 1593 + }, + { + "clip_ratio": 0.00019944607629440725, + "epoch": 0.004425343838666511, + "grad_norm": 0.09448972344398499, + "kl": 0.08183940127491951, + "learning_rate": 3e-06, + "loss": 0.0114, + "step": 1594 + }, + { + "clip_ratio": 8.316699677379802e-05, + "epoch": 0.0044281200895063275, + "grad_norm": 0.12228698283433914, + "kl": 0.07783747464418411, + "learning_rate": 3e-06, + "loss": 0.01, + "step": 1595 + }, + { + "clip_ratio": 0.0004985033592674881, + "epoch": 0.004430896340346143, + "grad_norm": 0.13479016721248627, + "kl": 0.07179780304431915, + "learning_rate": 3e-06, + "loss": 0.0101, + "step": 1596 + }, + { + "clip_ratio": 0.0002777284535113722, + "completion_length": 230.43750762939453, + "epoch": 0.004433672591185959, + "grad_norm": 0.11454746127128601, + "kl": 0.06979512795805931, + "learning_rate": 3e-06, + "loss": 0.0316, + "reward": 0.30416668951511383, + "reward_std": 0.2991009056568146, + "rewards/countdown_reward_func": 0.30416667461395264, + "step": 1597, + "zero_std_ratio": 0.25 + }, + { + "clip_ratio": 0.00035156837839167565, + "epoch": 0.004436448842025775, + "grad_norm": 0.14501981437206268, + "kl": 0.07220923155546188, + "learning_rate": 3e-06, + "loss": 0.0309, + "step": 1598 + }, + { + "clip_ratio": 9.484066686127335e-05, + "epoch": 0.004439225092865591, + "grad_norm": 0.09493423253297806, + "kl": 0.07610296085476875, + "learning_rate": 3e-06, + "loss": 0.0313, + "step": 1599 + }, + { + "clip_ratio": 0.0005380346046877094, + "epoch": 0.004442001343705406, + "grad_norm": 0.10318285971879959, + "kl": 0.07600558176636696, + "learning_rate": 3e-06, + "loss": 0.0309, + "step": 1600 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0044447775945452225, + "grad_norm": 0.1465279459953308, + "kl": 0.07435224205255508, + "learning_rate": 3e-06, + "loss": 0.0301, + "step": 1601 + }, + { + "clip_ratio": 0.0, + "epoch": 0.004447553845385038, + "grad_norm": 0.132258802652359, + "kl": 0.07526106759905815, + "learning_rate": 3e-06, + "loss": 0.0296, + "step": 1602 + }, + { + "clip_ratio": 0.0003190559073118493, + "epoch": 0.004450330096224854, + "grad_norm": 0.11824331432580948, + "kl": 0.07644139230251312, + "learning_rate": 3e-06, + "loss": 0.0287, + "step": 1603 + }, + { + "clip_ratio": 0.00036518805427476764, + "epoch": 0.00445310634706467, + "grad_norm": 0.12642835080623627, + "kl": 0.08045900240540504, + "learning_rate": 3e-06, + "loss": 0.028, + "step": 1604 + }, + { + "clip_ratio": 9.484066686127335e-05, + "epoch": 0.004455882597904486, + "grad_norm": 0.08899746090173721, + "kl": 0.08476192504167557, + "learning_rate": 3e-06, + "loss": 0.029, + "step": 1605 + }, + { + "clip_ratio": 0.00035244174068793654, + "epoch": 0.004458658848744302, + "grad_norm": 0.09743326157331467, + "kl": 0.08727932721376419, + "learning_rate": 3e-06, + "loss": 0.0277, + "step": 1606 + }, + { + "clip_ratio": 0.0, + "epoch": 0.004461435099584118, + "grad_norm": 0.13737818598747253, + "kl": 0.08706852421164513, + "learning_rate": 3e-06, + "loss": 0.0263, + "step": 1607 + }, + { + "clip_ratio": 0.0005535711825359613, + "epoch": 0.004464211350423934, + "grad_norm": 0.12821559607982635, + "kl": 0.08946230262517929, + "learning_rate": 3e-06, + "loss": 0.0255, + "step": 1608 + }, + { + "clip_ratio": 0.0003540786055964418, + "completion_length": 227.70833587646484, + "epoch": 0.004466987601263749, + "grad_norm": 0.11783099174499512, + "kl": 0.10899167135357857, + "learning_rate": 3e-06, + "loss": -0.0008, + "reward": 0.30416668951511383, + "reward_std": 0.3719724863767624, + "rewards/countdown_reward_func": 0.30416668951511383, + "step": 1609, + "zero_std_ratio": 0.125 + }, + { + "clip_ratio": 9.704969124868512e-05, + "epoch": 0.0044697638521035655, + "grad_norm": 0.10847090929746628, + "kl": 0.10389678180217743, + "learning_rate": 3e-06, + "loss": -0.0007, + "step": 1610 + }, + { + "clip_ratio": 0.0006049876101315022, + "epoch": 0.004472540102943381, + "grad_norm": 0.11665836721658707, + "kl": 0.10452697053551674, + "learning_rate": 3e-06, + "loss": -0.0001, + "step": 1611 + }, + { + "clip_ratio": 0.000750809209421277, + "epoch": 0.004475316353783197, + "grad_norm": 0.1398657262325287, + "kl": 0.12372316420078278, + "learning_rate": 3e-06, + "loss": -0.0002, + "step": 1612 + }, + { + "clip_ratio": 0.0, + "epoch": 0.004478092604623013, + "grad_norm": 0.09913130849599838, + "kl": 0.11288458853960037, + "learning_rate": 3e-06, + "loss": -0.0017, + "step": 1613 + }, + { + "clip_ratio": 0.00017424796533305198, + "epoch": 0.004480868855462829, + "grad_norm": 0.1439204066991806, + "kl": 0.1121748648583889, + "learning_rate": 3e-06, + "loss": -0.0008, + "step": 1614 + }, + { + "clip_ratio": 8.138021075865254e-05, + "epoch": 0.004483645106302644, + "grad_norm": 0.11864045262336731, + "kl": 0.12259047850966454, + "learning_rate": 3e-06, + "loss": 0.0006, + "step": 1615 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0044864213571424606, + "grad_norm": 0.10626986622810364, + "kl": 0.11897166818380356, + "learning_rate": 3e-06, + "loss": -0.0003, + "step": 1616 + }, + { + "clip_ratio": 0.0007439094333676621, + "epoch": 0.004489197607982277, + "grad_norm": 0.10574115067720413, + "kl": 0.1181485652923584, + "learning_rate": 3e-06, + "loss": 0.0004, + "step": 1617 + }, + { + "clip_ratio": 0.000587511618505232, + "epoch": 0.004491973858822092, + "grad_norm": 0.12936261296272278, + "kl": 0.13641805946826935, + "learning_rate": 3e-06, + "loss": -0.0005, + "step": 1618 + }, + { + "clip_ratio": 0.0001871362328529358, + "epoch": 0.0044947501096619085, + "grad_norm": 0.09758912771940231, + "kl": 0.12104825302958488, + "learning_rate": 3e-06, + "loss": -0.0026, + "step": 1619 + }, + { + "clip_ratio": 0.00016276042151730508, + "epoch": 0.004497526360501724, + "grad_norm": 0.10167786478996277, + "kl": 0.11872199177742004, + "learning_rate": 3e-06, + "loss": -0.0008, + "step": 1620 + }, + { + "clip_ratio": 0.0004541333037195727, + "completion_length": 230.33333587646484, + "epoch": 0.00450030261134154, + "grad_norm": 0.09969217330217361, + "kl": 0.13528293371200562, + "learning_rate": 3e-06, + "loss": 0.0064, + "reward": 0.2875000238418579, + "reward_std": 0.2661401033401489, + "rewards/countdown_reward_func": 0.2875000163912773, + "step": 1621, + "zero_std_ratio": 0.375 + }, + { + "clip_ratio": 0.0, + "epoch": 0.004503078862181356, + "grad_norm": 0.11825183779001236, + "kl": 0.11860142275691032, + "learning_rate": 3e-06, + "loss": 0.0048, + "step": 1622 + }, + { + "clip_ratio": 0.0, + "epoch": 0.004505855113021172, + "grad_norm": 0.09762841463088989, + "kl": 0.1295238509774208, + "learning_rate": 3e-06, + "loss": 0.0057, + "step": 1623 + }, + { + "clip_ratio": 0.00018274853937327862, + "epoch": 0.004508631363860987, + "grad_norm": 0.1245664656162262, + "kl": 0.12989997118711472, + "learning_rate": 3e-06, + "loss": 0.0048, + "step": 1624 + }, + { + "clip_ratio": 0.0004756918060593307, + "epoch": 0.0045114076147008035, + "grad_norm": 0.09959601610898972, + "kl": 0.13353952765464783, + "learning_rate": 3e-06, + "loss": 0.0055, + "step": 1625 + }, + { + "clip_ratio": 0.0004383840714581311, + "epoch": 0.004514183865540619, + "grad_norm": 0.10265988856554031, + "kl": 0.1284654214978218, + "learning_rate": 3e-06, + "loss": 0.0058, + "step": 1626 + }, + { + "clip_ratio": 0.0007191235781647265, + "epoch": 0.004516960116380435, + "grad_norm": 0.09630168974399567, + "kl": 0.13352051377296448, + "learning_rate": 3e-06, + "loss": 0.0057, + "step": 1627 + }, + { + "clip_ratio": 0.0001932617014972493, + "epoch": 0.0045197363672202515, + "grad_norm": 0.1130925789475441, + "kl": 0.11547379568219185, + "learning_rate": 3e-06, + "loss": 0.0042, + "step": 1628 + }, + { + "clip_ratio": 0.00018102824105881155, + "epoch": 0.004522512618060067, + "grad_norm": 0.10465371608734131, + "kl": 0.12602712959051132, + "learning_rate": 3e-06, + "loss": 0.0042, + "step": 1629 + }, + { + "clip_ratio": 0.0002716227754717693, + "epoch": 0.004525288868899883, + "grad_norm": 0.11785195767879486, + "kl": 0.12726370617747307, + "learning_rate": 3e-06, + "loss": 0.003, + "step": 1630 + }, + { + "clip_ratio": 0.00018102824105881155, + "epoch": 0.004528065119739699, + "grad_norm": 0.1045258566737175, + "kl": 0.12971790879964828, + "learning_rate": 3e-06, + "loss": 0.0042, + "step": 1631 + }, + { + "clip_ratio": 0.0006421779398806393, + "epoch": 0.004530841370579515, + "grad_norm": 0.10235866159200668, + "kl": 0.12507328763604164, + "learning_rate": 3e-06, + "loss": 0.004, + "step": 1632 + }, + { + "clip_ratio": 9.61538462433964e-05, + "completion_length": 237.31250762939453, + "epoch": 0.00453361762141933, + "grad_norm": 0.10965386778116226, + "kl": 0.11101473122835159, + "learning_rate": 3e-06, + "loss": 0.0247, + "reward": 0.34166668355464935, + "reward_std": 0.3452594429254532, + "rewards/countdown_reward_func": 0.34166668355464935, + "step": 1633, + "zero_std_ratio": 0.25 + }, + { + "clip_ratio": 0.0002707348467083648, + "epoch": 0.0045363938722591465, + "grad_norm": 0.10254738479852676, + "kl": 0.11892389506101608, + "learning_rate": 3e-06, + "loss": 0.0251, + "step": 1634 + }, + { + "clip_ratio": 0.00018317173817194998, + "epoch": 0.004539170123098962, + "grad_norm": 0.1211012601852417, + "kl": 0.11001782864332199, + "learning_rate": 3e-06, + "loss": 0.0254, + "step": 1635 + }, + { + "clip_ratio": 0.0007125435367925093, + "epoch": 0.004541946373938778, + "grad_norm": 0.10486630350351334, + "kl": 0.11037512868642807, + "learning_rate": 3e-06, + "loss": 0.025, + "step": 1636 + }, + { + "clip_ratio": 0.0001923076924867928, + "epoch": 0.004544722624778594, + "grad_norm": 0.11679045855998993, + "kl": 0.11175461858510971, + "learning_rate": 3e-06, + "loss": 0.025, + "step": 1637 + }, + { + "clip_ratio": 0.00017780938651412725, + "epoch": 0.00454749887561841, + "grad_norm": 0.11241600662469864, + "kl": 0.11684262380003929, + "learning_rate": 3e-06, + "loss": 0.0248, + "step": 1638 + }, + { + "clip_ratio": 8.383634849451482e-05, + "epoch": 0.004550275126458226, + "grad_norm": 0.12549988925457, + "kl": 0.1113254725933075, + "learning_rate": 3e-06, + "loss": 0.0236, + "step": 1639 + }, + { + "clip_ratio": 0.0003387995529919863, + "epoch": 0.004553051377298042, + "grad_norm": 0.09110260754823685, + "kl": 0.11747504025697708, + "learning_rate": 3e-06, + "loss": 0.0238, + "step": 1640 + }, + { + "clip_ratio": 0.00026678377616917714, + "epoch": 0.004555827628137858, + "grad_norm": 0.12658299505710602, + "kl": 0.11034082993865013, + "learning_rate": 3e-06, + "loss": 0.0242, + "step": 1641 + }, + { + "clip_ratio": 0.0003627651822171174, + "epoch": 0.004558603878977673, + "grad_norm": 0.1093655377626419, + "kl": 0.11209141090512276, + "learning_rate": 3e-06, + "loss": 0.0244, + "step": 1642 + }, + { + "clip_ratio": 9.61538462433964e-05, + "epoch": 0.0045613801298174895, + "grad_norm": 0.11235036700963974, + "kl": 0.11257147789001465, + "learning_rate": 3e-06, + "loss": 0.0237, + "step": 1643 + }, + { + "clip_ratio": 8.890469325706363e-05, + "epoch": 0.004564156380657305, + "grad_norm": 0.11150780320167542, + "kl": 0.11817445978522301, + "learning_rate": 3e-06, + "loss": 0.024, + "step": 1644 + }, + { + "clip_ratio": 0.00017182034207507968, + "completion_length": 218.52083587646484, + "epoch": 0.004566932631497121, + "grad_norm": 0.11840204149484634, + "kl": 0.13194195181131363, + "learning_rate": 3e-06, + "loss": 0.0075, + "reward": 0.3791666626930237, + "reward_std": 0.36280614137649536, + "rewards/countdown_reward_func": 0.3791666626930237, + "step": 1645, + "zero_std_ratio": 0.125 + }, + { + "clip_ratio": 0.0001797049117158167, + "epoch": 0.004569708882336937, + "grad_norm": 0.10418499261140823, + "kl": 0.13568759709596634, + "learning_rate": 3e-06, + "loss": 0.0073, + "step": 1646 + }, + { + "clip_ratio": 0.00017531556659378111, + "epoch": 0.004572485133176753, + "grad_norm": 0.13569895923137665, + "kl": 0.1283918097615242, + "learning_rate": 3e-06, + "loss": 0.0067, + "step": 1647 + }, + { + "clip_ratio": 0.00020443140238057822, + "epoch": 0.004575261384016568, + "grad_norm": 0.13669107854366302, + "kl": 0.13637378439307213, + "learning_rate": 3e-06, + "loss": 0.0073, + "step": 1648 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0045780376348563846, + "grad_norm": 0.10260222107172012, + "kl": 0.1373545378446579, + "learning_rate": 3e-06, + "loss": 0.0077, + "step": 1649 + }, + { + "clip_ratio": 0.0004510040016612038, + "epoch": 0.004580813885696201, + "grad_norm": 0.1269139051437378, + "kl": 0.14053868502378464, + "learning_rate": 3e-06, + "loss": 0.0081, + "step": 1650 + }, + { + "clip_ratio": 0.0006953472329769284, + "epoch": 0.004583590136536016, + "grad_norm": 0.11335631459951401, + "kl": 0.12832976132631302, + "learning_rate": 3e-06, + "loss": 0.0068, + "step": 1651 + }, + { + "clip_ratio": 0.0004522776580415666, + "epoch": 0.0045863663873758325, + "grad_norm": 0.12118861079216003, + "kl": 0.13261255621910095, + "learning_rate": 3e-06, + "loss": 0.0063, + "step": 1652 + }, + { + "clip_ratio": 0.0, + "epoch": 0.004589142638215648, + "grad_norm": 0.130536749958992, + "kl": 0.12578046321868896, + "learning_rate": 3e-06, + "loss": 0.0049, + "step": 1653 + }, + { + "clip_ratio": 0.00020579837291734293, + "epoch": 0.004591918889055464, + "grad_norm": 0.13175055384635925, + "kl": 0.13097375631332397, + "learning_rate": 3e-06, + "loss": 0.0062, + "step": 1654 + }, + { + "clip_ratio": 0.00038550328463315964, + "epoch": 0.00459469513989528, + "grad_norm": 0.0984969213604927, + "kl": 0.13125114515423775, + "learning_rate": 3e-06, + "loss": 0.0059, + "step": 1655 + }, + { + "clip_ratio": 0.0004875610757153481, + "epoch": 0.004597471390735096, + "grad_norm": 0.11173491179943085, + "kl": 0.13282914832234383, + "learning_rate": 3e-06, + "loss": 0.0056, + "step": 1656 + }, + { + "clip_ratio": 0.0001720578147796914, + "completion_length": 235.7916717529297, + "epoch": 0.004600247641574911, + "grad_norm": 0.13065429031848907, + "kl": 0.10866416990756989, + "learning_rate": 3e-06, + "loss": 0.015, + "reward": 0.30625002831220627, + "reward_std": 0.31206804513931274, + "rewards/countdown_reward_func": 0.3062500134110451, + "step": 1657, + "zero_std_ratio": 0.25 + }, + { + "clip_ratio": 8.650519157527015e-05, + "epoch": 0.0046030238924147275, + "grad_norm": 0.09855504333972931, + "kl": 0.10450971871614456, + "learning_rate": 3e-06, + "loss": 0.0145, + "step": 1658 + }, + { + "clip_ratio": 0.0006019261782057583, + "epoch": 0.004605800143254543, + "grad_norm": 0.08144383132457733, + "kl": 0.11103589460253716, + "learning_rate": 3e-06, + "loss": 0.0156, + "step": 1659 + }, + { + "clip_ratio": 0.00030096308910287917, + "epoch": 0.004608576394094359, + "grad_norm": 0.09432277828454971, + "kl": 0.10549774393439293, + "learning_rate": 3e-06, + "loss": 0.0161, + "step": 1660 + }, + { + "clip_ratio": 0.0004300602595321834, + "epoch": 0.0046113526449341755, + "grad_norm": 0.09225940704345703, + "kl": 0.108955267816782, + "learning_rate": 3e-06, + "loss": 0.0163, + "step": 1661 + }, + { + "clip_ratio": 0.00045322794176172465, + "epoch": 0.004614128895773991, + "grad_norm": 0.12361875921487808, + "kl": 0.1058424860239029, + "learning_rate": 3e-06, + "loss": 0.0149, + "step": 1662 + }, + { + "clip_ratio": 8.60289073898457e-05, + "epoch": 0.004616905146613807, + "grad_norm": 0.11309605091810226, + "kl": 0.10573908686637878, + "learning_rate": 3e-06, + "loss": 0.0146, + "step": 1663 + }, + { + "clip_ratio": 0.00016788540233392268, + "epoch": 0.004619681397453623, + "grad_norm": 0.09809679538011551, + "kl": 0.10098403319716454, + "learning_rate": 3e-06, + "loss": 0.0145, + "step": 1664 + }, + { + "clip_ratio": 9.09090886125341e-05, + "epoch": 0.004622457648293439, + "grad_norm": 0.07421303540468216, + "kl": 0.10688856989145279, + "learning_rate": 3e-06, + "loss": 0.0152, + "step": 1665 + }, + { + "clip_ratio": 0.00010032102727564052, + "epoch": 0.004625233899133254, + "grad_norm": 0.09634650498628616, + "kl": 0.10281075537204742, + "learning_rate": 3e-06, + "loss": 0.0153, + "step": 1666 + }, + { + "clip_ratio": 0.0004270849240128882, + "epoch": 0.0046280101499730705, + "grad_norm": 0.10299887508153915, + "kl": 0.10465852543711662, + "learning_rate": 3e-06, + "loss": 0.0147, + "step": 1667 + }, + { + "clip_ratio": 0.0006324783898890018, + "epoch": 0.004630786400812886, + "grad_norm": 0.11212204396724701, + "kl": 0.10365309193730354, + "learning_rate": 3e-06, + "loss": 0.0127, + "step": 1668 + }, + { + "clip_ratio": 0.0, + "completion_length": 232.75, + "epoch": 0.004633562651652702, + "grad_norm": 0.06845124810934067, + "kl": 0.09933054447174072, + "learning_rate": 3e-06, + "loss": -0.008, + "reward": 0.15416667610406876, + "reward_std": 0.1091257855296135, + "rewards/countdown_reward_func": 0.15416667610406876, + "step": 1669, + "zero_std_ratio": 0.625 + }, + { + "clip_ratio": 8.45165632199496e-05, + "epoch": 0.004636338902492518, + "grad_norm": 0.0753096267580986, + "kl": 0.10704569146037102, + "learning_rate": 3e-06, + "loss": -0.0078, + "step": 1670 + }, + { + "clip_ratio": 0.00035820446646539494, + "epoch": 0.004639115153332334, + "grad_norm": 0.056882914155721664, + "kl": 0.10202248394489288, + "learning_rate": 3e-06, + "loss": -0.008, + "step": 1671 + }, + { + "clip_ratio": 0.0, + "epoch": 0.00464189140417215, + "grad_norm": 0.0649522915482521, + "kl": 0.0977204330265522, + "learning_rate": 3e-06, + "loss": -0.008, + "step": 1672 + }, + { + "clip_ratio": 0.0, + "epoch": 0.004644667655011966, + "grad_norm": 0.06856367737054825, + "kl": 0.09061568230390549, + "learning_rate": 3e-06, + "loss": -0.0085, + "step": 1673 + }, + { + "clip_ratio": 0.0005575706818490289, + "epoch": 0.004647443905851782, + "grad_norm": 0.09739033877849579, + "kl": 0.09403853490948677, + "learning_rate": 3e-06, + "loss": -0.0085, + "step": 1674 + }, + { + "clip_ratio": 9.412650251761079e-05, + "epoch": 0.004650220156691597, + "grad_norm": 0.06736251711845398, + "kl": 0.09165050461888313, + "learning_rate": 3e-06, + "loss": -0.0087, + "step": 1675 + }, + { + "clip_ratio": 0.00017174614913528785, + "epoch": 0.0046529964075314135, + "grad_norm": 0.0773087590932846, + "kl": 0.09735730290412903, + "learning_rate": 3e-06, + "loss": -0.0086, + "step": 1676 + }, + { + "clip_ratio": 0.001088498393073678, + "epoch": 0.004655772658371229, + "grad_norm": 0.05607219412922859, + "kl": 0.0904586911201477, + "learning_rate": 3e-06, + "loss": -0.009, + "step": 1677 + }, + { + "clip_ratio": 0.0009852994699031115, + "epoch": 0.004658548909211045, + "grad_norm": 0.07360868901014328, + "kl": 0.08624210581183434, + "learning_rate": 3e-06, + "loss": -0.0091, + "step": 1678 + }, + { + "clip_ratio": 0.0013295641401782632, + "epoch": 0.004661325160050861, + "grad_norm": 0.06193244829773903, + "kl": 0.07950897514820099, + "learning_rate": 3e-06, + "loss": -0.0099, + "step": 1679 + }, + { + "clip_ratio": 0.001959454733878374, + "epoch": 0.004664101410890677, + "grad_norm": 0.06660173088312149, + "kl": 0.0817050151526928, + "learning_rate": 3e-06, + "loss": -0.0093, + "step": 1680 + }, + { + "clip_ratio": 0.00019872814300470054, + "completion_length": 224.27083587646484, + "epoch": 0.004666877661730492, + "grad_norm": 0.11621461063623428, + "kl": 0.09354111552238464, + "learning_rate": 3e-06, + "loss": -0.0011, + "reward": 0.3020833432674408, + "reward_std": 0.2798703759908676, + "rewards/countdown_reward_func": 0.3020833358168602, + "step": 1681, + "zero_std_ratio": 0.125 + }, + { + "clip_ratio": 8.644536865176633e-05, + "epoch": 0.0046696539125703086, + "grad_norm": 0.09669879823923111, + "kl": 0.07669191434979439, + "learning_rate": 3e-06, + "loss": -0.0009, + "step": 1682 + }, + { + "clip_ratio": 0.0003749770621652715, + "epoch": 0.004672430163410125, + "grad_norm": 0.10819410532712936, + "kl": 0.08729588240385056, + "learning_rate": 3e-06, + "loss": -0.001, + "step": 1683 + }, + { + "clip_ratio": 8.716875890968367e-05, + "epoch": 0.00467520641424994, + "grad_norm": 0.08888690173625946, + "kl": 0.07762591540813446, + "learning_rate": 3e-06, + "loss": -0.0003, + "step": 1684 + }, + { + "clip_ratio": 8.704735228093341e-05, + "epoch": 0.0046779826650897565, + "grad_norm": 0.08127215504646301, + "kl": 0.08648081123828888, + "learning_rate": 3e-06, + "loss": -0.0002, + "step": 1685 + }, + { + "clip_ratio": 8.704735228093341e-05, + "epoch": 0.004680758915929572, + "grad_norm": 0.08273155987262726, + "kl": 0.0694781243801117, + "learning_rate": 3e-06, + "loss": -0.0011, + "step": 1686 + }, + { + "clip_ratio": 0.0006585983792319894, + "epoch": 0.004683535166769388, + "grad_norm": 0.1248389482498169, + "kl": 0.08379602804780006, + "learning_rate": 3e-06, + "loss": -0.0014, + "step": 1687 + }, + { + "clip_ratio": 0.00037935634463792667, + "epoch": 0.004686311417609204, + "grad_norm": 0.09269773215055466, + "kl": 0.06929580494761467, + "learning_rate": 3e-06, + "loss": -0.0026, + "step": 1688 + }, + { + "clip_ratio": 0.0001979566877707839, + "epoch": 0.00468908766844902, + "grad_norm": 0.10713022947311401, + "kl": 0.08045705035328865, + "learning_rate": 3e-06, + "loss": -0.0023, + "step": 1689 + }, + { + "clip_ratio": 0.0003845375686069019, + "epoch": 0.004691863919288835, + "grad_norm": 0.093922920525074, + "kl": 0.07109234854578972, + "learning_rate": 3e-06, + "loss": -0.001, + "step": 1690 + }, + { + "clip_ratio": 0.000683231744915247, + "epoch": 0.0046946401701286515, + "grad_norm": 0.08436852693557739, + "kl": 0.07955209910869598, + "learning_rate": 3e-06, + "loss": -0.0011, + "step": 1691 + }, + { + "clip_ratio": 0.0004670836788136512, + "epoch": 0.004697416420968467, + "grad_norm": 0.07506747543811798, + "kl": 0.06490103155374527, + "learning_rate": 3e-06, + "loss": -0.0018, + "step": 1692 + }, + { + "clip_ratio": 0.0004444956357474439, + "completion_length": 228.9791717529297, + "epoch": 0.004700192671808283, + "grad_norm": 0.08811832219362259, + "kl": 0.07729529216885567, + "learning_rate": 3e-06, + "loss": -0.0021, + "reward": 0.2645833492279053, + "reward_std": 0.23099135607481003, + "rewards/countdown_reward_func": 0.2645833417773247, + "step": 1693, + "zero_std_ratio": 0.5 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0047029689226480995, + "grad_norm": 0.13022494316101074, + "kl": 0.07385160028934479, + "learning_rate": 3e-06, + "loss": -0.0017, + "step": 1694 + }, + { + "clip_ratio": 0.0014592632069252431, + "epoch": 0.004705745173487915, + "grad_norm": 0.18172861635684967, + "kl": 0.07337892800569534, + "learning_rate": 3e-06, + "loss": -0.0003, + "step": 1695 + }, + { + "clip_ratio": 0.0005964136798866093, + "epoch": 0.004708521424327731, + "grad_norm": 0.0807371437549591, + "kl": 0.07234909385442734, + "learning_rate": 3e-06, + "loss": -0.0011, + "step": 1696 + }, + { + "clip_ratio": 0.0004422512211021967, + "epoch": 0.004711297675167547, + "grad_norm": 0.07396223396062851, + "kl": 0.07509288936853409, + "learning_rate": 3e-06, + "loss": -0.002, + "step": 1697 + }, + { + "clip_ratio": 0.0003952159022446722, + "epoch": 0.004714073926007363, + "grad_norm": 0.09103484451770782, + "kl": 0.08038389682769775, + "learning_rate": 3e-06, + "loss": -0.0014, + "step": 1698 + }, + { + "clip_ratio": 9.873617818811908e-05, + "epoch": 0.004716850176847178, + "grad_norm": 0.1258394420146942, + "kl": 0.07613859698176384, + "learning_rate": 3e-06, + "loss": -0.0025, + "step": 1699 + }, + { + "clip_ratio": 8.138021075865254e-05, + "epoch": 0.0047196264276869945, + "grad_norm": 0.12652093172073364, + "kl": 0.07367661595344543, + "learning_rate": 3e-06, + "loss": -0.0034, + "step": 1700 + }, + { + "clip_ratio": 0.0015260791988112032, + "epoch": 0.00472240267852681, + "grad_norm": 0.07796800136566162, + "kl": 0.07423160970211029, + "learning_rate": 3e-06, + "loss": -0.0021, + "step": 1701 + }, + { + "clip_ratio": 0.00018615041335579008, + "epoch": 0.004725178929366626, + "grad_norm": 0.0833539217710495, + "kl": 0.07227658852934837, + "learning_rate": 3e-06, + "loss": -0.0026, + "step": 1702 + }, + { + "clip_ratio": 0.0006381928396876901, + "epoch": 0.004727955180206442, + "grad_norm": 0.08434394001960754, + "kl": 0.07486193627119064, + "learning_rate": 3e-06, + "loss": -0.0027, + "step": 1703 + }, + { + "clip_ratio": 0.0013060531928204, + "epoch": 0.004730731431046258, + "grad_norm": 0.07230468839406967, + "kl": 0.08221058547496796, + "learning_rate": 3e-06, + "loss": -0.0026, + "step": 1704 + }, + { + "clip_ratio": 0.0002603422835818492, + "completion_length": 229.4166717529297, + "epoch": 0.004733507681886074, + "grad_norm": 0.10037482529878616, + "kl": 0.06833513081073761, + "learning_rate": 3e-06, + "loss": 0.0286, + "reward": 0.36249999701976776, + "reward_std": 0.33992572128772736, + "rewards/countdown_reward_func": 0.36249999701976776, + "step": 1705, + "zero_std_ratio": 0.25 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0047362839327258896, + "grad_norm": 0.10078221559524536, + "kl": 0.06954888440668583, + "learning_rate": 3e-06, + "loss": 0.0288, + "step": 1706 + }, + { + "clip_ratio": 0.00036663911305367947, + "epoch": 0.004739060183565706, + "grad_norm": 0.08536067605018616, + "kl": 0.07179216295480728, + "learning_rate": 3e-06, + "loss": 0.028, + "step": 1707 + }, + { + "clip_ratio": 0.0001773409530869685, + "epoch": 0.004741836434405521, + "grad_norm": 0.11112114787101746, + "kl": 0.061546871438622475, + "learning_rate": 3e-06, + "loss": 0.0281, + "step": 1708 + }, + { + "clip_ratio": 0.00025751072098501027, + "epoch": 0.0047446126852453375, + "grad_norm": 0.24011927843093872, + "kl": 0.07169570401310921, + "learning_rate": 3e-06, + "loss": 0.0287, + "step": 1709 + }, + { + "clip_ratio": 0.0001065643664333038, + "epoch": 0.004747388936085153, + "grad_norm": 0.09349286556243896, + "kl": 0.07777471095323563, + "learning_rate": 3e-06, + "loss": 0.029, + "step": 1710 + }, + { + "clip_ratio": 9.433962259208784e-05, + "epoch": 0.004750165186924969, + "grad_norm": 0.10448563098907471, + "kl": 0.07240425795316696, + "learning_rate": 3e-06, + "loss": 0.0277, + "step": 1711 + }, + { + "clip_ratio": 9.211496217176318e-05, + "epoch": 0.004752941437764785, + "grad_norm": 0.09998632222414017, + "kl": 0.07552265375852585, + "learning_rate": 3e-06, + "loss": 0.0273, + "step": 1712 + }, + { + "clip_ratio": 9.433962259208784e-05, + "epoch": 0.004755717688604601, + "grad_norm": 0.09559866786003113, + "kl": 0.07743804901838303, + "learning_rate": 3e-06, + "loss": 0.0268, + "step": 1713 + }, + { + "clip_ratio": 0.0007562700993730687, + "epoch": 0.004758493939444416, + "grad_norm": 0.10372816026210785, + "kl": 0.06693735346198082, + "learning_rate": 3e-06, + "loss": 0.0273, + "step": 1714 + }, + { + "clip_ratio": 0.0005361997973523103, + "epoch": 0.0047612701902842326, + "grad_norm": 0.13456197082996368, + "kl": 0.07971575111150742, + "learning_rate": 3e-06, + "loss": 0.0266, + "step": 1715 + }, + { + "clip_ratio": 0.00019023237109649926, + "epoch": 0.004764046441124049, + "grad_norm": 0.10750555992126465, + "kl": 0.08734529465436935, + "learning_rate": 3e-06, + "loss": 0.0272, + "step": 1716 + }, + { + "clip_ratio": 0.0, + "completion_length": 239.375, + "epoch": 0.004766822691963864, + "grad_norm": 0.10056161880493164, + "kl": 0.0764843225479126, + "learning_rate": 3e-06, + "loss": 0.0052, + "reward": 0.3229166939854622, + "reward_std": 0.3206951767206192, + "rewards/countdown_reward_func": 0.3229166939854622, + "step": 1717, + "zero_std_ratio": 0.125 + }, + { + "clip_ratio": 0.00016971943114185706, + "epoch": 0.0047695989428036805, + "grad_norm": 0.0818033367395401, + "kl": 0.0784323550760746, + "learning_rate": 3e-06, + "loss": 0.0056, + "step": 1718 + }, + { + "clip_ratio": 0.000244140625, + "epoch": 0.004772375193643496, + "grad_norm": 0.11514274030923843, + "kl": 0.08092424646019936, + "learning_rate": 3e-06, + "loss": 0.0055, + "step": 1719 + }, + { + "clip_ratio": 0.00044265526230446994, + "epoch": 0.004775151444483312, + "grad_norm": 0.08406610041856766, + "kl": 0.07746245339512825, + "learning_rate": 3e-06, + "loss": 0.0053, + "step": 1720 + }, + { + "clip_ratio": 0.00016971943114185706, + "epoch": 0.004777927695323128, + "grad_norm": 0.0868329182267189, + "kl": 0.08470290526747704, + "learning_rate": 3e-06, + "loss": 0.0054, + "step": 1721 + }, + { + "clip_ratio": 9.07111752894707e-05, + "epoch": 0.004780703946162944, + "grad_norm": 0.11065360903739929, + "kl": 0.08721869066357613, + "learning_rate": 3e-06, + "loss": 0.0062, + "step": 1722 + }, + { + "clip_ratio": 0.0, + "epoch": 0.004783480197002759, + "grad_norm": 0.09971947222948074, + "kl": 0.08457087725400925, + "learning_rate": 3e-06, + "loss": 0.0049, + "step": 1723 + }, + { + "clip_ratio": 8.138021075865254e-05, + "epoch": 0.0047862564478425755, + "grad_norm": 0.0916123166680336, + "kl": 0.08624141290783882, + "learning_rate": 3e-06, + "loss": 0.0043, + "step": 1724 + }, + { + "clip_ratio": 0.0, + "epoch": 0.004789032698682391, + "grad_norm": 0.09791970252990723, + "kl": 0.08757534250617027, + "learning_rate": 3e-06, + "loss": 0.0052, + "step": 1725 + }, + { + "clip_ratio": 0.00026201604487141594, + "epoch": 0.004791808949522207, + "grad_norm": 0.11371166259050369, + "kl": 0.08409228920936584, + "learning_rate": 3e-06, + "loss": 0.0047, + "step": 1726 + }, + { + "clip_ratio": 8.833922038320452e-05, + "epoch": 0.0047945852003620235, + "grad_norm": 0.08852825313806534, + "kl": 0.09211602807044983, + "learning_rate": 3e-06, + "loss": 0.0046, + "step": 1727 + }, + { + "clip_ratio": 0.00016276042151730508, + "epoch": 0.004797361451201839, + "grad_norm": 0.08812572062015533, + "kl": 0.0936342217028141, + "learning_rate": 3e-06, + "loss": 0.005, + "step": 1728 + }, + { + "clip_ratio": 8.333333244081587e-05, + "completion_length": 226.58334350585938, + "epoch": 0.004800137702041655, + "grad_norm": 0.08161304146051407, + "kl": 0.10271313786506653, + "learning_rate": 3e-06, + "loss": 0.0168, + "reward": 0.28333336114883423, + "reward_std": 0.21825158596038818, + "rewards/countdown_reward_func": 0.28333333134651184, + "step": 1729, + "zero_std_ratio": 0.25 + }, + { + "clip_ratio": 0.00017384745297022164, + "epoch": 0.004802913952881471, + "grad_norm": 0.09780708700418472, + "kl": 0.0883183516561985, + "learning_rate": 3e-06, + "loss": 0.0166, + "step": 1730 + }, + { + "clip_ratio": 0.0, + "epoch": 0.004805690203721287, + "grad_norm": 0.18682555854320526, + "kl": 0.09598564729094505, + "learning_rate": 3e-06, + "loss": 0.0164, + "step": 1731 + }, + { + "clip_ratio": 0.0, + "epoch": 0.004808466454561102, + "grad_norm": 0.06634990125894547, + "kl": 0.1002919152379036, + "learning_rate": 3e-06, + "loss": 0.0164, + "step": 1732 + }, + { + "clip_ratio": 8.947744936449453e-05, + "epoch": 0.0048112427054009185, + "grad_norm": 0.07831425219774246, + "kl": 0.10307565331459045, + "learning_rate": 3e-06, + "loss": 0.0172, + "step": 1733 + }, + { + "clip_ratio": 0.0004471343127079308, + "epoch": 0.004814018956240734, + "grad_norm": 0.09252594411373138, + "kl": 0.09812160581350327, + "learning_rate": 3e-06, + "loss": 0.0166, + "step": 1734 + }, + { + "clip_ratio": 0.00027552236133487895, + "epoch": 0.00481679520708055, + "grad_norm": 0.08623359352350235, + "kl": 0.10652950406074524, + "learning_rate": 3e-06, + "loss": 0.0157, + "step": 1735 + }, + { + "clip_ratio": 0.0, + "epoch": 0.004819571457920366, + "grad_norm": 0.08916810154914856, + "kl": 0.0917881578207016, + "learning_rate": 3e-06, + "loss": 0.016, + "step": 1736 + }, + { + "clip_ratio": 0.0, + "epoch": 0.004822347708760182, + "grad_norm": 0.11515611410140991, + "kl": 0.09990720450878143, + "learning_rate": 3e-06, + "loss": 0.0151, + "step": 1737 + }, + { + "clip_ratio": 0.0, + "epoch": 0.004825123959599998, + "grad_norm": 0.06613731384277344, + "kl": 0.10350186005234718, + "learning_rate": 3e-06, + "loss": 0.0154, + "step": 1738 + }, + { + "clip_ratio": 9.051412052940577e-05, + "epoch": 0.0048279002104398136, + "grad_norm": 0.08245322108268738, + "kl": 0.10670360550284386, + "learning_rate": 3e-06, + "loss": 0.0161, + "step": 1739 + }, + { + "clip_ratio": 8.947744936449453e-05, + "epoch": 0.00483067646127963, + "grad_norm": 0.08258913457393646, + "kl": 0.10113772377371788, + "learning_rate": 3e-06, + "loss": 0.0151, + "step": 1740 + }, + { + "clip_ratio": 9.999999747378752e-05, + "completion_length": 238.39584350585938, + "epoch": 0.004833452712119445, + "grad_norm": 0.08632933348417282, + "kl": 0.10816295444965363, + "learning_rate": 3e-06, + "loss": 0.0146, + "reward": 0.3395833671092987, + "reward_std": 0.3079586625099182, + "rewards/countdown_reward_func": 0.3395833522081375, + "step": 1741, + "zero_std_ratio": 0.25 + }, + { + "clip_ratio": 0.000183780153747648, + "epoch": 0.0048362289629592615, + "grad_norm": 0.10708604753017426, + "kl": 0.10778484493494034, + "learning_rate": 3e-06, + "loss": 0.0143, + "step": 1742 + }, + { + "clip_ratio": 8.468834857922047e-05, + "epoch": 0.004839005213799077, + "grad_norm": 0.11992800235748291, + "kl": 0.10616076365113258, + "learning_rate": 3e-06, + "loss": 0.0148, + "step": 1743 + }, + { + "clip_ratio": 0.0, + "epoch": 0.004841781464638893, + "grad_norm": 0.08565586805343628, + "kl": 0.11514625698328018, + "learning_rate": 3e-06, + "loss": 0.0148, + "step": 1744 + }, + { + "clip_ratio": 0.0, + "epoch": 0.004844557715478709, + "grad_norm": 0.10972116142511368, + "kl": 0.11541270092129707, + "learning_rate": 3e-06, + "loss": 0.0141, + "step": 1745 + }, + { + "clip_ratio": 0.0, + "epoch": 0.004847333966318525, + "grad_norm": 0.11946596950292587, + "kl": 0.11952852457761765, + "learning_rate": 3e-06, + "loss": 0.0145, + "step": 1746 + }, + { + "clip_ratio": 0.0003465483241598122, + "epoch": 0.00485011021715834, + "grad_norm": 0.1060066968202591, + "kl": 0.11223845556378365, + "learning_rate": 3e-06, + "loss": 0.0138, + "step": 1747 + }, + { + "clip_ratio": 8.468834857922047e-05, + "epoch": 0.0048528864679981566, + "grad_norm": 0.09957956522703171, + "kl": 0.11351840943098068, + "learning_rate": 3e-06, + "loss": 0.0139, + "step": 1748 + }, + { + "clip_ratio": 0.0, + "epoch": 0.004855662718837973, + "grad_norm": 0.10374125838279724, + "kl": 0.11228137835860252, + "learning_rate": 3e-06, + "loss": 0.0126, + "step": 1749 + }, + { + "clip_ratio": 0.0001720578147796914, + "epoch": 0.004858438969677788, + "grad_norm": 0.08856089413166046, + "kl": 0.121010672301054, + "learning_rate": 3e-06, + "loss": 0.0141, + "step": 1750 + }, + { + "clip_ratio": 0.0001767400826793164, + "epoch": 0.0048612152205176045, + "grad_norm": 0.13084955513477325, + "kl": 0.12098020315170288, + "learning_rate": 3e-06, + "loss": 0.0122, + "step": 1751 + }, + { + "clip_ratio": 0.00024654832668602467, + "epoch": 0.00486399147135742, + "grad_norm": 0.10557901859283447, + "kl": 0.12543104588985443, + "learning_rate": 3e-06, + "loss": 0.0119, + "step": 1752 + }, + { + "clip_ratio": 0.0, + "completion_length": 231.7916717529297, + "epoch": 0.004866767722197236, + "grad_norm": 0.09326691925525665, + "kl": 0.1041104681789875, + "learning_rate": 3e-06, + "loss": -0.0005, + "reward": 0.3229166865348816, + "reward_std": 0.2270909696817398, + "rewards/countdown_reward_func": 0.3229166716337204, + "step": 1753, + "zero_std_ratio": 0.5 + }, + { + "clip_ratio": 0.00018162409833166748, + "epoch": 0.004869543973037052, + "grad_norm": 0.1563946008682251, + "kl": 0.10441932827234268, + "learning_rate": 3e-06, + "loss": 0.001, + "step": 1754 + }, + { + "clip_ratio": 0.0, + "epoch": 0.004872320223876868, + "grad_norm": 0.09415509551763535, + "kl": 0.10655347630381584, + "learning_rate": 3e-06, + "loss": 0.0006, + "step": 1755 + }, + { + "clip_ratio": 0.0001735860641929321, + "epoch": 0.004875096474716683, + "grad_norm": 0.07791784405708313, + "kl": 0.11189163848757744, + "learning_rate": 3e-06, + "loss": 0.001, + "step": 1756 + }, + { + "clip_ratio": 0.0005474452627822757, + "epoch": 0.0048778727255564995, + "grad_norm": 0.07700037211179733, + "kl": 0.11374571919441223, + "learning_rate": 3e-06, + "loss": 0.0006, + "step": 1757 + }, + { + "clip_ratio": 0.0004572612378979102, + "epoch": 0.004880648976396315, + "grad_norm": 0.09134937077760696, + "kl": 0.1076822318136692, + "learning_rate": 3e-06, + "loss": -0.0005, + "step": 1758 + }, + { + "clip_ratio": 0.0005507961686816998, + "epoch": 0.004883425227236131, + "grad_norm": 0.11843130737543106, + "kl": 0.1056605763733387, + "learning_rate": 3e-06, + "loss": -0.0003, + "step": 1759 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0048862014780759475, + "grad_norm": 0.16378162801265717, + "kl": 0.10637029260396957, + "learning_rate": 3e-06, + "loss": 0.0001, + "step": 1760 + }, + { + "clip_ratio": 0.00017188674246426672, + "epoch": 0.004888977728915763, + "grad_norm": 0.09075536578893661, + "kl": 0.10630379244685173, + "learning_rate": 3e-06, + "loss": -0.0009, + "step": 1761 + }, + { + "clip_ratio": 0.00036519287823466584, + "epoch": 0.004891753979755579, + "grad_norm": 0.07516534626483917, + "kl": 0.11119216680526733, + "learning_rate": 3e-06, + "loss": -0.0004, + "step": 1762 + }, + { + "clip_ratio": 0.00018491214723326266, + "epoch": 0.004894530230595395, + "grad_norm": 0.07346532493829727, + "kl": 0.1081731989979744, + "learning_rate": 3e-06, + "loss": -0.0006, + "step": 1763 + }, + { + "clip_ratio": 0.0012658692721743137, + "epoch": 0.004897306481435211, + "grad_norm": 0.09767357259988785, + "kl": 0.10378069430589676, + "learning_rate": 3e-06, + "loss": -0.0013, + "step": 1764 + }, + { + "clip_ratio": 0.000739818497095257, + "completion_length": 228.83333587646484, + "epoch": 0.004900082732275026, + "grad_norm": 0.04857170954346657, + "kl": 0.11081210523843765, + "learning_rate": 3e-06, + "loss": 0.0008, + "reward": 0.13333334028720856, + "reward_std": 0.09824509173631668, + "rewards/countdown_reward_func": 0.13333333283662796, + "step": 1765, + "zero_std_ratio": 0.625 + }, + { + "clip_ratio": 0.001252438290975988, + "epoch": 0.0049028589831148425, + "grad_norm": 0.06208095699548721, + "kl": 0.11198806017637253, + "learning_rate": 3e-06, + "loss": 0.0006, + "step": 1766 + }, + { + "clip_ratio": 0.00036127932253293693, + "epoch": 0.004905635233954658, + "grad_norm": 0.06156253442168236, + "kl": 0.11192040145397186, + "learning_rate": 3e-06, + "loss": 0.0009, + "step": 1767 + }, + { + "clip_ratio": 0.0002693784481380135, + "epoch": 0.004908411484794474, + "grad_norm": 0.05420385301113129, + "kl": 0.10709425061941147, + "learning_rate": 3e-06, + "loss": 0.001, + "step": 1768 + }, + { + "clip_ratio": 8.138021075865254e-05, + "epoch": 0.00491118773563429, + "grad_norm": 0.054946351796388626, + "kl": 0.10676027089357376, + "learning_rate": 3e-06, + "loss": 0.0004, + "step": 1769 + }, + { + "clip_ratio": 0.0008511265332344919, + "epoch": 0.004913963986474106, + "grad_norm": 0.03839809074997902, + "kl": 0.10732469335198402, + "learning_rate": 3e-06, + "loss": 0.0007, + "step": 1770 + }, + { + "clip_ratio": 0.0003610541461966932, + "epoch": 0.004916740237313922, + "grad_norm": 0.046104252338409424, + "kl": 0.10469045117497444, + "learning_rate": 3e-06, + "loss": 0.0, + "step": 1771 + }, + { + "clip_ratio": 0.0015279441722668707, + "epoch": 0.0049195164881537376, + "grad_norm": 0.06267226487398148, + "kl": 0.10644160956144333, + "learning_rate": 3e-06, + "loss": 0.0, + "step": 1772 + }, + { + "clip_ratio": 0.0001966938652913086, + "epoch": 0.004922292738993554, + "grad_norm": 0.05547748878598213, + "kl": 0.10656990110874176, + "learning_rate": 3e-06, + "loss": 0.0001, + "step": 1773 + }, + { + "clip_ratio": 0.0008054111385717988, + "epoch": 0.004925068989833369, + "grad_norm": 0.0485399030148983, + "kl": 0.10373036935925484, + "learning_rate": 3e-06, + "loss": 0.0, + "step": 1774 + }, + { + "clip_ratio": 0.0011505663569550961, + "epoch": 0.0049278452406731855, + "grad_norm": 0.09875226765871048, + "kl": 0.10132811218500137, + "learning_rate": 3e-06, + "loss": -0.0, + "step": 1775 + }, + { + "clip_ratio": 0.0020636909175664186, + "epoch": 0.004930621491513001, + "grad_norm": 0.042701300233602524, + "kl": 0.10125269740819931, + "learning_rate": 3e-06, + "loss": 0.0002, + "step": 1776 + }, + { + "clip_ratio": 0.0004316127669881098, + "completion_length": 223.7291717529297, + "epoch": 0.004933397742352817, + "grad_norm": 0.37167254090309143, + "kl": 0.10485289990901947, + "learning_rate": 3e-06, + "loss": 0.0251, + "reward": 0.3812500238418579, + "reward_std": 0.350497841835022, + "rewards/countdown_reward_func": 0.3812499940395355, + "step": 1777, + "zero_std_ratio": 0.25 + }, + { + "clip_ratio": 0.0, + "epoch": 0.004936173993192633, + "grad_norm": 0.1220475360751152, + "kl": 0.10859641060233116, + "learning_rate": 3e-06, + "loss": 0.0243, + "step": 1778 + }, + { + "clip_ratio": 0.0, + "epoch": 0.004938950244032449, + "grad_norm": 0.11971476674079895, + "kl": 0.10970620810985565, + "learning_rate": 3e-06, + "loss": 0.0248, + "step": 1779 + }, + { + "clip_ratio": 0.0, + "epoch": 0.004941726494872264, + "grad_norm": 0.11429105699062347, + "kl": 0.09786089137196541, + "learning_rate": 3e-06, + "loss": 0.0239, + "step": 1780 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0049445027457120806, + "grad_norm": 0.10652303695678711, + "kl": 0.10274695977568626, + "learning_rate": 3e-06, + "loss": 0.0252, + "step": 1781 + }, + { + "clip_ratio": 0.000254739832598716, + "epoch": 0.004947278996551897, + "grad_norm": 0.12241549789905548, + "kl": 0.10112031176686287, + "learning_rate": 3e-06, + "loss": 0.0244, + "step": 1782 + }, + { + "clip_ratio": 0.00017335961456410587, + "epoch": 0.004950055247391712, + "grad_norm": 0.1391787976026535, + "kl": 0.10613827779889107, + "learning_rate": 3e-06, + "loss": 0.0237, + "step": 1783 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0049528314982315285, + "grad_norm": 0.17388495802879333, + "kl": 0.10883147642016411, + "learning_rate": 3e-06, + "loss": 0.0232, + "step": 1784 + }, + { + "clip_ratio": 0.00029968634044053033, + "epoch": 0.004955607749071344, + "grad_norm": 0.11082387715578079, + "kl": 0.11147769913077354, + "learning_rate": 3e-06, + "loss": 0.0236, + "step": 1785 + }, + { + "clip_ratio": 0.0001846859959186986, + "epoch": 0.00495838399991116, + "grad_norm": 0.11685911566019058, + "kl": 0.09964372962713242, + "learning_rate": 3e-06, + "loss": 0.0226, + "step": 1786 + }, + { + "clip_ratio": 0.00011281588376732543, + "epoch": 0.004961160250750976, + "grad_norm": 0.09485632926225662, + "kl": 0.10617783665657043, + "learning_rate": 3e-06, + "loss": 0.0236, + "step": 1787 + }, + { + "clip_ratio": 0.00043700785317923874, + "epoch": 0.004963936501590792, + "grad_norm": 0.11837552487850189, + "kl": 0.10385986790060997, + "learning_rate": 3e-06, + "loss": 0.0227, + "step": 1788 + }, + { + "clip_ratio": 0.00016897656314540654, + "completion_length": 237.0416717529297, + "epoch": 0.004966712752430607, + "grad_norm": 0.12939836084842682, + "kl": 0.1072714664041996, + "learning_rate": 3e-06, + "loss": 0.0052, + "reward": 0.3437500298023224, + "reward_std": 0.37016281485557556, + "rewards/countdown_reward_func": 0.3437500298023224, + "step": 1789, + "zero_std_ratio": 0.125 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0049694890032704235, + "grad_norm": 0.11611343175172806, + "kl": 0.11076124012470245, + "learning_rate": 3e-06, + "loss": 0.0048, + "step": 1790 + }, + { + "clip_ratio": 0.0003420523353270255, + "epoch": 0.004972265254110239, + "grad_norm": 0.11028125882148743, + "kl": 0.11113661155104637, + "learning_rate": 3e-06, + "loss": 0.0043, + "step": 1791 + }, + { + "clip_ratio": 0.00017754628788679838, + "epoch": 0.004975041504950055, + "grad_norm": 0.09910908341407776, + "kl": 0.11355148255825043, + "learning_rate": 3e-06, + "loss": 0.0056, + "step": 1792 + }, + { + "clip_ratio": 0.00017655367264524102, + "epoch": 0.0049778177557898715, + "grad_norm": 0.0909920260310173, + "kl": 0.115181814879179, + "learning_rate": 3e-06, + "loss": 0.0045, + "step": 1793 + }, + { + "clip_ratio": 0.0, + "epoch": 0.004980594006629687, + "grad_norm": 0.08922834694385529, + "kl": 0.10475178062915802, + "learning_rate": 3e-06, + "loss": 0.0044, + "step": 1794 + }, + { + "clip_ratio": 8.7596352386754e-05, + "epoch": 0.004983370257469503, + "grad_norm": 0.08417384326457977, + "kl": 0.11183957010507584, + "learning_rate": 3e-06, + "loss": 0.0046, + "step": 1795 + }, + { + "clip_ratio": 8.7596352386754e-05, + "epoch": 0.004986146508309319, + "grad_norm": 0.11513616889715195, + "kl": 0.11391586065292358, + "learning_rate": 3e-06, + "loss": 0.0035, + "step": 1796 + }, + { + "clip_ratio": 0.00017655367264524102, + "epoch": 0.004988922759149135, + "grad_norm": 0.09690085053443909, + "kl": 0.1138366088271141, + "learning_rate": 3e-06, + "loss": 0.0038, + "step": 1797 + }, + { + "clip_ratio": 0.0008805023971945047, + "epoch": 0.00499169900998895, + "grad_norm": 0.1030769795179367, + "kl": 0.11408108472824097, + "learning_rate": 3e-06, + "loss": 0.0052, + "step": 1798 + }, + { + "clip_ratio": 0.0005165053516975604, + "epoch": 0.0049944752608287665, + "grad_norm": 0.08906270563602448, + "kl": 0.11490156501531601, + "learning_rate": 3e-06, + "loss": 0.0045, + "step": 1799 + }, + { + "clip_ratio": 8.827683632262051e-05, + "epoch": 0.004997251511668582, + "grad_norm": 0.09603746235370636, + "kl": 0.10548309236764908, + "learning_rate": 3e-06, + "loss": 0.0031, + "step": 1800 + }, + { + "clip_ratio": 0.0002600554726086557, + "completion_length": 222.20834350585938, + "epoch": 0.005000027762508398, + "grad_norm": 0.0949837937951088, + "kl": 0.10473751276731491, + "learning_rate": 3e-06, + "loss": 0.0252, + "reward": 0.302083358168602, + "reward_std": 0.2531573101878166, + "rewards/countdown_reward_func": 0.302083358168602, + "step": 1801, + "zero_std_ratio": 0.25 + }, + { + "clip_ratio": 8.716875890968367e-05, + "epoch": 0.005002804013348214, + "grad_norm": 0.13353735208511353, + "kl": 0.09882612898945808, + "learning_rate": 3e-06, + "loss": 0.0252, + "step": 1802 + }, + { + "clip_ratio": 0.0, + "epoch": 0.00500558026418803, + "grad_norm": 0.12187909334897995, + "kl": 0.10175449773669243, + "learning_rate": 3e-06, + "loss": 0.0253, + "step": 1803 + }, + { + "clip_ratio": 0.0, + "epoch": 0.005008356515027846, + "grad_norm": 0.13179193437099457, + "kl": 0.10488756746053696, + "learning_rate": 3e-06, + "loss": 0.0246, + "step": 1804 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0050111327658676616, + "grad_norm": 0.10637587308883667, + "kl": 0.10015484690666199, + "learning_rate": 3e-06, + "loss": 0.0246, + "step": 1805 + }, + { + "clip_ratio": 8.668516238685697e-05, + "epoch": 0.005013909016707478, + "grad_norm": 0.10470699518918991, + "kl": 0.10087545961141586, + "learning_rate": 3e-06, + "loss": 0.0246, + "step": 1806 + }, + { + "clip_ratio": 0.0, + "epoch": 0.005016685267547293, + "grad_norm": 0.1398010551929474, + "kl": 0.10762974619865417, + "learning_rate": 3e-06, + "loss": 0.0242, + "step": 1807 + }, + { + "clip_ratio": 0.00020646479970309883, + "epoch": 0.0050194615183871095, + "grad_norm": 0.19034115970134735, + "kl": 0.10274334251880646, + "learning_rate": 3e-06, + "loss": 0.0227, + "step": 1808 + }, + { + "clip_ratio": 0.0, + "epoch": 0.005022237769226925, + "grad_norm": 0.11217880249023438, + "kl": 0.10773766785860062, + "learning_rate": 3e-06, + "loss": 0.023, + "step": 1809 + }, + { + "clip_ratio": 0.00010822511103469878, + "epoch": 0.005025014020066741, + "grad_norm": 0.1465480774641037, + "kl": 0.11147905141115189, + "learning_rate": 3e-06, + "loss": 0.0215, + "step": 1810 + }, + { + "clip_ratio": 0.0004162504119449295, + "epoch": 0.005027790270906557, + "grad_norm": 0.0986005887389183, + "kl": 0.10899289697408676, + "learning_rate": 3e-06, + "loss": 0.0235, + "step": 1811 + }, + { + "clip_ratio": 9.788566967472434e-05, + "epoch": 0.005030566521746373, + "grad_norm": 0.13088902831077576, + "kl": 0.1097114235162735, + "learning_rate": 3e-06, + "loss": 0.023, + "step": 1812 + }, + { + "clip_ratio": 0.0002538930275477469, + "completion_length": 213.83333587646484, + "epoch": 0.005033342772586188, + "grad_norm": 0.09580644220113754, + "kl": 0.10691358521580696, + "learning_rate": 3e-06, + "loss": -0.001, + "reward": 0.3958333432674408, + "reward_std": 0.2887437641620636, + "rewards/countdown_reward_func": 0.3958333283662796, + "step": 1813, + "zero_std_ratio": 0.25 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0050361190234260046, + "grad_norm": 0.08536209911108017, + "kl": 0.11967135220766068, + "learning_rate": 3e-06, + "loss": -0.001, + "step": 1814 + }, + { + "clip_ratio": 0.00019878626335412264, + "epoch": 0.005038895274265821, + "grad_norm": 0.0959966704249382, + "kl": 0.11832546070218086, + "learning_rate": 3e-06, + "loss": -0.0003, + "step": 1815 + }, + { + "clip_ratio": 0.0021427306346595287, + "epoch": 0.005041671525105636, + "grad_norm": 0.11329105496406555, + "kl": 0.12149709090590477, + "learning_rate": 3e-06, + "loss": 0.0015, + "step": 1816 + }, + { + "clip_ratio": 0.00026538767997408286, + "epoch": 0.0050444477759454525, + "grad_norm": 0.12283118814229965, + "kl": 0.12353919818997383, + "learning_rate": 3e-06, + "loss": -0.0004, + "step": 1817 + }, + { + "clip_ratio": 0.0004591475590132177, + "epoch": 0.005047224026785268, + "grad_norm": 0.11809934675693512, + "kl": 0.11520658433437347, + "learning_rate": 3e-06, + "loss": -0.002, + "step": 1818 + }, + { + "clip_ratio": 0.00027188926469534636, + "epoch": 0.005050000277625084, + "grad_norm": 0.10198795795440674, + "kl": 0.11487729102373123, + "learning_rate": 3e-06, + "loss": -0.001, + "step": 1819 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0050527765284649, + "grad_norm": 0.09649267792701721, + "kl": 0.12504612654447556, + "learning_rate": 3e-06, + "loss": -0.0014, + "step": 1820 + }, + { + "clip_ratio": 0.0004089476424269378, + "epoch": 0.005055552779304716, + "grad_norm": 0.2582434117794037, + "kl": 0.12240016460418701, + "learning_rate": 3e-06, + "loss": -0.0011, + "step": 1821 + }, + { + "clip_ratio": 0.0019799493966274895, + "epoch": 0.005058329030144531, + "grad_norm": 0.12102940678596497, + "kl": 0.12317222356796265, + "learning_rate": 3e-06, + "loss": -0.0002, + "step": 1822 + }, + { + "clip_ratio": 0.00018400746921543032, + "epoch": 0.0050611052809843475, + "grad_norm": 0.12739579379558563, + "kl": 0.12464824318885803, + "learning_rate": 3e-06, + "loss": -0.0008, + "step": 1823 + }, + { + "clip_ratio": 0.00020525451691355556, + "epoch": 0.005063881531824163, + "grad_norm": 0.11777213960886002, + "kl": 0.11386393383145332, + "learning_rate": 3e-06, + "loss": -0.0031, + "step": 1824 + }, + { + "clip_ratio": 0.0003442906090640463, + "completion_length": 233.89584350585938, + "epoch": 0.005066657782663979, + "grad_norm": 0.07550974935293198, + "kl": 0.13742565363645554, + "learning_rate": 3e-06, + "loss": 0.014, + "reward": 0.24791669100522995, + "reward_std": 0.2587834596633911, + "rewards/countdown_reward_func": 0.24791667610406876, + "step": 1825, + "zero_std_ratio": 0.375 + }, + { + "clip_ratio": 0.00044292007805779576, + "epoch": 0.0050694340335037955, + "grad_norm": 0.09571371972560883, + "kl": 0.13907401263713837, + "learning_rate": 3e-06, + "loss": 0.0135, + "step": 1826 + }, + { + "clip_ratio": 0.00018455334065947682, + "epoch": 0.005072210284343611, + "grad_norm": 0.11037315428256989, + "kl": 0.13096477836370468, + "learning_rate": 3e-06, + "loss": 0.0139, + "step": 1827 + }, + { + "clip_ratio": 0.0003636363544501364, + "epoch": 0.005074986535183427, + "grad_norm": 0.08234788477420807, + "kl": 0.12375519797205925, + "learning_rate": 3e-06, + "loss": 0.0135, + "step": 1828 + }, + { + "clip_ratio": 0.0, + "epoch": 0.005077762786023243, + "grad_norm": 0.12358090281486511, + "kl": 0.1380329132080078, + "learning_rate": 3e-06, + "loss": 0.0136, + "step": 1829 + }, + { + "clip_ratio": 0.0005509999464266002, + "epoch": 0.005080539036863059, + "grad_norm": 0.10022760182619095, + "kl": 0.12769127637147903, + "learning_rate": 3e-06, + "loss": 0.0132, + "step": 1830 + }, + { + "clip_ratio": 0.00044850878475699574, + "epoch": 0.005083315287702874, + "grad_norm": 0.07287811487913132, + "kl": 0.1334082931280136, + "learning_rate": 3e-06, + "loss": 0.0134, + "step": 1831 + }, + { + "clip_ratio": 0.0004338581129559316, + "epoch": 0.0050860915385426905, + "grad_norm": 0.07464426755905151, + "kl": 0.13454636931419373, + "learning_rate": 3e-06, + "loss": 0.0137, + "step": 1832 + }, + { + "clip_ratio": 0.00045034874347038567, + "epoch": 0.005088867789382506, + "grad_norm": 0.12014682590961456, + "kl": 0.12987623363733292, + "learning_rate": 3e-06, + "loss": 0.0129, + "step": 1833 + }, + { + "clip_ratio": 0.00018235020252177492, + "epoch": 0.005091644040222322, + "grad_norm": 0.07914389669895172, + "kl": 0.12284610792994499, + "learning_rate": 3e-06, + "loss": 0.0123, + "step": 1834 + }, + { + "clip_ratio": 0.00017130826745415106, + "epoch": 0.0050944202910621385, + "grad_norm": 0.11240369826555252, + "kl": 0.13621504604816437, + "learning_rate": 3e-06, + "loss": 0.0131, + "step": 1835 + }, + { + "clip_ratio": 0.0006353274511639029, + "epoch": 0.005097196541901954, + "grad_norm": 0.09824000298976898, + "kl": 0.12700487673282623, + "learning_rate": 3e-06, + "loss": 0.0133, + "step": 1836 + }, + { + "clip_ratio": 0.0, + "completion_length": 230.31250762939453, + "epoch": 0.00509997279274177, + "grad_norm": 0.12020771205425262, + "kl": 0.1342645138502121, + "learning_rate": 3e-06, + "loss": -0.0, + "reward": 0.35833336412906647, + "reward_std": 0.37625907361507416, + "rewards/countdown_reward_func": 0.3583333492279053, + "step": 1837, + "zero_std_ratio": 0.125 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0051027490435815856, + "grad_norm": 0.17950089275836945, + "kl": 0.1457306444644928, + "learning_rate": 3e-06, + "loss": 0.0, + "step": 1838 + }, + { + "clip_ratio": 9.968101949198171e-05, + "epoch": 0.005105525294421402, + "grad_norm": 0.150423064827919, + "kl": 0.1225435845553875, + "learning_rate": 3e-06, + "loss": -0.0004, + "step": 1839 + }, + { + "clip_ratio": 0.0003465738918748684, + "epoch": 0.005108301545261217, + "grad_norm": 0.17598502337932587, + "kl": 0.12997641041874886, + "learning_rate": 3e-06, + "loss": -0.0014, + "step": 1840 + }, + { + "clip_ratio": 0.0003827196196652949, + "epoch": 0.0051110777961010335, + "grad_norm": 0.13662053644657135, + "kl": 0.12417108565568924, + "learning_rate": 3e-06, + "loss": -0.0021, + "step": 1841 + }, + { + "clip_ratio": 9.904913167702034e-05, + "epoch": 0.005113854046940849, + "grad_norm": 0.1616424322128296, + "kl": 0.12268723547458649, + "learning_rate": 3e-06, + "loss": -0.0027, + "step": 1842 + }, + { + "clip_ratio": 0.0006147185195004568, + "epoch": 0.005116630297780665, + "grad_norm": 0.1300649344921112, + "kl": 0.12857786938548088, + "learning_rate": 3e-06, + "loss": -0.0024, + "step": 1843 + }, + { + "clip_ratio": 0.00027205952210351825, + "epoch": 0.005119406548620481, + "grad_norm": 0.1779082864522934, + "kl": 0.14096860587596893, + "learning_rate": 3e-06, + "loss": -0.0021, + "step": 1844 + }, + { + "clip_ratio": 0.00035519967786967754, + "epoch": 0.005122182799460297, + "grad_norm": 0.13828246295452118, + "kl": 0.1195240318775177, + "learning_rate": 3e-06, + "loss": -0.004, + "step": 1845 + }, + { + "clip_ratio": 0.001379472087137401, + "epoch": 0.005124959050300113, + "grad_norm": 0.16142214834690094, + "kl": 0.12471278756856918, + "learning_rate": 3e-06, + "loss": -0.0059, + "step": 1846 + }, + { + "clip_ratio": 0.001573986024595797, + "epoch": 0.0051277353011399286, + "grad_norm": 0.1403919756412506, + "kl": 0.11798427999019623, + "learning_rate": 3e-06, + "loss": -0.0067, + "step": 1847 + }, + { + "clip_ratio": 0.0015037042612675577, + "epoch": 0.005130511551979745, + "grad_norm": 0.1678413301706314, + "kl": 0.1188434436917305, + "learning_rate": 3e-06, + "loss": -0.0067, + "step": 1848 + }, + { + "clip_ratio": 0.0005114346859045327, + "completion_length": 231.4375, + "epoch": 0.00513328780281956, + "grad_norm": 0.06092618405818939, + "kl": 0.11399786919355392, + "learning_rate": 3e-06, + "loss": -0.0017, + "reward": 0.1937500163912773, + "reward_std": 0.15347465500235558, + "rewards/countdown_reward_func": 0.1937500163912773, + "step": 1849, + "zero_std_ratio": 0.625 + }, + { + "clip_ratio": 8.802816591924056e-05, + "epoch": 0.0051360640536593765, + "grad_norm": 0.08002404868602753, + "kl": 0.10806835442781448, + "learning_rate": 3e-06, + "loss": -0.0025, + "step": 1850 + }, + { + "clip_ratio": 0.0001694083766778931, + "epoch": 0.005138840304499192, + "grad_norm": 0.07181860506534576, + "kl": 0.10447893664240837, + "learning_rate": 3e-06, + "loss": -0.0033, + "step": 1851 + }, + { + "clip_ratio": 0.0006369501352310181, + "epoch": 0.005141616555339008, + "grad_norm": 0.0680135115981102, + "kl": 0.10594388097524643, + "learning_rate": 3e-06, + "loss": -0.0029, + "step": 1852 + }, + { + "clip_ratio": 0.0009738111984916031, + "epoch": 0.005144392806178824, + "grad_norm": 0.05742673948407173, + "kl": 0.10426938533782959, + "learning_rate": 3e-06, + "loss": -0.0026, + "step": 1853 + }, + { + "clip_ratio": 0.0006258258072193712, + "epoch": 0.00514716905701864, + "grad_norm": 0.06563463062047958, + "kl": 0.10713209956884384, + "learning_rate": 3e-06, + "loss": -0.003, + "step": 1854 + }, + { + "clip_ratio": 0.001724798115901649, + "epoch": 0.005149945307858455, + "grad_norm": 0.05824752897024155, + "kl": 0.10814163088798523, + "learning_rate": 3e-06, + "loss": -0.0022, + "step": 1855 + }, + { + "clip_ratio": 0.0009082746400963515, + "epoch": 0.0051527215586982715, + "grad_norm": 0.08871683478355408, + "kl": 0.10223837941884995, + "learning_rate": 3e-06, + "loss": -0.0039, + "step": 1856 + }, + { + "clip_ratio": 0.0010515126050449908, + "epoch": 0.005155497809538088, + "grad_norm": 0.0660201832652092, + "kl": 0.10071967542171478, + "learning_rate": 3e-06, + "loss": -0.004, + "step": 1857 + }, + { + "clip_ratio": 0.002384283463470638, + "epoch": 0.005158274060377903, + "grad_norm": 0.0644756555557251, + "kl": 0.10329937189817429, + "learning_rate": 3e-06, + "loss": -0.0041, + "step": 1858 + }, + { + "clip_ratio": 0.0010630089382175356, + "epoch": 0.0051610503112177195, + "grad_norm": 0.05238967016339302, + "kl": 0.09916985407471657, + "learning_rate": 3e-06, + "loss": -0.0034, + "step": 1859 + }, + { + "clip_ratio": 0.0028576954500749707, + "epoch": 0.005163826562057535, + "grad_norm": 0.06547994166612625, + "kl": 0.1038946770131588, + "learning_rate": 3e-06, + "loss": -0.0043, + "step": 1860 + }, + { + "clip_ratio": 8.46310067572631e-05, + "completion_length": 236.08333587646484, + "epoch": 0.005166602812897351, + "grad_norm": 0.1372431516647339, + "kl": 0.1432003378868103, + "learning_rate": 3e-06, + "loss": 0.0302, + "reward": 0.3041666969656944, + "reward_std": 0.27210913598537445, + "rewards/countdown_reward_func": 0.3041666969656944, + "step": 1861, + "zero_std_ratio": 0.375 + }, + { + "clip_ratio": 8.46310067572631e-05, + "epoch": 0.005169379063737167, + "grad_norm": 0.08329330384731293, + "kl": 0.12102905288338661, + "learning_rate": 3e-06, + "loss": 0.0297, + "step": 1862 + }, + { + "clip_ratio": 0.00017888411093736067, + "epoch": 0.005172155314576983, + "grad_norm": 0.10118658095598221, + "kl": 0.13169753551483154, + "learning_rate": 3e-06, + "loss": 0.0301, + "step": 1863 + }, + { + "clip_ratio": 0.0002753450389718637, + "epoch": 0.005174931565416798, + "grad_norm": 0.11060193926095963, + "kl": 0.13022061809897423, + "learning_rate": 3e-06, + "loss": 0.0292, + "step": 1864 + }, + { + "clip_ratio": 0.00017968667816603556, + "epoch": 0.0051777078162566145, + "grad_norm": 0.10579267144203186, + "kl": 0.12778804078698158, + "learning_rate": 3e-06, + "loss": 0.0302, + "step": 1865 + }, + { + "clip_ratio": 8.46310067572631e-05, + "epoch": 0.00518048406709643, + "grad_norm": 0.0976695865392685, + "kl": 0.11770972609519958, + "learning_rate": 3e-06, + "loss": 0.0288, + "step": 1866 + }, + { + "clip_ratio": 0.0003618215851020068, + "epoch": 0.005183260317936246, + "grad_norm": 0.09765175729990005, + "kl": 0.14232248812913895, + "learning_rate": 3e-06, + "loss": 0.0292, + "step": 1867 + }, + { + "clip_ratio": 0.00016601121751591563, + "epoch": 0.0051860365687760625, + "grad_norm": 0.0782550796866417, + "kl": 0.1213303953409195, + "learning_rate": 3e-06, + "loss": 0.0291, + "step": 1868 + }, + { + "clip_ratio": 0.0002776276451186277, + "epoch": 0.005188812819615878, + "grad_norm": 0.10285110771656036, + "kl": 0.13297457247972488, + "learning_rate": 3e-06, + "loss": 0.0283, + "step": 1869 + }, + { + "clip_ratio": 0.00018646800162969157, + "epoch": 0.005191589070455694, + "grad_norm": 0.11395300924777985, + "kl": 0.1316339075565338, + "learning_rate": 3e-06, + "loss": 0.0278, + "step": 1870 + }, + { + "clip_ratio": 0.0005108660116093233, + "epoch": 0.0051943653212955096, + "grad_norm": 0.0980798751115799, + "kl": 0.12793593108654022, + "learning_rate": 3e-06, + "loss": 0.0287, + "step": 1871 + }, + { + "clip_ratio": 9.097525617107749e-05, + "epoch": 0.005197141572135326, + "grad_norm": 0.09456781297922134, + "kl": 0.12113503366708755, + "learning_rate": 3e-06, + "loss": 0.0272, + "step": 1872 + }, + { + "clip_ratio": 8.85896515683271e-05, + "completion_length": 233.06250762939453, + "epoch": 0.005199917822975141, + "grad_norm": 0.15110257267951965, + "kl": 0.11747819557785988, + "learning_rate": 3e-06, + "loss": 0.0183, + "reward": 0.3437500298023224, + "reward_std": 0.31206804513931274, + "rewards/countdown_reward_func": 0.3437500149011612, + "step": 1873, + "zero_std_ratio": 0.25 + }, + { + "clip_ratio": 9.897070412989706e-05, + "epoch": 0.0052026940738149575, + "grad_norm": 0.11430589109659195, + "kl": 0.1168769858777523, + "learning_rate": 3e-06, + "loss": 0.0183, + "step": 1874 + }, + { + "clip_ratio": 8.90313385752961e-05, + "epoch": 0.005205470324654773, + "grad_norm": 0.09721416234970093, + "kl": 0.11751173809170723, + "learning_rate": 3e-06, + "loss": 0.0188, + "step": 1875 + }, + { + "clip_ratio": 0.0004218236426822841, + "epoch": 0.005208246575494589, + "grad_norm": 0.09081301838159561, + "kl": 0.11737838387489319, + "learning_rate": 3e-06, + "loss": 0.0181, + "step": 1876 + }, + { + "clip_ratio": 0.0003606221798690967, + "epoch": 0.005211022826334405, + "grad_norm": 0.08658932149410248, + "kl": 0.12061323970556259, + "learning_rate": 3e-06, + "loss": 0.0184, + "step": 1877 + }, + { + "clip_ratio": 0.0004417505406308919, + "epoch": 0.005213799077174221, + "grad_norm": 0.13034342229366302, + "kl": 0.11869553104043007, + "learning_rate": 3e-06, + "loss": 0.0181, + "step": 1878 + }, + { + "clip_ratio": 0.0003327302838442847, + "epoch": 0.005216575328014037, + "grad_norm": 0.09448829293251038, + "kl": 0.12387342751026154, + "learning_rate": 3e-06, + "loss": 0.0167, + "step": 1879 + }, + { + "clip_ratio": 0.0003734165584319271, + "epoch": 0.0052193515788538525, + "grad_norm": 0.10919336974620819, + "kl": 0.12406548857688904, + "learning_rate": 3e-06, + "loss": 0.0164, + "step": 1880 + }, + { + "clip_ratio": 8.85896515683271e-05, + "epoch": 0.005222127829693669, + "grad_norm": 0.09010238200426102, + "kl": 0.1251124069094658, + "learning_rate": 3e-06, + "loss": 0.0178, + "step": 1881 + }, + { + "clip_ratio": 0.0004870776829193346, + "epoch": 0.005224904080533484, + "grad_norm": 0.08950203657150269, + "kl": 0.1254933625459671, + "learning_rate": 3e-06, + "loss": 0.016, + "step": 1882 + }, + { + "clip_ratio": 0.0005361769872251898, + "epoch": 0.0052276803313733005, + "grad_norm": 0.1491861492395401, + "kl": 0.13046596199274063, + "learning_rate": 3e-06, + "loss": 0.0163, + "step": 1883 + }, + { + "clip_ratio": 0.0005558350239880383, + "epoch": 0.005230456582213116, + "grad_norm": 0.12711560726165771, + "kl": 0.12579327076673508, + "learning_rate": 3e-06, + "loss": 0.0155, + "step": 1884 + }, + { + "clip_ratio": 0.0005470910182339139, + "completion_length": 228.58333587646484, + "epoch": 0.005233232833052932, + "grad_norm": 0.18066106736660004, + "kl": 0.14193446934223175, + "learning_rate": 3e-06, + "loss": 0.0218, + "reward": 0.34166671335697174, + "reward_std": 0.3784969747066498, + "rewards/countdown_reward_func": 0.34166669845581055, + "step": 1885, + "zero_std_ratio": 0.125 + }, + { + "clip_ratio": 0.000280211737845093, + "epoch": 0.005236009083892748, + "grad_norm": 0.1274838149547577, + "kl": 0.13073249906301498, + "learning_rate": 3e-06, + "loss": 0.0227, + "step": 1886 + }, + { + "clip_ratio": 0.00027726277767214924, + "epoch": 0.005238785334732564, + "grad_norm": 0.11818882077932358, + "kl": 0.14046981185674667, + "learning_rate": 3e-06, + "loss": 0.0234, + "step": 1887 + }, + { + "clip_ratio": 0.0, + "epoch": 0.005241561585572379, + "grad_norm": 0.12307964265346527, + "kl": 0.13810279220342636, + "learning_rate": 3e-06, + "loss": 0.022, + "step": 1888 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0052443378364121955, + "grad_norm": 0.12825541198253632, + "kl": 0.13892988115549088, + "learning_rate": 3e-06, + "loss": 0.0217, + "step": 1889 + }, + { + "clip_ratio": 9.170946577796713e-05, + "epoch": 0.005247114087252012, + "grad_norm": 0.14815275371074677, + "kl": 0.15306030213832855, + "learning_rate": 3e-06, + "loss": 0.0225, + "step": 1890 + }, + { + "clip_ratio": 8.890469325706363e-05, + "epoch": 0.005249890338091827, + "grad_norm": 0.12983007729053497, + "kl": 0.1546284407377243, + "learning_rate": 3e-06, + "loss": 0.0203, + "step": 1891 + }, + { + "clip_ratio": 0.00017390426364727318, + "epoch": 0.0052526665889316435, + "grad_norm": 0.20150499045848846, + "kl": 0.13978148996829987, + "learning_rate": 3e-06, + "loss": 0.0197, + "step": 1892 + }, + { + "clip_ratio": 0.0015431393767357804, + "epoch": 0.005255442839771459, + "grad_norm": 0.12668222188949585, + "kl": 0.14928098767995834, + "learning_rate": 3e-06, + "loss": 0.0219, + "step": 1893 + }, + { + "clip_ratio": 0.00017780938651412725, + "epoch": 0.005258219090611275, + "grad_norm": 0.1326785534620285, + "kl": 0.14815984666347504, + "learning_rate": 3e-06, + "loss": 0.0205, + "step": 1894 + }, + { + "clip_ratio": 0.000360042235115543, + "epoch": 0.005260995341451091, + "grad_norm": 0.11318658292293549, + "kl": 0.14500422030687332, + "learning_rate": 3e-06, + "loss": 0.0193, + "step": 1895 + }, + { + "clip_ratio": 0.00046394005767069757, + "epoch": 0.005263771592290907, + "grad_norm": 0.17353534698486328, + "kl": 0.1581486538052559, + "learning_rate": 3e-06, + "loss": 0.0198, + "step": 1896 + }, + { + "clip_ratio": 0.0, + "completion_length": 227.6666717529297, + "epoch": 0.005266547843130722, + "grad_norm": 0.09441768378019333, + "kl": 0.14793231338262558, + "learning_rate": 3e-06, + "loss": 0.0268, + "reward": 0.19375000149011612, + "reward_std": 0.16211743652820587, + "rewards/countdown_reward_func": 0.19375000149011612, + "step": 1897, + "zero_std_ratio": 0.625 + }, + { + "clip_ratio": 0.00019291546777822077, + "epoch": 0.0052693240939705385, + "grad_norm": 0.06794694811105728, + "kl": 0.15445201843976974, + "learning_rate": 3e-06, + "loss": 0.0273, + "step": 1898 + }, + { + "clip_ratio": 9.084302291739732e-05, + "epoch": 0.005272100344810354, + "grad_norm": 0.07045197486877441, + "kl": 0.15664274990558624, + "learning_rate": 3e-06, + "loss": 0.0278, + "step": 1899 + }, + { + "clip_ratio": 0.0002881958498619497, + "epoch": 0.00527487659565017, + "grad_norm": 0.0634378045797348, + "kl": 0.15410824120044708, + "learning_rate": 3e-06, + "loss": 0.0281, + "step": 1900 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0052776528464899865, + "grad_norm": 0.07607164233922958, + "kl": 0.14435160160064697, + "learning_rate": 3e-06, + "loss": 0.027, + "step": 1901 + }, + { + "clip_ratio": 0.0004623479617293924, + "epoch": 0.005280429097329802, + "grad_norm": 0.08263766020536423, + "kl": 0.14478224515914917, + "learning_rate": 3e-06, + "loss": 0.0261, + "step": 1902 + }, + { + "clip_ratio": 0.0003508084482746199, + "epoch": 0.005283205348169618, + "grad_norm": 0.08324983716011047, + "kl": 0.15262345969676971, + "learning_rate": 3e-06, + "loss": 0.0265, + "step": 1903 + }, + { + "clip_ratio": 0.0002769144412013702, + "epoch": 0.0052859815990094336, + "grad_norm": 0.0647958442568779, + "kl": 0.16446733474731445, + "learning_rate": 3e-06, + "loss": 0.0268, + "step": 1904 + }, + { + "clip_ratio": 0.0002789492136798799, + "epoch": 0.00528875784984925, + "grad_norm": 0.06370534002780914, + "kl": 0.165434792637825, + "learning_rate": 3e-06, + "loss": 0.0274, + "step": 1905 + }, + { + "clip_ratio": 0.0009251071896869689, + "epoch": 0.005291534100689065, + "grad_norm": 0.06624593585729599, + "kl": 0.1653437316417694, + "learning_rate": 3e-06, + "loss": 0.0266, + "step": 1906 + }, + { + "clip_ratio": 0.0005657712754327804, + "epoch": 0.0052943103515288815, + "grad_norm": 0.06858285516500473, + "kl": 0.15655828267335892, + "learning_rate": 3e-06, + "loss": 0.0265, + "step": 1907 + }, + { + "clip_ratio": 0.0008327158866450191, + "epoch": 0.005297086602368697, + "grad_norm": 0.08369874209165573, + "kl": 0.15721774101257324, + "learning_rate": 3e-06, + "loss": 0.0254, + "step": 1908 + }, + { + "clip_ratio": 8.239947055699304e-05, + "completion_length": 220.08334350585938, + "epoch": 0.005299862853208513, + "grad_norm": 0.07524816691875458, + "kl": 0.16231046617031097, + "learning_rate": 3e-06, + "loss": 0.0145, + "reward": 0.1937500163912773, + "reward_std": 0.1958785466849804, + "rewards/countdown_reward_func": 0.1937500163912773, + "step": 1909, + "zero_std_ratio": 0.5 + }, + { + "clip_ratio": 9.057971328729764e-05, + "epoch": 0.005302639104048329, + "grad_norm": 0.06150933727622032, + "kl": 0.17525289952754974, + "learning_rate": 3e-06, + "loss": 0.0154, + "step": 1910 + }, + { + "clip_ratio": 0.00010024057701230049, + "epoch": 0.005305415354888145, + "grad_norm": 0.06644804775714874, + "kl": 0.17246182262897491, + "learning_rate": 3e-06, + "loss": 0.0141, + "step": 1911 + }, + { + "clip_ratio": 0.0, + "epoch": 0.005308191605727961, + "grad_norm": 0.11811656504869461, + "kl": 0.18016871809959412, + "learning_rate": 3e-06, + "loss": 0.0154, + "step": 1912 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0053109678565677765, + "grad_norm": 0.06376418471336365, + "kl": 0.16709844022989273, + "learning_rate": 3e-06, + "loss": 0.0137, + "step": 1913 + }, + { + "clip_ratio": 0.00010469011613167822, + "epoch": 0.005313744107407593, + "grad_norm": 0.055688947439193726, + "kl": 0.16722818464040756, + "learning_rate": 3e-06, + "loss": 0.0141, + "step": 1914 + }, + { + "clip_ratio": 0.0, + "epoch": 0.005316520358247408, + "grad_norm": 0.0814003273844719, + "kl": 0.16963288933038712, + "learning_rate": 3e-06, + "loss": 0.0142, + "step": 1915 + }, + { + "clip_ratio": 0.00029551039915531874, + "epoch": 0.0053192966090872245, + "grad_norm": 0.06685936450958252, + "kl": 0.18109215795993805, + "learning_rate": 3e-06, + "loss": 0.0147, + "step": 1916 + }, + { + "clip_ratio": 0.00018481432925909758, + "epoch": 0.00532207285992704, + "grad_norm": 0.07375549525022507, + "kl": 0.17562759667634964, + "learning_rate": 3e-06, + "loss": 0.0138, + "step": 1917 + }, + { + "clip_ratio": 0.0001017087051877752, + "epoch": 0.005324849110766856, + "grad_norm": 0.12521782517433167, + "kl": 0.18078292161226273, + "learning_rate": 3e-06, + "loss": 0.0154, + "step": 1918 + }, + { + "clip_ratio": 0.00018628245015861467, + "epoch": 0.005327625361606672, + "grad_norm": 0.06385212391614914, + "kl": 0.1704089492559433, + "learning_rate": 3e-06, + "loss": 0.0135, + "step": 1919 + }, + { + "clip_ratio": 0.00010469011613167822, + "epoch": 0.005330401612446488, + "grad_norm": 0.05945051088929176, + "kl": 0.16913989931344986, + "learning_rate": 3e-06, + "loss": 0.014, + "step": 1920 + }, + { + "clip_ratio": 0.0, + "completion_length": 226.56250762939453, + "epoch": 0.005333177863286303, + "grad_norm": 0.06853771954774857, + "kl": 0.1720345988869667, + "learning_rate": 3e-06, + "loss": 0.0216, + "reward": 0.2500000149011612, + "reward_std": 0.2080453597009182, + "rewards/countdown_reward_func": 0.2500000149011612, + "step": 1921, + "zero_std_ratio": 0.5 + }, + { + "clip_ratio": 0.00026880551740759984, + "epoch": 0.0053359541141261195, + "grad_norm": 0.08787719905376434, + "kl": 0.16743547469377518, + "learning_rate": 3e-06, + "loss": 0.0207, + "step": 1922 + }, + { + "clip_ratio": 8.567512122681364e-05, + "epoch": 0.005338730364965936, + "grad_norm": 0.1024947464466095, + "kl": 0.16995961219072342, + "learning_rate": 3e-06, + "loss": 0.0217, + "step": 1923 + }, + { + "clip_ratio": 0.0003480182640487328, + "epoch": 0.005341506615805751, + "grad_norm": 0.0839357003569603, + "kl": 0.18032675236463547, + "learning_rate": 3e-06, + "loss": 0.0214, + "step": 1924 + }, + { + "clip_ratio": 0.0002771032159216702, + "epoch": 0.0053442828666455675, + "grad_norm": 0.08104819059371948, + "kl": 0.18133548647165298, + "learning_rate": 3e-06, + "loss": 0.0211, + "step": 1925 + }, + { + "clip_ratio": 0.00025895826547639444, + "epoch": 0.005347059117485383, + "grad_norm": 0.10513396561145782, + "kl": 0.16761326789855957, + "learning_rate": 3e-06, + "loss": 0.0211, + "step": 1926 + }, + { + "clip_ratio": 0.0, + "epoch": 0.005349835368325199, + "grad_norm": 0.0935124084353447, + "kl": 0.17295867204666138, + "learning_rate": 3e-06, + "loss": 0.0209, + "step": 1927 + }, + { + "clip_ratio": 0.000300093786790967, + "epoch": 0.005352611619165015, + "grad_norm": 0.090276338160038, + "kl": 0.16908128559589386, + "learning_rate": 3e-06, + "loss": 0.0201, + "step": 1928 + }, + { + "clip_ratio": 8.555784006603062e-05, + "epoch": 0.005355387870004831, + "grad_norm": 0.09899937361478806, + "kl": 0.17253682017326355, + "learning_rate": 3e-06, + "loss": 0.0213, + "step": 1929 + }, + { + "clip_ratio": 0.0, + "epoch": 0.005358164120844646, + "grad_norm": 0.09567968547344208, + "kl": 0.18297704309225082, + "learning_rate": 3e-06, + "loss": 0.0212, + "step": 1930 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0053609403716844625, + "grad_norm": 0.07854413986206055, + "kl": 0.18571214377880096, + "learning_rate": 3e-06, + "loss": 0.0203, + "step": 1931 + }, + { + "clip_ratio": 0.0004300739456084557, + "epoch": 0.005363716622524278, + "grad_norm": 0.08686288446187973, + "kl": 0.17234565317630768, + "learning_rate": 3e-06, + "loss": 0.0197, + "step": 1932 + }, + { + "clip_ratio": 0.00018758241640171036, + "completion_length": 219.52083587646484, + "epoch": 0.005366492873364094, + "grad_norm": 0.11458230018615723, + "kl": 0.18364715576171875, + "learning_rate": 3e-06, + "loss": 0.0036, + "reward": 0.21250002086162567, + "reward_std": 0.2080453634262085, + "rewards/countdown_reward_func": 0.21250002086162567, + "step": 1933, + "zero_std_ratio": 0.5 + }, + { + "clip_ratio": 0.00010620220564305782, + "epoch": 0.0053692691242039105, + "grad_norm": 0.22106598317623138, + "kl": 0.1838528960943222, + "learning_rate": 3e-06, + "loss": 0.0045, + "step": 1934 + }, + { + "clip_ratio": 0.0, + "epoch": 0.005372045375043726, + "grad_norm": 0.11077378690242767, + "kl": 0.18076221644878387, + "learning_rate": 3e-06, + "loss": 0.0035, + "step": 1935 + }, + { + "clip_ratio": 0.0006542036862811074, + "epoch": 0.005374821625883542, + "grad_norm": 0.10001291334629059, + "kl": 0.17611156404018402, + "learning_rate": 3e-06, + "loss": 0.0038, + "step": 1936 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0053775978767233576, + "grad_norm": 0.09241759777069092, + "kl": 0.17259087413549423, + "learning_rate": 3e-06, + "loss": 0.0032, + "step": 1937 + }, + { + "clip_ratio": 0.00039494471275247633, + "epoch": 0.005380374127563174, + "grad_norm": 0.11034689098596573, + "kl": 0.17264589667320251, + "learning_rate": 3e-06, + "loss": 0.0028, + "step": 1938 + }, + { + "clip_ratio": 8.196721319109201e-05, + "epoch": 0.005383150378402989, + "grad_norm": 0.11598069220781326, + "kl": 0.1707867681980133, + "learning_rate": 3e-06, + "loss": 0.0018, + "step": 1939 + }, + { + "clip_ratio": 0.0003757518425118178, + "epoch": 0.0053859266292428055, + "grad_norm": 0.23157401382923126, + "kl": 0.16873134672641754, + "learning_rate": 3e-06, + "loss": 0.002, + "step": 1940 + }, + { + "clip_ratio": 0.0005139116401551291, + "epoch": 0.005388702880082621, + "grad_norm": 0.11090226471424103, + "kl": 0.1620839387178421, + "learning_rate": 3e-06, + "loss": 0.0012, + "step": 1941 + }, + { + "clip_ratio": 0.0010267609904985875, + "epoch": 0.005391479130922437, + "grad_norm": 0.09474062919616699, + "kl": 0.15658225119113922, + "learning_rate": 3e-06, + "loss": 0.0019, + "step": 1942 + }, + { + "clip_ratio": 0.000244140625, + "epoch": 0.005394255381762253, + "grad_norm": 0.08938335627317429, + "kl": 0.1493806093931198, + "learning_rate": 3e-06, + "loss": 0.0012, + "step": 1943 + }, + { + "clip_ratio": 0.0003643684176495299, + "epoch": 0.005397031632602069, + "grad_norm": 0.14408725500106812, + "kl": 0.15142924338579178, + "learning_rate": 3e-06, + "loss": 0.0002, + "step": 1944 + }, + { + "clip_ratio": 8.138021075865254e-05, + "completion_length": 229.0416717529297, + "epoch": 0.005399807883441885, + "grad_norm": 0.12846609950065613, + "kl": 0.14687072485685349, + "learning_rate": 3e-06, + "loss": 0.0124, + "reward": 0.3020833730697632, + "reward_std": 0.2922677993774414, + "rewards/countdown_reward_func": 0.3020833432674408, + "step": 1945, + "zero_std_ratio": 0.25 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0054025841342817005, + "grad_norm": 0.10064707696437836, + "kl": 0.13952406495809555, + "learning_rate": 3e-06, + "loss": 0.012, + "step": 1946 + }, + { + "clip_ratio": 0.0007594309572596103, + "epoch": 0.005405360385121517, + "grad_norm": 0.10994094610214233, + "kl": 0.13837341219186783, + "learning_rate": 3e-06, + "loss": 0.0116, + "step": 1947 + }, + { + "clip_ratio": 8.138021075865254e-05, + "epoch": 0.005408136635961332, + "grad_norm": 0.09539451450109482, + "kl": 0.1353205442428589, + "learning_rate": 3e-06, + "loss": 0.0122, + "step": 1948 + }, + { + "clip_ratio": 0.00025521605130052194, + "epoch": 0.0054109128868011485, + "grad_norm": 0.11715108156204224, + "kl": 0.13513853400945663, + "learning_rate": 3e-06, + "loss": 0.0127, + "step": 1949 + }, + { + "clip_ratio": 0.0005876675131730735, + "epoch": 0.005413689137640964, + "grad_norm": 0.11635179072618484, + "kl": 0.12926241755485535, + "learning_rate": 3e-06, + "loss": 0.0114, + "step": 1950 + }, + { + "clip_ratio": 0.00033659624023130164, + "epoch": 0.00541646538848078, + "grad_norm": 0.13250043988227844, + "kl": 0.1317070946097374, + "learning_rate": 3e-06, + "loss": 0.0108, + "step": 1951 + }, + { + "clip_ratio": 0.0002992119698319584, + "epoch": 0.005419241639320596, + "grad_norm": 0.10431980341672897, + "kl": 0.12609682232141495, + "learning_rate": 3e-06, + "loss": 0.0105, + "step": 1952 + }, + { + "clip_ratio": 0.0017799893976189196, + "epoch": 0.005422017890160412, + "grad_norm": 0.1222839429974556, + "kl": 0.12637890875339508, + "learning_rate": 3e-06, + "loss": 0.0103, + "step": 1953 + }, + { + "clip_ratio": 0.0005579154822044075, + "epoch": 0.005424794141000227, + "grad_norm": 0.09429076313972473, + "kl": 0.12555726990103722, + "learning_rate": 3e-06, + "loss": 0.0103, + "step": 1954 + }, + { + "clip_ratio": 0.0017903645930346102, + "epoch": 0.0054275703918400435, + "grad_norm": 0.11087554693222046, + "kl": 0.1258564032614231, + "learning_rate": 3e-06, + "loss": 0.0102, + "step": 1955 + }, + { + "clip_ratio": 0.002133891510311514, + "epoch": 0.00543034664267986, + "grad_norm": 0.13041195273399353, + "kl": 0.12211589515209198, + "learning_rate": 3e-06, + "loss": 0.0104, + "step": 1956 + }, + { + "clip_ratio": 0.0, + "completion_length": 222.1041717529297, + "epoch": 0.005433122893519675, + "grad_norm": 0.09448617696762085, + "kl": 0.11988238245248795, + "learning_rate": 3e-06, + "loss": 0.0148, + "reward": 0.2666666805744171, + "reward_std": 0.22970523685216904, + "rewards/countdown_reward_func": 0.2666666656732559, + "step": 1957, + "zero_std_ratio": 0.5 + }, + { + "clip_ratio": 0.00017508336168248206, + "epoch": 0.0054358991443594915, + "grad_norm": 0.09333960711956024, + "kl": 0.12271144986152649, + "learning_rate": 3e-06, + "loss": 0.0156, + "step": 1958 + }, + { + "clip_ratio": 0.0, + "epoch": 0.005438675395199307, + "grad_norm": 0.10355954617261887, + "kl": 0.11948385834693909, + "learning_rate": 3e-06, + "loss": 0.0148, + "step": 1959 + }, + { + "clip_ratio": 0.00018209777772426605, + "epoch": 0.005441451646039123, + "grad_norm": 0.10874044895172119, + "kl": 0.12741681933403015, + "learning_rate": 3e-06, + "loss": 0.0161, + "step": 1960 + }, + { + "clip_ratio": 0.0004606101065292023, + "epoch": 0.005444227896878939, + "grad_norm": 0.11931112408638, + "kl": 0.11942564323544502, + "learning_rate": 3e-06, + "loss": 0.0144, + "step": 1961 + }, + { + "clip_ratio": 0.00018340400856686756, + "epoch": 0.005447004147718755, + "grad_norm": 0.13169850409030914, + "kl": 0.11675844341516495, + "learning_rate": 3e-06, + "loss": 0.0149, + "step": 1962 + }, + { + "clip_ratio": 8.138021075865254e-05, + "epoch": 0.00544978039855857, + "grad_norm": 0.09862768650054932, + "kl": 0.12140379846096039, + "learning_rate": 3e-06, + "loss": 0.0147, + "step": 1963 + }, + { + "clip_ratio": 0.0002775423854473047, + "epoch": 0.0054525566493983865, + "grad_norm": 0.10945676267147064, + "kl": 0.12424385920166969, + "learning_rate": 3e-06, + "loss": 0.0146, + "step": 1964 + }, + { + "clip_ratio": 0.0005308387917466462, + "epoch": 0.005455332900238202, + "grad_norm": 0.09942898899316788, + "kl": 0.12193005159497261, + "learning_rate": 3e-06, + "loss": 0.0137, + "step": 1965 + }, + { + "clip_ratio": 0.00037315295776352286, + "epoch": 0.005458109151078018, + "grad_norm": 0.1435389667749405, + "kl": 0.12972797825932503, + "learning_rate": 3e-06, + "loss": 0.0155, + "step": 1966 + }, + { + "clip_ratio": 0.0011096491580246948, + "epoch": 0.0054608854019178345, + "grad_norm": 0.118125319480896, + "kl": 0.12346281483769417, + "learning_rate": 3e-06, + "loss": 0.0133, + "step": 1967 + }, + { + "clip_ratio": 0.00108197086956352, + "epoch": 0.00546366165275765, + "grad_norm": 0.13901278376579285, + "kl": 0.12123304605484009, + "learning_rate": 3e-06, + "loss": 0.0131, + "step": 1968 + }, + { + "clip_ratio": 0.00017531556659378111, + "completion_length": 226.37500762939453, + "epoch": 0.005466437903597466, + "grad_norm": 0.1171063482761383, + "kl": 0.11619845405220985, + "learning_rate": 3e-06, + "loss": 0.0091, + "reward": 0.3395833522081375, + "reward_std": 0.35036255419254303, + "rewards/countdown_reward_func": 0.3395833373069763, + "step": 1969, + "zero_std_ratio": 0.125 + }, + { + "clip_ratio": 9.645061800256371e-05, + "epoch": 0.0054692141544372816, + "grad_norm": 0.1672900915145874, + "kl": 0.12332340329885483, + "learning_rate": 3e-06, + "loss": 0.0087, + "step": 1970 + }, + { + "clip_ratio": 0.0, + "epoch": 0.005471990405277098, + "grad_norm": 0.12007147073745728, + "kl": 0.12126722559332848, + "learning_rate": 3e-06, + "loss": 0.0096, + "step": 1971 + }, + { + "clip_ratio": 9.645061800256371e-05, + "epoch": 0.005474766656116913, + "grad_norm": 0.1385733038187027, + "kl": 0.11890603601932526, + "learning_rate": 3e-06, + "loss": 0.0084, + "step": 1972 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0054775429069567295, + "grad_norm": 0.122285395860672, + "kl": 0.12061040103435516, + "learning_rate": 3e-06, + "loss": 0.0079, + "step": 1973 + }, + { + "clip_ratio": 8.765778329689056e-05, + "epoch": 0.005480319157796545, + "grad_norm": 0.11869718879461288, + "kl": 0.12436644360423088, + "learning_rate": 3e-06, + "loss": 0.0088, + "step": 1974 + }, + { + "clip_ratio": 8.765778329689056e-05, + "epoch": 0.005483095408636361, + "grad_norm": 0.11378361284732819, + "kl": 0.11800763756036758, + "learning_rate": 3e-06, + "loss": 0.007, + "step": 1975 + }, + { + "clip_ratio": 0.00017234613187611103, + "epoch": 0.005485871659476177, + "grad_norm": 0.1709638386964798, + "kl": 0.1264483742415905, + "learning_rate": 3e-06, + "loss": 0.0078, + "step": 1976 + }, + { + "clip_ratio": 0.0, + "epoch": 0.005488647910315993, + "grad_norm": 0.118524931371212, + "kl": 0.12333150953054428, + "learning_rate": 3e-06, + "loss": 0.0078, + "step": 1977 + }, + { + "clip_ratio": 9.645061800256371e-05, + "epoch": 0.005491424161155809, + "grad_norm": 0.15198098123073578, + "kl": 0.12151569873094559, + "learning_rate": 3e-06, + "loss": 0.0068, + "step": 1978 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0054942004119956245, + "grad_norm": 0.13697625696659088, + "kl": 0.12277953699231148, + "learning_rate": 3e-06, + "loss": 0.0065, + "step": 1979 + }, + { + "clip_ratio": 0.0002635204582475126, + "epoch": 0.005496976662835441, + "grad_norm": 0.1165957823395729, + "kl": 0.12548280879855156, + "learning_rate": 3e-06, + "loss": 0.0077, + "step": 1980 + }, + { + "clip_ratio": 0.0, + "completion_length": 220.08334350585938, + "epoch": 0.005499752913675256, + "grad_norm": 0.1365557312965393, + "kl": 0.11501205712556839, + "learning_rate": 3e-06, + "loss": -0.0015, + "reward": 0.36250002682209015, + "reward_std": 0.33128294348716736, + "rewards/countdown_reward_func": 0.36250001192092896, + "step": 1981, + "zero_std_ratio": 0.25 + }, + { + "clip_ratio": 0.00026728439843282104, + "epoch": 0.0055025291645150725, + "grad_norm": 0.1341644525527954, + "kl": 0.11507071927189827, + "learning_rate": 3e-06, + "loss": -0.0015, + "step": 1982 + }, + { + "clip_ratio": 0.00010775862028822303, + "epoch": 0.005505305415354888, + "grad_norm": 0.1273004114627838, + "kl": 0.12233321368694305, + "learning_rate": 3e-06, + "loss": -0.0006, + "step": 1983 + }, + { + "clip_ratio": 0.0005858765362063423, + "epoch": 0.005508081666194704, + "grad_norm": 0.12228009104728699, + "kl": 0.12594753503799438, + "learning_rate": 3e-06, + "loss": -0.0006, + "step": 1984 + }, + { + "clip_ratio": 0.00029870675643905997, + "epoch": 0.00551085791703452, + "grad_norm": 0.1282241940498352, + "kl": 0.12114127725362778, + "learning_rate": 3e-06, + "loss": -0.002, + "step": 1985 + }, + { + "clip_ratio": 0.0, + "epoch": 0.005513634167874336, + "grad_norm": 0.18000636994838715, + "kl": 0.12271250784397125, + "learning_rate": 3e-06, + "loss": -0.0024, + "step": 1986 + }, + { + "clip_ratio": 0.0003066251229029149, + "epoch": 0.005516410418714151, + "grad_norm": 0.11781095713376999, + "kl": 0.11188013106584549, + "learning_rate": 3e-06, + "loss": -0.0023, + "step": 1987 + }, + { + "clip_ratio": 0.0005269177490845323, + "epoch": 0.0055191866695539675, + "grad_norm": 0.13404454290866852, + "kl": 0.1126963309943676, + "learning_rate": 3e-06, + "loss": -0.0036, + "step": 1988 + }, + { + "clip_ratio": 0.0003066251229029149, + "epoch": 0.005521962920393784, + "grad_norm": 0.11764881014823914, + "kl": 0.11812717467546463, + "learning_rate": 3e-06, + "loss": -0.0026, + "step": 1989 + }, + { + "clip_ratio": 0.0011090568150393665, + "epoch": 0.005524739171233599, + "grad_norm": 0.14475159347057343, + "kl": 0.12093603238463402, + "learning_rate": 3e-06, + "loss": -0.0037, + "step": 1990 + }, + { + "clip_ratio": 0.0010156702192034572, + "epoch": 0.0055275154220734155, + "grad_norm": 0.11484024673700333, + "kl": 0.11511994898319244, + "learning_rate": 3e-06, + "loss": -0.0032, + "step": 1991 + }, + { + "clip_ratio": 0.0007695165404584259, + "epoch": 0.005530291672913231, + "grad_norm": 0.18602021038532257, + "kl": 0.11601589620113373, + "learning_rate": 3e-06, + "loss": -0.0049, + "step": 1992 + }, + { + "clip_ratio": 0.0005422846879810095, + "completion_length": 225.81250762939453, + "epoch": 0.005533067923753047, + "grad_norm": 0.10769283771514893, + "kl": 0.11326644197106361, + "learning_rate": 3e-06, + "loss": 0.0156, + "reward": 0.2854166850447655, + "reward_std": 0.24453017115592957, + "rewards/countdown_reward_func": 0.2854166701436043, + "step": 1993, + "zero_std_ratio": 0.375 + }, + { + "clip_ratio": 0.0003762734049814753, + "epoch": 0.005535844174592863, + "grad_norm": 0.12362723052501678, + "kl": 0.11308754608035088, + "learning_rate": 3e-06, + "loss": 0.0153, + "step": 1994 + }, + { + "clip_ratio": 0.0005234857235336676, + "epoch": 0.005538620425432679, + "grad_norm": 0.0922532007098198, + "kl": 0.10114821046590805, + "learning_rate": 3e-06, + "loss": 0.0145, + "step": 1995 + }, + { + "clip_ratio": 8.73515018611215e-05, + "epoch": 0.005541396676272494, + "grad_norm": 0.10349724441766739, + "kl": 0.10615367442369461, + "learning_rate": 3e-06, + "loss": 0.016, + "step": 1996 + }, + { + "clip_ratio": 0.00021079258294776082, + "epoch": 0.0055441729271123105, + "grad_norm": 0.13311462104320526, + "kl": 0.10724882781505585, + "learning_rate": 3e-06, + "loss": 0.0144, + "step": 1997 + }, + { + "clip_ratio": 0.0001151012911577709, + "epoch": 0.005546949177952126, + "grad_norm": 0.09960220754146576, + "kl": 0.10265998914837837, + "learning_rate": 3e-06, + "loss": 0.0153, + "step": 1998 + }, + { + "clip_ratio": 0.0007085143588483334, + "epoch": 0.005549725428791942, + "grad_norm": 0.10948970168828964, + "kl": 0.10900342464447021, + "learning_rate": 3e-06, + "loss": 0.0154, + "step": 1999 + }, + { + "epoch": 0.0055525016796317585, + "grad_norm": 0.09006191045045853, + "learning_rate": 3e-06, + "loss": 0.0145, + "step": 2000 + }, + { + "clip_ratio": 0.0005361156981962267, + "epoch": 0.005555277930471574, + "grad_norm": 0.10735763609409332, + "kl": 0.1035256776958704, + "learning_rate": 3e-06, + "loss": 0.0144, + "step": 2001 + }, + { + "clip_ratio": 0.0004768256621900946, + "epoch": 0.00555805418131139, + "grad_norm": 0.09122443944215775, + "kl": 0.10310684517025948, + "learning_rate": 3e-06, + "loss": 0.0147, + "step": 2002 + }, + { + "clip_ratio": 0.0020118614193052053, + "epoch": 0.0055608304321512056, + "grad_norm": 0.21058741211891174, + "kl": 0.10488555207848549, + "learning_rate": 3e-06, + "loss": 0.0131, + "step": 2003 + }, + { + "clip_ratio": 0.0006240379370865412, + "epoch": 0.005563606682991022, + "grad_norm": 0.09514111280441284, + "kl": 0.10193825513124466, + "learning_rate": 3e-06, + "loss": 0.0136, + "step": 2004 + }, + { + "clip_ratio": 0.0002617926656967029, + "completion_length": 223.08333587646484, + "epoch": 0.005566382933830837, + "grad_norm": 0.08725622296333313, + "kl": 0.11390889436006546, + "learning_rate": 3e-06, + "loss": 0.0051, + "reward": 0.2291666641831398, + "reward_std": 0.16722052544355392, + "rewards/countdown_reward_func": 0.2291666641831398, + "step": 2005, + "zero_std_ratio": 0.5 + }, + { + "clip_ratio": 0.00020610057981684804, + "epoch": 0.0055691591846706535, + "grad_norm": 0.08379297703504562, + "kl": 0.11956554651260376, + "learning_rate": 3e-06, + "loss": 0.0049, + "step": 2006 + }, + { + "clip_ratio": 0.0002910737384809181, + "epoch": 0.005571935435510469, + "grad_norm": 0.0656660795211792, + "kl": 0.10992233455181122, + "learning_rate": 3e-06, + "loss": 0.0051, + "step": 2007 + }, + { + "clip_ratio": 8.840169903123751e-05, + "epoch": 0.005574711686350285, + "grad_norm": 0.07707367092370987, + "kl": 0.11566202342510223, + "learning_rate": 3e-06, + "loss": 0.0054, + "step": 2008 + }, + { + "clip_ratio": 0.0005598522620857693, + "epoch": 0.005577487937190101, + "grad_norm": 0.0825631394982338, + "kl": 0.11144465208053589, + "learning_rate": 3e-06, + "loss": 0.0042, + "step": 2009 + }, + { + "clip_ratio": 8.333333244081587e-05, + "epoch": 0.005580264188029917, + "grad_norm": 0.07131849229335785, + "kl": 0.11648586019873619, + "learning_rate": 3e-06, + "loss": 0.0044, + "step": 2010 + }, + { + "clip_ratio": 9.204712841892615e-05, + "epoch": 0.005583040438869733, + "grad_norm": 0.07822870463132858, + "kl": 0.11300336197018623, + "learning_rate": 3e-06, + "loss": 0.0043, + "step": 2011 + }, + { + "clip_ratio": 0.00019509741832735017, + "epoch": 0.0055858166897095485, + "grad_norm": 0.08241821080446243, + "kl": 0.11744438856840134, + "learning_rate": 3e-06, + "loss": 0.0046, + "step": 2012 + }, + { + "clip_ratio": 0.0005299464974086732, + "epoch": 0.005588592940549365, + "grad_norm": 0.07519717514514923, + "kl": 0.1080629974603653, + "learning_rate": 3e-06, + "loss": 0.0044, + "step": 2013 + }, + { + "clip_ratio": 9.578544268151745e-05, + "epoch": 0.00559136919138918, + "grad_norm": 0.07436896860599518, + "kl": 0.11372564733028412, + "learning_rate": 3e-06, + "loss": 0.0053, + "step": 2014 + }, + { + "clip_ratio": 0.0011513839708641171, + "epoch": 0.0055941454422289965, + "grad_norm": 0.07369540631771088, + "kl": 0.10970757156610489, + "learning_rate": 3e-06, + "loss": 0.0035, + "step": 2015 + }, + { + "clip_ratio": 0.0004363836196716875, + "epoch": 0.005596921693068812, + "grad_norm": 0.06370438635349274, + "kl": 0.1145344153046608, + "learning_rate": 3e-06, + "loss": 0.0037, + "step": 2016 + }, + { + "clip_ratio": 0.0002770941355265677, + "completion_length": 221.2291717529297, + "epoch": 0.005599697943908628, + "grad_norm": 0.07984435558319092, + "kl": 0.10508502274751663, + "learning_rate": 3e-06, + "loss": 0.0025, + "reward": 0.21250000596046448, + "reward_std": 0.17428425326943398, + "rewards/countdown_reward_func": 0.21250000596046448, + "step": 2017, + "zero_std_ratio": 0.625 + }, + { + "clip_ratio": 9.21828905120492e-05, + "epoch": 0.005602474194748444, + "grad_norm": 0.07923725992441177, + "kl": 0.11465127393603325, + "learning_rate": 3e-06, + "loss": 0.0021, + "step": 2018 + }, + { + "clip_ratio": 0.00029520990210585296, + "epoch": 0.00560525044558826, + "grad_norm": 0.08113577216863632, + "kl": 0.10587036609649658, + "learning_rate": 3e-06, + "loss": 0.0026, + "step": 2019 + }, + { + "clip_ratio": 0.00039488900802098215, + "epoch": 0.005608026696428075, + "grad_norm": 0.09565461426973343, + "kl": 0.11312683299183846, + "learning_rate": 3e-06, + "loss": 0.0021, + "step": 2020 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0056108029472678915, + "grad_norm": 0.10134981572628021, + "kl": 0.10707443952560425, + "learning_rate": 3e-06, + "loss": 0.0027, + "step": 2021 + }, + { + "clip_ratio": 0.000533617683686316, + "epoch": 0.005613579198107708, + "grad_norm": 0.0789194330573082, + "kl": 0.1022312305867672, + "learning_rate": 3e-06, + "loss": 0.0019, + "step": 2022 + }, + { + "clip_ratio": 0.0005019560339860618, + "epoch": 0.005616355448947523, + "grad_norm": 0.08137885481119156, + "kl": 0.10183565318584442, + "learning_rate": 3e-06, + "loss": 0.0022, + "step": 2023 + }, + { + "clip_ratio": 9.21828905120492e-05, + "epoch": 0.0056191316997873395, + "grad_norm": 0.0739808902144432, + "kl": 0.11150963604450226, + "learning_rate": 3e-06, + "loss": 0.0012, + "step": 2024 + }, + { + "clip_ratio": 0.0001953362807398662, + "epoch": 0.005621907950627155, + "grad_norm": 0.07564988732337952, + "kl": 0.10209690034389496, + "learning_rate": 3e-06, + "loss": 0.0017, + "step": 2025 + }, + { + "clip_ratio": 0.0004712695226771757, + "epoch": 0.005624684201466971, + "grad_norm": 0.14252831041812897, + "kl": 0.10544009134173393, + "learning_rate": 3e-06, + "loss": 0.0015, + "step": 2026 + }, + { + "clip_ratio": 8.138021075865254e-05, + "epoch": 0.005627460452306787, + "grad_norm": 0.0803026333451271, + "kl": 0.10230910778045654, + "learning_rate": 3e-06, + "loss": 0.0012, + "step": 2027 + }, + { + "clip_ratio": 0.0007070228894008324, + "epoch": 0.005630236703146603, + "grad_norm": 0.08244466781616211, + "kl": 0.0964125283062458, + "learning_rate": 3e-06, + "loss": 0.001, + "step": 2028 + }, + { + "clip_ratio": 0.0004911591531708837, + "completion_length": 220.95833587646484, + "epoch": 0.005633012953986418, + "grad_norm": 0.10779600590467453, + "kl": 0.11588352173566818, + "learning_rate": 3e-06, + "loss": 0.0112, + "reward": 0.30416668206453323, + "reward_std": 0.3633297234773636, + "rewards/countdown_reward_func": 0.30416668206453323, + "step": 2029, + "zero_std_ratio": 0.125 + }, + { + "clip_ratio": 0.00043572409049374983, + "epoch": 0.0056357892048262345, + "grad_norm": 0.11869516223669052, + "kl": 0.09700796753168106, + "learning_rate": 3e-06, + "loss": 0.0108, + "step": 2030 + }, + { + "clip_ratio": 0.00020404949464136735, + "epoch": 0.00563856545566605, + "grad_norm": 0.09410764276981354, + "kl": 0.10727613791823387, + "learning_rate": 3e-06, + "loss": 0.0105, + "step": 2031 + }, + { + "clip_ratio": 0.00020356984896352515, + "epoch": 0.005641341706505866, + "grad_norm": 0.11350057274103165, + "kl": 0.10674090310931206, + "learning_rate": 3e-06, + "loss": 0.0096, + "step": 2032 + }, + { + "clip_ratio": 0.0002443792764097452, + "epoch": 0.0056441179573456825, + "grad_norm": 0.12127961963415146, + "kl": 0.10875796526670456, + "learning_rate": 3e-06, + "loss": 0.0101, + "step": 2033 + }, + { + "clip_ratio": 0.0003601440985221416, + "epoch": 0.005646894208185498, + "grad_norm": 0.1095552071928978, + "kl": 0.09716677665710449, + "learning_rate": 3e-06, + "loss": 0.01, + "step": 2034 + }, + { + "clip_ratio": 0.0002456332149449736, + "epoch": 0.005649670459025314, + "grad_norm": 0.12184404581785202, + "kl": 0.11015952378511429, + "learning_rate": 3e-06, + "loss": 0.01, + "step": 2035 + }, + { + "clip_ratio": 8.191350207198411e-05, + "epoch": 0.0056524467098651296, + "grad_norm": 0.11499220877885818, + "kl": 0.09228203445672989, + "learning_rate": 3e-06, + "loss": 0.0097, + "step": 2036 + }, + { + "clip_ratio": 0.0001221896382048726, + "epoch": 0.005655222960704946, + "grad_norm": 0.12085867673158646, + "kl": 0.10267635807394981, + "learning_rate": 3e-06, + "loss": 0.0094, + "step": 2037 + }, + { + "clip_ratio": 0.0004097962155356072, + "epoch": 0.005657999211544761, + "grad_norm": 0.12743261456489563, + "kl": 0.10104519873857498, + "learning_rate": 3e-06, + "loss": 0.0086, + "step": 2038 + }, + { + "clip_ratio": 0.0006971483817324042, + "epoch": 0.0056607754623845775, + "grad_norm": 0.12285710871219635, + "kl": 0.10486527159810066, + "learning_rate": 3e-06, + "loss": 0.0099, + "step": 2039 + }, + { + "clip_ratio": 0.0005238101875875145, + "epoch": 0.005663551713224393, + "grad_norm": 0.1023198664188385, + "kl": 0.09349857643246651, + "learning_rate": 3e-06, + "loss": 0.0078, + "step": 2040 + }, + { + "clip_ratio": 0.0009150805417448282, + "completion_length": 226.89583587646484, + "epoch": 0.005666327964064209, + "grad_norm": 0.09265701472759247, + "kl": 0.09547650068998337, + "learning_rate": 3e-06, + "loss": 0.0079, + "reward": 0.26875001937150955, + "reward_std": 0.15347465127706528, + "rewards/countdown_reward_func": 0.26875000447034836, + "step": 2041, + "zero_std_ratio": 0.625 + }, + { + "clip_ratio": 8.138021075865254e-05, + "epoch": 0.005669104214904025, + "grad_norm": 0.050366487354040146, + "kl": 0.10091326385736465, + "learning_rate": 3e-06, + "loss": 0.0086, + "step": 2042 + }, + { + "clip_ratio": 0.0, + "epoch": 0.005671880465743841, + "grad_norm": 0.06937897950410843, + "kl": 0.10959598422050476, + "learning_rate": 3e-06, + "loss": 0.0086, + "step": 2043 + }, + { + "clip_ratio": 0.0001739728031679988, + "epoch": 0.005674656716583657, + "grad_norm": 0.06653722375631332, + "kl": 0.09630914404988289, + "learning_rate": 3e-06, + "loss": 0.0084, + "step": 2044 + }, + { + "clip_ratio": 0.00018076645210385323, + "epoch": 0.0056774329674234725, + "grad_norm": 0.04911844804883003, + "kl": 0.08973881602287292, + "learning_rate": 3e-06, + "loss": 0.0084, + "step": 2045 + }, + { + "clip_ratio": 0.0002787739722407423, + "epoch": 0.005680209218263289, + "grad_norm": 0.08200076222419739, + "kl": 0.0952768363058567, + "learning_rate": 3e-06, + "loss": 0.0077, + "step": 2046 + }, + { + "clip_ratio": 0.0003649073769338429, + "epoch": 0.005682985469103104, + "grad_norm": 0.06790746748447418, + "kl": 0.09479077905416489, + "learning_rate": 3e-06, + "loss": 0.0078, + "step": 2047 + }, + { + "clip_ratio": 9.15080527192913e-05, + "epoch": 0.0056857617199429205, + "grad_norm": 0.05466790497303009, + "kl": 0.10251379758119583, + "learning_rate": 3e-06, + "loss": 0.0084, + "step": 2048 + }, + { + "clip_ratio": 0.0002632715040817857, + "epoch": 0.005688537970782736, + "grad_norm": 0.06806403398513794, + "kl": 0.1096310168504715, + "learning_rate": 3e-06, + "loss": 0.0081, + "step": 2049 + }, + { + "clip_ratio": 0.0001739728031679988, + "epoch": 0.005691314221622552, + "grad_norm": 0.06310102343559265, + "kl": 0.09831372275948524, + "learning_rate": 3e-06, + "loss": 0.0084, + "step": 2050 + }, + { + "clip_ratio": 0.0001840161858126521, + "epoch": 0.005694090472462368, + "grad_norm": 0.05473968759179115, + "kl": 0.09245896711945534, + "learning_rate": 3e-06, + "loss": 0.0077, + "step": 2051 + }, + { + "clip_ratio": 9.36329597607255e-05, + "epoch": 0.005696866723302184, + "grad_norm": 0.06946438550949097, + "kl": 0.0970330685377121, + "learning_rate": 3e-06, + "loss": 0.0074, + "step": 2052 + }, + { + "clip_ratio": 0.0, + "completion_length": 230.62500762939453, + "epoch": 0.005699642974141999, + "grad_norm": 0.1020989716053009, + "kl": 0.09304828196763992, + "learning_rate": 3e-06, + "loss": 0.0123, + "reward": 0.3395833522081375, + "reward_std": 0.2922678142786026, + "rewards/countdown_reward_func": 0.3395833522081375, + "step": 2053, + "zero_std_ratio": 0.25 + }, + { + "clip_ratio": 8.138021075865254e-05, + "epoch": 0.0057024192249818155, + "grad_norm": 0.08666249364614487, + "kl": 0.09421010315418243, + "learning_rate": 3e-06, + "loss": 0.0129, + "step": 2054 + }, + { + "clip_ratio": 0.00016469038382638246, + "epoch": 0.005705195475821632, + "grad_norm": 0.11592990159988403, + "kl": 0.09231939166784286, + "learning_rate": 3e-06, + "loss": 0.0128, + "step": 2055 + }, + { + "clip_ratio": 0.0002482597410562448, + "epoch": 0.005707971726661447, + "grad_norm": 0.1035403460264206, + "kl": 0.09664532542228699, + "learning_rate": 3e-06, + "loss": 0.012, + "step": 2056 + }, + { + "clip_ratio": 0.00016687953029759228, + "epoch": 0.0057107479775012635, + "grad_norm": 0.1996307075023651, + "kl": 0.09553533047437668, + "learning_rate": 3e-06, + "loss": 0.0118, + "step": 2057 + }, + { + "clip_ratio": 8.138021075865254e-05, + "epoch": 0.005713524228341079, + "grad_norm": 0.09972159564495087, + "kl": 0.10145417600870132, + "learning_rate": 3e-06, + "loss": 0.013, + "step": 2058 + }, + { + "clip_ratio": 0.0, + "epoch": 0.005716300479180895, + "grad_norm": 0.10573241114616394, + "kl": 0.09803595021367073, + "learning_rate": 3e-06, + "loss": 0.0114, + "step": 2059 + }, + { + "clip_ratio": 8.138021075865254e-05, + "epoch": 0.005719076730020711, + "grad_norm": 0.08910799026489258, + "kl": 0.09924589842557907, + "learning_rate": 3e-06, + "loss": 0.012, + "step": 2060 + }, + { + "clip_ratio": 8.138021075865254e-05, + "epoch": 0.005721852980860527, + "grad_norm": 0.11156439036130905, + "kl": 0.09876829758286476, + "learning_rate": 3e-06, + "loss": 0.0116, + "step": 2061 + }, + { + "clip_ratio": 0.00035182230203645304, + "epoch": 0.005724629231700342, + "grad_norm": 0.11116062849760056, + "kl": 0.10230807960033417, + "learning_rate": 3e-06, + "loss": 0.0107, + "step": 2062 + }, + { + "clip_ratio": 0.00016687953029759228, + "epoch": 0.0057274054825401585, + "grad_norm": 0.11640466749668121, + "kl": 0.10202505066990852, + "learning_rate": 3e-06, + "loss": 0.0104, + "step": 2063 + }, + { + "clip_ratio": 0.0002663229824975133, + "epoch": 0.005730181733379974, + "grad_norm": 0.20945513248443604, + "kl": 0.10810398682951927, + "learning_rate": 3e-06, + "loss": 0.0113, + "step": 2064 + }, + { + "clip_ratio": 8.223684562835842e-05, + "completion_length": 233.02084350585938, + "epoch": 0.00573295798421979, + "grad_norm": 0.10434425622224808, + "kl": 0.11239994317293167, + "learning_rate": 3e-06, + "loss": -0.0028, + "reward": 0.32083335518836975, + "reward_std": 0.3441803753376007, + "rewards/countdown_reward_func": 0.32083334028720856, + "step": 2065, + "zero_std_ratio": 0.25 + }, + { + "clip_ratio": 8.138021075865254e-05, + "epoch": 0.0057357342350596065, + "grad_norm": 0.11738111078739166, + "kl": 0.11531775444746017, + "learning_rate": 3e-06, + "loss": -0.0023, + "step": 2066 + }, + { + "clip_ratio": 0.0006976053700782359, + "epoch": 0.005738510485899422, + "grad_norm": 0.09144604206085205, + "kl": 0.11997786909341812, + "learning_rate": 3e-06, + "loss": -0.002, + "step": 2067 + }, + { + "clip_ratio": 0.0002564237511251122, + "epoch": 0.005741286736739238, + "grad_norm": 0.15023760497570038, + "kl": 0.11250534653663635, + "learning_rate": 3e-06, + "loss": -0.0033, + "step": 2068 + }, + { + "clip_ratio": 0.0003439874417381361, + "epoch": 0.0057440629875790536, + "grad_norm": 0.0950041189789772, + "kl": 0.1112917885184288, + "learning_rate": 3e-06, + "loss": -0.0029, + "step": 2069 + }, + { + "clip_ratio": 8.327781688421965e-05, + "epoch": 0.00574683923841887, + "grad_norm": 0.1445959061384201, + "kl": 0.12204555794596672, + "learning_rate": 3e-06, + "loss": -0.0021, + "step": 2070 + }, + { + "clip_ratio": 9.09090886125341e-05, + "epoch": 0.005749615489258685, + "grad_norm": 0.10922154039144516, + "kl": 0.1177988089621067, + "learning_rate": 3e-06, + "loss": -0.0032, + "step": 2071 + }, + { + "clip_ratio": 0.0002721360942814499, + "epoch": 0.0057523917400985015, + "grad_norm": 0.10157906264066696, + "kl": 0.11939038708806038, + "learning_rate": 3e-06, + "loss": -0.0028, + "step": 2072 + }, + { + "clip_ratio": 0.0011802471126429737, + "epoch": 0.005755167990938317, + "grad_norm": 0.10021565854549408, + "kl": 0.12260624766349792, + "learning_rate": 3e-06, + "loss": -0.0032, + "step": 2073 + }, + { + "clip_ratio": 0.0005078699323348701, + "epoch": 0.005757944241778133, + "grad_norm": 0.14854243397712708, + "kl": 0.11457186192274094, + "learning_rate": 3e-06, + "loss": -0.0052, + "step": 2074 + }, + { + "clip_ratio": 0.0005989633646095172, + "epoch": 0.005760720492617949, + "grad_norm": 0.10022560507059097, + "kl": 0.11323418468236923, + "learning_rate": 3e-06, + "loss": -0.0044, + "step": 2075 + }, + { + "clip_ratio": 0.0011352297442499548, + "epoch": 0.005763496743457765, + "grad_norm": 0.14965884387493134, + "kl": 0.12618472799658775, + "learning_rate": 3e-06, + "loss": -0.0045, + "step": 2076 + }, + { + "clip_ratio": 9.689922444522381e-05, + "completion_length": 221.6041717529297, + "epoch": 0.005766272994297581, + "grad_norm": 0.11252445727586746, + "kl": 0.11330411210656166, + "learning_rate": 3e-06, + "loss": 0.0338, + "reward": 0.26875001937150955, + "reward_std": 0.2272602580487728, + "rewards/countdown_reward_func": 0.26875001937150955, + "step": 2077, + "zero_std_ratio": 0.5 + }, + { + "clip_ratio": 0.0002688652166398242, + "epoch": 0.0057690492451373965, + "grad_norm": 0.11611206829547882, + "kl": 0.11921777203679085, + "learning_rate": 3e-06, + "loss": 0.0351, + "step": 2078 + }, + { + "clip_ratio": 0.00021403797290986404, + "epoch": 0.005771825495977213, + "grad_norm": 0.09081734716892242, + "kl": 0.11672532185912132, + "learning_rate": 3e-06, + "loss": 0.0339, + "step": 2079 + }, + { + "clip_ratio": 0.0006553223647642881, + "epoch": 0.005774601746817028, + "grad_norm": 0.11462453752756119, + "kl": 0.12759307771921158, + "learning_rate": 3e-06, + "loss": 0.0341, + "step": 2080 + }, + { + "clip_ratio": 0.0002815326370182447, + "epoch": 0.0057773779976568445, + "grad_norm": 0.11553678661584854, + "kl": 0.11894625052809715, + "learning_rate": 3e-06, + "loss": 0.0344, + "step": 2081 + }, + { + "clip_ratio": 0.00038389285327866673, + "epoch": 0.00578015424849666, + "grad_norm": 0.13438205420970917, + "kl": 0.13819441944360733, + "learning_rate": 3e-06, + "loss": 0.0343, + "step": 2082 + }, + { + "clip_ratio": 0.00019379844889044762, + "epoch": 0.005782930499336476, + "grad_norm": 0.09481407701969147, + "kl": 0.12022696807980537, + "learning_rate": 3e-06, + "loss": 0.0335, + "step": 2083 + }, + { + "clip_ratio": 0.00046928575466154143, + "epoch": 0.005785706750176292, + "grad_norm": 0.11112938076257706, + "kl": 0.12938351184129715, + "learning_rate": 3e-06, + "loss": 0.034, + "step": 2084 + }, + { + "clip_ratio": 0.0005152427984285168, + "epoch": 0.005788483001016108, + "grad_norm": 0.08531668782234192, + "kl": 0.12835099548101425, + "learning_rate": 3e-06, + "loss": 0.0329, + "step": 2085 + }, + { + "clip_ratio": 0.0012749898305628449, + "epoch": 0.005791259251855923, + "grad_norm": 0.14292016625404358, + "kl": 0.14164124429225922, + "learning_rate": 3e-06, + "loss": 0.0331, + "step": 2086 + }, + { + "clip_ratio": 0.0004016064340248704, + "epoch": 0.0057940355026957395, + "grad_norm": 0.10637033730745316, + "kl": 0.1343921273946762, + "learning_rate": 3e-06, + "loss": 0.0319, + "step": 2087 + }, + { + "clip_ratio": 0.0011857394711114466, + "epoch": 0.005796811753535556, + "grad_norm": 0.10809402167797089, + "kl": 0.15764878690242767, + "learning_rate": 3e-06, + "loss": 0.0319, + "step": 2088 + }, + { + "clip_ratio": 9.689922444522381e-05, + "completion_length": 241.83334350585938, + "epoch": 0.005799588004375371, + "grad_norm": 0.16788090765476227, + "kl": 0.1235123947262764, + "learning_rate": 3e-06, + "loss": 0.016, + "reward": 0.23125001043081284, + "reward_std": 0.2202121838927269, + "rewards/countdown_reward_func": 0.23125001043081284, + "step": 2089, + "zero_std_ratio": 0.5 + }, + { + "clip_ratio": 0.00040937707672128454, + "epoch": 0.0058023642552151875, + "grad_norm": 0.08114562183618546, + "kl": 0.14093566685914993, + "learning_rate": 3e-06, + "loss": 0.0153, + "step": 2090 + }, + { + "clip_ratio": 8.138021075865254e-05, + "epoch": 0.005805140506055003, + "grad_norm": 0.09554962068796158, + "kl": 0.1442393809556961, + "learning_rate": 3e-06, + "loss": 0.0148, + "step": 2091 + }, + { + "clip_ratio": 8.138021075865254e-05, + "epoch": 0.005807916756894819, + "grad_norm": 0.090549536049366, + "kl": 0.14015483856201172, + "learning_rate": 3e-06, + "loss": 0.0169, + "step": 2092 + }, + { + "clip_ratio": 0.0002713593712542206, + "epoch": 0.005810693007734635, + "grad_norm": 0.10590094327926636, + "kl": 0.1370547115802765, + "learning_rate": 3e-06, + "loss": 0.0161, + "step": 2093 + }, + { + "clip_ratio": 0.00025648381415521726, + "epoch": 0.005813469258574451, + "grad_norm": 0.1045352965593338, + "kl": 0.14967551827430725, + "learning_rate": 3e-06, + "loss": 0.016, + "step": 2094 + }, + { + "clip_ratio": 0.00045060378033667803, + "epoch": 0.005816245509414266, + "grad_norm": 0.08050543814897537, + "kl": 0.13956885784864426, + "learning_rate": 3e-06, + "loss": 0.0143, + "step": 2095 + }, + { + "clip_ratio": 0.0002441406322759576, + "epoch": 0.0058190217602540825, + "grad_norm": 0.14037755131721497, + "kl": 0.15493950247764587, + "learning_rate": 3e-06, + "loss": 0.015, + "step": 2096 + }, + { + "clip_ratio": 0.0, + "epoch": 0.005821798011093898, + "grad_norm": 0.09420851618051529, + "kl": 0.15672826766967773, + "learning_rate": 3e-06, + "loss": 0.0139, + "step": 2097 + }, + { + "clip_ratio": 0.0003684598486870527, + "epoch": 0.005824574261933714, + "grad_norm": 0.08682415634393692, + "kl": 0.15357310324907303, + "learning_rate": 3e-06, + "loss": 0.016, + "step": 2098 + }, + { + "clip_ratio": 0.0007170586031861603, + "epoch": 0.0058273505127735305, + "grad_norm": 0.09509247541427612, + "kl": 0.149859219789505, + "learning_rate": 3e-06, + "loss": 0.0145, + "step": 2099 + }, + { + "clip_ratio": 0.0007866056985221803, + "epoch": 0.005830126763613346, + "grad_norm": 0.1083296537399292, + "kl": 0.16195228695869446, + "learning_rate": 3e-06, + "loss": 0.0151, + "step": 2100 + }, + { + "clip_ratio": 0.00034024709020741284, + "completion_length": 231.875, + "epoch": 0.005832903014453162, + "grad_norm": 0.06713063269853592, + "kl": 0.16988279670476913, + "learning_rate": 3e-06, + "loss": 0.0083, + "reward": 0.1937500163912773, + "reward_std": 0.16211743280291557, + "rewards/countdown_reward_func": 0.1937500163912773, + "step": 2101, + "zero_std_ratio": 0.625 + }, + { + "clip_ratio": 0.0007102232775650918, + "epoch": 0.0058356792652929776, + "grad_norm": 0.0946163609623909, + "kl": 0.17318201810121536, + "learning_rate": 3e-06, + "loss": 0.0086, + "step": 2102 + }, + { + "clip_ratio": 0.000451006053481251, + "epoch": 0.005838455516132794, + "grad_norm": 0.08715200424194336, + "kl": 0.17537237703800201, + "learning_rate": 3e-06, + "loss": 0.0084, + "step": 2103 + }, + { + "clip_ratio": 0.0002783698437269777, + "epoch": 0.005841231766972609, + "grad_norm": 0.09208168089389801, + "kl": 0.18494432419538498, + "learning_rate": 3e-06, + "loss": 0.009, + "step": 2104 + }, + { + "clip_ratio": 0.0006052821408957243, + "epoch": 0.0058440080178124255, + "grad_norm": 0.08401837199926376, + "kl": 0.18430516123771667, + "learning_rate": 3e-06, + "loss": 0.0085, + "step": 2105 + }, + { + "clip_ratio": 0.0005398763241828419, + "epoch": 0.005846784268652241, + "grad_norm": 0.09048160165548325, + "kl": 0.17285944521427155, + "learning_rate": 3e-06, + "loss": 0.0083, + "step": 2106 + }, + { + "clip_ratio": 0.001236416690517217, + "epoch": 0.005849560519492057, + "grad_norm": 0.06877080351114273, + "kl": 0.16914395987987518, + "learning_rate": 3e-06, + "loss": 0.0078, + "step": 2107 + }, + { + "clip_ratio": 0.0004467820399440825, + "epoch": 0.005852336770331873, + "grad_norm": 0.08058128505945206, + "kl": 0.16808201372623444, + "learning_rate": 3e-06, + "loss": 0.0074, + "step": 2108 + }, + { + "clip_ratio": 0.0016149040893651545, + "epoch": 0.005855113021171689, + "grad_norm": 0.0833841860294342, + "kl": 0.1688889116048813, + "learning_rate": 3e-06, + "loss": 0.0083, + "step": 2109 + }, + { + "clip_ratio": 0.0009857022087089717, + "epoch": 0.005857889272011505, + "grad_norm": 0.07885358482599258, + "kl": 0.17396697402000427, + "learning_rate": 3e-06, + "loss": 0.0075, + "step": 2110 + }, + { + "clip_ratio": 0.0015709067229181528, + "epoch": 0.0058606655228513205, + "grad_norm": 0.07803882658481598, + "kl": 0.17317739129066467, + "learning_rate": 3e-06, + "loss": 0.0079, + "step": 2111 + }, + { + "clip_ratio": 0.0007821381441317499, + "epoch": 0.005863441773691137, + "grad_norm": 0.09485685080289841, + "kl": 0.1609647423028946, + "learning_rate": 3e-06, + "loss": 0.0074, + "step": 2112 + }, + { + "clip_ratio": 0.0007151653699111193, + "completion_length": 230.875, + "epoch": 0.005866218024530952, + "grad_norm": 0.11236539483070374, + "kl": 0.1636364832520485, + "learning_rate": 3e-06, + "loss": 0.0213, + "reward": 0.40000002086162567, + "reward_std": 0.4050685316324234, + "rewards/countdown_reward_func": 0.40000002086162567, + "step": 2113, + "zero_std_ratio": 0.125 + }, + { + "clip_ratio": 0.00017586752073839307, + "epoch": 0.0058689942753707685, + "grad_norm": 0.1399078369140625, + "kl": 0.14752335101366043, + "learning_rate": 3e-06, + "loss": 0.0204, + "step": 2114 + }, + { + "clip_ratio": 0.000263152651314158, + "epoch": 0.005871770526210584, + "grad_norm": 0.13787932693958282, + "kl": 0.1409480944275856, + "learning_rate": 3e-06, + "loss": 0.0213, + "step": 2115 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0058745467770504, + "grad_norm": 0.12428930401802063, + "kl": 0.14609377086162567, + "learning_rate": 3e-06, + "loss": 0.0202, + "step": 2116 + }, + { + "clip_ratio": 0.0003564371290849522, + "epoch": 0.005877323027890216, + "grad_norm": 0.1103990450501442, + "kl": 0.1507614701986313, + "learning_rate": 3e-06, + "loss": 0.0211, + "step": 2117 + }, + { + "clip_ratio": 0.00045055238297209144, + "epoch": 0.005880099278730032, + "grad_norm": 0.13074712455272675, + "kl": 0.14322231709957123, + "learning_rate": 3e-06, + "loss": 0.0215, + "step": 2118 + }, + { + "clip_ratio": 0.0006283415132202208, + "epoch": 0.005882875529569848, + "grad_norm": 0.11959907412528992, + "kl": 0.1588028073310852, + "learning_rate": 3e-06, + "loss": 0.0208, + "step": 2119 + }, + { + "clip_ratio": 0.0003582945646485314, + "epoch": 0.0058856517804096635, + "grad_norm": 0.1354365348815918, + "kl": 0.1482446938753128, + "learning_rate": 3e-06, + "loss": 0.0203, + "step": 2120 + }, + { + "clip_ratio": 0.00043741075205616653, + "epoch": 0.00588842803124948, + "grad_norm": 0.14199601113796234, + "kl": 0.14358460903167725, + "learning_rate": 3e-06, + "loss": 0.02, + "step": 2121 + }, + { + "clip_ratio": 0.0, + "epoch": 0.005891204282089295, + "grad_norm": 0.11802490055561066, + "kl": 0.15160782635211945, + "learning_rate": 3e-06, + "loss": 0.0193, + "step": 2122 + }, + { + "clip_ratio": 0.00018325866403756663, + "epoch": 0.0058939805329291115, + "grad_norm": 0.1145329475402832, + "kl": 0.15987689793109894, + "learning_rate": 3e-06, + "loss": 0.0203, + "step": 2123 + }, + { + "clip_ratio": 0.0003541165206115693, + "epoch": 0.005896756783768927, + "grad_norm": 0.13508586585521698, + "kl": 0.15324489772319794, + "learning_rate": 3e-06, + "loss": 0.0196, + "step": 2124 + }, + { + "clip_ratio": 0.0005099133632029407, + "completion_length": 218.33333587646484, + "epoch": 0.005899533034608743, + "grad_norm": 0.11340119689702988, + "kl": 0.1783110797405243, + "learning_rate": 3e-06, + "loss": 0.0107, + "reward": 0.30416667461395264, + "reward_std": 0.29580747336149216, + "rewards/countdown_reward_func": 0.30416665971279144, + "step": 2125, + "zero_std_ratio": 0.375 + }, + { + "clip_ratio": 0.00028971848951186985, + "epoch": 0.005902309285448559, + "grad_norm": 0.11806675046682358, + "kl": 0.19692577421665192, + "learning_rate": 3e-06, + "loss": 0.013, + "step": 2126 + }, + { + "clip_ratio": 0.0, + "epoch": 0.005905085536288375, + "grad_norm": 0.1320490837097168, + "kl": 0.18967118114233017, + "learning_rate": 3e-06, + "loss": 0.0124, + "step": 2127 + }, + { + "clip_ratio": 0.00026122476992895827, + "epoch": 0.00590786178712819, + "grad_norm": 0.11485249549150467, + "kl": 0.1766699254512787, + "learning_rate": 3e-06, + "loss": 0.0114, + "step": 2128 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0059106380379680065, + "grad_norm": 0.10251068323850632, + "kl": 0.175718754529953, + "learning_rate": 3e-06, + "loss": 0.0119, + "step": 2129 + }, + { + "clip_ratio": 0.0002794336760416627, + "epoch": 0.005913414288807823, + "grad_norm": 0.1114218533039093, + "kl": 0.1810135394334793, + "learning_rate": 3e-06, + "loss": 0.011, + "step": 2130 + }, + { + "clip_ratio": 9.314456110587344e-05, + "epoch": 0.005916190539647638, + "grad_norm": 0.12015023082494736, + "kl": 0.1831328272819519, + "learning_rate": 3e-06, + "loss": 0.0098, + "step": 2131 + }, + { + "clip_ratio": 0.0003007580089615658, + "epoch": 0.0059189667904874545, + "grad_norm": 0.11531781405210495, + "kl": 0.1972024142742157, + "learning_rate": 3e-06, + "loss": 0.0125, + "step": 2132 + }, + { + "clip_ratio": 0.0002856182763935067, + "epoch": 0.00592174304132727, + "grad_norm": 0.11725442856550217, + "kl": 0.18396782875061035, + "learning_rate": 3e-06, + "loss": 0.01, + "step": 2133 + }, + { + "clip_ratio": 0.00017714993737172335, + "epoch": 0.005924519292167086, + "grad_norm": 0.11699375510215759, + "kl": 0.17351438850164413, + "learning_rate": 3e-06, + "loss": 0.0097, + "step": 2134 + }, + { + "clip_ratio": 0.0002972428919747472, + "epoch": 0.0059272955430069016, + "grad_norm": 0.12814870476722717, + "kl": 0.16928401589393616, + "learning_rate": 3e-06, + "loss": 0.0101, + "step": 2135 + }, + { + "clip_ratio": 0.00038728527579223737, + "epoch": 0.005930071793846718, + "grad_norm": 0.10801739990711212, + "kl": 0.17271753400564194, + "learning_rate": 3e-06, + "loss": 0.0089, + "step": 2136 + }, + { + "clip_ratio": 0.0008804201061138883, + "completion_length": 240.1666717529297, + "epoch": 0.005932848044686533, + "grad_norm": 0.11421734094619751, + "kl": 0.169437974691391, + "learning_rate": 3e-06, + "loss": 0.0286, + "reward": 0.30416667461395264, + "reward_std": 0.2904581278562546, + "rewards/countdown_reward_func": 0.30416667461395264, + "step": 2137, + "zero_std_ratio": 0.25 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0059356242955263495, + "grad_norm": 0.1658470183610916, + "kl": 0.16381431370973587, + "learning_rate": 3e-06, + "loss": 0.0283, + "step": 2138 + }, + { + "clip_ratio": 8.765778329689056e-05, + "epoch": 0.005938400546366165, + "grad_norm": 0.1286601573228836, + "kl": 0.16420385986566544, + "learning_rate": 3e-06, + "loss": 0.029, + "step": 2139 + }, + { + "clip_ratio": 0.000244140625, + "epoch": 0.005941176797205981, + "grad_norm": 0.09617311507463455, + "kl": 0.15908657014369965, + "learning_rate": 3e-06, + "loss": 0.0279, + "step": 2140 + }, + { + "clip_ratio": 0.0004251676582498476, + "epoch": 0.0059439530480457975, + "grad_norm": 0.10428610444068909, + "kl": 0.1709904745221138, + "learning_rate": 3e-06, + "loss": 0.029, + "step": 2141 + }, + { + "clip_ratio": 0.000519495370099321, + "epoch": 0.005946729298885613, + "grad_norm": 0.08956306427717209, + "kl": 0.17576873302459717, + "learning_rate": 3e-06, + "loss": 0.0287, + "step": 2142 + }, + { + "clip_ratio": 0.00035832179128192365, + "epoch": 0.005949505549725429, + "grad_norm": 0.09938909113407135, + "kl": 0.1722540631890297, + "learning_rate": 3e-06, + "loss": 0.0279, + "step": 2143 + }, + { + "clip_ratio": 8.138021075865254e-05, + "epoch": 0.0059522818005652445, + "grad_norm": 0.0916515365242958, + "kl": 0.17020156979560852, + "learning_rate": 3e-06, + "loss": 0.0275, + "step": 2144 + }, + { + "clip_ratio": 0.00017531556659378111, + "epoch": 0.005955058051405061, + "grad_norm": 0.1481482982635498, + "kl": 0.17428260296583176, + "learning_rate": 3e-06, + "loss": 0.0269, + "step": 2145 + }, + { + "clip_ratio": 0.0007600796525366604, + "epoch": 0.005957834302244876, + "grad_norm": 0.10551691800355911, + "kl": 0.17008347064256668, + "learning_rate": 3e-06, + "loss": 0.0264, + "step": 2146 + }, + { + "clip_ratio": 0.00026695779524743557, + "epoch": 0.0059606105530846925, + "grad_norm": 0.1174376830458641, + "kl": 0.18421828001737595, + "learning_rate": 3e-06, + "loss": 0.0284, + "step": 2147 + }, + { + "clip_ratio": 0.000535756757017225, + "epoch": 0.005963386803924508, + "grad_norm": 0.08853857964277267, + "kl": 0.19289961457252502, + "learning_rate": 3e-06, + "loss": 0.0272, + "step": 2148 + }, + { + "clip_ratio": 8.138021075865254e-05, + "completion_length": 232.875, + "epoch": 0.005966163054764324, + "grad_norm": 0.04596855491399765, + "kl": 0.18686389178037643, + "learning_rate": 3e-06, + "loss": 0.0206, + "reward": 0.17083335667848587, + "reward_std": 0.11509480327367783, + "rewards/countdown_reward_func": 0.17083334922790527, + "step": 2149, + "zero_std_ratio": 0.625 + }, + { + "clip_ratio": 0.000768062163842842, + "epoch": 0.00596893930560414, + "grad_norm": 0.0615745410323143, + "kl": 0.1883329302072525, + "learning_rate": 3e-06, + "loss": 0.0202, + "step": 2150 + }, + { + "clip_ratio": 0.0002582559172878973, + "epoch": 0.005971715556443956, + "grad_norm": 0.060142528265714645, + "kl": 0.2043200358748436, + "learning_rate": 3e-06, + "loss": 0.0209, + "step": 2151 + }, + { + "clip_ratio": 0.00044469654676504433, + "epoch": 0.005974491807283772, + "grad_norm": 0.066607765853405, + "kl": 0.22076967358589172, + "learning_rate": 3e-06, + "loss": 0.0211, + "step": 2152 + }, + { + "clip_ratio": 0.0004701242069131695, + "epoch": 0.0059772680581235875, + "grad_norm": 0.08139292895793915, + "kl": 0.20817560702562332, + "learning_rate": 3e-06, + "loss": 0.0203, + "step": 2153 + }, + { + "clip_ratio": 0.0001826150546548888, + "epoch": 0.005980044308963404, + "grad_norm": 0.07108417898416519, + "kl": 0.20678973197937012, + "learning_rate": 3e-06, + "loss": 0.02, + "step": 2154 + }, + { + "clip_ratio": 0.00027385010616853833, + "epoch": 0.005982820559803219, + "grad_norm": 0.044102661311626434, + "kl": 0.2047039493918419, + "learning_rate": 3e-06, + "loss": 0.0201, + "step": 2155 + }, + { + "clip_ratio": 0.00040690103196538985, + "epoch": 0.0059855968106430355, + "grad_norm": 0.054487258195877075, + "kl": 0.20282930880784988, + "learning_rate": 3e-06, + "loss": 0.0196, + "step": 2156 + }, + { + "clip_ratio": 0.00010032102727564052, + "epoch": 0.005988373061482851, + "grad_norm": 0.05873025581240654, + "kl": 0.22172988951206207, + "learning_rate": 3e-06, + "loss": 0.0205, + "step": 2157 + }, + { + "clip_ratio": 0.00020064205455128103, + "epoch": 0.005991149312322667, + "grad_norm": 0.06830353289842606, + "kl": 0.2312830686569214, + "learning_rate": 3e-06, + "loss": 0.0203, + "step": 2158 + }, + { + "clip_ratio": 0.00020064205455128103, + "epoch": 0.005993925563162483, + "grad_norm": 0.06709025055170059, + "kl": 0.21746045351028442, + "learning_rate": 3e-06, + "loss": 0.0191, + "step": 2159 + }, + { + "clip_ratio": 0.0, + "epoch": 0.005996701814002299, + "grad_norm": 0.07886490970849991, + "kl": 0.21650362014770508, + "learning_rate": 3e-06, + "loss": 0.0196, + "step": 2160 + }, + { + "clip_ratio": 9.391435014549643e-05, + "completion_length": 214.43750762939453, + "epoch": 0.005999478064842114, + "grad_norm": 0.11006475239992142, + "kl": 0.21128001064062119, + "learning_rate": 3e-06, + "loss": 0.0006, + "reward": 0.28333335369825363, + "reward_std": 0.2853127121925354, + "rewards/countdown_reward_func": 0.28333333879709244, + "step": 2161, + "zero_std_ratio": 0.375 + }, + { + "clip_ratio": 0.0001065643664333038, + "epoch": 0.0060022543156819305, + "grad_norm": 0.17413948476314545, + "kl": 0.20346222817897797, + "learning_rate": 3e-06, + "loss": -0.0006, + "step": 2162 + }, + { + "clip_ratio": 0.00018050541984848678, + "epoch": 0.006005030566521747, + "grad_norm": 0.11314403265714645, + "kl": 0.19823309034109116, + "learning_rate": 3e-06, + "loss": 0.0001, + "step": 2163 + }, + { + "clip_ratio": 0.0005011356552131474, + "epoch": 0.006007806817361562, + "grad_norm": 0.1388438194990158, + "kl": 0.1954411193728447, + "learning_rate": 3e-06, + "loss": 0.0001, + "step": 2164 + }, + { + "clip_ratio": 0.00022549449931830168, + "epoch": 0.0060105830682013785, + "grad_norm": 0.1569499969482422, + "kl": 0.18387842923402786, + "learning_rate": 3e-06, + "loss": -0.0012, + "step": 2165 + }, + { + "clip_ratio": 0.0001065643664333038, + "epoch": 0.006013359319041194, + "grad_norm": 0.14936763048171997, + "kl": 0.19631676375865936, + "learning_rate": 3e-06, + "loss": -0.0009, + "step": 2166 + }, + { + "clip_ratio": 0.0001065643664333038, + "epoch": 0.00601613556988101, + "grad_norm": 0.10569910705089569, + "kl": 0.19845695048570633, + "learning_rate": 3e-06, + "loss": -0.0009, + "step": 2167 + }, + { + "clip_ratio": 0.00045912877249065787, + "epoch": 0.0060189118207208256, + "grad_norm": 0.1597263067960739, + "kl": 0.18819713592529297, + "learning_rate": 3e-06, + "loss": -0.0028, + "step": 2168 + }, + { + "clip_ratio": 0.000762759504141286, + "epoch": 0.006021688071560642, + "grad_norm": 0.1164880320429802, + "kl": 0.17806652933359146, + "learning_rate": 3e-06, + "loss": -0.0014, + "step": 2169 + }, + { + "clip_ratio": 0.0007057104958221316, + "epoch": 0.006024464322400457, + "grad_norm": 0.12073780596256256, + "kl": 0.17337379604578018, + "learning_rate": 3e-06, + "loss": -0.0033, + "step": 2170 + }, + { + "clip_ratio": 0.00038431792927440256, + "epoch": 0.0060272405732402735, + "grad_norm": 0.15236780047416687, + "kl": 0.1608799397945404, + "learning_rate": 3e-06, + "loss": -0.0041, + "step": 2171 + }, + { + "clip_ratio": 0.0005894555361010134, + "epoch": 0.006030016824080089, + "grad_norm": 0.1417692005634308, + "kl": 0.17029716074466705, + "learning_rate": 3e-06, + "loss": -0.0044, + "step": 2172 + }, + { + "clip_ratio": 0.00017841139197116718, + "completion_length": 235.6041717529297, + "epoch": 0.006032793074919905, + "grad_norm": 0.09551423043012619, + "kl": 0.1639445647597313, + "learning_rate": 3e-06, + "loss": -0.0062, + "reward": 0.21250000596046448, + "reward_std": 0.11618950217962265, + "rewards/countdown_reward_func": 0.21250000596046448, + "step": 2173, + "zero_std_ratio": 0.75 + }, + { + "clip_ratio": 0.0006238899368327111, + "epoch": 0.0060355693257597215, + "grad_norm": 0.11371457576751709, + "kl": 0.15197165310382843, + "learning_rate": 3e-06, + "loss": -0.0061, + "step": 2174 + }, + { + "clip_ratio": 8.138021075865254e-05, + "epoch": 0.006038345576599537, + "grad_norm": 0.09170381724834442, + "kl": 0.15608438104391098, + "learning_rate": 3e-06, + "loss": -0.0057, + "step": 2175 + }, + { + "clip_ratio": 0.0005786644760519266, + "epoch": 0.006041121827439353, + "grad_norm": 0.08310035616159439, + "kl": 0.15356775373220444, + "learning_rate": 3e-06, + "loss": -0.0068, + "step": 2176 + }, + { + "clip_ratio": 8.802816591924056e-05, + "epoch": 0.0060438980782791685, + "grad_norm": 0.10131369531154633, + "kl": 0.148365817964077, + "learning_rate": 3e-06, + "loss": -0.0066, + "step": 2177 + }, + { + "clip_ratio": 0.0007334884430747479, + "epoch": 0.006046674329118985, + "grad_norm": 0.0910923108458519, + "kl": 0.14135953783988953, + "learning_rate": 3e-06, + "loss": -0.0067, + "step": 2178 + }, + { + "clip_ratio": 0.0007884440710768104, + "epoch": 0.0060494505799588, + "grad_norm": 0.09182950854301453, + "kl": 0.1402914896607399, + "learning_rate": 3e-06, + "loss": -0.0073, + "step": 2179 + }, + { + "clip_ratio": 0.0012513441615737975, + "epoch": 0.0060522268307986165, + "grad_norm": 0.11456513404846191, + "kl": 0.13089902698993683, + "learning_rate": 3e-06, + "loss": -0.0089, + "step": 2180 + }, + { + "clip_ratio": 0.0005369534774217755, + "epoch": 0.006055003081638432, + "grad_norm": 0.09350922703742981, + "kl": 0.1340053342282772, + "learning_rate": 3e-06, + "loss": -0.0084, + "step": 2181 + }, + { + "clip_ratio": 0.0014489490713458508, + "epoch": 0.006057779332478248, + "grad_norm": 0.08015953004360199, + "kl": 0.13297784700989723, + "learning_rate": 3e-06, + "loss": -0.0091, + "step": 2182 + }, + { + "clip_ratio": 0.0017656179843470454, + "epoch": 0.006060555583318064, + "grad_norm": 0.08231103420257568, + "kl": 0.130483016371727, + "learning_rate": 3e-06, + "loss": -0.0095, + "step": 2183 + }, + { + "clip_ratio": 0.002995648537762463, + "epoch": 0.00606333183415788, + "grad_norm": 0.0673069879412651, + "kl": 0.12187189608812332, + "learning_rate": 3e-06, + "loss": -0.009, + "step": 2184 + }, + { + "clip_ratio": 0.0004267231997800991, + "completion_length": 233.3541717529297, + "epoch": 0.006066108084997696, + "grad_norm": 0.07689964771270752, + "kl": 0.1193951666355133, + "learning_rate": 3e-06, + "loss": 0.0086, + "reward": 0.21250000596046448, + "reward_std": 0.2418064922094345, + "rewards/countdown_reward_func": 0.21250000596046448, + "step": 2185, + "zero_std_ratio": 0.375 + }, + { + "clip_ratio": 0.0002802690723910928, + "epoch": 0.0060688843358375115, + "grad_norm": 0.07742653042078018, + "kl": 0.1159132868051529, + "learning_rate": 3e-06, + "loss": 0.0085, + "step": 2186 + }, + { + "clip_ratio": 0.0004338699218351394, + "epoch": 0.006071660586677328, + "grad_norm": 0.11565269529819489, + "kl": 0.11845757439732552, + "learning_rate": 3e-06, + "loss": 0.0084, + "step": 2187 + }, + { + "clip_ratio": 8.138021075865254e-05, + "epoch": 0.006074436837517143, + "grad_norm": 0.07725950330495834, + "kl": 0.11740844696760178, + "learning_rate": 3e-06, + "loss": 0.0082, + "step": 2188 + }, + { + "clip_ratio": 0.00018731241289060563, + "epoch": 0.0060772130883569595, + "grad_norm": 0.0754290521144867, + "kl": 0.11156599596142769, + "learning_rate": 3e-06, + "loss": 0.008, + "step": 2189 + }, + { + "clip_ratio": 0.000675759933074005, + "epoch": 0.006079989339196775, + "grad_norm": 0.10546611994504929, + "kl": 0.11662887409329414, + "learning_rate": 3e-06, + "loss": 0.0078, + "step": 2190 + }, + { + "clip_ratio": 0.00046884678886272013, + "epoch": 0.006082765590036591, + "grad_norm": 0.07807113230228424, + "kl": 0.11078613996505737, + "learning_rate": 3e-06, + "loss": 0.0078, + "step": 2191 + }, + { + "clip_ratio": 0.00047955120680853724, + "epoch": 0.0060855418408764066, + "grad_norm": 0.0798504427075386, + "kl": 0.10783285275101662, + "learning_rate": 3e-06, + "loss": 0.0075, + "step": 2192 + }, + { + "clip_ratio": 0.00029404355882434174, + "epoch": 0.006088318091716223, + "grad_norm": 0.09978339821100235, + "kl": 0.11092406511306763, + "learning_rate": 3e-06, + "loss": 0.0074, + "step": 2193 + }, + { + "clip_ratio": 0.000244140625, + "epoch": 0.006091094342556038, + "grad_norm": 0.08134466409683228, + "kl": 0.11090698093175888, + "learning_rate": 3e-06, + "loss": 0.008, + "step": 2194 + }, + { + "clip_ratio": 0.0006887645286042243, + "epoch": 0.0060938705933958545, + "grad_norm": 0.07155325263738632, + "kl": 0.10384676232933998, + "learning_rate": 3e-06, + "loss": 0.0072, + "step": 2195 + }, + { + "clip_ratio": 0.0011016842036042362, + "epoch": 0.006096646844235671, + "grad_norm": 0.08413667976856232, + "kl": 0.11054657027125359, + "learning_rate": 3e-06, + "loss": 0.007, + "step": 2196 + }, + { + "clip_ratio": 0.00044989935122430325, + "completion_length": 235.1875, + "epoch": 0.006099423095075486, + "grad_norm": 0.07087922096252441, + "kl": 0.1160878986120224, + "learning_rate": 3e-06, + "loss": 0.0124, + "reward": 0.2083333507180214, + "reward_std": 0.2182515673339367, + "rewards/countdown_reward_func": 0.2083333358168602, + "step": 2197, + "zero_std_ratio": 0.25 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0061021993459153025, + "grad_norm": 0.07426546514034271, + "kl": 0.11279673129320145, + "learning_rate": 3e-06, + "loss": 0.0122, + "step": 2198 + }, + { + "clip_ratio": 8.185985643649474e-05, + "epoch": 0.006104975596755118, + "grad_norm": 0.07759413123130798, + "kl": 0.116276104003191, + "learning_rate": 3e-06, + "loss": 0.012, + "step": 2199 + }, + { + "clip_ratio": 0.0, + "epoch": 0.006107751847594934, + "grad_norm": 0.0708848312497139, + "kl": 0.10642623901367188, + "learning_rate": 3e-06, + "loss": 0.0116, + "step": 2200 + }, + { + "clip_ratio": 0.00017426943668397143, + "epoch": 0.0061105280984347496, + "grad_norm": 0.06520801782608032, + "kl": 0.12019915506243706, + "learning_rate": 3e-06, + "loss": 0.0127, + "step": 2201 + }, + { + "clip_ratio": 0.00034561891516204923, + "epoch": 0.006113304349274566, + "grad_norm": 0.07822219282388687, + "kl": 0.10957641154527664, + "learning_rate": 3e-06, + "loss": 0.0129, + "step": 2202 + }, + { + "clip_ratio": 0.0004435717419255525, + "epoch": 0.006116080600114381, + "grad_norm": 0.07648086547851562, + "kl": 0.11310628056526184, + "learning_rate": 3e-06, + "loss": 0.0122, + "step": 2203 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0061188568509541975, + "grad_norm": 0.08405080437660217, + "kl": 0.11058183759450912, + "learning_rate": 3e-06, + "loss": 0.0112, + "step": 2204 + }, + { + "clip_ratio": 9.469696669839323e-05, + "epoch": 0.006121633101794013, + "grad_norm": 0.08572155982255936, + "kl": 0.11485684290528297, + "learning_rate": 3e-06, + "loss": 0.0113, + "step": 2205 + }, + { + "clip_ratio": 0.0, + "epoch": 0.006124409352633829, + "grad_norm": 0.07442327588796616, + "kl": 0.1044563390314579, + "learning_rate": 3e-06, + "loss": 0.0112, + "step": 2206 + }, + { + "clip_ratio": 0.0004602315020747483, + "epoch": 0.0061271856034736455, + "grad_norm": 0.18411950767040253, + "kl": 0.11828203499317169, + "learning_rate": 3e-06, + "loss": 0.0119, + "step": 2207 + }, + { + "clip_ratio": 0.0005978553963359445, + "epoch": 0.006129961854313461, + "grad_norm": 0.0784916952252388, + "kl": 0.10879363864660263, + "learning_rate": 3e-06, + "loss": 0.0119, + "step": 2208 + }, + { + "clip_ratio": 0.0002201182724093087, + "completion_length": 233.75, + "epoch": 0.006132738105153277, + "grad_norm": 0.11505284905433655, + "kl": 0.10442957654595375, + "learning_rate": 3e-06, + "loss": 0.024, + "reward": 0.3812500089406967, + "reward_std": 0.3858536630868912, + "rewards/countdown_reward_func": 0.3812500089406967, + "step": 2209, + "zero_std_ratio": 0.125 + }, + { + "clip_ratio": 0.0003907842037733644, + "epoch": 0.0061355143559930925, + "grad_norm": 0.10098423808813095, + "kl": 0.10408291965723038, + "learning_rate": 3e-06, + "loss": 0.0239, + "step": 2210 + }, + { + "clip_ratio": 0.0001870947889983654, + "epoch": 0.006138290606832909, + "grad_norm": 0.10101636499166489, + "kl": 0.10832390189170837, + "learning_rate": 3e-06, + "loss": 0.0225, + "step": 2211 + }, + { + "clip_ratio": 0.0004268408374628052, + "epoch": 0.006141066857672724, + "grad_norm": 0.11006015539169312, + "kl": 0.10417984053492546, + "learning_rate": 3e-06, + "loss": 0.0225, + "step": 2212 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0061438431085125405, + "grad_norm": 0.1787508875131607, + "kl": 0.11693385615944862, + "learning_rate": 3e-06, + "loss": 0.0225, + "step": 2213 + }, + { + "clip_ratio": 0.0005292101996019483, + "epoch": 0.006146619359352356, + "grad_norm": 0.12185002118349075, + "kl": 0.11256387084722519, + "learning_rate": 3e-06, + "loss": 0.0223, + "step": 2214 + }, + { + "clip_ratio": 0.00012230919674038887, + "epoch": 0.006149395610192172, + "grad_norm": 0.12709738314151764, + "kl": 0.10908814519643784, + "learning_rate": 3e-06, + "loss": 0.023, + "step": 2215 + }, + { + "clip_ratio": 0.00012230919674038887, + "epoch": 0.006152171861031988, + "grad_norm": 0.10294611752033234, + "kl": 0.11007463932037354, + "learning_rate": 3e-06, + "loss": 0.0217, + "step": 2216 + }, + { + "clip_ratio": 0.0002929751281044446, + "epoch": 0.006154948111871804, + "grad_norm": 0.09818847477436066, + "kl": 0.11554765328764915, + "learning_rate": 3e-06, + "loss": 0.0212, + "step": 2217 + }, + { + "clip_ratio": 0.000292975120828487, + "epoch": 0.00615772436271162, + "grad_norm": 0.11660251021385193, + "kl": 0.11222562938928604, + "learning_rate": 3e-06, + "loss": 0.0201, + "step": 2218 + }, + { + "clip_ratio": 8.138021075865254e-05, + "epoch": 0.0061605006135514355, + "grad_norm": 0.09623023122549057, + "kl": 0.12738066911697388, + "learning_rate": 3e-06, + "loss": 0.0199, + "step": 2219 + }, + { + "clip_ratio": 0.0007260283455252647, + "epoch": 0.006163276864391252, + "grad_norm": 0.09498999267816544, + "kl": 0.12235638499259949, + "learning_rate": 3e-06, + "loss": 0.0207, + "step": 2220 + }, + { + "clip_ratio": 0.00026175539096584544, + "completion_length": 239.08333587646484, + "epoch": 0.006166053115231067, + "grad_norm": 0.08734557777643204, + "kl": 0.10614108666777611, + "learning_rate": 3e-06, + "loss": 0.0156, + "reward": 0.23124999552965164, + "reward_std": 0.2115694098174572, + "rewards/countdown_reward_func": 0.23124999552965164, + "step": 2221, + "zero_std_ratio": 0.5 + }, + { + "clip_ratio": 0.00016405216592829674, + "epoch": 0.0061688293660708835, + "grad_norm": 0.11606533825397491, + "kl": 0.10931029170751572, + "learning_rate": 3e-06, + "loss": 0.0164, + "step": 2222 + }, + { + "clip_ratio": 0.0002663365739863366, + "epoch": 0.006171605616910699, + "grad_norm": 0.07539436966180801, + "kl": 0.10711556673049927, + "learning_rate": 3e-06, + "loss": 0.0157, + "step": 2223 + }, + { + "clip_ratio": 0.0002645519489306025, + "epoch": 0.006174381867750515, + "grad_norm": 0.06568428874015808, + "kl": 0.12526912242174149, + "learning_rate": 3e-06, + "loss": 0.0164, + "step": 2224 + }, + { + "clip_ratio": 0.00026455195620656013, + "epoch": 0.0061771581185903306, + "grad_norm": 0.07675132900476456, + "kl": 0.11198921501636505, + "learning_rate": 3e-06, + "loss": 0.0158, + "step": 2225 + }, + { + "clip_ratio": 0.0007440476329065859, + "epoch": 0.006179934369430147, + "grad_norm": 0.07890935987234116, + "kl": 0.11408694460988045, + "learning_rate": 3e-06, + "loss": 0.0169, + "step": 2226 + }, + { + "clip_ratio": 0.0004973536997567862, + "epoch": 0.006182710620269962, + "grad_norm": 0.08032620698213577, + "kl": 0.11937838792800903, + "learning_rate": 3e-06, + "loss": 0.0155, + "step": 2227 + }, + { + "clip_ratio": 0.00019656029326142743, + "epoch": 0.0061854868711097785, + "grad_norm": 0.09635122865438461, + "kl": 0.11930480599403381, + "learning_rate": 3e-06, + "loss": 0.0145, + "step": 2228 + }, + { + "clip_ratio": 9.476876584812999e-05, + "epoch": 0.006188263121949595, + "grad_norm": 0.21204665303230286, + "kl": 0.11878373473882675, + "learning_rate": 3e-06, + "loss": 0.0142, + "step": 2229 + }, + { + "clip_ratio": 0.0009819945989875123, + "epoch": 0.00619103937278941, + "grad_norm": 0.06745634227991104, + "kl": 0.13793090730905533, + "learning_rate": 3e-06, + "loss": 0.0158, + "step": 2230 + }, + { + "clip_ratio": 0.0004885463276877999, + "epoch": 0.0061938156236292265, + "grad_norm": 0.07236343622207642, + "kl": 0.12467692419886589, + "learning_rate": 3e-06, + "loss": 0.0147, + "step": 2231 + }, + { + "clip_ratio": 0.0003369117039255798, + "epoch": 0.006196591874469042, + "grad_norm": 0.07333452254533768, + "kl": 0.12604239955544472, + "learning_rate": 3e-06, + "loss": 0.0151, + "step": 2232 + }, + { + "clip_ratio": 0.00044176532537676394, + "completion_length": 227.5, + "epoch": 0.006199368125308858, + "grad_norm": 0.11169209331274033, + "kl": 0.14527644217014313, + "learning_rate": 3e-06, + "loss": 0.0172, + "reward": 0.4520833492279053, + "reward_std": 0.3492494821548462, + "rewards/countdown_reward_func": 0.4520833194255829, + "step": 2233, + "zero_std_ratio": 0.25 + }, + { + "clip_ratio": 8.722958591533825e-05, + "epoch": 0.0062021443761486736, + "grad_norm": 0.12365715205669403, + "kl": 0.15404719859361649, + "learning_rate": 3e-06, + "loss": 0.0176, + "step": 2234 + }, + { + "clip_ratio": 0.0003403676091693342, + "epoch": 0.00620492062698849, + "grad_norm": 0.10898413509130478, + "kl": 0.15741366147994995, + "learning_rate": 3e-06, + "loss": 0.0166, + "step": 2235 + }, + { + "clip_ratio": 8.722958591533825e-05, + "epoch": 0.006207696877828305, + "grad_norm": 0.11201989650726318, + "kl": 0.15081220865249634, + "learning_rate": 3e-06, + "loss": 0.0165, + "step": 2236 + }, + { + "clip_ratio": 0.00020764119108207524, + "epoch": 0.0062104731286681215, + "grad_norm": 0.2154986560344696, + "kl": 0.15322324633598328, + "learning_rate": 3e-06, + "loss": 0.0158, + "step": 2237 + }, + { + "clip_ratio": 0.0006642429507337511, + "epoch": 0.006213249379507937, + "grad_norm": 0.0987052395939827, + "kl": 0.15503258258104324, + "learning_rate": 3e-06, + "loss": 0.0166, + "step": 2238 + }, + { + "clip_ratio": 8.722958591533825e-05, + "epoch": 0.006216025630347753, + "grad_norm": 0.10490848124027252, + "kl": 0.154758021235466, + "learning_rate": 3e-06, + "loss": 0.0157, + "step": 2239 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0062188018811875695, + "grad_norm": 0.1208440288901329, + "kl": 0.16184867918491364, + "learning_rate": 3e-06, + "loss": 0.016, + "step": 2240 + }, + { + "clip_ratio": 0.0002552756923250854, + "epoch": 0.006221578132027385, + "grad_norm": 0.09863177686929703, + "kl": 0.16634425520896912, + "learning_rate": 3e-06, + "loss": 0.0159, + "step": 2241 + }, + { + "clip_ratio": 0.00017934454808710143, + "epoch": 0.006224354382867201, + "grad_norm": 0.11939014494419098, + "kl": 0.16019989550113678, + "learning_rate": 3e-06, + "loss": 0.0145, + "step": 2242 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0062271306337070165, + "grad_norm": 0.16073253750801086, + "kl": 0.16025684773921967, + "learning_rate": 3e-06, + "loss": 0.0131, + "step": 2243 + }, + { + "clip_ratio": 0.0008758519834373146, + "epoch": 0.006229906884546833, + "grad_norm": 0.11048475652933121, + "kl": 0.1635778620839119, + "learning_rate": 3e-06, + "loss": 0.0148, + "step": 2244 + }, + { + "clip_ratio": 8.90313385752961e-05, + "completion_length": 217.1041717529297, + "epoch": 0.006232683135386648, + "grad_norm": 0.1347557008266449, + "kl": 0.16955012828111649, + "learning_rate": 3e-06, + "loss": 0.0281, + "reward": 0.35625001788139343, + "reward_std": 0.3819515109062195, + "rewards/countdown_reward_func": 0.35625001043081284, + "step": 2245, + "zero_std_ratio": 0.125 + }, + { + "clip_ratio": 9.204712841892615e-05, + "epoch": 0.0062354593862264645, + "grad_norm": 0.1472625583410263, + "kl": 0.16981971263885498, + "learning_rate": 3e-06, + "loss": 0.0262, + "step": 2246 + }, + { + "clip_ratio": 0.0001900863426271826, + "epoch": 0.00623823563706628, + "grad_norm": 0.11650460213422775, + "kl": 0.16689752787351608, + "learning_rate": 3e-06, + "loss": 0.0278, + "step": 2247 + }, + { + "clip_ratio": 0.0001804056082619354, + "epoch": 0.006241011887906096, + "grad_norm": 0.13127779960632324, + "kl": 0.1765117198228836, + "learning_rate": 3e-06, + "loss": 0.028, + "step": 2248 + }, + { + "clip_ratio": 0.0005522827850654721, + "epoch": 0.006243788138745912, + "grad_norm": 0.12236137688159943, + "kl": 0.16286692768335342, + "learning_rate": 3e-06, + "loss": 0.0261, + "step": 2249 + }, + { + "clip_ratio": 0.00018342139810556546, + "epoch": 0.006246564389585728, + "grad_norm": 0.13048216700553894, + "kl": 0.16761357337236404, + "learning_rate": 3e-06, + "loss": 0.0269, + "step": 2250 + }, + { + "clip_ratio": 0.000550264201592654, + "epoch": 0.006249340640425544, + "grad_norm": 0.13221754133701324, + "kl": 0.18436182290315628, + "learning_rate": 3e-06, + "loss": 0.0251, + "step": 2251 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0062521168912653595, + "grad_norm": 0.14144454896450043, + "kl": 0.18143575638532639, + "learning_rate": 3e-06, + "loss": 0.0239, + "step": 2252 + }, + { + "clip_ratio": 0.00033115307451225817, + "epoch": 0.006254893142105176, + "grad_norm": 0.13937947154045105, + "kl": 0.18208129703998566, + "learning_rate": 3e-06, + "loss": 0.0254, + "step": 2253 + }, + { + "clip_ratio": 0.00037381629226729274, + "epoch": 0.006257669392944991, + "grad_norm": 0.11771649122238159, + "kl": 0.1911814734339714, + "learning_rate": 3e-06, + "loss": 0.0255, + "step": 2254 + }, + { + "clip_ratio": 0.000840340624563396, + "epoch": 0.0062604456437848075, + "grad_norm": 0.11015734076499939, + "kl": 0.17935297638177872, + "learning_rate": 3e-06, + "loss": 0.0235, + "step": 2255 + }, + { + "clip_ratio": 0.0008332670404342934, + "epoch": 0.006263221894624623, + "grad_norm": 0.12103752791881561, + "kl": 0.180360309779644, + "learning_rate": 3e-06, + "loss": 0.0231, + "step": 2256 + }, + { + "clip_ratio": 0.0001020408162730746, + "completion_length": 203.4166717529297, + "epoch": 0.006265998145464439, + "grad_norm": 0.1085449829697609, + "kl": 0.178614042699337, + "learning_rate": 3e-06, + "loss": 0.0057, + "reward": 0.39791668951511383, + "reward_std": 0.32604455202817917, + "rewards/countdown_reward_func": 0.39791667461395264, + "step": 2257, + "zero_std_ratio": 0.25 + }, + { + "clip_ratio": 0.000700604279700201, + "epoch": 0.0062687743963042546, + "grad_norm": 0.12487374991178513, + "kl": 0.18157772719860077, + "learning_rate": 3e-06, + "loss": 0.0065, + "step": 2258 + }, + { + "clip_ratio": 0.0005079238399048336, + "epoch": 0.006271550647144071, + "grad_norm": 0.16143116354942322, + "kl": 0.19268468022346497, + "learning_rate": 3e-06, + "loss": 0.0063, + "step": 2259 + }, + { + "clip_ratio": 9.97605748125352e-05, + "epoch": 0.006274326897983886, + "grad_norm": 0.12245763093233109, + "kl": 0.1898026168346405, + "learning_rate": 3e-06, + "loss": 0.0064, + "step": 2260 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0062771031488237025, + "grad_norm": 0.11813352257013321, + "kl": 0.18366330116987228, + "learning_rate": 3e-06, + "loss": 0.0053, + "step": 2261 + }, + { + "clip_ratio": 0.0006266141863306984, + "epoch": 0.006279879399663519, + "grad_norm": 0.2579954266548157, + "kl": 0.18229518085718155, + "learning_rate": 3e-06, + "loss": 0.0059, + "step": 2262 + }, + { + "clip_ratio": 0.0, + "epoch": 0.006282655650503334, + "grad_norm": 0.11579995602369308, + "kl": 0.1908181607723236, + "learning_rate": 3e-06, + "loss": 0.0042, + "step": 2263 + }, + { + "clip_ratio": 0.0006184200756251812, + "epoch": 0.0062854319013431505, + "grad_norm": 0.1960228681564331, + "kl": 0.1934300884604454, + "learning_rate": 3e-06, + "loss": 0.0054, + "step": 2264 + }, + { + "clip_ratio": 0.0012736761127598584, + "epoch": 0.006288208152182966, + "grad_norm": 0.12140205502510071, + "kl": 0.20321929454803467, + "learning_rate": 3e-06, + "loss": 0.0057, + "step": 2265 + }, + { + "clip_ratio": 0.000426823906309437, + "epoch": 0.006290984403022782, + "grad_norm": 0.1191827654838562, + "kl": 0.1980365291237831, + "learning_rate": 3e-06, + "loss": 0.0048, + "step": 2266 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0062937606538625976, + "grad_norm": 0.1335882693529129, + "kl": 0.1887526884675026, + "learning_rate": 3e-06, + "loss": 0.004, + "step": 2267 + }, + { + "clip_ratio": 0.0004993882175767794, + "epoch": 0.006296536904702414, + "grad_norm": 0.11730194836854935, + "kl": 0.18969019502401352, + "learning_rate": 3e-06, + "loss": 0.0042, + "step": 2268 + }, + { + "clip_ratio": 0.00018388705211691558, + "completion_length": 227.70833587646484, + "epoch": 0.006299313155542229, + "grad_norm": 0.12374776601791382, + "kl": 0.22037464380264282, + "learning_rate": 3e-06, + "loss": -0.0004, + "reward": 0.2875000163912773, + "reward_std": 0.2661401182413101, + "rewards/countdown_reward_func": 0.2875000014901161, + "step": 2269, + "zero_std_ratio": 0.375 + }, + { + "clip_ratio": 0.00017997733084484935, + "epoch": 0.0063020894063820455, + "grad_norm": 0.13824217021465302, + "kl": 0.2126154974102974, + "learning_rate": 3e-06, + "loss": -0.001, + "step": 2270 + }, + { + "clip_ratio": 8.802816591924056e-05, + "epoch": 0.006304865657221861, + "grad_norm": 0.18445436656475067, + "kl": 0.2134125828742981, + "learning_rate": 3e-06, + "loss": -0.0016, + "step": 2271 + }, + { + "clip_ratio": 0.0001707650226308033, + "epoch": 0.006307641908061677, + "grad_norm": 0.13947488367557526, + "kl": 0.2060968428850174, + "learning_rate": 3e-06, + "loss": -0.0013, + "step": 2272 + }, + { + "clip_ratio": 0.00017788648256100714, + "epoch": 0.0063104181589014935, + "grad_norm": 0.13269934058189392, + "kl": 0.20021595805883408, + "learning_rate": 3e-06, + "loss": -0.0005, + "step": 2273 + }, + { + "clip_ratio": 0.0003868369967676699, + "epoch": 0.006313194409741309, + "grad_norm": 0.13080525398254395, + "kl": 0.20501340180635452, + "learning_rate": 3e-06, + "loss": -0.0021, + "step": 2274 + }, + { + "clip_ratio": 0.0005433336482383311, + "epoch": 0.006315970660581125, + "grad_norm": 0.11887697875499725, + "kl": 0.19895461946725845, + "learning_rate": 3e-06, + "loss": -0.0025, + "step": 2275 + }, + { + "clip_ratio": 0.0019091092544840649, + "epoch": 0.0063187469114209405, + "grad_norm": 0.13246318697929382, + "kl": 0.18607723712921143, + "learning_rate": 3e-06, + "loss": -0.003, + "step": 2276 + }, + { + "clip_ratio": 0.0005245065985945985, + "epoch": 0.006321523162260757, + "grad_norm": 0.13839560747146606, + "kl": 0.18259260058403015, + "learning_rate": 3e-06, + "loss": -0.0052, + "step": 2277 + }, + { + "clip_ratio": 0.001069007470505312, + "epoch": 0.006324299413100572, + "grad_norm": 0.126378133893013, + "kl": 0.17721717059612274, + "learning_rate": 3e-06, + "loss": -0.0051, + "step": 2278 + }, + { + "clip_ratio": 0.0020173885859549046, + "epoch": 0.0063270756639403885, + "grad_norm": 0.13504590094089508, + "kl": 0.16881190985441208, + "learning_rate": 3e-06, + "loss": -0.0043, + "step": 2279 + }, + { + "clip_ratio": 0.000982351542916149, + "epoch": 0.006329851914780204, + "grad_norm": 0.13827833533287048, + "kl": 0.16988541185855865, + "learning_rate": 3e-06, + "loss": -0.0052, + "step": 2280 + }, + { + "clip_ratio": 9.057971328729764e-05, + "completion_length": 228.64583587646484, + "epoch": 0.00633262816562002, + "grad_norm": 0.12721112370491028, + "kl": 0.1614338904619217, + "learning_rate": 3e-06, + "loss": 0.053, + "reward": 0.36250002682209015, + "reward_std": 0.33128294348716736, + "rewards/countdown_reward_func": 0.36250001192092896, + "step": 2281, + "zero_std_ratio": 0.25 + }, + { + "clip_ratio": 0.0003709889715537429, + "epoch": 0.006335404416459836, + "grad_norm": 0.20156148076057434, + "kl": 0.16206463426351547, + "learning_rate": 3e-06, + "loss": 0.0541, + "step": 2282 + }, + { + "clip_ratio": 0.00019021370098926127, + "epoch": 0.006338180667299652, + "grad_norm": 0.16486519575119019, + "kl": 0.15344135463237762, + "learning_rate": 3e-06, + "loss": 0.0543, + "step": 2283 + }, + { + "clip_ratio": 0.0008641848689876497, + "epoch": 0.006340956918139468, + "grad_norm": 0.16734211146831512, + "kl": 0.15662110596895218, + "learning_rate": 3e-06, + "loss": 0.0541, + "step": 2284 + }, + { + "clip_ratio": 0.00026667342899600044, + "epoch": 0.0063437331689792835, + "grad_norm": 0.14003409445285797, + "kl": 0.1625654399394989, + "learning_rate": 3e-06, + "loss": 0.0546, + "step": 2285 + }, + { + "clip_ratio": 8.239947055699304e-05, + "epoch": 0.0063465094198191, + "grad_norm": 0.14652083814144135, + "kl": 0.15703174471855164, + "learning_rate": 3e-06, + "loss": 0.0539, + "step": 2286 + }, + { + "clip_ratio": 8.239947055699304e-05, + "epoch": 0.006349285670658915, + "grad_norm": 0.14092198014259338, + "kl": 0.156203031539917, + "learning_rate": 3e-06, + "loss": 0.0527, + "step": 2287 + }, + { + "clip_ratio": 0.00041524558037053794, + "epoch": 0.0063520619214987315, + "grad_norm": 0.19670848548412323, + "kl": 0.16062773764133453, + "learning_rate": 3e-06, + "loss": 0.0533, + "step": 2288 + }, + { + "clip_ratio": 0.0005093724466860294, + "epoch": 0.006354838172338547, + "grad_norm": 0.15894927084445953, + "kl": 0.15653447806835175, + "learning_rate": 3e-06, + "loss": 0.0521, + "step": 2289 + }, + { + "clip_ratio": 0.0009049937652889639, + "epoch": 0.006357614423178363, + "grad_norm": 0.16536635160446167, + "kl": 0.1637991964817047, + "learning_rate": 3e-06, + "loss": 0.051, + "step": 2290 + }, + { + "clip_ratio": 0.00036807394644711167, + "epoch": 0.0063603906740181786, + "grad_norm": 0.13223929703235626, + "kl": 0.17576530575752258, + "learning_rate": 3e-06, + "loss": 0.052, + "step": 2291 + }, + { + "clip_ratio": 0.00010237510286970064, + "epoch": 0.006363166924857995, + "grad_norm": 0.13640287518501282, + "kl": 0.16830559074878693, + "learning_rate": 3e-06, + "loss": 0.05, + "step": 2292 + }, + { + "clip_ratio": 0.0002590137009974569, + "completion_length": 232.68750762939453, + "epoch": 0.00636594317569781, + "grad_norm": 0.0946778878569603, + "kl": 0.18598677217960358, + "learning_rate": 3e-06, + "loss": 0.0195, + "reward": 0.24583332985639572, + "reward_std": 0.2609790861606598, + "rewards/countdown_reward_func": 0.24583332985639572, + "step": 2293, + "zero_std_ratio": 0.375 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0063687194265376265, + "grad_norm": 0.0930834412574768, + "kl": 0.18177608400583267, + "learning_rate": 3e-06, + "loss": 0.0184, + "step": 2294 + }, + { + "clip_ratio": 8.417508070124313e-05, + "epoch": 0.006371495677377443, + "grad_norm": 0.26509028673171997, + "kl": 0.18570785969495773, + "learning_rate": 3e-06, + "loss": 0.0183, + "step": 2295 + }, + { + "clip_ratio": 0.0, + "epoch": 0.006374271928217258, + "grad_norm": 0.12472037225961685, + "kl": 0.19241143763065338, + "learning_rate": 3e-06, + "loss": 0.0195, + "step": 2296 + }, + { + "clip_ratio": 0.0006900840235175565, + "epoch": 0.0063770481790570745, + "grad_norm": 0.1127580925822258, + "kl": 0.19401460886001587, + "learning_rate": 3e-06, + "loss": 0.0185, + "step": 2297 + }, + { + "clip_ratio": 0.0001705010508885607, + "epoch": 0.00637982442989689, + "grad_norm": 0.1149328202009201, + "kl": 0.20228491723537445, + "learning_rate": 3e-06, + "loss": 0.0181, + "step": 2298 + }, + { + "clip_ratio": 0.00026395946042612195, + "epoch": 0.006382600680736706, + "grad_norm": 0.09492156654596329, + "kl": 0.21177734434604645, + "learning_rate": 3e-06, + "loss": 0.0185, + "step": 2299 + }, + { + "clip_ratio": 0.00018846639432013035, + "epoch": 0.0063853769315765216, + "grad_norm": 0.09327538311481476, + "kl": 0.20418494194746017, + "learning_rate": 3e-06, + "loss": 0.0178, + "step": 2300 + }, + { + "clip_ratio": 0.00017548260802868754, + "epoch": 0.006388153182416338, + "grad_norm": 0.11833266168832779, + "kl": 0.2049153670668602, + "learning_rate": 3e-06, + "loss": 0.0164, + "step": 2301 + }, + { + "clip_ratio": 0.00044657984835794196, + "epoch": 0.006390929433256153, + "grad_norm": 0.1208672747015953, + "kl": 0.20857040584087372, + "learning_rate": 3e-06, + "loss": 0.018, + "step": 2302 + }, + { + "clip_ratio": 0.001321177463978529, + "epoch": 0.0063937056840959695, + "grad_norm": 0.09639300405979156, + "kl": 0.20693976432085037, + "learning_rate": 3e-06, + "loss": 0.0173, + "step": 2303 + }, + { + "clip_ratio": 0.0, + "epoch": 0.006396481934935785, + "grad_norm": 0.1254264861345291, + "kl": 0.21083715558052063, + "learning_rate": 3e-06, + "loss": 0.0165, + "step": 2304 + }, + { + "clip_ratio": 0.0, + "completion_length": 219.6666717529297, + "epoch": 0.006399258185775601, + "grad_norm": 0.20140250027179718, + "kl": 0.22675880044698715, + "learning_rate": 3e-06, + "loss": 0.0569, + "reward": 0.4375000298023224, + "reward_std": 0.4317815750837326, + "rewards/countdown_reward_func": 0.4375000149011612, + "step": 2305, + "zero_std_ratio": 0.0 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0064020344366154175, + "grad_norm": 0.164041668176651, + "kl": 0.22609523683786392, + "learning_rate": 3e-06, + "loss": 0.057, + "step": 2306 + }, + { + "clip_ratio": 0.0010027538519352674, + "epoch": 0.006404810687455233, + "grad_norm": 0.2000647336244583, + "kl": 0.20753808319568634, + "learning_rate": 3e-06, + "loss": 0.0568, + "step": 2307 + }, + { + "clip_ratio": 0.00016891522682271898, + "epoch": 0.006407586938295049, + "grad_norm": 0.16357851028442383, + "kl": 0.23235772550106049, + "learning_rate": 3e-06, + "loss": 0.0562, + "step": 2308 + }, + { + "clip_ratio": 8.941345004132017e-05, + "epoch": 0.0064103631891348645, + "grad_norm": 0.18721310794353485, + "kl": 0.23869449645280838, + "learning_rate": 3e-06, + "loss": 0.055, + "step": 2309 + }, + { + "clip_ratio": 0.0, + "epoch": 0.006413139439974681, + "grad_norm": 0.1559915840625763, + "kl": 0.26671281456947327, + "learning_rate": 3e-06, + "loss": 0.0561, + "step": 2310 + }, + { + "clip_ratio": 0.00017694846610538661, + "epoch": 0.006415915690814496, + "grad_norm": 0.15243524312973022, + "kl": 0.25702714920043945, + "learning_rate": 3e-06, + "loss": 0.0538, + "step": 2311 + }, + { + "clip_ratio": 8.999279816634953e-05, + "epoch": 0.0064186919416543125, + "grad_norm": 0.1509992778301239, + "kl": 0.26283329725265503, + "learning_rate": 3e-06, + "loss": 0.0525, + "step": 2312 + }, + { + "clip_ratio": 9.968101949198171e-05, + "epoch": 0.006421468192494128, + "grad_norm": 0.15100322663784027, + "kl": 0.24412598460912704, + "learning_rate": 3e-06, + "loss": 0.0516, + "step": 2313 + }, + { + "clip_ratio": 0.000363269035005942, + "epoch": 0.006424244443333944, + "grad_norm": 0.1439571976661682, + "kl": 0.2789086848497391, + "learning_rate": 3e-06, + "loss": 0.0529, + "step": 2314 + }, + { + "clip_ratio": 0.00019936203898396343, + "epoch": 0.00642702069417376, + "grad_norm": 0.15630877017974854, + "kl": 0.29722483456134796, + "learning_rate": 3e-06, + "loss": 0.0506, + "step": 2315 + }, + { + "clip_ratio": 0.00027720882644644007, + "epoch": 0.006429796945013576, + "grad_norm": 0.15677094459533691, + "kl": 0.32308197021484375, + "learning_rate": 3e-06, + "loss": 0.0526, + "step": 2316 + }, + { + "clip_ratio": 0.0004198152746539563, + "completion_length": 203.25, + "epoch": 0.006432573195853392, + "grad_norm": 0.12152662873268127, + "kl": 0.27852681279182434, + "learning_rate": 3e-06, + "loss": 0.0191, + "reward": 0.3812500238418579, + "reward_std": 0.37016279995441437, + "rewards/countdown_reward_func": 0.3812499940395355, + "step": 2317, + "zero_std_ratio": 0.125 + }, + { + "clip_ratio": 0.00020843063975917175, + "epoch": 0.0064353494466932075, + "grad_norm": 0.139579176902771, + "kl": 0.3006761074066162, + "learning_rate": 3e-06, + "loss": 0.019, + "step": 2318 + }, + { + "clip_ratio": 0.0, + "epoch": 0.006438125697533024, + "grad_norm": 0.14806227385997772, + "kl": 0.3408869802951813, + "learning_rate": 3e-06, + "loss": 0.0213, + "step": 2319 + }, + { + "clip_ratio": 0.00039015276706777513, + "epoch": 0.006440901948372839, + "grad_norm": 0.17348290979862213, + "kl": 0.34244829416275024, + "learning_rate": 3e-06, + "loss": 0.0215, + "step": 2320 + }, + { + "clip_ratio": 0.000316114688757807, + "epoch": 0.0064436781992126555, + "grad_norm": 0.15013259649276733, + "kl": 0.34295883774757385, + "learning_rate": 3e-06, + "loss": 0.022, + "step": 2321 + }, + { + "clip_ratio": 0.0004764232726301998, + "epoch": 0.006446454450052471, + "grad_norm": 0.2142786681652069, + "kl": 0.36009830236434937, + "learning_rate": 3e-06, + "loss": 0.023, + "step": 2322 + }, + { + "clip_ratio": 0.00029448137502186, + "epoch": 0.006449230700892287, + "grad_norm": 0.12087388336658478, + "kl": 0.30946898460388184, + "learning_rate": 3e-06, + "loss": 0.0178, + "step": 2323 + }, + { + "clip_ratio": 0.00010347682109568268, + "epoch": 0.0064520069517321026, + "grad_norm": 0.15301726758480072, + "kl": 0.3245188295841217, + "learning_rate": 3e-06, + "loss": 0.0184, + "step": 2324 + }, + { + "clip_ratio": 0.00010495381866348907, + "epoch": 0.006454783202571919, + "grad_norm": 0.15143437683582306, + "kl": 0.3580729514360428, + "learning_rate": 3e-06, + "loss": 0.0202, + "step": 2325 + }, + { + "clip_ratio": 0.000572390272282064, + "epoch": 0.006457559453411734, + "grad_norm": 0.17135295271873474, + "kl": 0.35064497590065, + "learning_rate": 3e-06, + "loss": 0.0181, + "step": 2326 + }, + { + "clip_ratio": 0.00043031001405324787, + "epoch": 0.0064603357042515505, + "grad_norm": 0.14325761795043945, + "kl": 0.33868308365345, + "learning_rate": 3e-06, + "loss": 0.0201, + "step": 2327 + }, + { + "clip_ratio": 0.0015213553560897708, + "epoch": 0.006463111955091367, + "grad_norm": 0.158623605966568, + "kl": 0.3448447734117508, + "learning_rate": 3e-06, + "loss": 0.022, + "step": 2328 + }, + { + "clip_ratio": 0.0, + "completion_length": 203.6666717529297, + "epoch": 0.006465888205931182, + "grad_norm": 0.17202085256576538, + "kl": 0.2865896373987198, + "learning_rate": 3e-06, + "loss": 0.0367, + "reward": 0.4541666954755783, + "reward_std": 0.3947114050388336, + "rewards/countdown_reward_func": 0.4541666954755783, + "step": 2329, + "zero_std_ratio": 0.125 + }, + { + "clip_ratio": 9.904913167702034e-05, + "epoch": 0.0064686644567709985, + "grad_norm": 0.14967620372772217, + "kl": 0.2809770703315735, + "learning_rate": 3e-06, + "loss": 0.0355, + "step": 2330 + }, + { + "clip_ratio": 0.0, + "epoch": 0.006471440707610814, + "grad_norm": 0.1649092584848404, + "kl": 0.2993633449077606, + "learning_rate": 3e-06, + "loss": 0.0373, + "step": 2331 + }, + { + "clip_ratio": 0.00019977435295004398, + "epoch": 0.00647421695845063, + "grad_norm": 0.1611592024564743, + "kl": 0.2826383709907532, + "learning_rate": 3e-06, + "loss": 0.0346, + "step": 2332 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0064769932092904456, + "grad_norm": 0.13679081201553345, + "kl": 0.28105440735816956, + "learning_rate": 3e-06, + "loss": 0.0359, + "step": 2333 + }, + { + "clip_ratio": 0.0002023549168370664, + "epoch": 0.006479769460130262, + "grad_norm": 0.15343226492404938, + "kl": 0.2757682204246521, + "learning_rate": 3e-06, + "loss": 0.0357, + "step": 2334 + }, + { + "clip_ratio": 0.0, + "epoch": 0.006482545710970077, + "grad_norm": 0.17491856217384338, + "kl": 0.2747037261724472, + "learning_rate": 3e-06, + "loss": 0.0344, + "step": 2335 + }, + { + "clip_ratio": 0.00014551804633811116, + "epoch": 0.0064853219618098935, + "grad_norm": 0.14823581278324127, + "kl": 0.2763662040233612, + "learning_rate": 3e-06, + "loss": 0.0339, + "step": 2336 + }, + { + "clip_ratio": 9.391435014549643e-05, + "epoch": 0.006488098212649709, + "grad_norm": 0.15623505413532257, + "kl": 0.30250681936740875, + "learning_rate": 3e-06, + "loss": 0.0355, + "step": 2337 + }, + { + "clip_ratio": 0.00010330578516004607, + "epoch": 0.006490874463489525, + "grad_norm": 0.15354587137699127, + "kl": 0.2839023768901825, + "learning_rate": 3e-06, + "loss": 0.0342, + "step": 2338 + }, + { + "clip_ratio": 0.00020746888185385615, + "epoch": 0.0064936507143293415, + "grad_norm": 0.14572341740131378, + "kl": 0.2831754833459854, + "learning_rate": 3e-06, + "loss": 0.0341, + "step": 2339 + }, + { + "clip_ratio": 9.391435014549643e-05, + "epoch": 0.006496426965169157, + "grad_norm": 0.1376647800207138, + "kl": 0.28501126170158386, + "learning_rate": 3e-06, + "loss": 0.0341, + "step": 2340 + }, + { + "clip_ratio": 0.0, + "completion_length": 215.5416717529297, + "epoch": 0.006499203216008973, + "grad_norm": 0.2073156237602234, + "kl": 0.3356934189796448, + "learning_rate": 3e-06, + "loss": 0.0187, + "reward": 0.40000003576278687, + "reward_std": 0.2904737517237663, + "rewards/countdown_reward_func": 0.4000000059604645, + "step": 2341, + "zero_std_ratio": 0.375 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0065019794668487885, + "grad_norm": 0.14323925971984863, + "kl": 0.322573721408844, + "learning_rate": 3e-06, + "loss": 0.0175, + "step": 2342 + }, + { + "clip_ratio": 0.00025773196830414236, + "epoch": 0.006504755717688605, + "grad_norm": 0.3021996021270752, + "kl": 0.3119508624076843, + "learning_rate": 3e-06, + "loss": 0.0173, + "step": 2343 + }, + { + "clip_ratio": 0.0001781615719664842, + "epoch": 0.00650753196852842, + "grad_norm": 0.15104366838932037, + "kl": 0.31141962110996246, + "learning_rate": 3e-06, + "loss": 0.0174, + "step": 2344 + }, + { + "clip_ratio": 0.0002609418734209612, + "epoch": 0.0065103082193682365, + "grad_norm": 0.1799515038728714, + "kl": 0.3139701336622238, + "learning_rate": 3e-06, + "loss": 0.0174, + "step": 2345 + }, + { + "clip_ratio": 0.0003696906060213223, + "epoch": 0.006513084470208052, + "grad_norm": 0.15009325742721558, + "kl": 0.29629264771938324, + "learning_rate": 3e-06, + "loss": 0.0184, + "step": 2346 + }, + { + "clip_ratio": 0.00026359809999121353, + "epoch": 0.006515860721047868, + "grad_norm": 0.20159399509429932, + "kl": 0.30615514516830444, + "learning_rate": 3e-06, + "loss": 0.016, + "step": 2347 + }, + { + "clip_ratio": 8.591065125074238e-05, + "epoch": 0.006518636971887684, + "grad_norm": 0.15064962208271027, + "kl": 0.28726859390735626, + "learning_rate": 3e-06, + "loss": 0.0158, + "step": 2348 + }, + { + "clip_ratio": 0.00031818235584069043, + "epoch": 0.0065214132227275, + "grad_norm": 0.15885993838310242, + "kl": 0.2687215358018875, + "learning_rate": 3e-06, + "loss": 0.0144, + "step": 2349 + }, + { + "clip_ratio": 0.0012777996016666293, + "epoch": 0.006524189473567316, + "grad_norm": 0.15064121782779694, + "kl": 0.26052138954401016, + "learning_rate": 3e-06, + "loss": 0.0151, + "step": 2350 + }, + { + "clip_ratio": 0.000863487075548619, + "epoch": 0.0065269657244071315, + "grad_norm": 0.17315149307250977, + "kl": 0.2593861371278763, + "learning_rate": 3e-06, + "loss": 0.0123, + "step": 2351 + }, + { + "clip_ratio": 0.001552396803162992, + "epoch": 0.006529741975246948, + "grad_norm": 0.13436666131019592, + "kl": 0.24121838808059692, + "learning_rate": 3e-06, + "loss": 0.0142, + "step": 2352 + }, + { + "clip_ratio": 0.00013102724915370345, + "completion_length": 208.56250762939453, + "epoch": 0.006532518226086763, + "grad_norm": 0.16845515370368958, + "kl": 0.2573952376842499, + "learning_rate": 3e-06, + "loss": 0.0076, + "reward": 0.2854166701436043, + "reward_std": 0.24690960347652435, + "rewards/countdown_reward_func": 0.2854166701436043, + "step": 2353, + "zero_std_ratio": 0.25 + }, + { + "clip_ratio": 0.00021652656141668558, + "epoch": 0.0065352944769265795, + "grad_norm": 0.37916499376296997, + "kl": 0.2523314207792282, + "learning_rate": 3e-06, + "loss": 0.0068, + "step": 2354 + }, + { + "clip_ratio": 9.293680341215804e-05, + "epoch": 0.006538070727766395, + "grad_norm": 0.16434593498706818, + "kl": 0.24893923103809357, + "learning_rate": 3e-06, + "loss": 0.006, + "step": 2355 + }, + { + "clip_ratio": 0.0, + "epoch": 0.006540846978606211, + "grad_norm": 0.125566303730011, + "kl": 0.23655416816473007, + "learning_rate": 3e-06, + "loss": 0.0069, + "step": 2356 + }, + { + "clip_ratio": 0.00024826216395013034, + "epoch": 0.0065436232294460266, + "grad_norm": 0.130938321352005, + "kl": 0.220244362950325, + "learning_rate": 3e-06, + "loss": 0.0047, + "step": 2357 + }, + { + "clip_ratio": 0.00016687953029759228, + "epoch": 0.006546399480285843, + "grad_norm": 0.18719619512557983, + "kl": 0.2183583378791809, + "learning_rate": 3e-06, + "loss": 0.0042, + "step": 2358 + }, + { + "clip_ratio": 0.0015653396840207279, + "epoch": 0.006549175731125658, + "grad_norm": 0.16384494304656982, + "kl": 0.20466335862874985, + "learning_rate": 3e-06, + "loss": 0.0031, + "step": 2359 + }, + { + "clip_ratio": 0.0020808427361771464, + "epoch": 0.0065519519819654745, + "grad_norm": 0.1263015866279602, + "kl": 0.1988421380519867, + "learning_rate": 3e-06, + "loss": 0.0029, + "step": 2360 + }, + { + "clip_ratio": 0.0030670628184452653, + "epoch": 0.006554728232805291, + "grad_norm": 0.13618893921375275, + "kl": 0.20048832148313522, + "learning_rate": 3e-06, + "loss": 0.0016, + "step": 2361 + }, + { + "clip_ratio": 0.004884407157078385, + "epoch": 0.006557504483645106, + "grad_norm": 0.12469899654388428, + "kl": 0.18725426495075226, + "learning_rate": 3e-06, + "loss": 0.0037, + "step": 2362 + }, + { + "clip_ratio": 0.006509467493742704, + "epoch": 0.0065602807344849225, + "grad_norm": 0.11395581811666489, + "kl": 0.17675061523914337, + "learning_rate": 3e-06, + "loss": 0.0023, + "step": 2363 + }, + { + "clip_ratio": 0.010824932716786861, + "epoch": 0.006563056985324738, + "grad_norm": 0.14886018633842468, + "kl": 0.17395079135894775, + "learning_rate": 3e-06, + "loss": -0.0004, + "step": 2364 + }, + { + "clip_ratio": 0.00041061273077502847, + "completion_length": 225.1041717529297, + "epoch": 0.006565833236164554, + "grad_norm": 0.10056737810373306, + "kl": 0.14540190249681473, + "learning_rate": 3e-06, + "loss": 0.0074, + "reward": 0.3770833760499954, + "reward_std": 0.3362167477607727, + "rewards/countdown_reward_func": 0.37708336114883423, + "step": 2365, + "zero_std_ratio": 0.125 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0065686094870043696, + "grad_norm": 0.11986743658781052, + "kl": 0.14874933660030365, + "learning_rate": 3e-06, + "loss": 0.0078, + "step": 2366 + }, + { + "clip_ratio": 8.138021075865254e-05, + "epoch": 0.006571385737844186, + "grad_norm": 0.13911956548690796, + "kl": 0.1324518918991089, + "learning_rate": 3e-06, + "loss": 0.0065, + "step": 2367 + }, + { + "clip_ratio": 0.00030378723749890924, + "epoch": 0.006574161988684001, + "grad_norm": 0.12894724309444427, + "kl": 0.1367444545030594, + "learning_rate": 3e-06, + "loss": 0.0075, + "step": 2368 + }, + { + "clip_ratio": 0.0003762036649277434, + "epoch": 0.0065769382395238175, + "grad_norm": 0.10234810411930084, + "kl": 0.1384653076529503, + "learning_rate": 3e-06, + "loss": 0.0064, + "step": 2369 + }, + { + "clip_ratio": 0.0003675257903523743, + "epoch": 0.006579714490363633, + "grad_norm": 0.11732344329357147, + "kl": 0.12812578678131104, + "learning_rate": 3e-06, + "loss": 0.0075, + "step": 2370 + }, + { + "clip_ratio": 0.0004069010537932627, + "epoch": 0.006582490741203449, + "grad_norm": 0.09635314345359802, + "kl": 0.1304508000612259, + "learning_rate": 3e-06, + "loss": 0.0069, + "step": 2371 + }, + { + "clip_ratio": 8.138021075865254e-05, + "epoch": 0.0065852669920432655, + "grad_norm": 0.11662594228982925, + "kl": 0.13656722754240036, + "learning_rate": 3e-06, + "loss": 0.0065, + "step": 2372 + }, + { + "clip_ratio": 0.0008264775387942791, + "epoch": 0.006588043242883081, + "grad_norm": 0.11650729924440384, + "kl": 0.12324058637022972, + "learning_rate": 3e-06, + "loss": 0.0059, + "step": 2373 + }, + { + "clip_ratio": 0.0005027086590416729, + "epoch": 0.006590819493722897, + "grad_norm": 0.12746286392211914, + "kl": 0.12780633196234703, + "learning_rate": 3e-06, + "loss": 0.0055, + "step": 2374 + }, + { + "clip_ratio": 0.00035271809611003846, + "epoch": 0.0065935957445627125, + "grad_norm": 0.10422283411026001, + "kl": 0.13124725595116615, + "learning_rate": 3e-06, + "loss": 0.0072, + "step": 2375 + }, + { + "clip_ratio": 0.0005158810163266025, + "epoch": 0.006596371995402529, + "grad_norm": 0.11545603722333908, + "kl": 0.12114672735333443, + "learning_rate": 3e-06, + "loss": 0.0072, + "step": 2376 + }, + { + "clip_ratio": 0.00017926588043337688, + "completion_length": 227.45834350585938, + "epoch": 0.006599148246242344, + "grad_norm": 0.09546420723199844, + "kl": 0.1200266070663929, + "learning_rate": 3e-06, + "loss": 0.0108, + "reward": 0.37708333134651184, + "reward_std": 0.30922502279281616, + "rewards/countdown_reward_func": 0.37708331644535065, + "step": 2377, + "zero_std_ratio": 0.25 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0066019244970821605, + "grad_norm": 0.11293301731348038, + "kl": 0.11872411891818047, + "learning_rate": 3e-06, + "loss": 0.0105, + "step": 2378 + }, + { + "clip_ratio": 0.00032814600854180753, + "epoch": 0.006604700747921976, + "grad_norm": 0.10137040168046951, + "kl": 0.1121567115187645, + "learning_rate": 3e-06, + "loss": 0.011, + "step": 2379 + }, + { + "clip_ratio": 0.0, + "epoch": 0.006607476998761792, + "grad_norm": 0.12760894000530243, + "kl": 0.11916434392333031, + "learning_rate": 3e-06, + "loss": 0.0107, + "step": 2380 + }, + { + "clip_ratio": 0.0, + "epoch": 0.006610253249601608, + "grad_norm": 0.13065321743488312, + "kl": 0.11660314351320267, + "learning_rate": 3e-06, + "loss": 0.0105, + "step": 2381 + }, + { + "clip_ratio": 8.138021075865254e-05, + "epoch": 0.006613029500441424, + "grad_norm": 0.13641607761383057, + "kl": 0.11466880142688751, + "learning_rate": 3e-06, + "loss": 0.0102, + "step": 2382 + }, + { + "clip_ratio": 0.0, + "epoch": 0.00661580575128124, + "grad_norm": 0.09623085707426071, + "kl": 0.11718219518661499, + "learning_rate": 3e-06, + "loss": 0.0105, + "step": 2383 + }, + { + "clip_ratio": 0.0006848488337709568, + "epoch": 0.0066185820021210555, + "grad_norm": 0.10395172983407974, + "kl": 0.11643769219517708, + "learning_rate": 3e-06, + "loss": 0.0096, + "step": 2384 + }, + { + "clip_ratio": 0.0002559744389145635, + "epoch": 0.006621358252960872, + "grad_norm": 0.08755200356245041, + "kl": 0.10955745726823807, + "learning_rate": 3e-06, + "loss": 0.0094, + "step": 2385 + }, + { + "clip_ratio": 0.0, + "epoch": 0.006624134503800687, + "grad_norm": 0.09580729156732559, + "kl": 0.11723517999053001, + "learning_rate": 3e-06, + "loss": 0.0107, + "step": 2386 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0066269107546405035, + "grad_norm": 0.11413105577230453, + "kl": 0.11416671797633171, + "learning_rate": 3e-06, + "loss": 0.0096, + "step": 2387 + }, + { + "clip_ratio": 0.00010495381866348907, + "epoch": 0.006629687005480319, + "grad_norm": 0.14028304815292358, + "kl": 0.11132171005010605, + "learning_rate": 3e-06, + "loss": 0.0099, + "step": 2388 + }, + { + "clip_ratio": 0.0007585418788949028, + "completion_length": 243.0, + "epoch": 0.006632463256320135, + "grad_norm": 0.12180110812187195, + "kl": 0.11260923743247986, + "learning_rate": 3e-06, + "loss": 0.0024, + "reward": 0.3229166716337204, + "reward_std": 0.3841392993927002, + "rewards/countdown_reward_func": 0.3229166716337204, + "step": 2389, + "zero_std_ratio": 0.125 + }, + { + "clip_ratio": 8.138021075865254e-05, + "epoch": 0.0066352395071599506, + "grad_norm": 0.10579289495944977, + "kl": 0.12057583779096603, + "learning_rate": 3e-06, + "loss": 0.0038, + "step": 2390 + }, + { + "clip_ratio": 0.00044376778532750905, + "epoch": 0.006638015757999767, + "grad_norm": 0.1263391077518463, + "kl": 0.11325542628765106, + "learning_rate": 3e-06, + "loss": 0.0029, + "step": 2391 + }, + { + "clip_ratio": 0.00017295530415140092, + "epoch": 0.006640792008839583, + "grad_norm": 0.11693087965250015, + "kl": 0.1094004213809967, + "learning_rate": 3e-06, + "loss": 0.0033, + "step": 2392 + }, + { + "clip_ratio": 8.138021075865254e-05, + "epoch": 0.0066435682596793985, + "grad_norm": 0.09953317046165466, + "kl": 0.11280214041471481, + "learning_rate": 3e-06, + "loss": 0.0032, + "step": 2393 + }, + { + "clip_ratio": 0.00016556291666347533, + "epoch": 0.006646344510519215, + "grad_norm": 0.09708958119153976, + "kl": 0.10897836834192276, + "learning_rate": 3e-06, + "loss": 0.0026, + "step": 2394 + }, + { + "clip_ratio": 0.00018115942657459527, + "epoch": 0.00664912076135903, + "grad_norm": 0.12472333759069443, + "kl": 0.10994191840291023, + "learning_rate": 3e-06, + "loss": 0.0021, + "step": 2395 + }, + { + "clip_ratio": 0.0002494906075298786, + "epoch": 0.0066518970121988465, + "grad_norm": 0.10897238552570343, + "kl": 0.1151459850370884, + "learning_rate": 3e-06, + "loss": 0.0034, + "step": 2396 + }, + { + "clip_ratio": 0.0005148796408320777, + "epoch": 0.006654673263038662, + "grad_norm": 0.12806881964206696, + "kl": 0.10942655429244041, + "learning_rate": 3e-06, + "loss": 0.0017, + "step": 2397 + }, + { + "clip_ratio": 0.000273729907348752, + "epoch": 0.006657449513878478, + "grad_norm": 0.09647449851036072, + "kl": 0.1066247895359993, + "learning_rate": 3e-06, + "loss": 0.003, + "step": 2398 + }, + { + "clip_ratio": 8.394895849050954e-05, + "epoch": 0.0066602257647182935, + "grad_norm": 0.11547134071588516, + "kl": 0.10796680673956871, + "learning_rate": 3e-06, + "loss": 0.002, + "step": 2399 + }, + { + "clip_ratio": 8.138021075865254e-05, + "epoch": 0.00666300201555811, + "grad_norm": 0.10167829692363739, + "kl": 0.10324173793196678, + "learning_rate": 3e-06, + "loss": 0.0014, + "step": 2400 + }, + { + "clip_ratio": 0.001098598149837926, + "completion_length": 229.6041717529297, + "epoch": 0.006665778266397925, + "grad_norm": 0.12295004725456238, + "kl": 0.10780546069145203, + "learning_rate": 3e-06, + "loss": 0.017, + "reward": 0.28333336114883423, + "reward_std": 0.2339424267411232, + "rewards/countdown_reward_func": 0.28333333134651184, + "step": 2401, + "zero_std_ratio": 0.25 + }, + { + "clip_ratio": 0.00017353141447529197, + "epoch": 0.0066685545172377415, + "grad_norm": 0.08983492106199265, + "kl": 0.10426361858844757, + "learning_rate": 3e-06, + "loss": 0.0157, + "step": 2402 + }, + { + "clip_ratio": 8.196721319109201e-05, + "epoch": 0.006671330768077558, + "grad_norm": 0.06798221170902252, + "kl": 0.09816591441631317, + "learning_rate": 3e-06, + "loss": 0.0156, + "step": 2403 + }, + { + "clip_ratio": 0.00025100400671362877, + "epoch": 0.006674107018917373, + "grad_norm": 0.08332613110542297, + "kl": 0.10509132966399193, + "learning_rate": 3e-06, + "loss": 0.0153, + "step": 2404 + }, + { + "clip_ratio": 0.00019771909865085036, + "epoch": 0.0066768832697571895, + "grad_norm": 0.08753097057342529, + "kl": 0.1027427539229393, + "learning_rate": 3e-06, + "loss": 0.016, + "step": 2405 + }, + { + "clip_ratio": 0.0006030112272128463, + "epoch": 0.006679659520597005, + "grad_norm": 0.06988681852817535, + "kl": 0.10366249829530716, + "learning_rate": 3e-06, + "loss": 0.0161, + "step": 2406 + }, + { + "clip_ratio": 0.0005557472104555927, + "epoch": 0.006682435771436821, + "grad_norm": 0.09204717725515366, + "kl": 0.10658174008131027, + "learning_rate": 3e-06, + "loss": 0.0154, + "step": 2407 + }, + { + "clip_ratio": 8.366800466319546e-05, + "epoch": 0.0066852120222766365, + "grad_norm": 0.08877309411764145, + "kl": 0.10358146205544472, + "learning_rate": 3e-06, + "loss": 0.0147, + "step": 2408 + }, + { + "clip_ratio": 8.366800466319546e-05, + "epoch": 0.006687988273116453, + "grad_norm": 0.06935451179742813, + "kl": 0.0986652821302414, + "learning_rate": 3e-06, + "loss": 0.0153, + "step": 2409 + }, + { + "clip_ratio": 0.0003415837127249688, + "epoch": 0.006690764523956268, + "grad_norm": 0.0854223370552063, + "kl": 0.10557211935520172, + "learning_rate": 3e-06, + "loss": 0.0144, + "step": 2410 + }, + { + "clip_ratio": 0.0001140510939876549, + "epoch": 0.0066935407747960845, + "grad_norm": 0.08558721095323563, + "kl": 0.10372918471693993, + "learning_rate": 3e-06, + "loss": 0.0151, + "step": 2411 + }, + { + "clip_ratio": 0.0006259864894673228, + "epoch": 0.0066963170256359, + "grad_norm": 0.06793802231550217, + "kl": 0.10636503249406815, + "learning_rate": 3e-06, + "loss": 0.0152, + "step": 2412 + }, + { + "clip_ratio": 8.256275032181293e-05, + "completion_length": 230.20834350585938, + "epoch": 0.006699093276475716, + "grad_norm": 0.07189203053712845, + "kl": 0.11288557946681976, + "learning_rate": 3e-06, + "loss": 0.0073, + "reward": 0.1875000074505806, + "reward_std": 0.20407745242118835, + "rewards/countdown_reward_func": 0.1875, + "step": 2413, + "zero_std_ratio": 0.375 + }, + { + "clip_ratio": 0.00019181969400960952, + "epoch": 0.0067018695273155324, + "grad_norm": 0.07376082241535187, + "kl": 0.10653312504291534, + "learning_rate": 3e-06, + "loss": 0.0075, + "step": 2414 + }, + { + "clip_ratio": 0.0, + "epoch": 0.006704645778155348, + "grad_norm": 0.08378919959068298, + "kl": 0.11886245012283325, + "learning_rate": 3e-06, + "loss": 0.0077, + "step": 2415 + }, + { + "clip_ratio": 9.245562250725925e-05, + "epoch": 0.006707422028995164, + "grad_norm": 0.06871742010116577, + "kl": 0.11657249182462692, + "learning_rate": 3e-06, + "loss": 0.0075, + "step": 2416 + }, + { + "clip_ratio": 0.0008867568030836992, + "epoch": 0.0067101982798349795, + "grad_norm": 0.06403800845146179, + "kl": 0.10760099440813065, + "learning_rate": 3e-06, + "loss": 0.0086, + "step": 2417 + }, + { + "clip_ratio": 9.225092071574181e-05, + "epoch": 0.006712974530674796, + "grad_norm": 0.06900127232074738, + "kl": 0.10545783117413521, + "learning_rate": 3e-06, + "loss": 0.0067, + "step": 2418 + }, + { + "clip_ratio": 0.0, + "epoch": 0.006715750781514611, + "grad_norm": 0.07691039890050888, + "kl": 0.11479473859071732, + "learning_rate": 3e-06, + "loss": 0.0072, + "step": 2419 + }, + { + "clip_ratio": 0.00036602198088075966, + "epoch": 0.0067185270323544275, + "grad_norm": 0.07924985140562057, + "kl": 0.1063217967748642, + "learning_rate": 3e-06, + "loss": 0.0075, + "step": 2420 + }, + { + "clip_ratio": 0.0003559212273103185, + "epoch": 0.006721303283194243, + "grad_norm": 0.08351549506187439, + "kl": 0.11921686306595802, + "learning_rate": 3e-06, + "loss": 0.0078, + "step": 2421 + }, + { + "clip_ratio": 0.00027179565222468227, + "epoch": 0.006724079534034059, + "grad_norm": 0.06077051907777786, + "kl": 0.1158263087272644, + "learning_rate": 3e-06, + "loss": 0.0068, + "step": 2422 + }, + { + "clip_ratio": 0.0011344332597218454, + "epoch": 0.0067268557848738746, + "grad_norm": 0.06291055679321289, + "kl": 0.10745818167924881, + "learning_rate": 3e-06, + "loss": 0.0076, + "step": 2423 + }, + { + "clip_ratio": 0.0003693613543873653, + "epoch": 0.006729632035713691, + "grad_norm": 0.0665401816368103, + "kl": 0.10476551577448845, + "learning_rate": 3e-06, + "loss": 0.0065, + "step": 2424 + }, + { + "clip_ratio": 8.928571332944557e-05, + "completion_length": 215.6875, + "epoch": 0.006732408286553507, + "grad_norm": 0.08517291396856308, + "kl": 0.1074470691382885, + "learning_rate": 3e-06, + "loss": 0.0123, + "reward": 0.3604166805744171, + "reward_std": 0.28518571704626083, + "rewards/countdown_reward_func": 0.3604166656732559, + "step": 2425, + "zero_std_ratio": 0.375 + }, + { + "clip_ratio": 0.0005277531163301319, + "epoch": 0.0067351845373933225, + "grad_norm": 0.12994913756847382, + "kl": 0.11232437938451767, + "learning_rate": 3e-06, + "loss": 0.0127, + "step": 2426 + }, + { + "clip_ratio": 0.00017857142665889114, + "epoch": 0.006737960788233139, + "grad_norm": 0.20326417684555054, + "kl": 0.11356884241104126, + "learning_rate": 3e-06, + "loss": 0.0123, + "step": 2427 + }, + { + "clip_ratio": 0.0001188212918350473, + "epoch": 0.006740737039072954, + "grad_norm": 0.078920379281044, + "kl": 0.11158164963126183, + "learning_rate": 3e-06, + "loss": 0.0129, + "step": 2428 + }, + { + "clip_ratio": 0.0001050420178216882, + "epoch": 0.0067435132899127705, + "grad_norm": 0.07914809137582779, + "kl": 0.11002899706363678, + "learning_rate": 3e-06, + "loss": 0.0122, + "step": 2429 + }, + { + "clip_ratio": 0.00022386331693269312, + "epoch": 0.006746289540752586, + "grad_norm": 0.10386265814304352, + "kl": 0.1144975870847702, + "learning_rate": 3e-06, + "loss": 0.0123, + "step": 2430 + }, + { + "clip_ratio": 0.00030388979939743876, + "epoch": 0.006749065791592402, + "grad_norm": 0.09454360604286194, + "kl": 0.10659191012382507, + "learning_rate": 3e-06, + "loss": 0.0127, + "step": 2431 + }, + { + "clip_ratio": 0.00039420771645382047, + "epoch": 0.0067518420424322175, + "grad_norm": 0.10722549259662628, + "kl": 0.11116393283009529, + "learning_rate": 3e-06, + "loss": 0.0121, + "step": 2432 + }, + { + "clip_ratio": 0.0, + "epoch": 0.006754618293272034, + "grad_norm": 0.1120673343539238, + "kl": 0.11296507716178894, + "learning_rate": 3e-06, + "loss": 0.0123, + "step": 2433 + }, + { + "clip_ratio": 0.0006201282667461783, + "epoch": 0.006757394544111849, + "grad_norm": 0.08440835773944855, + "kl": 0.11203001067042351, + "learning_rate": 3e-06, + "loss": 0.012, + "step": 2434 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0067601707949516655, + "grad_norm": 0.07518981397151947, + "kl": 0.10907341167330742, + "learning_rate": 3e-06, + "loss": 0.012, + "step": 2435 + }, + { + "clip_ratio": 0.0005205198394833133, + "epoch": 0.006762947045791482, + "grad_norm": 0.09459592401981354, + "kl": 0.11349227651953697, + "learning_rate": 3e-06, + "loss": 0.0114, + "step": 2436 + }, + { + "clip_ratio": 0.00026166778843617067, + "completion_length": 230.75000762939453, + "epoch": 0.006765723296631297, + "grad_norm": 0.07740966230630875, + "kl": 0.10574423149228096, + "learning_rate": 3e-06, + "loss": 0.0033, + "reward": 0.2666666731238365, + "reward_std": 0.2285463623702526, + "rewards/countdown_reward_func": 0.2666666731238365, + "step": 2437, + "zero_std_ratio": 0.5 + }, + { + "clip_ratio": 8.802816591924056e-05, + "epoch": 0.0067684995474711135, + "grad_norm": 0.10261306911706924, + "kl": 0.10881078243255615, + "learning_rate": 3e-06, + "loss": 0.0033, + "step": 2438 + }, + { + "clip_ratio": 0.00017181201837956905, + "epoch": 0.006771275798310929, + "grad_norm": 0.09754025936126709, + "kl": 0.11322496458888054, + "learning_rate": 3e-06, + "loss": 0.0025, + "step": 2439 + }, + { + "clip_ratio": 0.0, + "epoch": 0.006774052049150745, + "grad_norm": 0.07824526727199554, + "kl": 0.10903006792068481, + "learning_rate": 3e-06, + "loss": 0.0033, + "step": 2440 + }, + { + "clip_ratio": 0.0003595175876398571, + "epoch": 0.0067768282999905605, + "grad_norm": 0.18730951845645905, + "kl": 0.11356868967413902, + "learning_rate": 3e-06, + "loss": 0.0022, + "step": 2441 + }, + { + "clip_ratio": 8.85896515683271e-05, + "epoch": 0.006779604550830377, + "grad_norm": 0.09276639670133591, + "kl": 0.10244229808449745, + "learning_rate": 3e-06, + "loss": 0.0028, + "step": 2442 + }, + { + "clip_ratio": 0.00026698306464822963, + "epoch": 0.006782380801670192, + "grad_norm": 0.07316610962152481, + "kl": 0.10353904590010643, + "learning_rate": 3e-06, + "loss": 0.0031, + "step": 2443 + }, + { + "clip_ratio": 8.802816591924056e-05, + "epoch": 0.0067851570525100085, + "grad_norm": 0.09989767521619797, + "kl": 0.10630606859922409, + "learning_rate": 3e-06, + "loss": 0.0023, + "step": 2444 + }, + { + "clip_ratio": 0.0004429482505656779, + "epoch": 0.006787933303349824, + "grad_norm": 0.08019357174634933, + "kl": 0.11003177613019943, + "learning_rate": 3e-06, + "loss": 0.0026, + "step": 2445 + }, + { + "clip_ratio": 0.0, + "epoch": 0.00679070955418964, + "grad_norm": 0.07700924575328827, + "kl": 0.10536712780594826, + "learning_rate": 3e-06, + "loss": 0.0024, + "step": 2446 + }, + { + "clip_ratio": 0.0005259623358142562, + "epoch": 0.0067934858050294564, + "grad_norm": 0.18418793380260468, + "kl": 0.11070561036467552, + "learning_rate": 3e-06, + "loss": -0.0002, + "step": 2447 + }, + { + "clip_ratio": 0.0001726998161757365, + "epoch": 0.006796262055869272, + "grad_norm": 0.10063984245061874, + "kl": 0.1003638207912445, + "learning_rate": 3e-06, + "loss": 0.0023, + "step": 2448 + }, + { + "clip_ratio": 0.00025201612152159214, + "completion_length": 229.39583587646484, + "epoch": 0.006799038306709088, + "grad_norm": 0.11044564843177795, + "kl": 0.10882113873958588, + "learning_rate": 3e-06, + "loss": 0.022, + "reward": 0.4333333671092987, + "reward_std": 0.4348773956298828, + "rewards/countdown_reward_func": 0.4333333671092987, + "step": 2449, + "zero_std_ratio": 0.0 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0068018145575489035, + "grad_norm": 0.10996294021606445, + "kl": 0.10753875970840454, + "learning_rate": 3e-06, + "loss": 0.0224, + "step": 2450 + }, + { + "clip_ratio": 0.00017908310110215098, + "epoch": 0.00680459080838872, + "grad_norm": 0.13970771431922913, + "kl": 0.10632171854376793, + "learning_rate": 3e-06, + "loss": 0.0232, + "step": 2451 + }, + { + "clip_ratio": 0.0005040322430431843, + "epoch": 0.006807367059228535, + "grad_norm": 0.1698501855134964, + "kl": 0.10705317929387093, + "learning_rate": 3e-06, + "loss": 0.0207, + "step": 2452 + }, + { + "clip_ratio": 8.585165051044896e-05, + "epoch": 0.0068101433100683515, + "grad_norm": 0.12471974641084671, + "kl": 0.09810657054185867, + "learning_rate": 3e-06, + "loss": 0.0227, + "step": 2453 + }, + { + "clip_ratio": 9.571210102876648e-05, + "epoch": 0.006812919560908167, + "grad_norm": 0.12794393301010132, + "kl": 0.097657959908247, + "learning_rate": 3e-06, + "loss": 0.0206, + "step": 2454 + }, + { + "clip_ratio": 0.0003692590180435218, + "epoch": 0.006815695811747983, + "grad_norm": 0.11906078457832336, + "kl": 0.10667447000741959, + "learning_rate": 3e-06, + "loss": 0.0221, + "step": 2455 + }, + { + "clip_ratio": 9.999999747378752e-05, + "epoch": 0.0068184720625877986, + "grad_norm": 0.24868349730968475, + "kl": 0.10781405493617058, + "learning_rate": 3e-06, + "loss": 0.0215, + "step": 2456 + }, + { + "clip_ratio": 0.00026493475888855755, + "epoch": 0.006821248313427615, + "grad_norm": 0.10990463197231293, + "kl": 0.10687519982457161, + "learning_rate": 3e-06, + "loss": 0.0217, + "step": 2457 + }, + { + "clip_ratio": 0.0, + "epoch": 0.006824024564267431, + "grad_norm": 0.16284221410751343, + "kl": 0.10830774903297424, + "learning_rate": 3e-06, + "loss": 0.0189, + "step": 2458 + }, + { + "clip_ratio": 0.00017539320106152445, + "epoch": 0.0068268008151072465, + "grad_norm": 0.11818123608827591, + "kl": 0.09956466034054756, + "learning_rate": 3e-06, + "loss": 0.0216, + "step": 2459 + }, + { + "clip_ratio": 0.0002764785604085773, + "epoch": 0.006829577065947063, + "grad_norm": 0.1193012222647667, + "kl": 0.09965555369853973, + "learning_rate": 3e-06, + "loss": 0.0197, + "step": 2460 + }, + { + "clip_ratio": 0.0, + "completion_length": 222.64583587646484, + "epoch": 0.006832353316786878, + "grad_norm": 0.09964413940906525, + "kl": 0.11183221638202667, + "learning_rate": 3e-06, + "loss": 0.0273, + "reward": 0.30625002086162567, + "reward_std": 0.28535499423742294, + "rewards/countdown_reward_func": 0.30625002086162567, + "step": 2461, + "zero_std_ratio": 0.375 + }, + { + "clip_ratio": 9.630200656829402e-05, + "epoch": 0.0068351295676266945, + "grad_norm": 0.09929607808589935, + "kl": 0.112647145986557, + "learning_rate": 3e-06, + "loss": 0.0274, + "step": 2462 + }, + { + "clip_ratio": 0.0007768171490170062, + "epoch": 0.00683790581846651, + "grad_norm": 0.10724590718746185, + "kl": 0.11417991295456886, + "learning_rate": 3e-06, + "loss": 0.0279, + "step": 2463 + }, + { + "clip_ratio": 0.000346161745255813, + "epoch": 0.006840682069306326, + "grad_norm": 0.0905621126294136, + "kl": 0.11805011332035065, + "learning_rate": 3e-06, + "loss": 0.0279, + "step": 2464 + }, + { + "clip_ratio": 0.000299699509923812, + "epoch": 0.0068434583201461415, + "grad_norm": 0.09721238166093826, + "kl": 0.11963087320327759, + "learning_rate": 3e-06, + "loss": 0.028, + "step": 2465 + }, + { + "clip_ratio": 8.614748367108405e-05, + "epoch": 0.006846234570985958, + "grad_norm": 0.09523501247167587, + "kl": 0.12303963676095009, + "learning_rate": 3e-06, + "loss": 0.0261, + "step": 2466 + }, + { + "clip_ratio": 0.0002624175394885242, + "epoch": 0.006849010821825773, + "grad_norm": 0.09940183162689209, + "kl": 0.12013134360313416, + "learning_rate": 3e-06, + "loss": 0.0264, + "step": 2467 + }, + { + "clip_ratio": 0.0002837783540599048, + "epoch": 0.0068517870726655895, + "grad_norm": 0.09632185846567154, + "kl": 0.12204306945204735, + "learning_rate": 3e-06, + "loss": 0.0259, + "step": 2468 + }, + { + "clip_ratio": 0.0003724969647009857, + "epoch": 0.006854563323505406, + "grad_norm": 0.10749493539333344, + "kl": 0.12405981495976448, + "learning_rate": 3e-06, + "loss": 0.0258, + "step": 2469 + }, + { + "clip_ratio": 8.771930151851848e-05, + "epoch": 0.006857339574345221, + "grad_norm": 0.084246926009655, + "kl": 0.12915344536304474, + "learning_rate": 3e-06, + "loss": 0.0264, + "step": 2470 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0068601158251850375, + "grad_norm": 0.0897546112537384, + "kl": 0.131361685693264, + "learning_rate": 3e-06, + "loss": 0.0264, + "step": 2471 + }, + { + "clip_ratio": 0.0, + "epoch": 0.006862892076024853, + "grad_norm": 0.07698703557252884, + "kl": 0.1356644704937935, + "learning_rate": 3e-06, + "loss": 0.0253, + "step": 2472 + }, + { + "clip_ratio": 0.0003964965872000903, + "completion_length": 210.31250762939453, + "epoch": 0.006865668326864669, + "grad_norm": 0.089408740401268, + "kl": 0.12919557839632034, + "learning_rate": 3e-06, + "loss": 0.0279, + "reward": 0.3604166805744171, + "reward_std": 0.3206951916217804, + "rewards/countdown_reward_func": 0.3604166805744171, + "step": 2473, + "zero_std_ratio": 0.125 + }, + { + "clip_ratio": 0.0005657835863530636, + "epoch": 0.0068684445777044845, + "grad_norm": 0.10280496627092361, + "kl": 0.13355571776628494, + "learning_rate": 3e-06, + "loss": 0.0279, + "step": 2474 + }, + { + "clip_ratio": 0.00030004348809598014, + "epoch": 0.006871220828544301, + "grad_norm": 0.09122777730226517, + "kl": 0.13717789202928543, + "learning_rate": 3e-06, + "loss": 0.0281, + "step": 2475 + }, + { + "clip_ratio": 9.578544268151745e-05, + "epoch": 0.006873997079384116, + "grad_norm": 0.08694249391555786, + "kl": 0.1368713080883026, + "learning_rate": 3e-06, + "loss": 0.0283, + "step": 2476 + }, + { + "clip_ratio": 9.912767563946545e-05, + "epoch": 0.0068767733302239325, + "grad_norm": 0.10039347410202026, + "kl": 0.1407921016216278, + "learning_rate": 3e-06, + "loss": 0.0278, + "step": 2477 + }, + { + "clip_ratio": 8.45165632199496e-05, + "epoch": 0.006879549581063748, + "grad_norm": 0.12389730662107468, + "kl": 0.15291880071163177, + "learning_rate": 3e-06, + "loss": 0.028, + "step": 2478 + }, + { + "clip_ratio": 0.0001690331264398992, + "epoch": 0.006882325831903564, + "grad_norm": 0.09755761176347733, + "kl": 0.14044442027807236, + "learning_rate": 3e-06, + "loss": 0.0272, + "step": 2479 + }, + { + "clip_ratio": 0.00030831367621431127, + "epoch": 0.0068851020827433804, + "grad_norm": 0.11109394580125809, + "kl": 0.1446225941181183, + "learning_rate": 3e-06, + "loss": 0.0266, + "step": 2480 + }, + { + "clip_ratio": 0.0003083823903580196, + "epoch": 0.006887878333583196, + "grad_norm": 0.10463670641183853, + "kl": 0.14909590035676956, + "learning_rate": 3e-06, + "loss": 0.0263, + "step": 2481 + }, + { + "clip_ratio": 0.0, + "epoch": 0.006890654584423012, + "grad_norm": 0.09704513847827911, + "kl": 0.14730746299028397, + "learning_rate": 3e-06, + "loss": 0.0267, + "step": 2482 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0068934308352628275, + "grad_norm": 0.09785734117031097, + "kl": 0.15231196582317352, + "learning_rate": 3e-06, + "loss": 0.0252, + "step": 2483 + }, + { + "clip_ratio": 8.45165632199496e-05, + "epoch": 0.006896207086102644, + "grad_norm": 0.08950638771057129, + "kl": 0.1649750992655754, + "learning_rate": 3e-06, + "loss": 0.0262, + "step": 2484 + }, + { + "clip_ratio": 8.138021075865254e-05, + "completion_length": 230.9166717529297, + "epoch": 0.006898983336942459, + "grad_norm": 0.08120502531528473, + "kl": 0.12933644652366638, + "learning_rate": 3e-06, + "loss": 0.0153, + "reward": 0.395833358168602, + "reward_std": 0.3388310372829437, + "rewards/countdown_reward_func": 0.3958333507180214, + "step": 2485, + "zero_std_ratio": 0.125 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0069017595877822755, + "grad_norm": 0.12228546291589737, + "kl": 0.13283831626176834, + "learning_rate": 3e-06, + "loss": 0.0166, + "step": 2486 + }, + { + "clip_ratio": 0.00025467218802077696, + "epoch": 0.006904535838622091, + "grad_norm": 0.11056458950042725, + "kl": 0.1325739100575447, + "learning_rate": 3e-06, + "loss": 0.0158, + "step": 2487 + }, + { + "clip_ratio": 0.00018615041335579008, + "epoch": 0.006907312089461907, + "grad_norm": 0.09804553538560867, + "kl": 0.13264501839876175, + "learning_rate": 3e-06, + "loss": 0.0154, + "step": 2488 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0069100883403017226, + "grad_norm": 0.10097736865282059, + "kl": 0.1376706250011921, + "learning_rate": 3e-06, + "loss": 0.0157, + "step": 2489 + }, + { + "clip_ratio": 0.0, + "epoch": 0.006912864591141539, + "grad_norm": 0.09442541003227234, + "kl": 0.13518304377794266, + "learning_rate": 3e-06, + "loss": 0.0159, + "step": 2490 + }, + { + "clip_ratio": 0.0, + "epoch": 0.006915640841981355, + "grad_norm": 0.08362875878810883, + "kl": 0.13955392688512802, + "learning_rate": 3e-06, + "loss": 0.0152, + "step": 2491 + }, + { + "clip_ratio": 0.0002792256127577275, + "epoch": 0.0069184170928211705, + "grad_norm": 0.09364964812994003, + "kl": 0.14495066553354263, + "learning_rate": 3e-06, + "loss": 0.0149, + "step": 2492 + }, + { + "clip_ratio": 0.00016947041149251163, + "epoch": 0.006921193343660987, + "grad_norm": 0.0926862508058548, + "kl": 0.144854336977005, + "learning_rate": 3e-06, + "loss": 0.0147, + "step": 2493 + }, + { + "clip_ratio": 0.0006245378099265508, + "epoch": 0.006923969594500802, + "grad_norm": 0.09652328491210938, + "kl": 0.1455134153366089, + "learning_rate": 3e-06, + "loss": 0.0148, + "step": 2494 + }, + { + "clip_ratio": 0.00016276042151730508, + "epoch": 0.0069267458453406185, + "grad_norm": 0.09675440937280655, + "kl": 0.14825546741485596, + "learning_rate": 3e-06, + "loss": 0.0149, + "step": 2495 + }, + { + "clip_ratio": 0.000413611029216554, + "epoch": 0.006929522096180434, + "grad_norm": 0.11376820504665375, + "kl": 0.14565087854862213, + "learning_rate": 3e-06, + "loss": 0.0144, + "step": 2496 + }, + { + "clip_ratio": 0.00010024057701230049, + "completion_length": 216.45833587646484, + "epoch": 0.00693229834702025, + "grad_norm": 0.11657664179801941, + "kl": 0.16808811575174332, + "learning_rate": 3e-06, + "loss": 0.0155, + "reward": 0.28541669249534607, + "reward_std": 0.21314848214387894, + "rewards/countdown_reward_func": 0.2854166775941849, + "step": 2497, + "zero_std_ratio": 0.375 + }, + { + "clip_ratio": 9.321401739725843e-05, + "epoch": 0.0069350745978600655, + "grad_norm": 0.0914500430226326, + "kl": 0.18082208931446075, + "learning_rate": 3e-06, + "loss": 0.0155, + "step": 2498 + }, + { + "clip_ratio": 0.00028280332480790094, + "epoch": 0.006937850848699882, + "grad_norm": 0.10090383887290955, + "kl": 0.17869839072227478, + "learning_rate": 3e-06, + "loss": 0.0146, + "step": 2499 + }, + { + "clip_ratio": 0.00018671700672712177, + "epoch": 0.006940627099539697, + "grad_norm": 0.07849971204996109, + "kl": 0.1728534698486328, + "learning_rate": 3e-06, + "loss": 0.0154, + "step": 2500 + } + ], + "logging_steps": 1, + "max_steps": 3601980, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 6, + "trial_name": null, + "trial_params": null +}