diff --git "a/checkpoint-1500/trainer_state.json" "b/checkpoint-1500/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-1500/trainer_state.json" @@ -0,0 +1,11658 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.02498625755834291, + "eval_steps": 500, + "global_step": 1500, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "completion_length": 228.0416717529297, + "epoch": 1.6657505038895273e-05, + "grad_norm": 0.04593616724014282, + "learning_rate": 1.5789473684210525e-07, + "loss": -0.0109, + "reward": 0.286458358168602, + "rewards/countdown_reward_func": 0.2864583432674408, + "step": 1 + }, + { + "epoch": 3.3315010077790546e-05, + "grad_norm": 0.045592524111270905, + "learning_rate": 3.157894736842105e-07, + "loss": -0.0227, + "step": 2 + }, + { + "epoch": 4.997251511668582e-05, + "grad_norm": 0.04625459387898445, + "learning_rate": 4.736842105263158e-07, + "loss": -0.0111, + "step": 3 + }, + { + "epoch": 6.663002015558109e-05, + "grad_norm": 0.05273105204105377, + "learning_rate": 6.31578947368421e-07, + "loss": -0.0228, + "step": 4 + }, + { + "completion_length": 236.7291717529297, + "epoch": 8.328752519447637e-05, + "grad_norm": 0.03238566964864731, + "learning_rate": 7.894736842105263e-07, + "loss": 0.0176, + "reward": 0.24791668355464935, + "rewards/countdown_reward_func": 0.24791667610406876, + "step": 5 + }, + { + "epoch": 9.994503023337164e-05, + "grad_norm": 0.03298381716012955, + "learning_rate": 9.473684210526316e-07, + "loss": -0.0198, + "step": 6 + }, + { + "epoch": 0.00011660253527226692, + "grad_norm": 0.03196447715163231, + "learning_rate": 1.1052631578947369e-06, + "loss": 0.0176, + "step": 7 + }, + { + "epoch": 0.00013326004031116219, + "grad_norm": 0.03542747348546982, + "learning_rate": 1.263157894736842e-06, + "loss": -0.0199, + "step": 8 + }, + { + "completion_length": 235.45834350585938, + "epoch": 0.00014991754535005747, + "grad_norm": 0.03613632544875145, + "learning_rate": 1.4210526315789473e-06, + "loss": 0.0019, + "reward": 0.22604168951511383, + "rewards/countdown_reward_func": 0.22604166716337204, + "step": 9 + }, + { + "epoch": 0.00016657505038895275, + "grad_norm": 0.0431361198425293, + "learning_rate": 1.5789473684210526e-06, + "loss": 0.0612, + "step": 10 + }, + { + "epoch": 0.00018323255542784803, + "grad_norm": 0.03627052903175354, + "learning_rate": 1.736842105263158e-06, + "loss": 0.002, + "step": 11 + }, + { + "epoch": 0.00019989006046674328, + "grad_norm": 0.04315226897597313, + "learning_rate": 1.8947368421052632e-06, + "loss": 0.0612, + "step": 12 + }, + { + "completion_length": 234.5, + "epoch": 0.00021654756550563856, + "grad_norm": 0.030635477975010872, + "learning_rate": 2.0526315789473687e-06, + "loss": 0.0016, + "reward": 0.20729167759418488, + "rewards/countdown_reward_func": 0.20729167014360428, + "step": 13 + }, + { + "epoch": 0.00023320507054453384, + "grad_norm": 0.042695969343185425, + "learning_rate": 2.2105263157894738e-06, + "loss": -0.0537, + "step": 14 + }, + { + "epoch": 0.0002498625755834291, + "grad_norm": 0.02931581437587738, + "learning_rate": 2.368421052631579e-06, + "loss": 0.0017, + "step": 15 + }, + { + "epoch": 0.00026652008062232437, + "grad_norm": 0.04035942628979683, + "learning_rate": 2.526315789473684e-06, + "loss": -0.054, + "step": 16 + }, + { + "completion_length": 236.68750762939453, + "epoch": 0.0002831775856612197, + "grad_norm": 0.043460357934236526, + "learning_rate": 2.6842105263157895e-06, + "loss": -0.0119, + "reward": 0.28541669249534607, + "rewards/countdown_reward_func": 0.2854166775941849, + "step": 17 + }, + { + "epoch": 0.00029983509070011493, + "grad_norm": 0.042771078646183014, + "learning_rate": 2.8421052631578946e-06, + "loss": -0.034, + "step": 18 + }, + { + "epoch": 0.0003164925957390102, + "grad_norm": 0.041939638555049896, + "learning_rate": 3e-06, + "loss": -0.0118, + "step": 19 + }, + { + "epoch": 0.0003331501007779055, + "grad_norm": 0.041805874556303024, + "learning_rate": 3e-06, + "loss": -0.0339, + "step": 20 + }, + { + "completion_length": 230.71875762939453, + "epoch": 0.00034980760581680074, + "grad_norm": 0.047246791422367096, + "learning_rate": 3e-06, + "loss": 0.0326, + "reward": 0.3010416775941849, + "rewards/countdown_reward_func": 0.3010416775941849, + "step": 21 + }, + { + "epoch": 0.00036646511085569605, + "grad_norm": 0.04929734393954277, + "learning_rate": 3e-06, + "loss": 0.0149, + "step": 22 + }, + { + "epoch": 0.0003831226158945913, + "grad_norm": 0.04676107317209244, + "learning_rate": 3e-06, + "loss": 0.0326, + "step": 23 + }, + { + "epoch": 0.00039978012093348656, + "grad_norm": 0.05400047451257706, + "learning_rate": 3e-06, + "loss": 0.0147, + "step": 24 + }, + { + "completion_length": 229.46875, + "epoch": 0.00041643762597238186, + "grad_norm": 0.043805044144392014, + "learning_rate": 3e-06, + "loss": 0.0519, + "reward": 0.22708335518836975, + "rewards/countdown_reward_func": 0.22708334028720856, + "step": 25 + }, + { + "epoch": 0.0004330951310112771, + "grad_norm": 0.03437976911664009, + "learning_rate": 3e-06, + "loss": 0.0332, + "step": 26 + }, + { + "epoch": 0.0004497526360501724, + "grad_norm": 0.044771112501621246, + "learning_rate": 3e-06, + "loss": 0.052, + "step": 27 + }, + { + "epoch": 0.0004664101410890677, + "grad_norm": 0.04903418570756912, + "learning_rate": 3e-06, + "loss": 0.0332, + "step": 28 + }, + { + "completion_length": 229.30208587646484, + "epoch": 0.00048306764612796293, + "grad_norm": 0.03801026567816734, + "learning_rate": 3e-06, + "loss": 0.0004, + "reward": 0.2187500149011612, + "rewards/countdown_reward_func": 0.2187500149011612, + "step": 29 + }, + { + "epoch": 0.0004997251511668582, + "grad_norm": 0.0448690690100193, + "learning_rate": 3e-06, + "loss": -0.007, + "step": 30 + }, + { + "epoch": 0.0005163826562057535, + "grad_norm": 0.04450896009802818, + "learning_rate": 3e-06, + "loss": 0.0004, + "step": 31 + }, + { + "epoch": 0.0005330401612446487, + "grad_norm": 0.03904105722904205, + "learning_rate": 3e-06, + "loss": -0.007, + "step": 32 + }, + { + "completion_length": 231.84375762939453, + "epoch": 0.0005496976662835441, + "grad_norm": 0.03819483891129494, + "learning_rate": 3e-06, + "loss": -0.0232, + "reward": 0.21666668355464935, + "rewards/countdown_reward_func": 0.21666667610406876, + "step": 33 + }, + { + "epoch": 0.0005663551713224394, + "grad_norm": 0.03099250979721546, + "learning_rate": 3e-06, + "loss": 0.0232, + "step": 34 + }, + { + "epoch": 0.0005830126763613346, + "grad_norm": 0.03755773603916168, + "learning_rate": 3e-06, + "loss": -0.0232, + "step": 35 + }, + { + "epoch": 0.0005996701814002299, + "grad_norm": 0.03070709854364395, + "learning_rate": 3e-06, + "loss": 0.0231, + "step": 36 + }, + { + "completion_length": 238.21875762939453, + "epoch": 0.0006163276864391251, + "grad_norm": 0.036010462790727615, + "learning_rate": 3e-06, + "loss": 0.0281, + "reward": 0.21145834028720856, + "rewards/countdown_reward_func": 0.21145834028720856, + "step": 37 + }, + { + "epoch": 0.0006329851914780204, + "grad_norm": 0.03523487597703934, + "learning_rate": 3e-06, + "loss": 0.0212, + "step": 38 + }, + { + "epoch": 0.0006496426965169157, + "grad_norm": 0.032657328993082047, + "learning_rate": 3e-06, + "loss": 0.028, + "step": 39 + }, + { + "epoch": 0.000666300201555811, + "grad_norm": 0.03279102221131325, + "learning_rate": 3e-06, + "loss": 0.0209, + "step": 40 + }, + { + "completion_length": 231.71875762939453, + "epoch": 0.0006829577065947062, + "grad_norm": 0.04088591784238815, + "learning_rate": 3e-06, + "loss": -0.006, + "reward": 0.2343750074505806, + "rewards/countdown_reward_func": 0.234375, + "step": 41 + }, + { + "epoch": 0.0006996152116336015, + "grad_norm": 0.0444653145968914, + "learning_rate": 3e-06, + "loss": 0.0643, + "step": 42 + }, + { + "epoch": 0.0007162727166724967, + "grad_norm": 0.040506646037101746, + "learning_rate": 3e-06, + "loss": -0.006, + "step": 43 + }, + { + "epoch": 0.0007329302217113921, + "grad_norm": 0.04303446412086487, + "learning_rate": 3e-06, + "loss": 0.0642, + "step": 44 + }, + { + "completion_length": 235.98959350585938, + "epoch": 0.0007495877267502874, + "grad_norm": 0.030605530366301537, + "learning_rate": 3e-06, + "loss": 0.0034, + "reward": 0.2656250149011612, + "rewards/countdown_reward_func": 0.2656250074505806, + "step": 45 + }, + { + "epoch": 0.0007662452317891826, + "grad_norm": 0.04526748135685921, + "learning_rate": 3e-06, + "loss": 0.0321, + "step": 46 + }, + { + "epoch": 0.0007829027368280779, + "grad_norm": 0.03210637718439102, + "learning_rate": 3e-06, + "loss": 0.0033, + "step": 47 + }, + { + "epoch": 0.0007995602418669731, + "grad_norm": 0.04428977891802788, + "learning_rate": 3e-06, + "loss": 0.032, + "step": 48 + }, + { + "completion_length": 229.40625762939453, + "epoch": 0.0008162177469058685, + "grad_norm": 0.04538910463452339, + "learning_rate": 3e-06, + "loss": 0.0001, + "reward": 0.2760416865348816, + "rewards/countdown_reward_func": 0.2760416865348816, + "step": 49 + }, + { + "epoch": 0.0008328752519447637, + "grad_norm": 0.05924785137176514, + "learning_rate": 3e-06, + "loss": -0.0231, + "step": 50 + }, + { + "epoch": 0.000849532756983659, + "grad_norm": 0.0470881387591362, + "learning_rate": 3e-06, + "loss": -0.0001, + "step": 51 + }, + { + "epoch": 0.0008661902620225542, + "grad_norm": 0.05719375237822533, + "learning_rate": 3e-06, + "loss": -0.0234, + "step": 52 + }, + { + "completion_length": 235.2291717529297, + "epoch": 0.0008828477670614495, + "grad_norm": 0.03878246620297432, + "learning_rate": 3e-06, + "loss": 0.0195, + "reward": 0.255208358168602, + "rewards/countdown_reward_func": 0.2552083507180214, + "step": 53 + }, + { + "epoch": 0.0008995052721003448, + "grad_norm": 0.03656379505991936, + "learning_rate": 3e-06, + "loss": 0.0342, + "step": 54 + }, + { + "epoch": 0.0009161627771392401, + "grad_norm": 0.039214931428432465, + "learning_rate": 3e-06, + "loss": 0.0193, + "step": 55 + }, + { + "epoch": 0.0009328202821781354, + "grad_norm": 0.036865487694740295, + "learning_rate": 3e-06, + "loss": 0.0341, + "step": 56 + }, + { + "completion_length": 229.4791717529297, + "epoch": 0.0009494777872170306, + "grad_norm": 0.04228115826845169, + "learning_rate": 3e-06, + "loss": 0.0111, + "reward": 0.22812500596046448, + "rewards/countdown_reward_func": 0.22812500596046448, + "step": 57 + }, + { + "epoch": 0.0009661352922559259, + "grad_norm": 0.0296314749866724, + "learning_rate": 3e-06, + "loss": -0.0234, + "step": 58 + }, + { + "epoch": 0.0009827927972948212, + "grad_norm": 0.03934413194656372, + "learning_rate": 3e-06, + "loss": 0.0108, + "step": 59 + }, + { + "epoch": 0.0009994503023337165, + "grad_norm": 0.029464874416589737, + "learning_rate": 3e-06, + "loss": -0.0235, + "step": 60 + }, + { + "completion_length": 221.70833587646484, + "epoch": 0.0010161078073726117, + "grad_norm": 0.034587737172842026, + "learning_rate": 3e-06, + "loss": 0.0247, + "reward": 0.24166668206453323, + "rewards/countdown_reward_func": 0.24166668206453323, + "step": 61 + }, + { + "epoch": 0.001032765312411507, + "grad_norm": 0.03653142973780632, + "learning_rate": 3e-06, + "loss": 0.0309, + "step": 62 + }, + { + "epoch": 0.0010494228174504022, + "grad_norm": 0.034984149038791656, + "learning_rate": 3e-06, + "loss": 0.0246, + "step": 63 + }, + { + "epoch": 0.0010660803224892975, + "grad_norm": 0.03449957072734833, + "learning_rate": 3e-06, + "loss": 0.0308, + "step": 64 + }, + { + "completion_length": 217.90625, + "epoch": 0.0010827378275281927, + "grad_norm": 0.04954748600721359, + "learning_rate": 3e-06, + "loss": 0.0459, + "reward": 0.24895835667848587, + "rewards/countdown_reward_func": 0.24895835667848587, + "step": 65 + }, + { + "epoch": 0.0010993953325670882, + "grad_norm": 0.05431276187300682, + "learning_rate": 3e-06, + "loss": 0.111, + "step": 66 + }, + { + "epoch": 0.0011160528376059835, + "grad_norm": 0.050470322370529175, + "learning_rate": 3e-06, + "loss": 0.0457, + "step": 67 + }, + { + "epoch": 0.0011327103426448787, + "grad_norm": 0.05538534000515938, + "learning_rate": 3e-06, + "loss": 0.1112, + "step": 68 + }, + { + "completion_length": 218.90625762939453, + "epoch": 0.001149367847683774, + "grad_norm": 0.045245472341775894, + "learning_rate": 3e-06, + "loss": 0.015, + "reward": 0.2864583432674408, + "rewards/countdown_reward_func": 0.2864583432674408, + "step": 69 + }, + { + "epoch": 0.0011660253527226692, + "grad_norm": 0.05130685493350029, + "learning_rate": 3e-06, + "loss": 0.0068, + "step": 70 + }, + { + "epoch": 0.0011826828577615645, + "grad_norm": 0.050618063658475876, + "learning_rate": 3e-06, + "loss": 0.0148, + "step": 71 + }, + { + "epoch": 0.0011993403628004597, + "grad_norm": 0.05025541037321091, + "learning_rate": 3e-06, + "loss": 0.0066, + "step": 72 + }, + { + "completion_length": 224.21875762939453, + "epoch": 0.001215997867839355, + "grad_norm": 0.026713810861110687, + "learning_rate": 3e-06, + "loss": 0.0353, + "reward": 0.18958333879709244, + "rewards/countdown_reward_func": 0.18958333879709244, + "step": 73 + }, + { + "epoch": 0.0012326553728782502, + "grad_norm": 0.03775971755385399, + "learning_rate": 3e-06, + "loss": 0.0534, + "step": 74 + }, + { + "epoch": 0.0012493128779171455, + "grad_norm": 0.024764789268374443, + "learning_rate": 3e-06, + "loss": 0.0354, + "step": 75 + }, + { + "epoch": 0.0012659703829560407, + "grad_norm": 0.03506957367062569, + "learning_rate": 3e-06, + "loss": 0.0533, + "step": 76 + }, + { + "completion_length": 230.8854217529297, + "epoch": 0.0012826278879949362, + "grad_norm": 0.03948931396007538, + "learning_rate": 3e-06, + "loss": -0.0227, + "reward": 0.2510416880249977, + "rewards/countdown_reward_func": 0.2510416880249977, + "step": 77 + }, + { + "epoch": 0.0012992853930338315, + "grad_norm": 0.036449041217565536, + "learning_rate": 3e-06, + "loss": 0.0201, + "step": 78 + }, + { + "epoch": 0.0013159428980727267, + "grad_norm": 0.03829808160662651, + "learning_rate": 3e-06, + "loss": -0.0227, + "step": 79 + }, + { + "epoch": 0.001332600403111622, + "grad_norm": 0.03440757840871811, + "learning_rate": 3e-06, + "loss": 0.0198, + "step": 80 + }, + { + "completion_length": 231.8541717529297, + "epoch": 0.0013492579081505172, + "grad_norm": 0.037220992147922516, + "learning_rate": 3e-06, + "loss": 0.0102, + "reward": 0.2770833522081375, + "rewards/countdown_reward_func": 0.2770833298563957, + "step": 81 + }, + { + "epoch": 0.0013659154131894125, + "grad_norm": 0.038877204060554504, + "learning_rate": 3e-06, + "loss": 0.0006, + "step": 82 + }, + { + "epoch": 0.0013825729182283077, + "grad_norm": 0.039478905498981476, + "learning_rate": 3e-06, + "loss": 0.0102, + "step": 83 + }, + { + "epoch": 0.001399230423267203, + "grad_norm": 0.03837360814213753, + "learning_rate": 3e-06, + "loss": 0.0004, + "step": 84 + }, + { + "completion_length": 221.48958587646484, + "epoch": 0.0014158879283060982, + "grad_norm": 0.03794768080115318, + "learning_rate": 3e-06, + "loss": -0.047, + "reward": 0.19062501192092896, + "rewards/countdown_reward_func": 0.19062499701976776, + "step": 85 + }, + { + "epoch": 0.0014325454333449935, + "grad_norm": 0.05210740491747856, + "learning_rate": 3e-06, + "loss": -0.0568, + "step": 86 + }, + { + "epoch": 0.001449202938383889, + "grad_norm": 0.03809172660112381, + "learning_rate": 3e-06, + "loss": -0.047, + "step": 87 + }, + { + "epoch": 0.0014658604434227842, + "grad_norm": 0.05146374553442001, + "learning_rate": 3e-06, + "loss": -0.0569, + "step": 88 + }, + { + "completion_length": 230.95833587646484, + "epoch": 0.0014825179484616795, + "grad_norm": 0.04563918709754944, + "learning_rate": 3e-06, + "loss": -0.001, + "reward": 0.2562500238418579, + "rewards/countdown_reward_func": 0.2562500163912773, + "step": 89 + }, + { + "epoch": 0.0014991754535005747, + "grad_norm": 0.039866987615823746, + "learning_rate": 3e-06, + "loss": -0.0026, + "step": 90 + }, + { + "epoch": 0.00151583295853947, + "grad_norm": 0.04423011094331741, + "learning_rate": 3e-06, + "loss": -0.001, + "step": 91 + }, + { + "epoch": 0.0015324904635783652, + "grad_norm": 0.04026401415467262, + "learning_rate": 3e-06, + "loss": -0.0027, + "step": 92 + }, + { + "completion_length": 227.90625, + "epoch": 0.0015491479686172605, + "grad_norm": 0.033889852464199066, + "learning_rate": 3e-06, + "loss": 0.0322, + "reward": 0.2666666805744171, + "rewards/countdown_reward_func": 0.2666666731238365, + "step": 93 + }, + { + "epoch": 0.0015658054736561557, + "grad_norm": 0.043229758739471436, + "learning_rate": 3e-06, + "loss": 0.0321, + "step": 94 + }, + { + "epoch": 0.001582462978695051, + "grad_norm": 0.033511530607938766, + "learning_rate": 3e-06, + "loss": 0.0323, + "step": 95 + }, + { + "epoch": 0.0015991204837339462, + "grad_norm": 0.04463406279683113, + "learning_rate": 3e-06, + "loss": 0.0318, + "step": 96 + }, + { + "completion_length": 229.23958587646484, + "epoch": 0.0016157779887728417, + "grad_norm": 0.04746805503964424, + "learning_rate": 3e-06, + "loss": 0.0648, + "reward": 0.22604165971279144, + "rewards/countdown_reward_func": 0.22604165226221085, + "step": 97 + }, + { + "epoch": 0.001632435493811737, + "grad_norm": 0.04102989286184311, + "learning_rate": 3e-06, + "loss": 0.0005, + "step": 98 + }, + { + "epoch": 0.0016490929988506322, + "grad_norm": 0.05134953558444977, + "learning_rate": 3e-06, + "loss": 0.0645, + "step": 99 + }, + { + "epoch": 0.0016657505038895275, + "grad_norm": 0.044511985033750534, + "learning_rate": 3e-06, + "loss": 0.0003, + "step": 100 + }, + { + "completion_length": 224.2291717529297, + "epoch": 0.0016824080089284227, + "grad_norm": 0.04039066284894943, + "learning_rate": 3e-06, + "loss": 0.0453, + "reward": 0.21666669100522995, + "rewards/countdown_reward_func": 0.21666669100522995, + "step": 101 + }, + { + "epoch": 0.001699065513967318, + "grad_norm": 0.044648367911577225, + "learning_rate": 3e-06, + "loss": 0.0594, + "step": 102 + }, + { + "epoch": 0.0017157230190062132, + "grad_norm": 0.039758797734975815, + "learning_rate": 3e-06, + "loss": 0.045, + "step": 103 + }, + { + "epoch": 0.0017323805240451085, + "grad_norm": 0.042167823761701584, + "learning_rate": 3e-06, + "loss": 0.059, + "step": 104 + }, + { + "completion_length": 220.5104217529297, + "epoch": 0.0017490380290840037, + "grad_norm": 0.0707293152809143, + "learning_rate": 3e-06, + "loss": -0.0281, + "reward": 0.20729167759418488, + "rewards/countdown_reward_func": 0.20729167759418488, + "step": 105 + }, + { + "epoch": 0.001765695534122899, + "grad_norm": 0.05888892337679863, + "learning_rate": 3e-06, + "loss": -0.0391, + "step": 106 + }, + { + "epoch": 0.0017823530391617944, + "grad_norm": 0.05352797731757164, + "learning_rate": 3e-06, + "loss": -0.0284, + "step": 107 + }, + { + "epoch": 0.0017990105442006897, + "grad_norm": 0.05991983041167259, + "learning_rate": 3e-06, + "loss": -0.0398, + "step": 108 + }, + { + "completion_length": 225.1666717529297, + "epoch": 0.001815668049239585, + "grad_norm": 0.03769804909825325, + "learning_rate": 3e-06, + "loss": 0.0349, + "reward": 0.20416667312383652, + "rewards/countdown_reward_func": 0.20416667312383652, + "step": 109 + }, + { + "epoch": 0.0018323255542784802, + "grad_norm": 0.04896159842610359, + "learning_rate": 3e-06, + "loss": 0.0252, + "step": 110 + }, + { + "epoch": 0.0018489830593173755, + "grad_norm": 0.038564011454582214, + "learning_rate": 3e-06, + "loss": 0.0346, + "step": 111 + }, + { + "epoch": 0.0018656405643562707, + "grad_norm": 0.04127072915434837, + "learning_rate": 3e-06, + "loss": 0.0251, + "step": 112 + }, + { + "completion_length": 216.4479217529297, + "epoch": 0.001882298069395166, + "grad_norm": 0.044385042041540146, + "learning_rate": 3e-06, + "loss": 0.0334, + "reward": 0.2760417014360428, + "rewards/countdown_reward_func": 0.276041679084301, + "step": 113 + }, + { + "epoch": 0.0018989555744340612, + "grad_norm": 0.046408314257860184, + "learning_rate": 3e-06, + "loss": 0.0356, + "step": 114 + }, + { + "epoch": 0.0019156130794729565, + "grad_norm": 0.04861271008849144, + "learning_rate": 3e-06, + "loss": 0.0332, + "step": 115 + }, + { + "epoch": 0.0019322705845118517, + "grad_norm": 0.04310572147369385, + "learning_rate": 3e-06, + "loss": 0.0353, + "step": 116 + }, + { + "completion_length": 219.08333587646484, + "epoch": 0.0019489280895507472, + "grad_norm": 0.048188716173172, + "learning_rate": 3e-06, + "loss": 0.0281, + "reward": 0.25729168206453323, + "rewards/countdown_reward_func": 0.25729168206453323, + "step": 117 + }, + { + "epoch": 0.0019655855945896424, + "grad_norm": 0.058333028107881546, + "learning_rate": 3e-06, + "loss": -0.0197, + "step": 118 + }, + { + "epoch": 0.0019822430996285375, + "grad_norm": 0.04716461896896362, + "learning_rate": 3e-06, + "loss": 0.0276, + "step": 119 + }, + { + "epoch": 0.001998900604667433, + "grad_norm": 0.05831597000360489, + "learning_rate": 3e-06, + "loss": -0.0198, + "step": 120 + }, + { + "completion_length": 217.33333587646484, + "epoch": 0.002015558109706328, + "grad_norm": 0.05323164165019989, + "learning_rate": 3e-06, + "loss": 0.019, + "reward": 0.29479168355464935, + "rewards/countdown_reward_func": 0.29479168355464935, + "step": 121 + }, + { + "epoch": 0.0020322156147452235, + "grad_norm": 0.05002224072813988, + "learning_rate": 3e-06, + "loss": -0.0001, + "step": 122 + }, + { + "epoch": 0.002048873119784119, + "grad_norm": 0.05228053405880928, + "learning_rate": 3e-06, + "loss": 0.0187, + "step": 123 + }, + { + "epoch": 0.002065530624823014, + "grad_norm": 0.05129559710621834, + "learning_rate": 3e-06, + "loss": -0.0005, + "step": 124 + }, + { + "completion_length": 217.0416717529297, + "epoch": 0.0020821881298619094, + "grad_norm": 0.05084940046072006, + "learning_rate": 3e-06, + "loss": 0.0237, + "reward": 0.265625, + "rewards/countdown_reward_func": 0.265625, + "step": 125 + }, + { + "epoch": 0.0020988456349008045, + "grad_norm": 0.043954841792583466, + "learning_rate": 3e-06, + "loss": -0.0036, + "step": 126 + }, + { + "epoch": 0.0021155031399397, + "grad_norm": 0.05043468996882439, + "learning_rate": 3e-06, + "loss": 0.0236, + "step": 127 + }, + { + "epoch": 0.002132160644978595, + "grad_norm": 0.04735936224460602, + "learning_rate": 3e-06, + "loss": -0.0039, + "step": 128 + }, + { + "completion_length": 209.90625762939453, + "epoch": 0.0021488181500174904, + "grad_norm": 0.058306675404310226, + "learning_rate": 3e-06, + "loss": -0.0212, + "reward": 0.27291667461395264, + "rewards/countdown_reward_func": 0.27291667461395264, + "step": 129 + }, + { + "epoch": 0.0021654756550563855, + "grad_norm": 0.07208102941513062, + "learning_rate": 3e-06, + "loss": -0.0275, + "step": 130 + }, + { + "epoch": 0.002182133160095281, + "grad_norm": 0.05696512758731842, + "learning_rate": 3e-06, + "loss": -0.0215, + "step": 131 + }, + { + "epoch": 0.0021987906651341764, + "grad_norm": 0.07473631203174591, + "learning_rate": 3e-06, + "loss": -0.0286, + "step": 132 + }, + { + "completion_length": 206.8541717529297, + "epoch": 0.0022154481701730715, + "grad_norm": 0.06164620444178581, + "learning_rate": 3e-06, + "loss": 0.0387, + "reward": 0.26354168355464935, + "rewards/countdown_reward_func": 0.26354167610406876, + "step": 133 + }, + { + "epoch": 0.002232105675211967, + "grad_norm": 0.07755275070667267, + "learning_rate": 3e-06, + "loss": 0.0068, + "step": 134 + }, + { + "epoch": 0.002248763180250862, + "grad_norm": 0.06192132830619812, + "learning_rate": 3e-06, + "loss": 0.0381, + "step": 135 + }, + { + "epoch": 0.0022654206852897574, + "grad_norm": 0.07948354631662369, + "learning_rate": 3e-06, + "loss": 0.006, + "step": 136 + }, + { + "completion_length": 197.75000762939453, + "epoch": 0.0022820781903286525, + "grad_norm": 0.050047654658555984, + "learning_rate": 3e-06, + "loss": 0.0678, + "reward": 0.2614583522081375, + "rewards/countdown_reward_func": 0.2614583298563957, + "step": 137 + }, + { + "epoch": 0.002298735695367548, + "grad_norm": 0.06346608698368073, + "learning_rate": 3e-06, + "loss": -0.0241, + "step": 138 + }, + { + "epoch": 0.002315393200406443, + "grad_norm": 0.053687598556280136, + "learning_rate": 3e-06, + "loss": 0.0673, + "step": 139 + }, + { + "epoch": 0.0023320507054453384, + "grad_norm": 0.05779948830604553, + "learning_rate": 3e-06, + "loss": -0.0245, + "step": 140 + }, + { + "completion_length": 198.7604217529297, + "epoch": 0.0023487082104842335, + "grad_norm": 0.06150375306606293, + "learning_rate": 3e-06, + "loss": -0.0449, + "reward": 0.16458334773778915, + "rewards/countdown_reward_func": 0.16458334028720856, + "step": 141 + }, + { + "epoch": 0.002365365715523129, + "grad_norm": 0.04769235476851463, + "learning_rate": 3e-06, + "loss": -0.0837, + "step": 142 + }, + { + "epoch": 0.0023820232205620244, + "grad_norm": 0.05787502974271774, + "learning_rate": 3e-06, + "loss": -0.0454, + "step": 143 + }, + { + "epoch": 0.0023986807256009194, + "grad_norm": 0.04905227571725845, + "learning_rate": 3e-06, + "loss": -0.0845, + "step": 144 + }, + { + "completion_length": 203.83333587646484, + "epoch": 0.002415338230639815, + "grad_norm": 0.0563383512198925, + "learning_rate": 3e-06, + "loss": 0.0202, + "reward": 0.3500000089406967, + "rewards/countdown_reward_func": 0.3500000089406967, + "step": 145 + }, + { + "epoch": 0.00243199573567871, + "grad_norm": 0.06317299604415894, + "learning_rate": 3e-06, + "loss": 0.041, + "step": 146 + }, + { + "epoch": 0.0024486532407176054, + "grad_norm": 0.058462101966142654, + "learning_rate": 3e-06, + "loss": 0.0193, + "step": 147 + }, + { + "epoch": 0.0024653107457565005, + "grad_norm": 0.09473595023155212, + "learning_rate": 3e-06, + "loss": 0.04, + "step": 148 + }, + { + "completion_length": 182.52083587646484, + "epoch": 0.002481968250795396, + "grad_norm": 0.08986622095108032, + "learning_rate": 3e-06, + "loss": -0.1184, + "reward": 0.31979167461395264, + "rewards/countdown_reward_func": 0.31979165971279144, + "step": 149 + }, + { + "epoch": 0.002498625755834291, + "grad_norm": 0.07035745680332184, + "learning_rate": 3e-06, + "loss": -0.069, + "step": 150 + }, + { + "epoch": 0.0025152832608731864, + "grad_norm": 0.09284700453281403, + "learning_rate": 3e-06, + "loss": -0.12, + "step": 151 + }, + { + "epoch": 0.0025319407659120815, + "grad_norm": 0.07525934278964996, + "learning_rate": 3e-06, + "loss": -0.07, + "step": 152 + }, + { + "completion_length": 162.36458587646484, + "epoch": 0.002548598270950977, + "grad_norm": 0.06289956718683243, + "learning_rate": 3e-06, + "loss": 0.14, + "reward": 0.31041671335697174, + "rewards/countdown_reward_func": 0.31041669845581055, + "step": 153 + }, + { + "epoch": 0.0025652557759898724, + "grad_norm": 0.09306161850690842, + "learning_rate": 3e-06, + "loss": 0.1543, + "step": 154 + }, + { + "epoch": 0.0025819132810287674, + "grad_norm": 0.06602126359939575, + "learning_rate": 3e-06, + "loss": 0.1394, + "step": 155 + }, + { + "epoch": 0.002598570786067663, + "grad_norm": 0.09522085636854172, + "learning_rate": 3e-06, + "loss": 0.1542, + "step": 156 + }, + { + "completion_length": 169.02083587646484, + "epoch": 0.002615228291106558, + "grad_norm": 0.050038523972034454, + "learning_rate": 3e-06, + "loss": 0.0323, + "reward": 0.2083333358168602, + "rewards/countdown_reward_func": 0.2083333358168602, + "step": 157 + }, + { + "epoch": 0.0026318857961454534, + "grad_norm": 0.06468649208545685, + "learning_rate": 3e-06, + "loss": -0.0958, + "step": 158 + }, + { + "epoch": 0.0026485433011843485, + "grad_norm": 0.0492979995906353, + "learning_rate": 3e-06, + "loss": 0.0318, + "step": 159 + }, + { + "epoch": 0.002665200806223244, + "grad_norm": 0.06833986937999725, + "learning_rate": 3e-06, + "loss": -0.0962, + "step": 160 + }, + { + "completion_length": 162.77084350585938, + "epoch": 0.002681858311262139, + "grad_norm": 0.057609107345342636, + "learning_rate": 3e-06, + "loss": 0.0039, + "reward": 0.3218750059604645, + "rewards/countdown_reward_func": 0.3218749910593033, + "step": 161 + }, + { + "epoch": 0.0026985158163010344, + "grad_norm": 0.06060798093676567, + "learning_rate": 3e-06, + "loss": 0.008, + "step": 162 + }, + { + "epoch": 0.00271517332133993, + "grad_norm": 0.06278552114963531, + "learning_rate": 3e-06, + "loss": 0.0035, + "step": 163 + }, + { + "epoch": 0.002731830826378825, + "grad_norm": 0.062433768063783646, + "learning_rate": 3e-06, + "loss": 0.0077, + "step": 164 + }, + { + "completion_length": 165.8541717529297, + "epoch": 0.0027484883314177204, + "grad_norm": 0.1078440472483635, + "learning_rate": 3e-06, + "loss": 0.0303, + "reward": 0.3031250089406967, + "rewards/countdown_reward_func": 0.3031250089406967, + "step": 165 + }, + { + "epoch": 0.0027651458364566154, + "grad_norm": 0.05740131065249443, + "learning_rate": 3e-06, + "loss": 0.0344, + "step": 166 + }, + { + "epoch": 0.002781803341495511, + "grad_norm": 0.10121332854032516, + "learning_rate": 3e-06, + "loss": 0.0298, + "step": 167 + }, + { + "epoch": 0.002798460846534406, + "grad_norm": 0.0628281831741333, + "learning_rate": 3e-06, + "loss": 0.0339, + "step": 168 + }, + { + "completion_length": 148.84375, + "epoch": 0.0028151183515733014, + "grad_norm": 0.06477784365415573, + "learning_rate": 3e-06, + "loss": 0.0372, + "reward": 0.2927083522081375, + "rewards/countdown_reward_func": 0.2927083522081375, + "step": 169 + }, + { + "epoch": 0.0028317758566121965, + "grad_norm": 0.06656663119792938, + "learning_rate": 3e-06, + "loss": 0.0393, + "step": 170 + }, + { + "epoch": 0.002848433361651092, + "grad_norm": 0.06222836300730705, + "learning_rate": 3e-06, + "loss": 0.0369, + "step": 171 + }, + { + "epoch": 0.002865090866689987, + "grad_norm": 0.06518974155187607, + "learning_rate": 3e-06, + "loss": 0.0388, + "step": 172 + }, + { + "completion_length": 139.3541717529297, + "epoch": 0.0028817483717288824, + "grad_norm": 0.06834086775779724, + "learning_rate": 3e-06, + "loss": 0.0375, + "reward": 0.26354166865348816, + "rewards/countdown_reward_func": 0.26354165375232697, + "step": 173 + }, + { + "epoch": 0.002898405876767778, + "grad_norm": 0.05560566857457161, + "learning_rate": 3e-06, + "loss": 0.0115, + "step": 174 + }, + { + "epoch": 0.002915063381806673, + "grad_norm": 0.06890060007572174, + "learning_rate": 3e-06, + "loss": 0.0371, + "step": 175 + }, + { + "epoch": 0.0029317208868455684, + "grad_norm": 0.05870360881090164, + "learning_rate": 3e-06, + "loss": 0.011, + "step": 176 + }, + { + "completion_length": 150.78125, + "epoch": 0.0029483783918844634, + "grad_norm": 0.05513399839401245, + "learning_rate": 3e-06, + "loss": -0.0187, + "reward": 0.23541668057441711, + "rewards/countdown_reward_func": 0.23541668057441711, + "step": 177 + }, + { + "epoch": 0.002965035896923359, + "grad_norm": 0.06441198289394379, + "learning_rate": 3e-06, + "loss": 0.0567, + "step": 178 + }, + { + "epoch": 0.002981693401962254, + "grad_norm": 0.05659981071949005, + "learning_rate": 3e-06, + "loss": -0.0189, + "step": 179 + }, + { + "epoch": 0.0029983509070011494, + "grad_norm": 0.0653940811753273, + "learning_rate": 3e-06, + "loss": 0.0566, + "step": 180 + }, + { + "completion_length": 145.09375762939453, + "epoch": 0.0030150084120400445, + "grad_norm": 0.09972866624593735, + "learning_rate": 3e-06, + "loss": 0.0542, + "reward": 0.3135416805744171, + "rewards/countdown_reward_func": 0.3135416656732559, + "step": 181 + }, + { + "epoch": 0.00303166591707894, + "grad_norm": 0.09707862883806229, + "learning_rate": 3e-06, + "loss": 0.019, + "step": 182 + }, + { + "epoch": 0.0030483234221178354, + "grad_norm": 0.10160347074270248, + "learning_rate": 3e-06, + "loss": 0.0539, + "step": 183 + }, + { + "epoch": 0.0030649809271567304, + "grad_norm": 0.101364366710186, + "learning_rate": 3e-06, + "loss": 0.0184, + "step": 184 + }, + { + "completion_length": 128.09375381469727, + "epoch": 0.003081638432195626, + "grad_norm": 0.046108428388834, + "learning_rate": 3e-06, + "loss": -0.0213, + "reward": 0.2614583373069763, + "rewards/countdown_reward_func": 0.2614583373069763, + "step": 185 + }, + { + "epoch": 0.003098295937234521, + "grad_norm": 0.06450662016868591, + "learning_rate": 3e-06, + "loss": -0.0576, + "step": 186 + }, + { + "epoch": 0.0031149534422734164, + "grad_norm": 0.04343460500240326, + "learning_rate": 3e-06, + "loss": -0.0215, + "step": 187 + }, + { + "epoch": 0.0031316109473123114, + "grad_norm": 0.05523299053311348, + "learning_rate": 3e-06, + "loss": -0.0578, + "step": 188 + }, + { + "completion_length": 165.27084350585938, + "epoch": 0.003148268452351207, + "grad_norm": 0.06990750133991241, + "learning_rate": 3e-06, + "loss": 0.0157, + "reward": 0.3020833432674408, + "rewards/countdown_reward_func": 0.3020833283662796, + "step": 189 + }, + { + "epoch": 0.003164925957390102, + "grad_norm": 0.07389772683382034, + "learning_rate": 3e-06, + "loss": 0.0244, + "step": 190 + }, + { + "epoch": 0.0031815834624289974, + "grad_norm": 0.06964443624019623, + "learning_rate": 3e-06, + "loss": 0.0152, + "step": 191 + }, + { + "epoch": 0.0031982409674678925, + "grad_norm": 0.07812868803739548, + "learning_rate": 3e-06, + "loss": 0.0239, + "step": 192 + }, + { + "completion_length": 137.96875, + "epoch": 0.003214898472506788, + "grad_norm": 0.08959627896547318, + "learning_rate": 3e-06, + "loss": -0.0172, + "reward": 0.3750000149011612, + "rewards/countdown_reward_func": 0.375, + "step": 193 + }, + { + "epoch": 0.0032315559775456834, + "grad_norm": 0.08820956200361252, + "learning_rate": 3e-06, + "loss": -0.0208, + "step": 194 + }, + { + "epoch": 0.0032482134825845784, + "grad_norm": 0.10535451769828796, + "learning_rate": 3e-06, + "loss": -0.0175, + "step": 195 + }, + { + "epoch": 0.003264870987623474, + "grad_norm": 0.09000883251428604, + "learning_rate": 3e-06, + "loss": -0.0211, + "step": 196 + }, + { + "completion_length": 147.9791717529297, + "epoch": 0.003281528492662369, + "grad_norm": 0.09463846683502197, + "learning_rate": 3e-06, + "loss": 0.1243, + "reward": 0.2708333432674408, + "rewards/countdown_reward_func": 0.2708333358168602, + "step": 197 + }, + { + "epoch": 0.0032981859977012644, + "grad_norm": 0.0821266770362854, + "learning_rate": 3e-06, + "loss": 0.0615, + "step": 198 + }, + { + "epoch": 0.0033148435027401594, + "grad_norm": 0.09228460490703583, + "learning_rate": 3e-06, + "loss": 0.124, + "step": 199 + }, + { + "epoch": 0.003331501007779055, + "grad_norm": 0.07375989854335785, + "learning_rate": 3e-06, + "loss": 0.0607, + "step": 200 + }, + { + "completion_length": 136.39583587646484, + "epoch": 0.00334815851281795, + "grad_norm": 0.058074142783880234, + "learning_rate": 3e-06, + "loss": 0.0778, + "reward": 0.30000001937150955, + "rewards/countdown_reward_func": 0.30000001937150955, + "step": 201 + }, + { + "epoch": 0.0033648160178568454, + "grad_norm": 0.06574033945798874, + "learning_rate": 3e-06, + "loss": -0.0168, + "step": 202 + }, + { + "epoch": 0.003381473522895741, + "grad_norm": 0.06161656603217125, + "learning_rate": 3e-06, + "loss": 0.0775, + "step": 203 + }, + { + "epoch": 0.003398131027934636, + "grad_norm": 0.0645272433757782, + "learning_rate": 3e-06, + "loss": -0.0169, + "step": 204 + }, + { + "completion_length": 150.37500762939453, + "epoch": 0.0034147885329735314, + "grad_norm": 0.09019534289836884, + "learning_rate": 3e-06, + "loss": 0.1341, + "reward": 0.3489583432674408, + "rewards/countdown_reward_func": 0.3489583283662796, + "step": 205 + }, + { + "epoch": 0.0034314460380124264, + "grad_norm": 0.06654354929924011, + "learning_rate": 3e-06, + "loss": -0.0037, + "step": 206 + }, + { + "epoch": 0.003448103543051322, + "grad_norm": 0.08852852135896683, + "learning_rate": 3e-06, + "loss": 0.1339, + "step": 207 + }, + { + "epoch": 0.003464761048090217, + "grad_norm": 0.07039018720388412, + "learning_rate": 3e-06, + "loss": -0.0039, + "step": 208 + }, + { + "completion_length": 157.03125762939453, + "epoch": 0.0034814185531291124, + "grad_norm": 0.07062125205993652, + "learning_rate": 3e-06, + "loss": 0.0731, + "reward": 0.33020836114883423, + "rewards/countdown_reward_func": 0.33020833134651184, + "step": 209 + }, + { + "epoch": 0.0034980760581680074, + "grad_norm": 0.09423331916332245, + "learning_rate": 3e-06, + "loss": 0.0646, + "step": 210 + }, + { + "epoch": 0.003514733563206903, + "grad_norm": 0.07079795002937317, + "learning_rate": 3e-06, + "loss": 0.0728, + "step": 211 + }, + { + "epoch": 0.003531391068245798, + "grad_norm": 0.06549128144979477, + "learning_rate": 3e-06, + "loss": 0.0644, + "step": 212 + }, + { + "completion_length": 144.92708587646484, + "epoch": 0.0035480485732846934, + "grad_norm": 0.07345682382583618, + "learning_rate": 3e-06, + "loss": -0.0488, + "reward": 0.29062502086162567, + "rewards/countdown_reward_func": 0.2906250059604645, + "step": 213 + }, + { + "epoch": 0.003564706078323589, + "grad_norm": 0.09150245040655136, + "learning_rate": 3e-06, + "loss": -0.0496, + "step": 214 + }, + { + "epoch": 0.003581363583362484, + "grad_norm": 0.07555064558982849, + "learning_rate": 3e-06, + "loss": -0.0492, + "step": 215 + }, + { + "epoch": 0.0035980210884013794, + "grad_norm": 0.08993101865053177, + "learning_rate": 3e-06, + "loss": -0.0499, + "step": 216 + }, + { + "completion_length": 138.31250762939453, + "epoch": 0.0036146785934402744, + "grad_norm": 0.05847645550966263, + "learning_rate": 3e-06, + "loss": -0.0095, + "reward": 0.25937502086162567, + "rewards/countdown_reward_func": 0.25937502086162567, + "step": 217 + }, + { + "epoch": 0.00363133609847917, + "grad_norm": 0.06861383467912674, + "learning_rate": 3e-06, + "loss": -0.054, + "step": 218 + }, + { + "epoch": 0.003647993603518065, + "grad_norm": 0.057325560599565506, + "learning_rate": 3e-06, + "loss": -0.0097, + "step": 219 + }, + { + "epoch": 0.0036646511085569604, + "grad_norm": 0.06771288067102432, + "learning_rate": 3e-06, + "loss": -0.0545, + "step": 220 + }, + { + "completion_length": 127.51041793823242, + "epoch": 0.0036813086135958554, + "grad_norm": 0.07580415159463882, + "learning_rate": 3e-06, + "loss": -0.0168, + "reward": 0.2604166716337204, + "rewards/countdown_reward_func": 0.2604166716337204, + "step": 221 + }, + { + "epoch": 0.003697966118634751, + "grad_norm": 0.07672726362943649, + "learning_rate": 3e-06, + "loss": 0.016, + "step": 222 + }, + { + "epoch": 0.0037146236236736464, + "grad_norm": 0.07794097810983658, + "learning_rate": 3e-06, + "loss": -0.0175, + "step": 223 + }, + { + "epoch": 0.0037312811287125414, + "grad_norm": 0.07431076467037201, + "learning_rate": 3e-06, + "loss": 0.0152, + "step": 224 + }, + { + "completion_length": 115.61458587646484, + "epoch": 0.003747938633751437, + "grad_norm": 0.07458362728357315, + "learning_rate": 3e-06, + "loss": -0.058, + "reward": 0.31979167461395264, + "rewards/countdown_reward_func": 0.31979167461395264, + "step": 225 + }, + { + "epoch": 0.003764596138790332, + "grad_norm": 0.06462802737951279, + "learning_rate": 3e-06, + "loss": 0.034, + "step": 226 + }, + { + "epoch": 0.0037812536438292274, + "grad_norm": 0.07902900874614716, + "learning_rate": 3e-06, + "loss": -0.0586, + "step": 227 + }, + { + "epoch": 0.0037979111488681224, + "grad_norm": 0.059599798172712326, + "learning_rate": 3e-06, + "loss": 0.0333, + "step": 228 + }, + { + "completion_length": 117.63541793823242, + "epoch": 0.003814568653907018, + "grad_norm": 0.12277258932590485, + "learning_rate": 3e-06, + "loss": 0.1314, + "reward": 0.3395833671092987, + "rewards/countdown_reward_func": 0.3395833522081375, + "step": 229 + }, + { + "epoch": 0.003831226158945913, + "grad_norm": 0.12224022299051285, + "learning_rate": 3e-06, + "loss": 0.1504, + "step": 230 + }, + { + "epoch": 0.0038478836639848084, + "grad_norm": 0.1264142245054245, + "learning_rate": 3e-06, + "loss": 0.1315, + "step": 231 + }, + { + "epoch": 0.0038645411690237034, + "grad_norm": 0.13444413244724274, + "learning_rate": 3e-06, + "loss": 0.1499, + "step": 232 + }, + { + "completion_length": 116.375, + "epoch": 0.003881198674062599, + "grad_norm": 0.06786767393350601, + "learning_rate": 3e-06, + "loss": 0.002, + "reward": 0.2718750387430191, + "rewards/countdown_reward_func": 0.2718750163912773, + "step": 233 + }, + { + "epoch": 0.0038978561791014944, + "grad_norm": 0.05543841794133186, + "learning_rate": 3e-06, + "loss": 0.0343, + "step": 234 + }, + { + "epoch": 0.00391451368414039, + "grad_norm": 0.0755564421415329, + "learning_rate": 3e-06, + "loss": 0.0022, + "step": 235 + }, + { + "epoch": 0.003931171189179285, + "grad_norm": 0.05361182987689972, + "learning_rate": 3e-06, + "loss": 0.0343, + "step": 236 + }, + { + "completion_length": 126.92708587646484, + "epoch": 0.00394782869421818, + "grad_norm": 0.07700058817863464, + "learning_rate": 3e-06, + "loss": -0.039, + "reward": 0.34791669249534607, + "rewards/countdown_reward_func": 0.3479166775941849, + "step": 237 + }, + { + "epoch": 0.003964486199257075, + "grad_norm": 0.0968509316444397, + "learning_rate": 3e-06, + "loss": -0.0338, + "step": 238 + }, + { + "epoch": 0.003981143704295971, + "grad_norm": 0.07549140602350235, + "learning_rate": 3e-06, + "loss": -0.039, + "step": 239 + }, + { + "epoch": 0.003997801209334866, + "grad_norm": 0.10092661529779434, + "learning_rate": 3e-06, + "loss": -0.0341, + "step": 240 + }, + { + "completion_length": 106.69791793823242, + "epoch": 0.004014458714373761, + "grad_norm": 0.06467276066541672, + "learning_rate": 3e-06, + "loss": -0.0163, + "reward": 0.4052083343267441, + "rewards/countdown_reward_func": 0.4052083343267441, + "step": 241 + }, + { + "epoch": 0.004031116219412656, + "grad_norm": 0.06268125772476196, + "learning_rate": 3e-06, + "loss": 0.042, + "step": 242 + }, + { + "epoch": 0.004047773724451552, + "grad_norm": 0.06319697201251984, + "learning_rate": 3e-06, + "loss": -0.0165, + "step": 243 + }, + { + "epoch": 0.004064431229490447, + "grad_norm": 0.0586119182407856, + "learning_rate": 3e-06, + "loss": 0.042, + "step": 244 + }, + { + "completion_length": 116.43750381469727, + "epoch": 0.004081088734529342, + "grad_norm": 0.08322522789239883, + "learning_rate": 3e-06, + "loss": 0.0643, + "reward": 0.38437503576278687, + "rewards/countdown_reward_func": 0.3843750059604645, + "step": 245 + }, + { + "epoch": 0.004097746239568238, + "grad_norm": 0.0803133174777031, + "learning_rate": 3e-06, + "loss": 0.0949, + "step": 246 + }, + { + "epoch": 0.004114403744607133, + "grad_norm": 0.09148529171943665, + "learning_rate": 3e-06, + "loss": 0.0639, + "step": 247 + }, + { + "epoch": 0.004131061249646028, + "grad_norm": 0.0862363874912262, + "learning_rate": 3e-06, + "loss": 0.0947, + "step": 248 + }, + { + "completion_length": 114.56250381469727, + "epoch": 0.004147718754684923, + "grad_norm": 0.06139526516199112, + "learning_rate": 3e-06, + "loss": 0.0166, + "reward": 0.29062502086162567, + "rewards/countdown_reward_func": 0.29062502086162567, + "step": 249 + }, + { + "epoch": 0.004164376259723819, + "grad_norm": 0.07488217949867249, + "learning_rate": 3e-06, + "loss": 0.0061, + "step": 250 + }, + { + "epoch": 0.004181033764762714, + "grad_norm": 0.058261286467313766, + "learning_rate": 3e-06, + "loss": 0.0164, + "step": 251 + }, + { + "epoch": 0.004197691269801609, + "grad_norm": 0.07172414660453796, + "learning_rate": 3e-06, + "loss": 0.0059, + "step": 252 + }, + { + "completion_length": 120.37500762939453, + "epoch": 0.004214348774840504, + "grad_norm": 0.0745980367064476, + "learning_rate": 3e-06, + "loss": -0.0736, + "reward": 0.28437501192092896, + "rewards/countdown_reward_func": 0.28437501192092896, + "step": 253 + }, + { + "epoch": 0.0042310062798794, + "grad_norm": 0.12503661215305328, + "learning_rate": 3e-06, + "loss": -0.1177, + "step": 254 + }, + { + "epoch": 0.004247663784918295, + "grad_norm": 0.07567345350980759, + "learning_rate": 3e-06, + "loss": -0.0739, + "step": 255 + }, + { + "epoch": 0.00426432128995719, + "grad_norm": 0.12846483290195465, + "learning_rate": 3e-06, + "loss": -0.119, + "step": 256 + }, + { + "completion_length": 107.37500381469727, + "epoch": 0.004280978794996086, + "grad_norm": 0.06598395854234695, + "learning_rate": 3e-06, + "loss": 0.0435, + "reward": 0.286458358168602, + "rewards/countdown_reward_func": 0.2864583432674408, + "step": 257 + }, + { + "epoch": 0.004297636300034981, + "grad_norm": 0.06995073705911636, + "learning_rate": 3e-06, + "loss": 0.0309, + "step": 258 + }, + { + "epoch": 0.004314293805073876, + "grad_norm": 0.06322945654392242, + "learning_rate": 3e-06, + "loss": 0.0433, + "step": 259 + }, + { + "epoch": 0.004330951310112771, + "grad_norm": 0.07441867887973785, + "learning_rate": 3e-06, + "loss": 0.0306, + "step": 260 + }, + { + "completion_length": 107.82291793823242, + "epoch": 0.004347608815151667, + "grad_norm": 0.10875702649354935, + "learning_rate": 3e-06, + "loss": -0.049, + "reward": 0.37916669249534607, + "rewards/countdown_reward_func": 0.3791666775941849, + "step": 261 + }, + { + "epoch": 0.004364266320190562, + "grad_norm": 0.08845296502113342, + "learning_rate": 3e-06, + "loss": -0.0999, + "step": 262 + }, + { + "epoch": 0.004380923825229457, + "grad_norm": 0.1017240509390831, + "learning_rate": 3e-06, + "loss": -0.0494, + "step": 263 + }, + { + "epoch": 0.004397581330268353, + "grad_norm": 0.09473470598459244, + "learning_rate": 3e-06, + "loss": -0.1004, + "step": 264 + }, + { + "completion_length": 103.91666793823242, + "epoch": 0.004414238835307248, + "grad_norm": 0.08854733407497406, + "learning_rate": 3e-06, + "loss": 0.0485, + "reward": 0.4531250298023224, + "rewards/countdown_reward_func": 0.4531250298023224, + "step": 265 + }, + { + "epoch": 0.004430896340346143, + "grad_norm": 0.07844230532646179, + "learning_rate": 3e-06, + "loss": 0.026, + "step": 266 + }, + { + "epoch": 0.004447553845385038, + "grad_norm": 0.08900896459817886, + "learning_rate": 3e-06, + "loss": 0.0484, + "step": 267 + }, + { + "epoch": 0.004464211350423934, + "grad_norm": 0.08528922498226166, + "learning_rate": 3e-06, + "loss": 0.0257, + "step": 268 + }, + { + "completion_length": 94.42708587646484, + "epoch": 0.004480868855462829, + "grad_norm": 0.07602333277463913, + "learning_rate": 3e-06, + "loss": 0.0793, + "reward": 0.3750000149011612, + "rewards/countdown_reward_func": 0.375, + "step": 269 + }, + { + "epoch": 0.004497526360501724, + "grad_norm": 0.06640485674142838, + "learning_rate": 3e-06, + "loss": 0.0262, + "step": 270 + }, + { + "epoch": 0.004514183865540619, + "grad_norm": 0.0821257010102272, + "learning_rate": 3e-06, + "loss": 0.0792, + "step": 271 + }, + { + "epoch": 0.004530841370579515, + "grad_norm": 0.08452390879392624, + "learning_rate": 3e-06, + "loss": 0.026, + "step": 272 + }, + { + "completion_length": 87.80208587646484, + "epoch": 0.00454749887561841, + "grad_norm": 0.08464347571134567, + "learning_rate": 3e-06, + "loss": -0.0301, + "reward": 0.4375000149011612, + "rewards/countdown_reward_func": 0.4375, + "step": 273 + }, + { + "epoch": 0.004564156380657305, + "grad_norm": 0.08153299242258072, + "learning_rate": 3e-06, + "loss": -0.0044, + "step": 274 + }, + { + "epoch": 0.004580813885696201, + "grad_norm": 0.08800006657838821, + "learning_rate": 3e-06, + "loss": -0.0307, + "step": 275 + }, + { + "epoch": 0.004597471390735096, + "grad_norm": 0.10315754264593124, + "learning_rate": 3e-06, + "loss": -0.0046, + "step": 276 + }, + { + "completion_length": 93.37500381469727, + "epoch": 0.004614128895773991, + "grad_norm": 0.06648615747690201, + "learning_rate": 3e-06, + "loss": 0.0335, + "reward": 0.2187500074505806, + "rewards/countdown_reward_func": 0.21875, + "step": 277 + }, + { + "epoch": 0.004630786400812886, + "grad_norm": 0.06065024435520172, + "learning_rate": 3e-06, + "loss": 0.046, + "step": 278 + }, + { + "epoch": 0.004647443905851782, + "grad_norm": 0.07257097959518433, + "learning_rate": 3e-06, + "loss": 0.0334, + "step": 279 + }, + { + "epoch": 0.004664101410890677, + "grad_norm": 0.06367486715316772, + "learning_rate": 3e-06, + "loss": 0.0461, + "step": 280 + }, + { + "completion_length": 86.65625, + "epoch": 0.004680758915929572, + "grad_norm": 0.07220441102981567, + "learning_rate": 3e-06, + "loss": -0.0027, + "reward": 0.30937501788139343, + "rewards/countdown_reward_func": 0.30937500298023224, + "step": 281 + }, + { + "epoch": 0.004697416420968467, + "grad_norm": 0.07540306448936462, + "learning_rate": 3e-06, + "loss": 0.0179, + "step": 282 + }, + { + "epoch": 0.004714073926007363, + "grad_norm": 0.06990005820989609, + "learning_rate": 3e-06, + "loss": -0.003, + "step": 283 + }, + { + "epoch": 0.004730731431046258, + "grad_norm": 0.07452642172574997, + "learning_rate": 3e-06, + "loss": 0.0174, + "step": 284 + }, + { + "completion_length": 92.10416793823242, + "epoch": 0.004747388936085153, + "grad_norm": 0.09307872503995895, + "learning_rate": 3e-06, + "loss": 0.0889, + "reward": 0.41458335518836975, + "rewards/countdown_reward_func": 0.41458335518836975, + "step": 285 + }, + { + "epoch": 0.004764046441124049, + "grad_norm": 0.08895017951726913, + "learning_rate": 3e-06, + "loss": -0.0085, + "step": 286 + }, + { + "epoch": 0.004780703946162944, + "grad_norm": 0.09304263442754745, + "learning_rate": 3e-06, + "loss": 0.0883, + "step": 287 + }, + { + "epoch": 0.004797361451201839, + "grad_norm": 0.08608956634998322, + "learning_rate": 3e-06, + "loss": -0.0086, + "step": 288 + }, + { + "completion_length": 93.06250381469727, + "epoch": 0.004814018956240734, + "grad_norm": 0.08824590593576431, + "learning_rate": 3e-06, + "loss": 0.0725, + "reward": 0.38333337008953094, + "rewards/countdown_reward_func": 0.38333337008953094, + "step": 289 + }, + { + "epoch": 0.00483067646127963, + "grad_norm": 0.08867403864860535, + "learning_rate": 3e-06, + "loss": 0.0455, + "step": 290 + }, + { + "epoch": 0.004847333966318525, + "grad_norm": 0.07506692409515381, + "learning_rate": 3e-06, + "loss": 0.0723, + "step": 291 + }, + { + "epoch": 0.00486399147135742, + "grad_norm": 0.09141616523265839, + "learning_rate": 3e-06, + "loss": 0.0448, + "step": 292 + }, + { + "completion_length": 93.87500381469727, + "epoch": 0.004880648976396315, + "grad_norm": 0.05315243452787399, + "learning_rate": 3e-06, + "loss": 0.0463, + "reward": 0.33645834028720856, + "rewards/countdown_reward_func": 0.33645834028720856, + "step": 293 + }, + { + "epoch": 0.004897306481435211, + "grad_norm": 0.06470438092947006, + "learning_rate": 3e-06, + "loss": -0.0112, + "step": 294 + }, + { + "epoch": 0.004913963986474106, + "grad_norm": 0.05309168994426727, + "learning_rate": 3e-06, + "loss": 0.0463, + "step": 295 + }, + { + "epoch": 0.004930621491513001, + "grad_norm": 0.0684286430478096, + "learning_rate": 3e-06, + "loss": -0.0115, + "step": 296 + }, + { + "completion_length": 89.95833587646484, + "epoch": 0.004947278996551897, + "grad_norm": 0.07998765259981155, + "learning_rate": 3e-06, + "loss": 0.0321, + "reward": 0.47187499701976776, + "rewards/countdown_reward_func": 0.47187499701976776, + "step": 297 + }, + { + "epoch": 0.004963936501590792, + "grad_norm": 0.08201282471418381, + "learning_rate": 3e-06, + "loss": 0.0823, + "step": 298 + }, + { + "epoch": 0.004980594006629687, + "grad_norm": 0.07699496299028397, + "learning_rate": 3e-06, + "loss": 0.0322, + "step": 299 + }, + { + "epoch": 0.004997251511668582, + "grad_norm": 0.08202577382326126, + "learning_rate": 3e-06, + "loss": 0.0817, + "step": 300 + }, + { + "completion_length": 87.64583587646484, + "epoch": 0.005013909016707478, + "grad_norm": 0.08015795052051544, + "learning_rate": 3e-06, + "loss": 0.0269, + "reward": 0.3489583432674408, + "rewards/countdown_reward_func": 0.3489583283662796, + "step": 301 + }, + { + "epoch": 0.005030566521746373, + "grad_norm": 0.0754380002617836, + "learning_rate": 3e-06, + "loss": 0.0509, + "step": 302 + }, + { + "epoch": 0.005047224026785268, + "grad_norm": 0.07999464124441147, + "learning_rate": 3e-06, + "loss": 0.0263, + "step": 303 + }, + { + "epoch": 0.005063881531824163, + "grad_norm": 0.07302260398864746, + "learning_rate": 3e-06, + "loss": 0.0506, + "step": 304 + }, + { + "completion_length": 87.75, + "epoch": 0.005080539036863059, + "grad_norm": 0.08891277760267258, + "learning_rate": 3e-06, + "loss": 0.0766, + "reward": 0.47187501192092896, + "rewards/countdown_reward_func": 0.47187499701976776, + "step": 305 + }, + { + "epoch": 0.005097196541901954, + "grad_norm": 0.06770050525665283, + "learning_rate": 3e-06, + "loss": 0.0858, + "step": 306 + }, + { + "epoch": 0.005113854046940849, + "grad_norm": 0.08512779325246811, + "learning_rate": 3e-06, + "loss": 0.0765, + "step": 307 + }, + { + "epoch": 0.005130511551979745, + "grad_norm": 0.060558076947927475, + "learning_rate": 3e-06, + "loss": 0.0857, + "step": 308 + }, + { + "completion_length": 97.32292175292969, + "epoch": 0.00514716905701864, + "grad_norm": 0.08441516757011414, + "learning_rate": 3e-06, + "loss": -0.0199, + "reward": 0.4166667014360428, + "rewards/countdown_reward_func": 0.4166666865348816, + "step": 309 + }, + { + "epoch": 0.005163826562057535, + "grad_norm": 0.08539208769798279, + "learning_rate": 3e-06, + "loss": 0.043, + "step": 310 + }, + { + "epoch": 0.00518048406709643, + "grad_norm": 0.0851275771856308, + "learning_rate": 3e-06, + "loss": -0.0205, + "step": 311 + }, + { + "epoch": 0.005197141572135326, + "grad_norm": 0.0845513641834259, + "learning_rate": 3e-06, + "loss": 0.0426, + "step": 312 + }, + { + "completion_length": 90.34375, + "epoch": 0.005213799077174221, + "grad_norm": 0.09382148832082748, + "learning_rate": 3e-06, + "loss": 0.0808, + "reward": 0.4281250089406967, + "rewards/countdown_reward_func": 0.4281250089406967, + "step": 313 + }, + { + "epoch": 0.005230456582213116, + "grad_norm": 0.07676962018013, + "learning_rate": 3e-06, + "loss": 0.0264, + "step": 314 + }, + { + "epoch": 0.005247114087252012, + "grad_norm": 0.09404994547367096, + "learning_rate": 3e-06, + "loss": 0.0802, + "step": 315 + }, + { + "epoch": 0.005263771592290907, + "grad_norm": 0.07587739080190659, + "learning_rate": 3e-06, + "loss": 0.0259, + "step": 316 + }, + { + "completion_length": 89.86458587646484, + "epoch": 0.005280429097329802, + "grad_norm": 0.07495738565921783, + "learning_rate": 3e-06, + "loss": 0.0092, + "reward": 0.348958358168602, + "rewards/countdown_reward_func": 0.3489583432674408, + "step": 317 + }, + { + "epoch": 0.005297086602368697, + "grad_norm": 0.10114588588476181, + "learning_rate": 3e-06, + "loss": 0.0468, + "step": 318 + }, + { + "epoch": 0.005313744107407593, + "grad_norm": 0.0784391537308693, + "learning_rate": 3e-06, + "loss": 0.0088, + "step": 319 + }, + { + "epoch": 0.005330401612446488, + "grad_norm": 0.11108588427305222, + "learning_rate": 3e-06, + "loss": 0.0462, + "step": 320 + }, + { + "completion_length": 89.50000381469727, + "epoch": 0.005347059117485383, + "grad_norm": 0.07896595448255539, + "learning_rate": 3e-06, + "loss": 0.1378, + "reward": 0.35104167461395264, + "rewards/countdown_reward_func": 0.35104167461395264, + "step": 321 + }, + { + "epoch": 0.005363716622524278, + "grad_norm": 0.08065321296453476, + "learning_rate": 3e-06, + "loss": 0.0612, + "step": 322 + }, + { + "epoch": 0.005380374127563174, + "grad_norm": 0.08472850918769836, + "learning_rate": 3e-06, + "loss": 0.1373, + "step": 323 + }, + { + "epoch": 0.005397031632602069, + "grad_norm": 0.07507702708244324, + "learning_rate": 3e-06, + "loss": 0.0608, + "step": 324 + }, + { + "completion_length": 84.91667175292969, + "epoch": 0.005413689137640964, + "grad_norm": 0.06527705490589142, + "learning_rate": 3e-06, + "loss": 0.0327, + "reward": 0.46145835518836975, + "rewards/countdown_reward_func": 0.46145834028720856, + "step": 325 + }, + { + "epoch": 0.00543034664267986, + "grad_norm": 0.12048778682947159, + "learning_rate": 3e-06, + "loss": -0.0581, + "step": 326 + }, + { + "epoch": 0.005447004147718755, + "grad_norm": 0.06648185104131699, + "learning_rate": 3e-06, + "loss": 0.0322, + "step": 327 + }, + { + "epoch": 0.00546366165275765, + "grad_norm": 0.12028519809246063, + "learning_rate": 3e-06, + "loss": -0.0598, + "step": 328 + }, + { + "completion_length": 84.05208587646484, + "epoch": 0.005480319157796545, + "grad_norm": 0.09329447895288467, + "learning_rate": 3e-06, + "loss": -0.0262, + "reward": 0.5572916865348816, + "rewards/countdown_reward_func": 0.5572916865348816, + "step": 329 + }, + { + "epoch": 0.005496976662835441, + "grad_norm": 0.16099168360233307, + "learning_rate": 3e-06, + "loss": 0.0215, + "step": 330 + }, + { + "epoch": 0.005513634167874336, + "grad_norm": 0.09137027710676193, + "learning_rate": 3e-06, + "loss": -0.0272, + "step": 331 + }, + { + "epoch": 0.005530291672913231, + "grad_norm": 0.10402784496545792, + "learning_rate": 3e-06, + "loss": 0.0205, + "step": 332 + }, + { + "completion_length": 81.07291793823242, + "epoch": 0.005546949177952126, + "grad_norm": 0.05004766583442688, + "learning_rate": 3e-06, + "loss": 0.0118, + "reward": 0.32083337008953094, + "rewards/countdown_reward_func": 0.32083335518836975, + "step": 333 + }, + { + "epoch": 0.005563606682991022, + "grad_norm": 0.05748219043016434, + "learning_rate": 3e-06, + "loss": 0.0321, + "step": 334 + }, + { + "epoch": 0.005580264188029917, + "grad_norm": 0.05636519566178322, + "learning_rate": 3e-06, + "loss": 0.0115, + "step": 335 + }, + { + "epoch": 0.005596921693068812, + "grad_norm": 0.05843605101108551, + "learning_rate": 3e-06, + "loss": 0.0318, + "step": 336 + }, + { + "completion_length": 81.13541793823242, + "epoch": 0.005613579198107708, + "grad_norm": 0.07628590613603592, + "learning_rate": 3e-06, + "loss": 0.0192, + "reward": 0.413541704416275, + "rewards/countdown_reward_func": 0.41354167461395264, + "step": 337 + }, + { + "epoch": 0.005630236703146603, + "grad_norm": 0.055629514157772064, + "learning_rate": 3e-06, + "loss": 0.0269, + "step": 338 + }, + { + "epoch": 0.005646894208185498, + "grad_norm": 0.07092045992612839, + "learning_rate": 3e-06, + "loss": 0.0188, + "step": 339 + }, + { + "epoch": 0.005663551713224393, + "grad_norm": 0.05902326852083206, + "learning_rate": 3e-06, + "loss": 0.0267, + "step": 340 + }, + { + "completion_length": 80.56250381469727, + "epoch": 0.005680209218263289, + "grad_norm": 0.061605557799339294, + "learning_rate": 3e-06, + "loss": 0.0264, + "reward": 0.3333333432674408, + "rewards/countdown_reward_func": 0.3333333432674408, + "step": 341 + }, + { + "epoch": 0.005696866723302184, + "grad_norm": 0.08945262432098389, + "learning_rate": 3e-06, + "loss": 0.0079, + "step": 342 + }, + { + "epoch": 0.005713524228341079, + "grad_norm": 0.05948707088828087, + "learning_rate": 3e-06, + "loss": 0.0264, + "step": 343 + }, + { + "epoch": 0.005730181733379974, + "grad_norm": 0.08860863000154495, + "learning_rate": 3e-06, + "loss": 0.0072, + "step": 344 + }, + { + "completion_length": 77.50000381469727, + "epoch": 0.00574683923841887, + "grad_norm": 0.05280579626560211, + "learning_rate": 3e-06, + "loss": 0.0424, + "reward": 0.4125000238418579, + "rewards/countdown_reward_func": 0.4124999940395355, + "step": 345 + }, + { + "epoch": 0.005763496743457765, + "grad_norm": 0.06609396636486053, + "learning_rate": 3e-06, + "loss": -0.0064, + "step": 346 + }, + { + "epoch": 0.00578015424849666, + "grad_norm": 0.05191401392221451, + "learning_rate": 3e-06, + "loss": 0.0421, + "step": 347 + }, + { + "epoch": 0.005796811753535556, + "grad_norm": 0.07616645097732544, + "learning_rate": 3e-06, + "loss": -0.0065, + "step": 348 + }, + { + "completion_length": 79.25, + "epoch": 0.005813469258574451, + "grad_norm": 0.04914965480566025, + "learning_rate": 3e-06, + "loss": 0.0321, + "reward": 0.30937500298023224, + "rewards/countdown_reward_func": 0.30937500298023224, + "step": 349 + }, + { + "epoch": 0.005830126763613346, + "grad_norm": 0.065985769033432, + "learning_rate": 3e-06, + "loss": -0.0294, + "step": 350 + }, + { + "epoch": 0.005846784268652241, + "grad_norm": 0.05139080807566643, + "learning_rate": 3e-06, + "loss": 0.0318, + "step": 351 + }, + { + "epoch": 0.005863441773691137, + "grad_norm": 0.06658973544836044, + "learning_rate": 3e-06, + "loss": -0.0294, + "step": 352 + }, + { + "completion_length": 80.25000381469727, + "epoch": 0.005880099278730032, + "grad_norm": 0.07533974200487137, + "learning_rate": 3e-06, + "loss": 0.0272, + "reward": 0.4635417014360428, + "rewards/countdown_reward_func": 0.4635416865348816, + "step": 353 + }, + { + "epoch": 0.005896756783768927, + "grad_norm": 0.09428796917200089, + "learning_rate": 3e-06, + "loss": 0.0225, + "step": 354 + }, + { + "epoch": 0.005913414288807823, + "grad_norm": 0.07671528309583664, + "learning_rate": 3e-06, + "loss": 0.0267, + "step": 355 + }, + { + "epoch": 0.005930071793846718, + "grad_norm": 0.09046661108732224, + "learning_rate": 3e-06, + "loss": 0.0224, + "step": 356 + }, + { + "completion_length": 78.20833587646484, + "epoch": 0.005946729298885613, + "grad_norm": 0.06101154908537865, + "learning_rate": 3e-06, + "loss": 0.0671, + "reward": 0.40312501788139343, + "rewards/countdown_reward_func": 0.40312500298023224, + "step": 357 + }, + { + "epoch": 0.005963386803924508, + "grad_norm": 0.09327936917543411, + "learning_rate": 3e-06, + "loss": 0.0348, + "step": 358 + }, + { + "epoch": 0.005980044308963404, + "grad_norm": 0.058738693594932556, + "learning_rate": 3e-06, + "loss": 0.067, + "step": 359 + }, + { + "epoch": 0.005996701814002299, + "grad_norm": 0.1009228378534317, + "learning_rate": 3e-06, + "loss": 0.0345, + "step": 360 + }, + { + "completion_length": 80.3125, + "epoch": 0.006013359319041194, + "grad_norm": 0.06401659548282623, + "learning_rate": 3e-06, + "loss": 0.005, + "reward": 0.3593750149011612, + "rewards/countdown_reward_func": 0.3593750149011612, + "step": 361 + }, + { + "epoch": 0.006030016824080089, + "grad_norm": 0.10152820497751236, + "learning_rate": 3e-06, + "loss": 0.0381, + "step": 362 + }, + { + "epoch": 0.006046674329118985, + "grad_norm": 0.05993008241057396, + "learning_rate": 3e-06, + "loss": 0.0048, + "step": 363 + }, + { + "epoch": 0.00606333183415788, + "grad_norm": 0.10378899425268173, + "learning_rate": 3e-06, + "loss": 0.0377, + "step": 364 + }, + { + "completion_length": 79.43750381469727, + "epoch": 0.006079989339196775, + "grad_norm": 0.04563683271408081, + "learning_rate": 3e-06, + "loss": -0.0029, + "reward": 0.4166667014360428, + "rewards/countdown_reward_func": 0.4166666865348816, + "step": 365 + }, + { + "epoch": 0.006096646844235671, + "grad_norm": 0.0845368504524231, + "learning_rate": 3e-06, + "loss": -0.0047, + "step": 366 + }, + { + "epoch": 0.006113304349274566, + "grad_norm": 0.04646947979927063, + "learning_rate": 3e-06, + "loss": -0.0028, + "step": 367 + }, + { + "epoch": 0.006129961854313461, + "grad_norm": 0.09741491079330444, + "learning_rate": 3e-06, + "loss": -0.005, + "step": 368 + }, + { + "completion_length": 79.1875, + "epoch": 0.006146619359352356, + "grad_norm": 0.07238098233938217, + "learning_rate": 3e-06, + "loss": -0.0315, + "reward": 0.4895833879709244, + "rewards/countdown_reward_func": 0.4895833879709244, + "step": 369 + }, + { + "epoch": 0.006163276864391252, + "grad_norm": 0.09169110655784607, + "learning_rate": 3e-06, + "loss": 0.0725, + "step": 370 + }, + { + "epoch": 0.006179934369430147, + "grad_norm": 0.08872858434915543, + "learning_rate": 3e-06, + "loss": -0.0318, + "step": 371 + }, + { + "epoch": 0.006196591874469042, + "grad_norm": 0.09137571603059769, + "learning_rate": 3e-06, + "loss": 0.0722, + "step": 372 + }, + { + "completion_length": 80.70833587646484, + "epoch": 0.006213249379507937, + "grad_norm": 0.06937266141176224, + "learning_rate": 3e-06, + "loss": 0.0508, + "reward": 0.5666667222976685, + "rewards/countdown_reward_func": 0.5666667073965073, + "step": 373 + }, + { + "epoch": 0.006229906884546833, + "grad_norm": 0.07525796443223953, + "learning_rate": 3e-06, + "loss": 0.0544, + "step": 374 + }, + { + "epoch": 0.006246564389585728, + "grad_norm": 0.06686960905790329, + "learning_rate": 3e-06, + "loss": 0.0504, + "step": 375 + }, + { + "epoch": 0.006263221894624623, + "grad_norm": 0.06938055902719498, + "learning_rate": 3e-06, + "loss": 0.0542, + "step": 376 + }, + { + "completion_length": 80.375, + "epoch": 0.006279879399663519, + "grad_norm": 0.07549868524074554, + "learning_rate": 3e-06, + "loss": 0.0784, + "reward": 0.4322917014360428, + "rewards/countdown_reward_func": 0.4322916865348816, + "step": 377 + }, + { + "epoch": 0.006296536904702414, + "grad_norm": 0.13216276466846466, + "learning_rate": 3e-06, + "loss": 0.0632, + "step": 378 + }, + { + "epoch": 0.006313194409741309, + "grad_norm": 0.07560853660106659, + "learning_rate": 3e-06, + "loss": 0.0781, + "step": 379 + }, + { + "epoch": 0.006329851914780204, + "grad_norm": 0.06288950890302658, + "learning_rate": 3e-06, + "loss": 0.0626, + "step": 380 + }, + { + "completion_length": 81.37500381469727, + "epoch": 0.0063465094198191, + "grad_norm": 0.05429549887776375, + "learning_rate": 3e-06, + "loss": 0.065, + "reward": 0.39375001192092896, + "rewards/countdown_reward_func": 0.39375001192092896, + "step": 381 + }, + { + "epoch": 0.006363166924857995, + "grad_norm": 0.08071405440568924, + "learning_rate": 3e-06, + "loss": 0.0308, + "step": 382 + }, + { + "epoch": 0.00637982442989689, + "grad_norm": 0.051490843296051025, + "learning_rate": 3e-06, + "loss": 0.0649, + "step": 383 + }, + { + "epoch": 0.006396481934935785, + "grad_norm": 0.0780770406126976, + "learning_rate": 3e-06, + "loss": 0.0302, + "step": 384 + }, + { + "completion_length": 80.64583587646484, + "epoch": 0.006413139439974681, + "grad_norm": 0.10670112818479538, + "learning_rate": 3e-06, + "loss": 0.0256, + "reward": 0.5666666626930237, + "rewards/countdown_reward_func": 0.5666666626930237, + "step": 385 + }, + { + "epoch": 0.006429796945013576, + "grad_norm": 0.10167449712753296, + "learning_rate": 3e-06, + "loss": -0.043, + "step": 386 + }, + { + "epoch": 0.006446454450052471, + "grad_norm": 0.10875949263572693, + "learning_rate": 3e-06, + "loss": 0.025, + "step": 387 + }, + { + "epoch": 0.006463111955091367, + "grad_norm": 0.10257674008607864, + "learning_rate": 3e-06, + "loss": -0.043, + "step": 388 + }, + { + "completion_length": 81.38541793823242, + "epoch": 0.006479769460130262, + "grad_norm": 0.07689986377954483, + "learning_rate": 3e-06, + "loss": 0.0336, + "reward": 0.47083334624767303, + "rewards/countdown_reward_func": 0.47083334624767303, + "step": 389 + }, + { + "epoch": 0.006496426965169157, + "grad_norm": 0.08978056907653809, + "learning_rate": 3e-06, + "loss": 0.0058, + "step": 390 + }, + { + "epoch": 0.006513084470208052, + "grad_norm": 0.074790820479393, + "learning_rate": 3e-06, + "loss": 0.0334, + "step": 391 + }, + { + "epoch": 0.006529741975246948, + "grad_norm": 0.0956558883190155, + "learning_rate": 3e-06, + "loss": 0.0052, + "step": 392 + }, + { + "completion_length": 79.16667175292969, + "epoch": 0.006546399480285843, + "grad_norm": 0.0642806813120842, + "learning_rate": 3e-06, + "loss": 0.0007, + "reward": 0.5218750238418579, + "rewards/countdown_reward_func": 0.5218749940395355, + "step": 393 + }, + { + "epoch": 0.006563056985324738, + "grad_norm": 0.07824093848466873, + "learning_rate": 3e-06, + "loss": 0.0732, + "step": 394 + }, + { + "epoch": 0.006579714490363633, + "grad_norm": 0.06322997063398361, + "learning_rate": 3e-06, + "loss": 0.0006, + "step": 395 + }, + { + "epoch": 0.006596371995402529, + "grad_norm": 0.07832024246454239, + "learning_rate": 3e-06, + "loss": 0.0728, + "step": 396 + }, + { + "completion_length": 80.48958587646484, + "epoch": 0.006613029500441424, + "grad_norm": 0.06320694088935852, + "learning_rate": 3e-06, + "loss": 0.0215, + "reward": 0.4322916716337204, + "rewards/countdown_reward_func": 0.4322916716337204, + "step": 397 + }, + { + "epoch": 0.006629687005480319, + "grad_norm": 0.13213297724723816, + "learning_rate": 3e-06, + "loss": 0.0033, + "step": 398 + }, + { + "epoch": 0.006646344510519215, + "grad_norm": 0.0655907690525055, + "learning_rate": 3e-06, + "loss": 0.0212, + "step": 399 + }, + { + "epoch": 0.00666300201555811, + "grad_norm": 0.0854148417711258, + "learning_rate": 3e-06, + "loss": 0.0028, + "step": 400 + }, + { + "completion_length": 77.76041793823242, + "epoch": 0.006679659520597005, + "grad_norm": 0.05806265026330948, + "learning_rate": 3e-06, + "loss": 0.0352, + "reward": 0.3708333522081375, + "rewards/countdown_reward_func": 0.3708333373069763, + "step": 401 + }, + { + "epoch": 0.0066963170256359, + "grad_norm": 0.06392884999513626, + "learning_rate": 3e-06, + "loss": 0.0243, + "step": 402 + }, + { + "epoch": 0.006712974530674796, + "grad_norm": 0.05753417685627937, + "learning_rate": 3e-06, + "loss": 0.0349, + "step": 403 + }, + { + "epoch": 0.006729632035713691, + "grad_norm": 0.06545396149158478, + "learning_rate": 3e-06, + "loss": 0.0238, + "step": 404 + }, + { + "completion_length": 79.63541793823242, + "epoch": 0.006746289540752586, + "grad_norm": 0.07429134100675583, + "learning_rate": 3e-06, + "loss": 0.0163, + "reward": 0.4083333760499954, + "rewards/countdown_reward_func": 0.40833336114883423, + "step": 405 + }, + { + "epoch": 0.006762947045791482, + "grad_norm": 0.050537265837192535, + "learning_rate": 3e-06, + "loss": 0.0432, + "step": 406 + }, + { + "epoch": 0.006779604550830377, + "grad_norm": 0.07599566876888275, + "learning_rate": 3e-06, + "loss": 0.0161, + "step": 407 + }, + { + "epoch": 0.006796262055869272, + "grad_norm": 0.05695967748761177, + "learning_rate": 3e-06, + "loss": 0.0432, + "step": 408 + }, + { + "completion_length": 78.59375, + "epoch": 0.006812919560908167, + "grad_norm": 0.07316026836633682, + "learning_rate": 3e-06, + "loss": 0.0511, + "reward": 0.4739583432674408, + "rewards/countdown_reward_func": 0.4739583432674408, + "step": 409 + }, + { + "epoch": 0.006829577065947063, + "grad_norm": 0.08317940682172775, + "learning_rate": 3e-06, + "loss": 0.0508, + "step": 410 + }, + { + "epoch": 0.006846234570985958, + "grad_norm": 0.0695376768708229, + "learning_rate": 3e-06, + "loss": 0.0507, + "step": 411 + }, + { + "epoch": 0.006862892076024853, + "grad_norm": 0.07508383691310883, + "learning_rate": 3e-06, + "loss": 0.0504, + "step": 412 + }, + { + "completion_length": 78.65625, + "epoch": 0.006879549581063748, + "grad_norm": 0.0945018008351326, + "learning_rate": 3e-06, + "loss": 0.0696, + "reward": 0.4625000059604645, + "rewards/countdown_reward_func": 0.4624999910593033, + "step": 413 + }, + { + "epoch": 0.006896207086102644, + "grad_norm": 0.07599159330129623, + "learning_rate": 3e-06, + "loss": 0.018, + "step": 414 + }, + { + "epoch": 0.006912864591141539, + "grad_norm": 0.08436521142721176, + "learning_rate": 3e-06, + "loss": 0.0691, + "step": 415 + }, + { + "epoch": 0.006929522096180434, + "grad_norm": 0.07717543095350266, + "learning_rate": 3e-06, + "loss": 0.0178, + "step": 416 + }, + { + "completion_length": 78.88542175292969, + "epoch": 0.00694617960121933, + "grad_norm": 0.051619481295347214, + "learning_rate": 3e-06, + "loss": 0.0249, + "reward": 0.3479166775941849, + "rewards/countdown_reward_func": 0.3479166626930237, + "step": 417 + }, + { + "epoch": 0.006962837106258225, + "grad_norm": 0.09331419318914413, + "learning_rate": 3e-06, + "loss": 0.0407, + "step": 418 + }, + { + "epoch": 0.00697949461129712, + "grad_norm": 0.05025678873062134, + "learning_rate": 3e-06, + "loss": 0.0245, + "step": 419 + }, + { + "epoch": 0.006996152116336015, + "grad_norm": 0.09707405418157578, + "learning_rate": 3e-06, + "loss": 0.0401, + "step": 420 + }, + { + "completion_length": 77.25, + "epoch": 0.007012809621374911, + "grad_norm": 0.08332526683807373, + "learning_rate": 3e-06, + "loss": 0.0333, + "reward": 0.5760416984558105, + "rewards/countdown_reward_func": 0.5760416686534882, + "step": 421 + }, + { + "epoch": 0.007029467126413806, + "grad_norm": 0.07828395813703537, + "learning_rate": 3e-06, + "loss": 0.0304, + "step": 422 + }, + { + "epoch": 0.007046124631452701, + "grad_norm": 0.09416884183883667, + "learning_rate": 3e-06, + "loss": 0.0333, + "step": 423 + }, + { + "epoch": 0.007062782136491596, + "grad_norm": 0.07076983153820038, + "learning_rate": 3e-06, + "loss": 0.0304, + "step": 424 + }, + { + "completion_length": 79.28125381469727, + "epoch": 0.007079439641530492, + "grad_norm": 0.0515352264046669, + "learning_rate": 3e-06, + "loss": 0.0194, + "reward": 0.34166669100522995, + "rewards/countdown_reward_func": 0.34166667610406876, + "step": 425 + }, + { + "epoch": 0.007096097146569387, + "grad_norm": 0.06081105023622513, + "learning_rate": 3e-06, + "loss": 0.0174, + "step": 426 + }, + { + "epoch": 0.007112754651608282, + "grad_norm": 0.04894004017114639, + "learning_rate": 3e-06, + "loss": 0.0192, + "step": 427 + }, + { + "epoch": 0.007129412156647178, + "grad_norm": 0.06134287267923355, + "learning_rate": 3e-06, + "loss": 0.0171, + "step": 428 + }, + { + "completion_length": 80.59375381469727, + "epoch": 0.007146069661686073, + "grad_norm": 0.10285397619009018, + "learning_rate": 3e-06, + "loss": 0.0637, + "reward": 0.4177083671092987, + "rewards/countdown_reward_func": 0.4177083522081375, + "step": 429 + }, + { + "epoch": 0.007162727166724968, + "grad_norm": 0.07841507345438004, + "learning_rate": 3e-06, + "loss": 0.0547, + "step": 430 + }, + { + "epoch": 0.007179384671763863, + "grad_norm": 0.06483368575572968, + "learning_rate": 3e-06, + "loss": 0.0631, + "step": 431 + }, + { + "epoch": 0.007196042176802759, + "grad_norm": 0.08059476315975189, + "learning_rate": 3e-06, + "loss": 0.0542, + "step": 432 + }, + { + "completion_length": 80.0625, + "epoch": 0.007212699681841654, + "grad_norm": 0.06586643308401108, + "learning_rate": 3e-06, + "loss": 0.0071, + "reward": 0.3697916865348816, + "rewards/countdown_reward_func": 0.3697916567325592, + "step": 433 + }, + { + "epoch": 0.007229357186880549, + "grad_norm": 0.0644889622926712, + "learning_rate": 3e-06, + "loss": -0.0065, + "step": 434 + }, + { + "epoch": 0.007246014691919444, + "grad_norm": 0.07256460189819336, + "learning_rate": 3e-06, + "loss": 0.0071, + "step": 435 + }, + { + "epoch": 0.00726267219695834, + "grad_norm": 0.0663222223520279, + "learning_rate": 3e-06, + "loss": -0.0068, + "step": 436 + }, + { + "completion_length": 79.73958587646484, + "epoch": 0.007279329701997235, + "grad_norm": 0.07111411541700363, + "learning_rate": 3e-06, + "loss": 0.009, + "reward": 0.5489583611488342, + "rewards/countdown_reward_func": 0.5489583611488342, + "step": 437 + }, + { + "epoch": 0.00729598720703613, + "grad_norm": 0.08117426931858063, + "learning_rate": 3e-06, + "loss": 0.0441, + "step": 438 + }, + { + "epoch": 0.007312644712075026, + "grad_norm": 0.07505331188440323, + "learning_rate": 3e-06, + "loss": 0.0088, + "step": 439 + }, + { + "epoch": 0.007329302217113921, + "grad_norm": 0.0730680376291275, + "learning_rate": 3e-06, + "loss": 0.0437, + "step": 440 + }, + { + "completion_length": 79.375, + "epoch": 0.007345959722152816, + "grad_norm": 0.08070104569196701, + "learning_rate": 3e-06, + "loss": 0.0157, + "reward": 0.45104168355464935, + "rewards/countdown_reward_func": 0.45104165375232697, + "step": 441 + }, + { + "epoch": 0.007362617227191711, + "grad_norm": 0.08048272132873535, + "learning_rate": 3e-06, + "loss": -0.04, + "step": 442 + }, + { + "epoch": 0.007379274732230607, + "grad_norm": 0.08332232385873795, + "learning_rate": 3e-06, + "loss": 0.0157, + "step": 443 + }, + { + "epoch": 0.007395932237269502, + "grad_norm": 0.08007360249757767, + "learning_rate": 3e-06, + "loss": -0.0404, + "step": 444 + }, + { + "completion_length": 78.64583587646484, + "epoch": 0.007412589742308397, + "grad_norm": 0.06347408145666122, + "learning_rate": 3e-06, + "loss": 0.0576, + "reward": 0.5208333730697632, + "rewards/countdown_reward_func": 0.5208333432674408, + "step": 445 + }, + { + "epoch": 0.007429247247347293, + "grad_norm": 0.12448671460151672, + "learning_rate": 3e-06, + "loss": 0.0178, + "step": 446 + }, + { + "epoch": 0.007445904752386188, + "grad_norm": 0.06372661888599396, + "learning_rate": 3e-06, + "loss": 0.0575, + "step": 447 + }, + { + "epoch": 0.007462562257425083, + "grad_norm": 0.08060593158006668, + "learning_rate": 3e-06, + "loss": 0.0178, + "step": 448 + }, + { + "completion_length": 81.17708587646484, + "epoch": 0.007479219762463978, + "grad_norm": 0.048487190157175064, + "learning_rate": 3e-06, + "loss": 0.0216, + "reward": 0.491666704416275, + "rewards/countdown_reward_func": 0.49166667461395264, + "step": 449 + }, + { + "epoch": 0.007495877267502874, + "grad_norm": 0.05279170721769333, + "learning_rate": 3e-06, + "loss": 0.036, + "step": 450 + }, + { + "epoch": 0.007512534772541769, + "grad_norm": 0.04872879013419151, + "learning_rate": 3e-06, + "loss": 0.0215, + "step": 451 + }, + { + "epoch": 0.007529192277580664, + "grad_norm": 0.050131987780332565, + "learning_rate": 3e-06, + "loss": 0.0359, + "step": 452 + }, + { + "completion_length": 80.14583587646484, + "epoch": 0.007545849782619559, + "grad_norm": 0.05295325443148613, + "learning_rate": 3e-06, + "loss": 0.0347, + "reward": 0.6510416865348816, + "rewards/countdown_reward_func": 0.6510416567325592, + "step": 453 + }, + { + "epoch": 0.007562507287658455, + "grad_norm": 0.05845298245549202, + "learning_rate": 3e-06, + "loss": 0.003, + "step": 454 + }, + { + "epoch": 0.00757916479269735, + "grad_norm": 0.05990788713097572, + "learning_rate": 3e-06, + "loss": 0.0346, + "step": 455 + }, + { + "epoch": 0.007595822297736245, + "grad_norm": 0.08733080327510834, + "learning_rate": 3e-06, + "loss": 0.003, + "step": 456 + }, + { + "completion_length": 80.06250381469727, + "epoch": 0.007612479802775141, + "grad_norm": 0.06363150477409363, + "learning_rate": 3e-06, + "loss": -0.0227, + "reward": 0.508333370089531, + "rewards/countdown_reward_func": 0.508333370089531, + "step": 457 + }, + { + "epoch": 0.007629137307814036, + "grad_norm": 0.05841623246669769, + "learning_rate": 3e-06, + "loss": 0.0062, + "step": 458 + }, + { + "epoch": 0.007645794812852931, + "grad_norm": 0.0630185529589653, + "learning_rate": 3e-06, + "loss": -0.0228, + "step": 459 + }, + { + "epoch": 0.007662452317891826, + "grad_norm": 0.05774760991334915, + "learning_rate": 3e-06, + "loss": 0.0059, + "step": 460 + }, + { + "completion_length": 79.59375381469727, + "epoch": 0.007679109822930722, + "grad_norm": 0.05274137109518051, + "learning_rate": 3e-06, + "loss": -0.0086, + "reward": 0.5458333492279053, + "rewards/countdown_reward_func": 0.5458333343267441, + "step": 461 + }, + { + "epoch": 0.007695767327969617, + "grad_norm": 0.05402350798249245, + "learning_rate": 3e-06, + "loss": 0.0387, + "step": 462 + }, + { + "epoch": 0.007712424833008512, + "grad_norm": 0.048290152102708817, + "learning_rate": 3e-06, + "loss": -0.0087, + "step": 463 + }, + { + "epoch": 0.007729082338047407, + "grad_norm": 0.05240393802523613, + "learning_rate": 3e-06, + "loss": 0.0382, + "step": 464 + }, + { + "completion_length": 80.36458587646484, + "epoch": 0.007745739843086303, + "grad_norm": 0.09540390223264694, + "learning_rate": 3e-06, + "loss": 0.0039, + "reward": 0.5458333790302277, + "rewards/countdown_reward_func": 0.5458333790302277, + "step": 465 + }, + { + "epoch": 0.007762397348125198, + "grad_norm": 0.07324394583702087, + "learning_rate": 3e-06, + "loss": 0.0319, + "step": 466 + }, + { + "epoch": 0.007779054853164093, + "grad_norm": 0.10398959368467331, + "learning_rate": 3e-06, + "loss": 0.0039, + "step": 467 + }, + { + "epoch": 0.007795712358202989, + "grad_norm": 0.07248996943235397, + "learning_rate": 3e-06, + "loss": 0.0314, + "step": 468 + }, + { + "completion_length": 81.51042175292969, + "epoch": 0.007812369863241884, + "grad_norm": 0.05704952031373978, + "learning_rate": 3e-06, + "loss": 0.0082, + "reward": 0.4260416775941849, + "rewards/countdown_reward_func": 0.4260416626930237, + "step": 469 + }, + { + "epoch": 0.00782902736828078, + "grad_norm": 0.06616488844156265, + "learning_rate": 3e-06, + "loss": 0.0032, + "step": 470 + }, + { + "epoch": 0.007845684873319674, + "grad_norm": 0.057678528130054474, + "learning_rate": 3e-06, + "loss": 0.0078, + "step": 471 + }, + { + "epoch": 0.00786234237835857, + "grad_norm": 0.06907469034194946, + "learning_rate": 3e-06, + "loss": 0.0027, + "step": 472 + }, + { + "completion_length": 81.90625381469727, + "epoch": 0.007878999883397464, + "grad_norm": 0.04800111800432205, + "learning_rate": 3e-06, + "loss": 0.0206, + "reward": 0.33125002682209015, + "rewards/countdown_reward_func": 0.33125002682209015, + "step": 473 + }, + { + "epoch": 0.00789565738843636, + "grad_norm": 0.05523252114653587, + "learning_rate": 3e-06, + "loss": 0.0251, + "step": 474 + }, + { + "epoch": 0.007912314893475256, + "grad_norm": 0.048462893813848495, + "learning_rate": 3e-06, + "loss": 0.0204, + "step": 475 + }, + { + "epoch": 0.00792897239851415, + "grad_norm": 0.05505013093352318, + "learning_rate": 3e-06, + "loss": 0.0249, + "step": 476 + }, + { + "completion_length": 81.47917175292969, + "epoch": 0.007945629903553046, + "grad_norm": 0.0628310963511467, + "learning_rate": 3e-06, + "loss": 0.0469, + "reward": 0.4333333522081375, + "rewards/countdown_reward_func": 0.4333333373069763, + "step": 477 + }, + { + "epoch": 0.007962287408591942, + "grad_norm": 0.0948786810040474, + "learning_rate": 3e-06, + "loss": -0.0354, + "step": 478 + }, + { + "epoch": 0.007978944913630836, + "grad_norm": 0.06490487605333328, + "learning_rate": 3e-06, + "loss": 0.0466, + "step": 479 + }, + { + "epoch": 0.007995602418669732, + "grad_norm": 0.08904693275690079, + "learning_rate": 3e-06, + "loss": -0.036, + "step": 480 + }, + { + "completion_length": 80.70833587646484, + "epoch": 0.008012259923708628, + "grad_norm": 0.06610012799501419, + "learning_rate": 3e-06, + "loss": -0.0034, + "reward": 0.5010416805744171, + "rewards/countdown_reward_func": 0.5010416805744171, + "step": 481 + }, + { + "epoch": 0.008028917428747522, + "grad_norm": 0.06118786707520485, + "learning_rate": 3e-06, + "loss": 0.0155, + "step": 482 + }, + { + "epoch": 0.008045574933786418, + "grad_norm": 0.07492632418870926, + "learning_rate": 3e-06, + "loss": -0.0037, + "step": 483 + }, + { + "epoch": 0.008062232438825312, + "grad_norm": 0.06142030283808708, + "learning_rate": 3e-06, + "loss": 0.0152, + "step": 484 + }, + { + "completion_length": 81.42708587646484, + "epoch": 0.008078889943864208, + "grad_norm": 0.09416550397872925, + "learning_rate": 3e-06, + "loss": 0.0029, + "reward": 0.4999999701976776, + "rewards/countdown_reward_func": 0.4999999701976776, + "step": 485 + }, + { + "epoch": 0.008095547448903104, + "grad_norm": 0.06360796838998795, + "learning_rate": 3e-06, + "loss": 0.0449, + "step": 486 + }, + { + "epoch": 0.008112204953941998, + "grad_norm": 0.09687819331884384, + "learning_rate": 3e-06, + "loss": 0.0023, + "step": 487 + }, + { + "epoch": 0.008128862458980894, + "grad_norm": 0.06378461420536041, + "learning_rate": 3e-06, + "loss": 0.0445, + "step": 488 + }, + { + "completion_length": 80.93750381469727, + "epoch": 0.00814551996401979, + "grad_norm": 0.06647662818431854, + "learning_rate": 3e-06, + "loss": 0.0474, + "reward": 0.6062500178813934, + "rewards/countdown_reward_func": 0.606249988079071, + "step": 489 + }, + { + "epoch": 0.008162177469058684, + "grad_norm": 0.0683470293879509, + "learning_rate": 3e-06, + "loss": 0.0448, + "step": 490 + }, + { + "epoch": 0.00817883497409758, + "grad_norm": 0.06710384786128998, + "learning_rate": 3e-06, + "loss": 0.0468, + "step": 491 + }, + { + "epoch": 0.008195492479136476, + "grad_norm": 0.06748959422111511, + "learning_rate": 3e-06, + "loss": 0.0445, + "step": 492 + }, + { + "completion_length": 81.87500381469727, + "epoch": 0.00821214998417537, + "grad_norm": 0.0520973801612854, + "learning_rate": 3e-06, + "loss": 0.0238, + "reward": 0.5479166805744171, + "rewards/countdown_reward_func": 0.5479166358709335, + "step": 493 + }, + { + "epoch": 0.008228807489214266, + "grad_norm": 0.059733469039201736, + "learning_rate": 3e-06, + "loss": 0.0069, + "step": 494 + }, + { + "epoch": 0.00824546499425316, + "grad_norm": 0.05311213433742523, + "learning_rate": 3e-06, + "loss": 0.0235, + "step": 495 + }, + { + "epoch": 0.008262122499292056, + "grad_norm": 0.058049220591783524, + "learning_rate": 3e-06, + "loss": 0.0067, + "step": 496 + }, + { + "completion_length": 81.57291793823242, + "epoch": 0.008278780004330952, + "grad_norm": 0.07772063463926315, + "learning_rate": 3e-06, + "loss": -0.0075, + "reward": 0.5656250715255737, + "rewards/countdown_reward_func": 0.565625011920929, + "step": 497 + }, + { + "epoch": 0.008295437509369846, + "grad_norm": 0.0807429775595665, + "learning_rate": 3e-06, + "loss": 0.0314, + "step": 498 + }, + { + "epoch": 0.008312095014408742, + "grad_norm": 0.0793251022696495, + "learning_rate": 3e-06, + "loss": -0.0081, + "step": 499 + }, + { + "epoch": 0.008328752519447638, + "grad_norm": 0.09254617244005203, + "learning_rate": 3e-06, + "loss": 0.0309, + "step": 500 + }, + { + "completion_length": 82.23958587646484, + "epoch": 0.008345410024486532, + "grad_norm": 0.07371559739112854, + "learning_rate": 3e-06, + "loss": 0.0581, + "reward": 0.453125, + "rewards/countdown_reward_func": 0.453125, + "step": 501 + }, + { + "epoch": 0.008362067529525428, + "grad_norm": 0.08851583302021027, + "learning_rate": 3e-06, + "loss": 0.009, + "step": 502 + }, + { + "epoch": 0.008378725034564324, + "grad_norm": 0.06968164443969727, + "learning_rate": 3e-06, + "loss": 0.0578, + "step": 503 + }, + { + "epoch": 0.008395382539603218, + "grad_norm": 0.08894790709018707, + "learning_rate": 3e-06, + "loss": 0.0089, + "step": 504 + }, + { + "completion_length": 83.41667175292969, + "epoch": 0.008412040044642114, + "grad_norm": 0.07978343218564987, + "learning_rate": 3e-06, + "loss": -0.0168, + "reward": 0.5552083551883698, + "rewards/countdown_reward_func": 0.5552083551883698, + "step": 505 + }, + { + "epoch": 0.008428697549681008, + "grad_norm": 0.09988199919462204, + "learning_rate": 3e-06, + "loss": -0.0441, + "step": 506 + }, + { + "epoch": 0.008445355054719904, + "grad_norm": 0.07664237171411514, + "learning_rate": 3e-06, + "loss": -0.0171, + "step": 507 + }, + { + "epoch": 0.0084620125597588, + "grad_norm": 0.09177546203136444, + "learning_rate": 3e-06, + "loss": -0.0443, + "step": 508 + }, + { + "completion_length": 82.13541793823242, + "epoch": 0.008478670064797694, + "grad_norm": 0.08791572600603104, + "learning_rate": 3e-06, + "loss": 0.0634, + "reward": 0.7250000536441803, + "rewards/countdown_reward_func": 0.7250000238418579, + "step": 509 + }, + { + "epoch": 0.00849532756983659, + "grad_norm": 0.08713562786579132, + "learning_rate": 3e-06, + "loss": 0.0024, + "step": 510 + }, + { + "epoch": 0.008511985074875486, + "grad_norm": 0.08514068275690079, + "learning_rate": 3e-06, + "loss": 0.0632, + "step": 511 + }, + { + "epoch": 0.00852864257991438, + "grad_norm": 0.08673429489135742, + "learning_rate": 3e-06, + "loss": 0.0018, + "step": 512 + }, + { + "completion_length": 81.71875381469727, + "epoch": 0.008545300084953276, + "grad_norm": 0.08796904236078262, + "learning_rate": 3e-06, + "loss": 0.0105, + "reward": 0.5947916805744171, + "rewards/countdown_reward_func": 0.5947916507720947, + "step": 513 + }, + { + "epoch": 0.008561957589992172, + "grad_norm": 0.08956664800643921, + "learning_rate": 3e-06, + "loss": 0.0482, + "step": 514 + }, + { + "epoch": 0.008578615095031066, + "grad_norm": 0.09548274427652359, + "learning_rate": 3e-06, + "loss": 0.0096, + "step": 515 + }, + { + "epoch": 0.008595272600069962, + "grad_norm": 0.08606173098087311, + "learning_rate": 3e-06, + "loss": 0.0477, + "step": 516 + }, + { + "completion_length": 82.72916793823242, + "epoch": 0.008611930105108858, + "grad_norm": 0.07674533873796463, + "learning_rate": 3e-06, + "loss": 0.0368, + "reward": 0.4833333194255829, + "rewards/countdown_reward_func": 0.4833333194255829, + "step": 517 + }, + { + "epoch": 0.008628587610147752, + "grad_norm": 0.07349152863025665, + "learning_rate": 3e-06, + "loss": 0.1014, + "step": 518 + }, + { + "epoch": 0.008645245115186648, + "grad_norm": 0.08653897792100906, + "learning_rate": 3e-06, + "loss": 0.0367, + "step": 519 + }, + { + "epoch": 0.008661902620225542, + "grad_norm": 0.06930646300315857, + "learning_rate": 3e-06, + "loss": 0.101, + "step": 520 + }, + { + "completion_length": 82.25, + "epoch": 0.008678560125264438, + "grad_norm": 0.044614847749471664, + "learning_rate": 3e-06, + "loss": 0.0217, + "reward": 0.484375, + "rewards/countdown_reward_func": 0.484375, + "step": 521 + }, + { + "epoch": 0.008695217630303334, + "grad_norm": 0.062400054186582565, + "learning_rate": 3e-06, + "loss": 0.0364, + "step": 522 + }, + { + "epoch": 0.008711875135342228, + "grad_norm": 0.049803055822849274, + "learning_rate": 3e-06, + "loss": 0.0218, + "step": 523 + }, + { + "epoch": 0.008728532640381124, + "grad_norm": 0.057401567697525024, + "learning_rate": 3e-06, + "loss": 0.036, + "step": 524 + }, + { + "completion_length": 81.67708587646484, + "epoch": 0.00874519014542002, + "grad_norm": 0.045486003160476685, + "learning_rate": 3e-06, + "loss": 0.0346, + "reward": 0.5312500298023224, + "rewards/countdown_reward_func": 0.5312499850988388, + "step": 525 + }, + { + "epoch": 0.008761847650458914, + "grad_norm": 0.05375165119767189, + "learning_rate": 3e-06, + "loss": 0.0212, + "step": 526 + }, + { + "epoch": 0.00877850515549781, + "grad_norm": 0.04472886398434639, + "learning_rate": 3e-06, + "loss": 0.0342, + "step": 527 + }, + { + "epoch": 0.008795162660536706, + "grad_norm": 0.045810677111148834, + "learning_rate": 3e-06, + "loss": 0.021, + "step": 528 + }, + { + "completion_length": 80.92708587646484, + "epoch": 0.0088118201655756, + "grad_norm": 0.046965040266513824, + "learning_rate": 3e-06, + "loss": -0.0182, + "reward": 0.2541666626930237, + "rewards/countdown_reward_func": 0.2541666626930237, + "step": 529 + }, + { + "epoch": 0.008828477670614496, + "grad_norm": 0.05033240467309952, + "learning_rate": 3e-06, + "loss": 0.0171, + "step": 530 + }, + { + "epoch": 0.00884513517565339, + "grad_norm": 0.04705624282360077, + "learning_rate": 3e-06, + "loss": -0.0185, + "step": 531 + }, + { + "epoch": 0.008861792680692286, + "grad_norm": 0.05377742275595665, + "learning_rate": 3e-06, + "loss": 0.0171, + "step": 532 + }, + { + "completion_length": 81.92708587646484, + "epoch": 0.008878450185731182, + "grad_norm": 0.08490116149187088, + "learning_rate": 3e-06, + "loss": 0.0057, + "reward": 0.6604166626930237, + "rewards/countdown_reward_func": 0.6604166626930237, + "step": 533 + }, + { + "epoch": 0.008895107690770076, + "grad_norm": 0.06449475139379501, + "learning_rate": 3e-06, + "loss": -0.0165, + "step": 534 + }, + { + "epoch": 0.008911765195808972, + "grad_norm": 0.09165811538696289, + "learning_rate": 3e-06, + "loss": 0.0054, + "step": 535 + }, + { + "epoch": 0.008928422700847868, + "grad_norm": 0.06177179515361786, + "learning_rate": 3e-06, + "loss": -0.0167, + "step": 536 + }, + { + "completion_length": 82.23958587646484, + "epoch": 0.008945080205886762, + "grad_norm": 0.13394509255886078, + "learning_rate": 3e-06, + "loss": 0.0389, + "reward": 0.7166666984558105, + "rewards/countdown_reward_func": 0.7166666388511658, + "step": 537 + }, + { + "epoch": 0.008961737710925658, + "grad_norm": 0.10945811867713928, + "learning_rate": 3e-06, + "loss": -0.0173, + "step": 538 + }, + { + "epoch": 0.008978395215964554, + "grad_norm": 0.13525190949440002, + "learning_rate": 3e-06, + "loss": 0.038, + "step": 539 + }, + { + "epoch": 0.008995052721003448, + "grad_norm": 0.10777381807565689, + "learning_rate": 3e-06, + "loss": -0.0183, + "step": 540 + }, + { + "completion_length": 82.45833587646484, + "epoch": 0.009011710226042344, + "grad_norm": 0.06299254298210144, + "learning_rate": 3e-06, + "loss": -0.0098, + "reward": 0.502083346247673, + "rewards/countdown_reward_func": 0.5020833164453506, + "step": 541 + }, + { + "epoch": 0.009028367731081238, + "grad_norm": 0.07722142338752747, + "learning_rate": 3e-06, + "loss": 0.0366, + "step": 542 + }, + { + "epoch": 0.009045025236120134, + "grad_norm": 0.06091798469424248, + "learning_rate": 3e-06, + "loss": -0.0099, + "step": 543 + }, + { + "epoch": 0.00906168274115903, + "grad_norm": 0.07711268961429596, + "learning_rate": 3e-06, + "loss": 0.0363, + "step": 544 + }, + { + "completion_length": 81.42708587646484, + "epoch": 0.009078340246197924, + "grad_norm": 0.09180547297000885, + "learning_rate": 3e-06, + "loss": 0.0459, + "reward": 0.38854166865348816, + "rewards/countdown_reward_func": 0.38854165375232697, + "step": 545 + }, + { + "epoch": 0.00909499775123682, + "grad_norm": 0.0839102491736412, + "learning_rate": 3e-06, + "loss": 0.0369, + "step": 546 + }, + { + "epoch": 0.009111655256275716, + "grad_norm": 0.1056525781750679, + "learning_rate": 3e-06, + "loss": 0.045, + "step": 547 + }, + { + "epoch": 0.00912831276131461, + "grad_norm": 0.09000610560178757, + "learning_rate": 3e-06, + "loss": 0.0365, + "step": 548 + }, + { + "completion_length": 82.91667175292969, + "epoch": 0.009144970266353506, + "grad_norm": 0.08945036679506302, + "learning_rate": 3e-06, + "loss": 0.0099, + "reward": 0.427083358168602, + "rewards/countdown_reward_func": 0.4270833283662796, + "step": 549 + }, + { + "epoch": 0.009161627771392402, + "grad_norm": 0.08390450477600098, + "learning_rate": 3e-06, + "loss": -0.0236, + "step": 550 + }, + { + "epoch": 0.009178285276431296, + "grad_norm": 0.08963964879512787, + "learning_rate": 3e-06, + "loss": 0.0097, + "step": 551 + }, + { + "epoch": 0.009194942781470192, + "grad_norm": 0.08354327827692032, + "learning_rate": 3e-06, + "loss": -0.0238, + "step": 552 + }, + { + "completion_length": 83.43750381469727, + "epoch": 0.009211600286509086, + "grad_norm": 0.04936714842915535, + "learning_rate": 3e-06, + "loss": 0.013, + "reward": 0.4072916656732559, + "rewards/countdown_reward_func": 0.4072916507720947, + "step": 553 + }, + { + "epoch": 0.009228257791547982, + "grad_norm": 0.0474991649389267, + "learning_rate": 3e-06, + "loss": 0.0339, + "step": 554 + }, + { + "epoch": 0.009244915296586878, + "grad_norm": 0.04830361157655716, + "learning_rate": 3e-06, + "loss": 0.013, + "step": 555 + }, + { + "epoch": 0.009261572801625772, + "grad_norm": 0.05189661681652069, + "learning_rate": 3e-06, + "loss": 0.0337, + "step": 556 + }, + { + "completion_length": 83.47917175292969, + "epoch": 0.009278230306664668, + "grad_norm": 0.046872418373823166, + "learning_rate": 3e-06, + "loss": 0.0037, + "reward": 0.5291666686534882, + "rewards/countdown_reward_func": 0.5291666686534882, + "step": 557 + }, + { + "epoch": 0.009294887811703564, + "grad_norm": 0.05835714191198349, + "learning_rate": 3e-06, + "loss": 0.038, + "step": 558 + }, + { + "epoch": 0.009311545316742458, + "grad_norm": 0.04692190885543823, + "learning_rate": 3e-06, + "loss": 0.0034, + "step": 559 + }, + { + "epoch": 0.009328202821781354, + "grad_norm": 0.060306619852781296, + "learning_rate": 3e-06, + "loss": 0.0381, + "step": 560 + }, + { + "completion_length": 82.85416793823242, + "epoch": 0.00934486032682025, + "grad_norm": 0.04616887494921684, + "learning_rate": 3e-06, + "loss": 0.0061, + "reward": 0.5395833849906921, + "rewards/countdown_reward_func": 0.539583370089531, + "step": 561 + }, + { + "epoch": 0.009361517831859144, + "grad_norm": 0.04462489113211632, + "learning_rate": 3e-06, + "loss": 0.0258, + "step": 562 + }, + { + "epoch": 0.00937817533689804, + "grad_norm": 0.046142786741256714, + "learning_rate": 3e-06, + "loss": 0.006, + "step": 563 + }, + { + "epoch": 0.009394832841936934, + "grad_norm": 0.04871681332588196, + "learning_rate": 3e-06, + "loss": 0.0259, + "step": 564 + }, + { + "completion_length": 83.23958587646484, + "epoch": 0.00941149034697583, + "grad_norm": 0.07824543118476868, + "learning_rate": 3e-06, + "loss": -0.0003, + "reward": 0.6333333551883698, + "rewards/countdown_reward_func": 0.6333333551883698, + "step": 565 + }, + { + "epoch": 0.009428147852014726, + "grad_norm": 0.05676697567105293, + "learning_rate": 3e-06, + "loss": 0.0069, + "step": 566 + }, + { + "epoch": 0.00944480535705362, + "grad_norm": 0.06804952770471573, + "learning_rate": 3e-06, + "loss": -0.0006, + "step": 567 + }, + { + "epoch": 0.009461462862092516, + "grad_norm": 0.056607119739055634, + "learning_rate": 3e-06, + "loss": 0.0067, + "step": 568 + }, + { + "completion_length": 82.70833587646484, + "epoch": 0.009478120367131412, + "grad_norm": 0.10670281201601028, + "learning_rate": 3e-06, + "loss": 0.0138, + "reward": 0.5677083432674408, + "rewards/countdown_reward_func": 0.5677083432674408, + "step": 569 + }, + { + "epoch": 0.009494777872170306, + "grad_norm": 0.05545264109969139, + "learning_rate": 3e-06, + "loss": 0.025, + "step": 570 + }, + { + "epoch": 0.009511435377209202, + "grad_norm": 0.05357765778899193, + "learning_rate": 3e-06, + "loss": 0.0137, + "step": 571 + }, + { + "epoch": 0.009528092882248098, + "grad_norm": 0.060004618018865585, + "learning_rate": 3e-06, + "loss": 0.0247, + "step": 572 + }, + { + "completion_length": 82.45833587646484, + "epoch": 0.009544750387286992, + "grad_norm": 0.0710037350654602, + "learning_rate": 3e-06, + "loss": -0.031, + "reward": 0.5114583671092987, + "rewards/countdown_reward_func": 0.5114583224058151, + "step": 573 + }, + { + "epoch": 0.009561407892325888, + "grad_norm": 0.0758698508143425, + "learning_rate": 3e-06, + "loss": 0.0069, + "step": 574 + }, + { + "epoch": 0.009578065397364782, + "grad_norm": 0.07385803759098053, + "learning_rate": 3e-06, + "loss": -0.0313, + "step": 575 + }, + { + "epoch": 0.009594722902403678, + "grad_norm": 0.07923060655593872, + "learning_rate": 3e-06, + "loss": 0.0067, + "step": 576 + }, + { + "completion_length": 82.29166793823242, + "epoch": 0.009611380407442574, + "grad_norm": 0.0728163793683052, + "learning_rate": 3e-06, + "loss": 0.0545, + "reward": 0.4177083373069763, + "rewards/countdown_reward_func": 0.4177083373069763, + "step": 577 + }, + { + "epoch": 0.009628037912481468, + "grad_norm": 0.08058478683233261, + "learning_rate": 3e-06, + "loss": -0.0144, + "step": 578 + }, + { + "epoch": 0.009644695417520364, + "grad_norm": 0.0700056329369545, + "learning_rate": 3e-06, + "loss": 0.0545, + "step": 579 + }, + { + "epoch": 0.00966135292255926, + "grad_norm": 0.07899457961320877, + "learning_rate": 3e-06, + "loss": -0.0147, + "step": 580 + }, + { + "completion_length": 83.20833587646484, + "epoch": 0.009678010427598154, + "grad_norm": 0.06491773575544357, + "learning_rate": 3e-06, + "loss": 0.0181, + "reward": 0.45625002682209015, + "rewards/countdown_reward_func": 0.45625001192092896, + "step": 581 + }, + { + "epoch": 0.00969466793263705, + "grad_norm": 0.12015333771705627, + "learning_rate": 3e-06, + "loss": 0.022, + "step": 582 + }, + { + "epoch": 0.009711325437675946, + "grad_norm": 0.06440982222557068, + "learning_rate": 3e-06, + "loss": 0.0178, + "step": 583 + }, + { + "epoch": 0.00972798294271484, + "grad_norm": 0.04709446057677269, + "learning_rate": 3e-06, + "loss": 0.0216, + "step": 584 + }, + { + "completion_length": 82.11458587646484, + "epoch": 0.009744640447753736, + "grad_norm": 0.06834648549556732, + "learning_rate": 3e-06, + "loss": -0.0081, + "reward": 0.567708358168602, + "rewards/countdown_reward_func": 0.5677083283662796, + "step": 585 + }, + { + "epoch": 0.00976129795279263, + "grad_norm": 0.07228496670722961, + "learning_rate": 3e-06, + "loss": -0.0165, + "step": 586 + }, + { + "epoch": 0.009777955457831526, + "grad_norm": 0.0658431425690651, + "learning_rate": 3e-06, + "loss": -0.0084, + "step": 587 + }, + { + "epoch": 0.009794612962870422, + "grad_norm": 0.07200872898101807, + "learning_rate": 3e-06, + "loss": -0.0168, + "step": 588 + }, + { + "completion_length": 82.02083587646484, + "epoch": 0.009811270467909316, + "grad_norm": 0.09475923329591751, + "learning_rate": 3e-06, + "loss": 0.0137, + "reward": 0.596875011920929, + "rewards/countdown_reward_func": 0.596875011920929, + "step": 589 + }, + { + "epoch": 0.009827927972948212, + "grad_norm": 0.06969800591468811, + "learning_rate": 3e-06, + "loss": 0.0393, + "step": 590 + }, + { + "epoch": 0.009844585477987108, + "grad_norm": 0.10244551301002502, + "learning_rate": 3e-06, + "loss": 0.0132, + "step": 591 + }, + { + "epoch": 0.009861242983026002, + "grad_norm": 0.0676097571849823, + "learning_rate": 3e-06, + "loss": 0.0392, + "step": 592 + }, + { + "completion_length": 80.89583587646484, + "epoch": 0.009877900488064898, + "grad_norm": 0.08070485293865204, + "learning_rate": 3e-06, + "loss": 0.0292, + "reward": 0.5291666984558105, + "rewards/countdown_reward_func": 0.5291666835546494, + "step": 593 + }, + { + "epoch": 0.009894557993103794, + "grad_norm": 0.0888860672712326, + "learning_rate": 3e-06, + "loss": 0.0232, + "step": 594 + }, + { + "epoch": 0.009911215498142688, + "grad_norm": 0.07038453966379166, + "learning_rate": 3e-06, + "loss": 0.0287, + "step": 595 + }, + { + "epoch": 0.009927873003181584, + "grad_norm": 0.0951337143778801, + "learning_rate": 3e-06, + "loss": 0.0229, + "step": 596 + }, + { + "completion_length": 81.48958587646484, + "epoch": 0.009944530508220478, + "grad_norm": 0.10328395664691925, + "learning_rate": 3e-06, + "loss": 0.0022, + "reward": 0.6114583909511566, + "rewards/countdown_reward_func": 0.6114583611488342, + "step": 597 + }, + { + "epoch": 0.009961188013259374, + "grad_norm": 0.08002610504627228, + "learning_rate": 3e-06, + "loss": 0.0092, + "step": 598 + }, + { + "epoch": 0.00997784551829827, + "grad_norm": 0.09106598049402237, + "learning_rate": 3e-06, + "loss": 0.0013, + "step": 599 + }, + { + "epoch": 0.009994503023337164, + "grad_norm": 0.07897195219993591, + "learning_rate": 3e-06, + "loss": 0.0088, + "step": 600 + }, + { + "completion_length": 80.85416793823242, + "epoch": 0.01001116052837606, + "grad_norm": 0.05939459428191185, + "learning_rate": 3e-06, + "loss": 0.0446, + "reward": 0.3250000327825546, + "rewards/countdown_reward_func": 0.3250000327825546, + "step": 601 + }, + { + "epoch": 0.010027818033414956, + "grad_norm": 0.04603337496519089, + "learning_rate": 3e-06, + "loss": 0.0258, + "step": 602 + }, + { + "epoch": 0.01004447553845385, + "grad_norm": 0.08744379878044128, + "learning_rate": 3e-06, + "loss": 0.0443, + "step": 603 + }, + { + "epoch": 0.010061133043492746, + "grad_norm": 0.04587329924106598, + "learning_rate": 3e-06, + "loss": 0.0258, + "step": 604 + }, + { + "completion_length": 80.55208587646484, + "epoch": 0.010077790548531642, + "grad_norm": 0.0878421813249588, + "learning_rate": 3e-06, + "loss": -0.0403, + "reward": 0.36145834624767303, + "rewards/countdown_reward_func": 0.36145834624767303, + "step": 605 + }, + { + "epoch": 0.010094448053570536, + "grad_norm": 0.08399204909801483, + "learning_rate": 3e-06, + "loss": 0.0589, + "step": 606 + }, + { + "epoch": 0.010111105558609432, + "grad_norm": 0.089258573949337, + "learning_rate": 3e-06, + "loss": -0.0406, + "step": 607 + }, + { + "epoch": 0.010127763063648326, + "grad_norm": 0.08361204713582993, + "learning_rate": 3e-06, + "loss": 0.0587, + "step": 608 + }, + { + "completion_length": 79.84375381469727, + "epoch": 0.010144420568687222, + "grad_norm": 0.07720425724983215, + "learning_rate": 3e-06, + "loss": 0.0053, + "reward": 0.5947916507720947, + "rewards/countdown_reward_func": 0.5947916507720947, + "step": 609 + }, + { + "epoch": 0.010161078073726118, + "grad_norm": 0.06592312455177307, + "learning_rate": 3e-06, + "loss": 0.0365, + "step": 610 + }, + { + "epoch": 0.010177735578765012, + "grad_norm": 0.07583089917898178, + "learning_rate": 3e-06, + "loss": 0.0053, + "step": 611 + }, + { + "epoch": 0.010194393083803908, + "grad_norm": 0.06720515340566635, + "learning_rate": 3e-06, + "loss": 0.0362, + "step": 612 + }, + { + "completion_length": 80.89583587646484, + "epoch": 0.010211050588842804, + "grad_norm": 0.0967940166592598, + "learning_rate": 3e-06, + "loss": -0.0033, + "reward": 0.45520836114883423, + "rewards/countdown_reward_func": 0.45520833134651184, + "step": 613 + }, + { + "epoch": 0.010227708093881698, + "grad_norm": 0.13421790301799774, + "learning_rate": 3e-06, + "loss": 0.0204, + "step": 614 + }, + { + "epoch": 0.010244365598920594, + "grad_norm": 0.07963573187589645, + "learning_rate": 3e-06, + "loss": -0.0034, + "step": 615 + }, + { + "epoch": 0.01026102310395949, + "grad_norm": 0.17263776063919067, + "learning_rate": 3e-06, + "loss": 0.0196, + "step": 616 + }, + { + "completion_length": 80.61458587646484, + "epoch": 0.010277680608998384, + "grad_norm": 0.11948724836111069, + "learning_rate": 3e-06, + "loss": 0.0026, + "reward": 0.36250002682209015, + "rewards/countdown_reward_func": 0.36250000447034836, + "step": 617 + }, + { + "epoch": 0.01029433811403728, + "grad_norm": 0.06251583993434906, + "learning_rate": 3e-06, + "loss": -0.0182, + "step": 618 + }, + { + "epoch": 0.010310995619076176, + "grad_norm": 0.08977127075195312, + "learning_rate": 3e-06, + "loss": 0.0017, + "step": 619 + }, + { + "epoch": 0.01032765312411507, + "grad_norm": 0.0661797970533371, + "learning_rate": 3e-06, + "loss": -0.0188, + "step": 620 + }, + { + "completion_length": 80.46875381469727, + "epoch": 0.010344310629153966, + "grad_norm": 0.16933809220790863, + "learning_rate": 3e-06, + "loss": -0.0005, + "reward": 0.6156250536441803, + "rewards/countdown_reward_func": 0.6156250238418579, + "step": 621 + }, + { + "epoch": 0.01036096813419286, + "grad_norm": 0.07308223843574524, + "learning_rate": 3e-06, + "loss": -0.0028, + "step": 622 + }, + { + "epoch": 0.010377625639231756, + "grad_norm": 0.06293918937444687, + "learning_rate": 3e-06, + "loss": -0.0003, + "step": 623 + }, + { + "epoch": 0.010394283144270652, + "grad_norm": 0.06461838632822037, + "learning_rate": 3e-06, + "loss": -0.0032, + "step": 624 + }, + { + "completion_length": 81.26042175292969, + "epoch": 0.010410940649309546, + "grad_norm": 0.07721193879842758, + "learning_rate": 3e-06, + "loss": 0.0243, + "reward": 0.5197917073965073, + "rewards/countdown_reward_func": 0.5197916775941849, + "step": 625 + }, + { + "epoch": 0.010427598154348442, + "grad_norm": 0.07750426977872849, + "learning_rate": 3e-06, + "loss": 0.0177, + "step": 626 + }, + { + "epoch": 0.010444255659387338, + "grad_norm": 0.08096791803836823, + "learning_rate": 3e-06, + "loss": 0.024, + "step": 627 + }, + { + "epoch": 0.010460913164426232, + "grad_norm": 0.06906022131443024, + "learning_rate": 3e-06, + "loss": 0.0174, + "step": 628 + }, + { + "completion_length": 81.23958587646484, + "epoch": 0.010477570669465128, + "grad_norm": 0.056105613708496094, + "learning_rate": 3e-06, + "loss": 0.0618, + "reward": 0.5687500238418579, + "rewards/countdown_reward_func": 0.5687500089406967, + "step": 629 + }, + { + "epoch": 0.010494228174504024, + "grad_norm": 0.05577811226248741, + "learning_rate": 3e-06, + "loss": 0.0267, + "step": 630 + }, + { + "epoch": 0.010510885679542918, + "grad_norm": 0.05937442556023598, + "learning_rate": 3e-06, + "loss": 0.0615, + "step": 631 + }, + { + "epoch": 0.010527543184581814, + "grad_norm": 0.06506016105413437, + "learning_rate": 3e-06, + "loss": 0.0266, + "step": 632 + }, + { + "completion_length": 81.38541793823242, + "epoch": 0.010544200689620708, + "grad_norm": 0.08698436617851257, + "learning_rate": 3e-06, + "loss": 0.0264, + "reward": 0.5572916865348816, + "rewards/countdown_reward_func": 0.5572916567325592, + "step": 633 + }, + { + "epoch": 0.010560858194659604, + "grad_norm": 0.09259279817342758, + "learning_rate": 3e-06, + "loss": -0.0012, + "step": 634 + }, + { + "epoch": 0.0105775156996985, + "grad_norm": 0.11567766964435577, + "learning_rate": 3e-06, + "loss": 0.0261, + "step": 635 + }, + { + "epoch": 0.010594173204737394, + "grad_norm": 0.08870053291320801, + "learning_rate": 3e-06, + "loss": -0.0017, + "step": 636 + }, + { + "completion_length": 81.31250381469727, + "epoch": 0.01061083070977629, + "grad_norm": 0.061698123812675476, + "learning_rate": 3e-06, + "loss": 0.0355, + "reward": 0.503125011920929, + "rewards/countdown_reward_func": 0.5031249970197678, + "step": 637 + }, + { + "epoch": 0.010627488214815186, + "grad_norm": 0.06971965730190277, + "learning_rate": 3e-06, + "loss": 0.0049, + "step": 638 + }, + { + "epoch": 0.01064414571985408, + "grad_norm": 0.05989031121134758, + "learning_rate": 3e-06, + "loss": 0.0354, + "step": 639 + }, + { + "epoch": 0.010660803224892976, + "grad_norm": 0.0777720957994461, + "learning_rate": 3e-06, + "loss": 0.0046, + "step": 640 + }, + { + "completion_length": 81.08333587646484, + "epoch": 0.010677460729931872, + "grad_norm": 0.14573520421981812, + "learning_rate": 3e-06, + "loss": -0.0451, + "reward": 0.49375002086162567, + "rewards/countdown_reward_func": 0.4937499910593033, + "step": 641 + }, + { + "epoch": 0.010694118234970766, + "grad_norm": 0.14523252844810486, + "learning_rate": 3e-06, + "loss": -0.0457, + "step": 642 + }, + { + "epoch": 0.010710775740009662, + "grad_norm": 0.1476719230413437, + "learning_rate": 3e-06, + "loss": -0.0461, + "step": 643 + }, + { + "epoch": 0.010727433245048556, + "grad_norm": 0.14897885918617249, + "learning_rate": 3e-06, + "loss": -0.0466, + "step": 644 + }, + { + "completion_length": 80.59375381469727, + "epoch": 0.010744090750087452, + "grad_norm": 0.0748654454946518, + "learning_rate": 3e-06, + "loss": 0.0148, + "reward": 0.6052083671092987, + "rewards/countdown_reward_func": 0.6052083671092987, + "step": 645 + }, + { + "epoch": 0.010760748255126348, + "grad_norm": 0.11252734810113907, + "learning_rate": 3e-06, + "loss": 0.0485, + "step": 646 + }, + { + "epoch": 0.010777405760165242, + "grad_norm": 0.07342631369829178, + "learning_rate": 3e-06, + "loss": 0.0143, + "step": 647 + }, + { + "epoch": 0.010794063265204138, + "grad_norm": 0.09865226596593857, + "learning_rate": 3e-06, + "loss": 0.0475, + "step": 648 + }, + { + "completion_length": 79.66666793823242, + "epoch": 0.010810720770243034, + "grad_norm": 0.08172736316919327, + "learning_rate": 3e-06, + "loss": 0.0147, + "reward": 0.4375000149011612, + "rewards/countdown_reward_func": 0.4375, + "step": 649 + }, + { + "epoch": 0.010827378275281928, + "grad_norm": 0.05033933371305466, + "learning_rate": 3e-06, + "loss": 0.0262, + "step": 650 + }, + { + "epoch": 0.010844035780320824, + "grad_norm": 0.07038245350122452, + "learning_rate": 3e-06, + "loss": 0.0144, + "step": 651 + }, + { + "epoch": 0.01086069328535972, + "grad_norm": 0.0483170785009861, + "learning_rate": 3e-06, + "loss": 0.0262, + "step": 652 + }, + { + "completion_length": 80.11458587646484, + "epoch": 0.010877350790398614, + "grad_norm": 0.10530553758144379, + "learning_rate": 3e-06, + "loss": 0.0528, + "reward": 0.643750011920929, + "rewards/countdown_reward_func": 0.643750011920929, + "step": 653 + }, + { + "epoch": 0.01089400829543751, + "grad_norm": 0.08281281590461731, + "learning_rate": 3e-06, + "loss": 0.0143, + "step": 654 + }, + { + "epoch": 0.010910665800476404, + "grad_norm": 0.10374639183282852, + "learning_rate": 3e-06, + "loss": 0.0522, + "step": 655 + }, + { + "epoch": 0.0109273233055153, + "grad_norm": 0.09769141674041748, + "learning_rate": 3e-06, + "loss": 0.0138, + "step": 656 + }, + { + "completion_length": 78.95833587646484, + "epoch": 0.010943980810554196, + "grad_norm": 0.07695486396551132, + "learning_rate": 3e-06, + "loss": -0.0333, + "reward": 0.491666704416275, + "rewards/countdown_reward_func": 0.49166665971279144, + "step": 657 + }, + { + "epoch": 0.01096063831559309, + "grad_norm": 0.08287905901670456, + "learning_rate": 3e-06, + "loss": -0.0385, + "step": 658 + }, + { + "epoch": 0.010977295820631986, + "grad_norm": 0.07985985279083252, + "learning_rate": 3e-06, + "loss": -0.0339, + "step": 659 + }, + { + "epoch": 0.010993953325670882, + "grad_norm": 0.08193966001272202, + "learning_rate": 3e-06, + "loss": -0.0388, + "step": 660 + }, + { + "completion_length": 78.46875381469727, + "epoch": 0.011010610830709776, + "grad_norm": 0.07884252816438675, + "learning_rate": 3e-06, + "loss": 0.0111, + "reward": 0.6500000357627869, + "rewards/countdown_reward_func": 0.6500000357627869, + "step": 661 + }, + { + "epoch": 0.011027268335748672, + "grad_norm": 0.10381687432527542, + "learning_rate": 3e-06, + "loss": -0.0175, + "step": 662 + }, + { + "epoch": 0.011043925840787568, + "grad_norm": 0.07030217349529266, + "learning_rate": 3e-06, + "loss": 0.0105, + "step": 663 + }, + { + "epoch": 0.011060583345826462, + "grad_norm": 0.10452256351709366, + "learning_rate": 3e-06, + "loss": -0.0175, + "step": 664 + }, + { + "completion_length": 78.48958587646484, + "epoch": 0.011077240850865358, + "grad_norm": 0.04517979919910431, + "learning_rate": 3e-06, + "loss": 0.0263, + "reward": 0.44583335518836975, + "rewards/countdown_reward_func": 0.44583334028720856, + "step": 665 + }, + { + "epoch": 0.011093898355904252, + "grad_norm": 0.05246163159608841, + "learning_rate": 3e-06, + "loss": 0.0003, + "step": 666 + }, + { + "epoch": 0.011110555860943148, + "grad_norm": 0.04906832426786423, + "learning_rate": 3e-06, + "loss": 0.0261, + "step": 667 + }, + { + "epoch": 0.011127213365982044, + "grad_norm": 0.05243990942835808, + "learning_rate": 3e-06, + "loss": 0.0001, + "step": 668 + }, + { + "completion_length": 77.66667175292969, + "epoch": 0.011143870871020938, + "grad_norm": 0.05620981380343437, + "learning_rate": 3e-06, + "loss": 0.0039, + "reward": 0.24062500149011612, + "rewards/countdown_reward_func": 0.24062500149011612, + "step": 669 + }, + { + "epoch": 0.011160528376059834, + "grad_norm": 0.06384444981813431, + "learning_rate": 3e-06, + "loss": 0.0264, + "step": 670 + }, + { + "epoch": 0.01117718588109873, + "grad_norm": 0.05502112954854965, + "learning_rate": 3e-06, + "loss": 0.0037, + "step": 671 + }, + { + "epoch": 0.011193843386137624, + "grad_norm": 0.06419157981872559, + "learning_rate": 3e-06, + "loss": 0.0262, + "step": 672 + }, + { + "completion_length": 78.36458587646484, + "epoch": 0.01121050089117652, + "grad_norm": 0.09978973865509033, + "learning_rate": 3e-06, + "loss": 0.0091, + "reward": 0.5302083790302277, + "rewards/countdown_reward_func": 0.5302083492279053, + "step": 673 + }, + { + "epoch": 0.011227158396215416, + "grad_norm": 0.09139817953109741, + "learning_rate": 3e-06, + "loss": 0.0019, + "step": 674 + }, + { + "epoch": 0.01124381590125431, + "grad_norm": 0.08964921534061432, + "learning_rate": 3e-06, + "loss": 0.0086, + "step": 675 + }, + { + "epoch": 0.011260473406293206, + "grad_norm": 0.09452591836452484, + "learning_rate": 3e-06, + "loss": 0.0014, + "step": 676 + }, + { + "completion_length": 78.52083587646484, + "epoch": 0.0112771309113321, + "grad_norm": 0.04879109561443329, + "learning_rate": 3e-06, + "loss": 0.0249, + "reward": 0.40937499701976776, + "rewards/countdown_reward_func": 0.40937499701976776, + "step": 677 + }, + { + "epoch": 0.011293788416370996, + "grad_norm": 0.0655544251203537, + "learning_rate": 3e-06, + "loss": -0.0145, + "step": 678 + }, + { + "epoch": 0.011310445921409892, + "grad_norm": 0.04994789510965347, + "learning_rate": 3e-06, + "loss": 0.0247, + "step": 679 + }, + { + "epoch": 0.011327103426448786, + "grad_norm": 0.06290163099765778, + "learning_rate": 3e-06, + "loss": -0.0147, + "step": 680 + }, + { + "completion_length": 78.98958587646484, + "epoch": 0.011343760931487682, + "grad_norm": 0.08619464933872223, + "learning_rate": 3e-06, + "loss": -0.0085, + "reward": 0.41875001788139343, + "rewards/countdown_reward_func": 0.41875000298023224, + "step": 681 + }, + { + "epoch": 0.011360418436526578, + "grad_norm": 0.03602594882249832, + "learning_rate": 3e-06, + "loss": 0.0159, + "step": 682 + }, + { + "epoch": 0.011377075941565472, + "grad_norm": 0.1828061044216156, + "learning_rate": 3e-06, + "loss": -0.0086, + "step": 683 + }, + { + "epoch": 0.011393733446604368, + "grad_norm": 0.03523556888103485, + "learning_rate": 3e-06, + "loss": 0.0158, + "step": 684 + }, + { + "completion_length": 78.75, + "epoch": 0.011410390951643264, + "grad_norm": 0.09288085997104645, + "learning_rate": 3e-06, + "loss": 0.0164, + "reward": 0.5218750238418579, + "rewards/countdown_reward_func": 0.5218750089406967, + "step": 685 + }, + { + "epoch": 0.011427048456682158, + "grad_norm": 0.07052195072174072, + "learning_rate": 3e-06, + "loss": 0.0103, + "step": 686 + }, + { + "epoch": 0.011443705961721054, + "grad_norm": 0.09347330778837204, + "learning_rate": 3e-06, + "loss": 0.0161, + "step": 687 + }, + { + "epoch": 0.011460363466759948, + "grad_norm": 0.06872225552797318, + "learning_rate": 3e-06, + "loss": 0.0101, + "step": 688 + }, + { + "completion_length": 77.65625, + "epoch": 0.011477020971798844, + "grad_norm": 0.07362423092126846, + "learning_rate": 3e-06, + "loss": 0.0146, + "reward": 0.585416704416275, + "rewards/countdown_reward_func": 0.5854166746139526, + "step": 689 + }, + { + "epoch": 0.01149367847683774, + "grad_norm": 0.058942295610904694, + "learning_rate": 3e-06, + "loss": 0.0081, + "step": 690 + }, + { + "epoch": 0.011510335981876634, + "grad_norm": 0.07810818403959274, + "learning_rate": 3e-06, + "loss": 0.0147, + "step": 691 + }, + { + "epoch": 0.01152699348691553, + "grad_norm": 0.06231189891695976, + "learning_rate": 3e-06, + "loss": 0.0079, + "step": 692 + }, + { + "completion_length": 78.27083587646484, + "epoch": 0.011543650991954426, + "grad_norm": 0.0465918630361557, + "learning_rate": 3e-06, + "loss": 0.0003, + "reward": 0.4083333760499954, + "rewards/countdown_reward_func": 0.40833336114883423, + "step": 693 + }, + { + "epoch": 0.01156030849699332, + "grad_norm": 0.05846988782286644, + "learning_rate": 3e-06, + "loss": 0.0209, + "step": 694 + }, + { + "epoch": 0.011576966002032216, + "grad_norm": 0.05115668103098869, + "learning_rate": 3e-06, + "loss": 0.0002, + "step": 695 + }, + { + "epoch": 0.011593623507071112, + "grad_norm": 0.05303172394633293, + "learning_rate": 3e-06, + "loss": 0.0207, + "step": 696 + }, + { + "completion_length": 78.07291793823242, + "epoch": 0.011610281012110006, + "grad_norm": 0.04062580689787865, + "learning_rate": 3e-06, + "loss": 0.0224, + "reward": 0.53125, + "rewards/countdown_reward_func": 0.53125, + "step": 697 + }, + { + "epoch": 0.011626938517148902, + "grad_norm": 0.025238094851374626, + "learning_rate": 3e-06, + "loss": -0.0005, + "step": 698 + }, + { + "epoch": 0.011643596022187796, + "grad_norm": 0.03677640110254288, + "learning_rate": 3e-06, + "loss": 0.0224, + "step": 699 + }, + { + "epoch": 0.011660253527226692, + "grad_norm": 0.025632807984948158, + "learning_rate": 3e-06, + "loss": -0.0005, + "step": 700 + }, + { + "completion_length": 78.84375, + "epoch": 0.011676911032265588, + "grad_norm": 0.11586595326662064, + "learning_rate": 3e-06, + "loss": 0.0099, + "reward": 0.5197916775941849, + "rewards/countdown_reward_func": 0.5197916775941849, + "step": 701 + }, + { + "epoch": 0.011693568537304482, + "grad_norm": 0.0814061239361763, + "learning_rate": 3e-06, + "loss": 0.0036, + "step": 702 + }, + { + "epoch": 0.011710226042343378, + "grad_norm": 0.10785569995641708, + "learning_rate": 3e-06, + "loss": 0.0095, + "step": 703 + }, + { + "epoch": 0.011726883547382274, + "grad_norm": 0.07799053937196732, + "learning_rate": 3e-06, + "loss": 0.0036, + "step": 704 + }, + { + "completion_length": 78.76041793823242, + "epoch": 0.011743541052421168, + "grad_norm": 0.05988309904932976, + "learning_rate": 3e-06, + "loss": 0.001, + "reward": 0.35312502086162567, + "rewards/countdown_reward_func": 0.3531250059604645, + "step": 705 + }, + { + "epoch": 0.011760198557460064, + "grad_norm": 0.05371326580643654, + "learning_rate": 3e-06, + "loss": 0.0275, + "step": 706 + }, + { + "epoch": 0.01177685606249896, + "grad_norm": 0.06082794442772865, + "learning_rate": 3e-06, + "loss": 0.0009, + "step": 707 + }, + { + "epoch": 0.011793513567537854, + "grad_norm": 0.04692936688661575, + "learning_rate": 3e-06, + "loss": 0.0274, + "step": 708 + }, + { + "completion_length": 78.93750381469727, + "epoch": 0.01181017107257675, + "grad_norm": 0.06282486766576767, + "learning_rate": 3e-06, + "loss": 0.0139, + "reward": 0.5406250208616257, + "rewards/countdown_reward_func": 0.5406250208616257, + "step": 709 + }, + { + "epoch": 0.011826828577615646, + "grad_norm": 0.043248213827610016, + "learning_rate": 3e-06, + "loss": 0.0113, + "step": 710 + }, + { + "epoch": 0.01184348608265454, + "grad_norm": 0.06568653881549835, + "learning_rate": 3e-06, + "loss": 0.0137, + "step": 711 + }, + { + "epoch": 0.011860143587693436, + "grad_norm": 0.043849412351846695, + "learning_rate": 3e-06, + "loss": 0.0112, + "step": 712 + }, + { + "completion_length": 78.73958587646484, + "epoch": 0.01187680109273233, + "grad_norm": 0.07370666414499283, + "learning_rate": 3e-06, + "loss": 0.0015, + "reward": 0.7093749940395355, + "rewards/countdown_reward_func": 0.7093749940395355, + "step": 713 + }, + { + "epoch": 0.011893458597771226, + "grad_norm": 0.0847863256931305, + "learning_rate": 3e-06, + "loss": 0.0205, + "step": 714 + }, + { + "epoch": 0.011910116102810122, + "grad_norm": 0.11252886056900024, + "learning_rate": 3e-06, + "loss": 0.0012, + "step": 715 + }, + { + "epoch": 0.011926773607849016, + "grad_norm": 0.09333963692188263, + "learning_rate": 3e-06, + "loss": 0.0204, + "step": 716 + }, + { + "completion_length": 78.09375381469727, + "epoch": 0.011943431112887912, + "grad_norm": 0.2915778160095215, + "learning_rate": 3e-06, + "loss": 0.0062, + "reward": 0.40000003576278687, + "rewards/countdown_reward_func": 0.4000000059604645, + "step": 717 + }, + { + "epoch": 0.011960088617926808, + "grad_norm": 0.07282423973083496, + "learning_rate": 3e-06, + "loss": -0.0145, + "step": 718 + }, + { + "epoch": 0.011976746122965702, + "grad_norm": 0.0652887299656868, + "learning_rate": 3e-06, + "loss": 0.0058, + "step": 719 + }, + { + "epoch": 0.011993403628004598, + "grad_norm": 0.07212091237306595, + "learning_rate": 3e-06, + "loss": -0.0146, + "step": 720 + }, + { + "completion_length": 79.69791793823242, + "epoch": 0.012010061133043494, + "grad_norm": 0.11293821036815643, + "learning_rate": 3e-06, + "loss": 0.0016, + "reward": 0.6156250536441803, + "rewards/countdown_reward_func": 0.6156250089406967, + "step": 721 + }, + { + "epoch": 0.012026718638082388, + "grad_norm": 0.06647317856550217, + "learning_rate": 3e-06, + "loss": 0.0013, + "step": 722 + }, + { + "epoch": 0.012043376143121284, + "grad_norm": 0.10243447870016098, + "learning_rate": 3e-06, + "loss": 0.0009, + "step": 723 + }, + { + "epoch": 0.012060033648160178, + "grad_norm": 0.06480325758457184, + "learning_rate": 3e-06, + "loss": 0.0011, + "step": 724 + }, + { + "completion_length": 79.55208587646484, + "epoch": 0.012076691153199074, + "grad_norm": 0.05560755729675293, + "learning_rate": 3e-06, + "loss": 0.0315, + "reward": 0.484375, + "rewards/countdown_reward_func": 0.484375, + "step": 725 + }, + { + "epoch": 0.01209334865823797, + "grad_norm": 0.09164172410964966, + "learning_rate": 3e-06, + "loss": 0.0066, + "step": 726 + }, + { + "epoch": 0.012110006163276864, + "grad_norm": 0.05865085870027542, + "learning_rate": 3e-06, + "loss": 0.0311, + "step": 727 + }, + { + "epoch": 0.01212666366831576, + "grad_norm": 0.10650106519460678, + "learning_rate": 3e-06, + "loss": 0.006, + "step": 728 + }, + { + "completion_length": 79.85416793823242, + "epoch": 0.012143321173354656, + "grad_norm": 0.07895907759666443, + "learning_rate": 3e-06, + "loss": 0.018, + "reward": 0.5406250059604645, + "rewards/countdown_reward_func": 0.5406250059604645, + "step": 729 + }, + { + "epoch": 0.01215997867839355, + "grad_norm": 0.07635629922151566, + "learning_rate": 3e-06, + "loss": 0.0233, + "step": 730 + }, + { + "epoch": 0.012176636183432446, + "grad_norm": 0.07198678702116013, + "learning_rate": 3e-06, + "loss": 0.0178, + "step": 731 + }, + { + "epoch": 0.012193293688471342, + "grad_norm": 0.06083283573389053, + "learning_rate": 3e-06, + "loss": 0.023, + "step": 732 + }, + { + "completion_length": 79.63541793823242, + "epoch": 0.012209951193510236, + "grad_norm": 0.09881098568439484, + "learning_rate": 3e-06, + "loss": 0.011, + "reward": 0.4750000238418579, + "rewards/countdown_reward_func": 0.4750000089406967, + "step": 733 + }, + { + "epoch": 0.012226608698549132, + "grad_norm": 0.09897299110889435, + "learning_rate": 3e-06, + "loss": 0.0211, + "step": 734 + }, + { + "epoch": 0.012243266203588026, + "grad_norm": 0.08897686004638672, + "learning_rate": 3e-06, + "loss": 0.0104, + "step": 735 + }, + { + "epoch": 0.012259923708626922, + "grad_norm": 0.08943609893321991, + "learning_rate": 3e-06, + "loss": 0.0204, + "step": 736 + }, + { + "completion_length": 80.36458587646484, + "epoch": 0.012276581213665818, + "grad_norm": 0.09426072984933853, + "learning_rate": 3e-06, + "loss": 0.0316, + "reward": 0.5406250208616257, + "rewards/countdown_reward_func": 0.5406250208616257, + "step": 737 + }, + { + "epoch": 0.012293238718704712, + "grad_norm": 0.0849115401506424, + "learning_rate": 3e-06, + "loss": 0.0527, + "step": 738 + }, + { + "epoch": 0.012309896223743608, + "grad_norm": 0.0926174446940422, + "learning_rate": 3e-06, + "loss": 0.0315, + "step": 739 + }, + { + "epoch": 0.012326553728782504, + "grad_norm": 0.08799073845148087, + "learning_rate": 3e-06, + "loss": 0.052, + "step": 740 + }, + { + "completion_length": 80.90625, + "epoch": 0.012343211233821398, + "grad_norm": 0.053437020629644394, + "learning_rate": 3e-06, + "loss": 0.0038, + "reward": 0.5781250298023224, + "rewards/countdown_reward_func": 0.578125, + "step": 741 + }, + { + "epoch": 0.012359868738860294, + "grad_norm": 0.07904201000928879, + "learning_rate": 3e-06, + "loss": 0.0442, + "step": 742 + }, + { + "epoch": 0.01237652624389919, + "grad_norm": 0.0524018332362175, + "learning_rate": 3e-06, + "loss": 0.0036, + "step": 743 + }, + { + "epoch": 0.012393183748938084, + "grad_norm": 0.07826267927885056, + "learning_rate": 3e-06, + "loss": 0.044, + "step": 744 + }, + { + "completion_length": 80.76041793823242, + "epoch": 0.01240984125397698, + "grad_norm": 0.11324140429496765, + "learning_rate": 3e-06, + "loss": 0.0296, + "reward": 0.5781250596046448, + "rewards/countdown_reward_func": 0.5781250298023224, + "step": 745 + }, + { + "epoch": 0.012426498759015874, + "grad_norm": 0.08553794771432877, + "learning_rate": 3e-06, + "loss": 0.0249, + "step": 746 + }, + { + "epoch": 0.01244315626405477, + "grad_norm": 0.10348138958215714, + "learning_rate": 3e-06, + "loss": 0.0287, + "step": 747 + }, + { + "epoch": 0.012459813769093666, + "grad_norm": 0.08831793069839478, + "learning_rate": 3e-06, + "loss": 0.0241, + "step": 748 + }, + { + "completion_length": 80.90625, + "epoch": 0.01247647127413256, + "grad_norm": 0.06809167563915253, + "learning_rate": 3e-06, + "loss": 0.0207, + "reward": 0.5687500238418579, + "rewards/countdown_reward_func": 0.5687500238418579, + "step": 749 + }, + { + "epoch": 0.012493128779171456, + "grad_norm": 0.04747375473380089, + "learning_rate": 3e-06, + "loss": 0.0119, + "step": 750 + }, + { + "epoch": 0.012509786284210352, + "grad_norm": 0.06507973372936249, + "learning_rate": 3e-06, + "loss": 0.0204, + "step": 751 + }, + { + "epoch": 0.012526443789249246, + "grad_norm": 0.06337691098451614, + "learning_rate": 3e-06, + "loss": 0.0117, + "step": 752 + }, + { + "completion_length": 80.09375, + "epoch": 0.012543101294288142, + "grad_norm": 0.09242469817399979, + "learning_rate": 3e-06, + "loss": -0.0057, + "reward": 0.6708333790302277, + "rewards/countdown_reward_func": 0.6708333492279053, + "step": 753 + }, + { + "epoch": 0.012559758799327038, + "grad_norm": 0.05312391370534897, + "learning_rate": 3e-06, + "loss": 0.012, + "step": 754 + }, + { + "epoch": 0.012576416304365932, + "grad_norm": 0.11846458166837692, + "learning_rate": 3e-06, + "loss": -0.0064, + "step": 755 + }, + { + "epoch": 0.012593073809404828, + "grad_norm": 0.04948701336979866, + "learning_rate": 3e-06, + "loss": 0.012, + "step": 756 + }, + { + "completion_length": 80.02083587646484, + "epoch": 0.012609731314443722, + "grad_norm": 0.030748212710022926, + "learning_rate": 3e-06, + "loss": -0.0092, + "reward": 0.46562498807907104, + "rewards/countdown_reward_func": 0.46562497317790985, + "step": 757 + }, + { + "epoch": 0.012626388819482618, + "grad_norm": 0.02479449287056923, + "learning_rate": 3e-06, + "loss": 0.0001, + "step": 758 + }, + { + "epoch": 0.012643046324521514, + "grad_norm": 0.035349972546100616, + "learning_rate": 3e-06, + "loss": -0.0092, + "step": 759 + }, + { + "epoch": 0.012659703829560408, + "grad_norm": 0.024609258398413658, + "learning_rate": 3e-06, + "loss": 0.0002, + "step": 760 + }, + { + "completion_length": 79.30208587646484, + "epoch": 0.012676361334599304, + "grad_norm": 0.04653378948569298, + "learning_rate": 3e-06, + "loss": 0.0067, + "reward": 0.5593750178813934, + "rewards/countdown_reward_func": 0.5593750178813934, + "step": 761 + }, + { + "epoch": 0.0126930188396382, + "grad_norm": 0.03835076838731766, + "learning_rate": 3e-06, + "loss": -0.0055, + "step": 762 + }, + { + "epoch": 0.012709676344677094, + "grad_norm": 0.04435642063617706, + "learning_rate": 3e-06, + "loss": 0.0067, + "step": 763 + }, + { + "epoch": 0.01272633384971599, + "grad_norm": 0.036747898906469345, + "learning_rate": 3e-06, + "loss": -0.0057, + "step": 764 + }, + { + "completion_length": 80.44791793823242, + "epoch": 0.012742991354754886, + "grad_norm": 0.07825355976819992, + "learning_rate": 3e-06, + "loss": -0.0151, + "reward": 0.6156250238418579, + "rewards/countdown_reward_func": 0.6156249940395355, + "step": 765 + }, + { + "epoch": 0.01275964885979378, + "grad_norm": 0.079051174223423, + "learning_rate": 3e-06, + "loss": 0.0139, + "step": 766 + }, + { + "epoch": 0.012776306364832676, + "grad_norm": 0.09507065266370773, + "learning_rate": 3e-06, + "loss": -0.0156, + "step": 767 + }, + { + "epoch": 0.01279296386987157, + "grad_norm": 0.07938767224550247, + "learning_rate": 3e-06, + "loss": 0.0134, + "step": 768 + }, + { + "completion_length": 79.41667175292969, + "epoch": 0.012809621374910466, + "grad_norm": 0.0698011964559555, + "learning_rate": 3e-06, + "loss": 0.0136, + "reward": 0.5593750327825546, + "rewards/countdown_reward_func": 0.5593750029802322, + "step": 769 + }, + { + "epoch": 0.012826278879949362, + "grad_norm": 0.06635377556085587, + "learning_rate": 3e-06, + "loss": 0.0214, + "step": 770 + }, + { + "epoch": 0.012842936384988256, + "grad_norm": 0.07067494094371796, + "learning_rate": 3e-06, + "loss": 0.0135, + "step": 771 + }, + { + "epoch": 0.012859593890027152, + "grad_norm": 0.05914042517542839, + "learning_rate": 3e-06, + "loss": 0.0213, + "step": 772 + }, + { + "completion_length": 79.27083587646484, + "epoch": 0.012876251395066048, + "grad_norm": 0.04239983856678009, + "learning_rate": 3e-06, + "loss": 0.0189, + "reward": 0.4843750298023224, + "rewards/countdown_reward_func": 0.4843749850988388, + "step": 773 + }, + { + "epoch": 0.012892908900104942, + "grad_norm": 0.05679262802004814, + "learning_rate": 3e-06, + "loss": 0.0219, + "step": 774 + }, + { + "epoch": 0.012909566405143838, + "grad_norm": 0.04409712925553322, + "learning_rate": 3e-06, + "loss": 0.0189, + "step": 775 + }, + { + "epoch": 0.012926223910182734, + "grad_norm": 0.06001592054963112, + "learning_rate": 3e-06, + "loss": 0.0216, + "step": 776 + }, + { + "completion_length": 78.66667175292969, + "epoch": 0.012942881415221628, + "grad_norm": 0.059472233057022095, + "learning_rate": 3e-06, + "loss": -0.0176, + "reward": 0.4375, + "rewards/countdown_reward_func": 0.4375, + "step": 777 + }, + { + "epoch": 0.012959538920260524, + "grad_norm": 0.07600384205579758, + "learning_rate": 3e-06, + "loss": -0.0181, + "step": 778 + }, + { + "epoch": 0.012976196425299418, + "grad_norm": 0.060126226395368576, + "learning_rate": 3e-06, + "loss": -0.0178, + "step": 779 + }, + { + "epoch": 0.012992853930338314, + "grad_norm": 0.06755590438842773, + "learning_rate": 3e-06, + "loss": -0.0183, + "step": 780 + }, + { + "completion_length": 81.11458587646484, + "epoch": 0.01300951143537721, + "grad_norm": 0.03604620322585106, + "learning_rate": 3e-06, + "loss": 0.0063, + "reward": 0.6708333194255829, + "rewards/countdown_reward_func": 0.6708333194255829, + "step": 781 + }, + { + "epoch": 0.013026168940416104, + "grad_norm": 0.042886558920145035, + "learning_rate": 3e-06, + "loss": 0.0011, + "step": 782 + }, + { + "epoch": 0.013042826445455, + "grad_norm": 0.03288163244724274, + "learning_rate": 3e-06, + "loss": 0.0064, + "step": 783 + }, + { + "epoch": 0.013059483950493896, + "grad_norm": 0.04485667869448662, + "learning_rate": 3e-06, + "loss": 0.0009, + "step": 784 + }, + { + "completion_length": 78.45833587646484, + "epoch": 0.01307614145553279, + "grad_norm": 0.20523227751255035, + "learning_rate": 3e-06, + "loss": -0.0014, + "reward": 0.40000002086162567, + "rewards/countdown_reward_func": 0.3999999910593033, + "step": 785 + }, + { + "epoch": 0.013092798960571686, + "grad_norm": 0.07569430768489838, + "learning_rate": 3e-06, + "loss": 0.0014, + "step": 786 + }, + { + "epoch": 0.013109456465610582, + "grad_norm": 0.10879930108785629, + "learning_rate": 3e-06, + "loss": -0.0015, + "step": 787 + }, + { + "epoch": 0.013126113970649476, + "grad_norm": 0.06688430160284042, + "learning_rate": 3e-06, + "loss": 0.0012, + "step": 788 + }, + { + "completion_length": 78.86458587646484, + "epoch": 0.013142771475688372, + "grad_norm": 0.01620897464454174, + "learning_rate": 3e-06, + "loss": 0.007, + "reward": 0.596875011920929, + "rewards/countdown_reward_func": 0.5968749821186066, + "step": 789 + }, + { + "epoch": 0.013159428980727266, + "grad_norm": 0.0457727387547493, + "learning_rate": 3e-06, + "loss": 0.0064, + "step": 790 + }, + { + "epoch": 0.013176086485766162, + "grad_norm": 0.01622067578136921, + "learning_rate": 3e-06, + "loss": 0.007, + "step": 791 + }, + { + "epoch": 0.013192743990805058, + "grad_norm": 0.046804651618003845, + "learning_rate": 3e-06, + "loss": 0.0065, + "step": 792 + }, + { + "completion_length": 78.43750381469727, + "epoch": 0.013209401495843952, + "grad_norm": 0.07593139261007309, + "learning_rate": 3e-06, + "loss": -0.0167, + "reward": 0.5499999970197678, + "rewards/countdown_reward_func": 0.5499999970197678, + "step": 793 + }, + { + "epoch": 0.013226059000882848, + "grad_norm": 0.04331118240952492, + "learning_rate": 3e-06, + "loss": 0.0129, + "step": 794 + }, + { + "epoch": 0.013242716505921744, + "grad_norm": 0.0735052227973938, + "learning_rate": 3e-06, + "loss": -0.0168, + "step": 795 + }, + { + "epoch": 0.013259374010960638, + "grad_norm": 0.03814023360610008, + "learning_rate": 3e-06, + "loss": 0.0127, + "step": 796 + }, + { + "completion_length": 78.33333587646484, + "epoch": 0.013276031515999534, + "grad_norm": 0.10209158807992935, + "learning_rate": 3e-06, + "loss": 0.0061, + "reward": 0.4281250089406967, + "rewards/countdown_reward_func": 0.4281250089406967, + "step": 797 + }, + { + "epoch": 0.01329268902103843, + "grad_norm": 0.08039453625679016, + "learning_rate": 3e-06, + "loss": -0.0158, + "step": 798 + }, + { + "epoch": 0.013309346526077324, + "grad_norm": 0.07530573010444641, + "learning_rate": 3e-06, + "loss": 0.006, + "step": 799 + }, + { + "epoch": 0.01332600403111622, + "grad_norm": 0.06310764700174332, + "learning_rate": 3e-06, + "loss": -0.0165, + "step": 800 + }, + { + "completion_length": 78.75000381469727, + "epoch": 0.013342661536155116, + "grad_norm": 0.06366696208715439, + "learning_rate": 3e-06, + "loss": 0.0178, + "reward": 0.5593750327825546, + "rewards/countdown_reward_func": 0.5593750029802322, + "step": 801 + }, + { + "epoch": 0.01335931904119401, + "grad_norm": 0.04627365246415138, + "learning_rate": 3e-06, + "loss": 0.0209, + "step": 802 + }, + { + "epoch": 0.013375976546232906, + "grad_norm": 0.05828734487295151, + "learning_rate": 3e-06, + "loss": 0.0179, + "step": 803 + }, + { + "epoch": 0.0133926340512718, + "grad_norm": 0.04469382017850876, + "learning_rate": 3e-06, + "loss": 0.0209, + "step": 804 + }, + { + "completion_length": 78.29166793823242, + "epoch": 0.013409291556310696, + "grad_norm": 0.08634136617183685, + "learning_rate": 3e-06, + "loss": -0.0059, + "reward": 0.7187500298023224, + "rewards/countdown_reward_func": 0.71875, + "step": 805 + }, + { + "epoch": 0.013425949061349592, + "grad_norm": 0.07501038908958435, + "learning_rate": 3e-06, + "loss": 0.0129, + "step": 806 + }, + { + "epoch": 0.013442606566388486, + "grad_norm": 0.09129597246646881, + "learning_rate": 3e-06, + "loss": -0.0065, + "step": 807 + }, + { + "epoch": 0.013459264071427382, + "grad_norm": 0.07218378037214279, + "learning_rate": 3e-06, + "loss": 0.0126, + "step": 808 + }, + { + "completion_length": 79.27083587646484, + "epoch": 0.013475921576466278, + "grad_norm": 0.0433824248611927, + "learning_rate": 3e-06, + "loss": 0.0124, + "reward": 0.793749988079071, + "rewards/countdown_reward_func": 0.793749988079071, + "step": 809 + }, + { + "epoch": 0.013492579081505172, + "grad_norm": 0.04705730453133583, + "learning_rate": 3e-06, + "loss": 0.0102, + "step": 810 + }, + { + "epoch": 0.013509236586544068, + "grad_norm": 0.04610871151089668, + "learning_rate": 3e-06, + "loss": 0.0123, + "step": 811 + }, + { + "epoch": 0.013525894091582964, + "grad_norm": 0.049312613904476166, + "learning_rate": 3e-06, + "loss": 0.0102, + "step": 812 + }, + { + "completion_length": 78.80208587646484, + "epoch": 0.013542551596621858, + "grad_norm": 0.10267847031354904, + "learning_rate": 3e-06, + "loss": -0.0122, + "reward": 0.5312500149011612, + "rewards/countdown_reward_func": 0.5312500149011612, + "step": 813 + }, + { + "epoch": 0.013559209101660754, + "grad_norm": 0.08535538613796234, + "learning_rate": 3e-06, + "loss": -0.0078, + "step": 814 + }, + { + "epoch": 0.013575866606699648, + "grad_norm": 0.07861588150262833, + "learning_rate": 3e-06, + "loss": -0.0128, + "step": 815 + }, + { + "epoch": 0.013592524111738544, + "grad_norm": 0.0840579941868782, + "learning_rate": 3e-06, + "loss": -0.0084, + "step": 816 + }, + { + "completion_length": 79.70833587646484, + "epoch": 0.01360918161677744, + "grad_norm": 0.038241952657699585, + "learning_rate": 3e-06, + "loss": 0.0138, + "reward": 0.5125000476837158, + "rewards/countdown_reward_func": 0.512499988079071, + "step": 817 + }, + { + "epoch": 0.013625839121816334, + "grad_norm": 0.037799984216690063, + "learning_rate": 3e-06, + "loss": 0.002, + "step": 818 + }, + { + "epoch": 0.01364249662685523, + "grad_norm": 0.03673562407493591, + "learning_rate": 3e-06, + "loss": 0.0138, + "step": 819 + }, + { + "epoch": 0.013659154131894126, + "grad_norm": 0.03907566890120506, + "learning_rate": 3e-06, + "loss": 0.002, + "step": 820 + }, + { + "completion_length": 78.65625381469727, + "epoch": 0.01367581163693302, + "grad_norm": 0.055468518286943436, + "learning_rate": 3e-06, + "loss": 0.0274, + "reward": 0.5208334028720856, + "rewards/countdown_reward_func": 0.520833358168602, + "step": 821 + }, + { + "epoch": 0.013692469141971916, + "grad_norm": 0.06687070429325104, + "learning_rate": 3e-06, + "loss": -0.007, + "step": 822 + }, + { + "epoch": 0.013709126647010812, + "grad_norm": 0.05737793445587158, + "learning_rate": 3e-06, + "loss": 0.0274, + "step": 823 + }, + { + "epoch": 0.013725784152049706, + "grad_norm": 0.06675849854946136, + "learning_rate": 3e-06, + "loss": -0.0073, + "step": 824 + }, + { + "completion_length": 78.82292175292969, + "epoch": 0.013742441657088602, + "grad_norm": 0.08598799258470535, + "learning_rate": 3e-06, + "loss": -0.0187, + "reward": 0.3437500149011612, + "rewards/countdown_reward_func": 0.34375, + "step": 825 + }, + { + "epoch": 0.013759099162127496, + "grad_norm": 0.07694842666387558, + "learning_rate": 3e-06, + "loss": -0.0021, + "step": 826 + }, + { + "epoch": 0.013775756667166392, + "grad_norm": 0.0851878747344017, + "learning_rate": 3e-06, + "loss": -0.0194, + "step": 827 + }, + { + "epoch": 0.013792414172205288, + "grad_norm": 0.07542301714420319, + "learning_rate": 3e-06, + "loss": -0.0029, + "step": 828 + }, + { + "completion_length": 79.07291793823242, + "epoch": 0.013809071677244182, + "grad_norm": 0.05098611116409302, + "learning_rate": 3e-06, + "loss": -0.0019, + "reward": 0.737500011920929, + "rewards/countdown_reward_func": 0.7374999821186066, + "step": 829 + }, + { + "epoch": 0.013825729182283078, + "grad_norm": 0.0766080841422081, + "learning_rate": 3e-06, + "loss": -0.0089, + "step": 830 + }, + { + "epoch": 0.013842386687321974, + "grad_norm": 0.05998547747731209, + "learning_rate": 3e-06, + "loss": -0.0018, + "step": 831 + }, + { + "epoch": 0.013859044192360868, + "grad_norm": 0.07929453998804092, + "learning_rate": 3e-06, + "loss": -0.0094, + "step": 832 + }, + { + "completion_length": 78.84375381469727, + "epoch": 0.013875701697399764, + "grad_norm": 0.04318351298570633, + "learning_rate": 3e-06, + "loss": 0.0068, + "reward": 0.6625000536441803, + "rewards/countdown_reward_func": 0.6625000238418579, + "step": 833 + }, + { + "epoch": 0.01389235920243866, + "grad_norm": 0.03792160004377365, + "learning_rate": 3e-06, + "loss": -0.0069, + "step": 834 + }, + { + "epoch": 0.013909016707477554, + "grad_norm": 0.04440497234463692, + "learning_rate": 3e-06, + "loss": 0.0067, + "step": 835 + }, + { + "epoch": 0.01392567421251645, + "grad_norm": 0.03722609207034111, + "learning_rate": 3e-06, + "loss": -0.007, + "step": 836 + }, + { + "completion_length": 79.72917175292969, + "epoch": 0.013942331717555344, + "grad_norm": 0.07144748419523239, + "learning_rate": 3e-06, + "loss": 0.0088, + "reward": 0.3875000327825546, + "rewards/countdown_reward_func": 0.38750001788139343, + "step": 837 + }, + { + "epoch": 0.01395898922259424, + "grad_norm": 0.07204285264015198, + "learning_rate": 3e-06, + "loss": 0.0131, + "step": 838 + }, + { + "epoch": 0.013975646727633136, + "grad_norm": 0.09143166244029999, + "learning_rate": 3e-06, + "loss": 0.0087, + "step": 839 + }, + { + "epoch": 0.01399230423267203, + "grad_norm": 0.08665017038583755, + "learning_rate": 3e-06, + "loss": 0.0128, + "step": 840 + }, + { + "completion_length": 79.26042175292969, + "epoch": 0.014008961737710926, + "grad_norm": 0.07560448348522186, + "learning_rate": 3e-06, + "loss": 0.0164, + "reward": 0.5593750178813934, + "rewards/countdown_reward_func": 0.5593750178813934, + "step": 841 + }, + { + "epoch": 0.014025619242749822, + "grad_norm": 0.06624451279640198, + "learning_rate": 3e-06, + "loss": 0.0073, + "step": 842 + }, + { + "epoch": 0.014042276747788716, + "grad_norm": 0.07740146666765213, + "learning_rate": 3e-06, + "loss": 0.0157, + "step": 843 + }, + { + "epoch": 0.014058934252827612, + "grad_norm": 0.0636131763458252, + "learning_rate": 3e-06, + "loss": 0.0073, + "step": 844 + }, + { + "completion_length": 79.03125381469727, + "epoch": 0.014075591757866508, + "grad_norm": 0.06641527265310287, + "learning_rate": 3e-06, + "loss": 0.0323, + "reward": 0.6250000298023224, + "rewards/countdown_reward_func": 0.6250000298023224, + "step": 845 + }, + { + "epoch": 0.014092249262905402, + "grad_norm": 0.041574615985155106, + "learning_rate": 3e-06, + "loss": 0.0077, + "step": 846 + }, + { + "epoch": 0.014108906767944298, + "grad_norm": 0.06959685683250427, + "learning_rate": 3e-06, + "loss": 0.0321, + "step": 847 + }, + { + "epoch": 0.014125564272983192, + "grad_norm": 0.04341280832886696, + "learning_rate": 3e-06, + "loss": 0.0076, + "step": 848 + }, + { + "completion_length": 78.59375, + "epoch": 0.014142221778022088, + "grad_norm": 0.08582612872123718, + "learning_rate": 3e-06, + "loss": 0.0146, + "reward": 0.49270837008953094, + "rewards/countdown_reward_func": 0.49270834028720856, + "step": 849 + }, + { + "epoch": 0.014158879283060984, + "grad_norm": 0.069090336561203, + "learning_rate": 3e-06, + "loss": 0.0116, + "step": 850 + }, + { + "epoch": 0.014175536788099878, + "grad_norm": 0.0744604617357254, + "learning_rate": 3e-06, + "loss": 0.0141, + "step": 851 + }, + { + "epoch": 0.014192194293138774, + "grad_norm": 0.06849255412817001, + "learning_rate": 3e-06, + "loss": 0.0111, + "step": 852 + }, + { + "completion_length": 78.89583587646484, + "epoch": 0.01420885179817767, + "grad_norm": 0.05486054718494415, + "learning_rate": 3e-06, + "loss": 0.0026, + "reward": 0.6531250476837158, + "rewards/countdown_reward_func": 0.653124988079071, + "step": 853 + }, + { + "epoch": 0.014225509303216564, + "grad_norm": 0.05528473109006882, + "learning_rate": 3e-06, + "loss": 0.0081, + "step": 854 + }, + { + "epoch": 0.01424216680825546, + "grad_norm": 0.05691102519631386, + "learning_rate": 3e-06, + "loss": 0.0026, + "step": 855 + }, + { + "epoch": 0.014258824313294356, + "grad_norm": 0.05414103716611862, + "learning_rate": 3e-06, + "loss": 0.0078, + "step": 856 + }, + { + "completion_length": 78.21875381469727, + "epoch": 0.01427548181833325, + "grad_norm": 0.03430921956896782, + "learning_rate": 3e-06, + "loss": 0.0054, + "reward": 0.6520833671092987, + "rewards/countdown_reward_func": 0.6520833373069763, + "step": 857 + }, + { + "epoch": 0.014292139323372146, + "grad_norm": 0.05903447046875954, + "learning_rate": 3e-06, + "loss": -0.0008, + "step": 858 + }, + { + "epoch": 0.01430879682841104, + "grad_norm": 0.04273103550076485, + "learning_rate": 3e-06, + "loss": 0.0053, + "step": 859 + }, + { + "epoch": 0.014325454333449936, + "grad_norm": 0.06651394069194794, + "learning_rate": 3e-06, + "loss": -0.0011, + "step": 860 + }, + { + "completion_length": 77.91666793823242, + "epoch": 0.014342111838488832, + "grad_norm": 0.05171092227101326, + "learning_rate": 3e-06, + "loss": 0.017, + "reward": 0.4187500327825546, + "rewards/countdown_reward_func": 0.41875000298023224, + "step": 861 + }, + { + "epoch": 0.014358769343527726, + "grad_norm": 0.047871384769678116, + "learning_rate": 3e-06, + "loss": 0.0055, + "step": 862 + }, + { + "epoch": 0.014375426848566622, + "grad_norm": 0.05144357308745384, + "learning_rate": 3e-06, + "loss": 0.0167, + "step": 863 + }, + { + "epoch": 0.014392084353605518, + "grad_norm": 0.04513939097523689, + "learning_rate": 3e-06, + "loss": 0.0054, + "step": 864 + }, + { + "completion_length": 78.76041793823242, + "epoch": 0.014408741858644412, + "grad_norm": 0.056057143956422806, + "learning_rate": 3e-06, + "loss": 0.0011, + "reward": 0.4281250089406967, + "rewards/countdown_reward_func": 0.4281250089406967, + "step": 865 + }, + { + "epoch": 0.014425399363683308, + "grad_norm": 0.06279901415109634, + "learning_rate": 3e-06, + "loss": -0.0052, + "step": 866 + }, + { + "epoch": 0.014442056868722204, + "grad_norm": 0.05785118415951729, + "learning_rate": 3e-06, + "loss": 0.0011, + "step": 867 + }, + { + "epoch": 0.014458714373761098, + "grad_norm": 0.06489012390375137, + "learning_rate": 3e-06, + "loss": -0.0054, + "step": 868 + }, + { + "completion_length": 77.52083587646484, + "epoch": 0.014475371878799994, + "grad_norm": 0.10484462976455688, + "learning_rate": 3e-06, + "loss": 0.0039, + "reward": 0.6343750357627869, + "rewards/countdown_reward_func": 0.6343750059604645, + "step": 869 + }, + { + "epoch": 0.014492029383838888, + "grad_norm": 0.05355258658528328, + "learning_rate": 3e-06, + "loss": 0.0094, + "step": 870 + }, + { + "epoch": 0.014508686888877784, + "grad_norm": 0.06863229721784592, + "learning_rate": 3e-06, + "loss": 0.0036, + "step": 871 + }, + { + "epoch": 0.01452534439391668, + "grad_norm": 0.05193575844168663, + "learning_rate": 3e-06, + "loss": 0.0091, + "step": 872 + }, + { + "completion_length": 77.81250381469727, + "epoch": 0.014542001898955574, + "grad_norm": 0.0821094661951065, + "learning_rate": 3e-06, + "loss": 0.0076, + "reward": 0.4656250327825546, + "rewards/countdown_reward_func": 0.4656250327825546, + "step": 873 + }, + { + "epoch": 0.01455865940399447, + "grad_norm": 0.09611205756664276, + "learning_rate": 3e-06, + "loss": -0.0002, + "step": 874 + }, + { + "epoch": 0.014575316909033366, + "grad_norm": 0.06733272224664688, + "learning_rate": 3e-06, + "loss": 0.0072, + "step": 875 + }, + { + "epoch": 0.01459197441407226, + "grad_norm": 0.09633225947618484, + "learning_rate": 3e-06, + "loss": -0.0005, + "step": 876 + }, + { + "completion_length": 77.59375381469727, + "epoch": 0.014608631919111156, + "grad_norm": 0.05266202986240387, + "learning_rate": 3e-06, + "loss": 0.0124, + "reward": 0.44583335518836975, + "rewards/countdown_reward_func": 0.44583334028720856, + "step": 877 + }, + { + "epoch": 0.014625289424150052, + "grad_norm": 0.053205348551273346, + "learning_rate": 3e-06, + "loss": -0.0085, + "step": 878 + }, + { + "epoch": 0.014641946929188946, + "grad_norm": 0.055026356130838394, + "learning_rate": 3e-06, + "loss": 0.0122, + "step": 879 + }, + { + "epoch": 0.014658604434227842, + "grad_norm": 0.05671245977282524, + "learning_rate": 3e-06, + "loss": -0.0088, + "step": 880 + }, + { + "completion_length": 78.87500381469727, + "epoch": 0.014675261939266736, + "grad_norm": 0.07053803652524948, + "learning_rate": 3e-06, + "loss": 0.0208, + "reward": 0.5583333373069763, + "rewards/countdown_reward_func": 0.5583333373069763, + "step": 881 + }, + { + "epoch": 0.014691919444305632, + "grad_norm": 0.11113756895065308, + "learning_rate": 3e-06, + "loss": -0.0062, + "step": 882 + }, + { + "epoch": 0.014708576949344528, + "grad_norm": 0.0723627433180809, + "learning_rate": 3e-06, + "loss": 0.0203, + "step": 883 + }, + { + "epoch": 0.014725234454383422, + "grad_norm": 0.07315173000097275, + "learning_rate": 3e-06, + "loss": -0.0068, + "step": 884 + }, + { + "completion_length": 78.59375, + "epoch": 0.014741891959422318, + "grad_norm": 0.07103732228279114, + "learning_rate": 3e-06, + "loss": -0.0004, + "reward": 0.6156250536441803, + "rewards/countdown_reward_func": 0.6156250238418579, + "step": 885 + }, + { + "epoch": 0.014758549464461214, + "grad_norm": 0.11812125891447067, + "learning_rate": 3e-06, + "loss": 0.0232, + "step": 886 + }, + { + "epoch": 0.014775206969500108, + "grad_norm": 0.07260793447494507, + "learning_rate": 3e-06, + "loss": -0.0008, + "step": 887 + }, + { + "epoch": 0.014791864474539004, + "grad_norm": 0.11094340682029724, + "learning_rate": 3e-06, + "loss": 0.0225, + "step": 888 + }, + { + "completion_length": 78.31250381469727, + "epoch": 0.0148085219795779, + "grad_norm": 0.07344941794872284, + "learning_rate": 3e-06, + "loss": 0.0324, + "reward": 0.6812500059604645, + "rewards/countdown_reward_func": 0.6812500059604645, + "step": 889 + }, + { + "epoch": 0.014825179484616794, + "grad_norm": 0.0585373155772686, + "learning_rate": 3e-06, + "loss": 0.0169, + "step": 890 + }, + { + "epoch": 0.01484183698965569, + "grad_norm": 0.06403908133506775, + "learning_rate": 3e-06, + "loss": 0.032, + "step": 891 + }, + { + "epoch": 0.014858494494694586, + "grad_norm": 0.059206005185842514, + "learning_rate": 3e-06, + "loss": 0.0166, + "step": 892 + }, + { + "completion_length": 78.48958587646484, + "epoch": 0.01487515199973348, + "grad_norm": 0.040342044085264206, + "learning_rate": 3e-06, + "loss": 0.0048, + "reward": 0.390625, + "rewards/countdown_reward_func": 0.390625, + "step": 893 + }, + { + "epoch": 0.014891809504772376, + "grad_norm": 0.018915094435214996, + "learning_rate": 3e-06, + "loss": 0.003, + "step": 894 + }, + { + "epoch": 0.01490846700981127, + "grad_norm": 0.024014100432395935, + "learning_rate": 3e-06, + "loss": 0.0049, + "step": 895 + }, + { + "epoch": 0.014925124514850166, + "grad_norm": 0.01908082142472267, + "learning_rate": 3e-06, + "loss": 0.003, + "step": 896 + }, + { + "completion_length": 78.40625381469727, + "epoch": 0.014941782019889062, + "grad_norm": 0.07758533954620361, + "learning_rate": 3e-06, + "loss": 0.0292, + "reward": 0.5406250208616257, + "rewards/countdown_reward_func": 0.5406249910593033, + "step": 897 + }, + { + "epoch": 0.014958439524927956, + "grad_norm": 0.0758599191904068, + "learning_rate": 3e-06, + "loss": 0.0175, + "step": 898 + }, + { + "epoch": 0.014975097029966852, + "grad_norm": 0.06867958605289459, + "learning_rate": 3e-06, + "loss": 0.029, + "step": 899 + }, + { + "epoch": 0.014991754535005748, + "grad_norm": 0.07711223512887955, + "learning_rate": 3e-06, + "loss": 0.0169, + "step": 900 + }, + { + "completion_length": 79.13541793823242, + "epoch": 0.015008412040044642, + "grad_norm": 0.04773081839084625, + "learning_rate": 3e-06, + "loss": 0.0026, + "reward": 0.6531250774860382, + "rewards/countdown_reward_func": 0.6531250476837158, + "step": 901 + }, + { + "epoch": 0.015025069545083538, + "grad_norm": 0.031081542372703552, + "learning_rate": 3e-06, + "loss": 0.0008, + "step": 902 + }, + { + "epoch": 0.015041727050122434, + "grad_norm": 0.04798680916428566, + "learning_rate": 3e-06, + "loss": 0.0024, + "step": 903 + }, + { + "epoch": 0.015058384555161328, + "grad_norm": 0.02800125442445278, + "learning_rate": 3e-06, + "loss": 0.0006, + "step": 904 + }, + { + "completion_length": 78.66666793823242, + "epoch": 0.015075042060200224, + "grad_norm": 0.06695875525474548, + "learning_rate": 3e-06, + "loss": 0.0188, + "reward": 0.6343749910593033, + "rewards/countdown_reward_func": 0.6343749910593033, + "step": 905 + }, + { + "epoch": 0.015091699565239118, + "grad_norm": 0.07816401869058609, + "learning_rate": 3e-06, + "loss": -0.0108, + "step": 906 + }, + { + "epoch": 0.015108357070278014, + "grad_norm": 0.06759928911924362, + "learning_rate": 3e-06, + "loss": 0.0186, + "step": 907 + }, + { + "epoch": 0.01512501457531691, + "grad_norm": 0.07328783720731735, + "learning_rate": 3e-06, + "loss": -0.011, + "step": 908 + }, + { + "completion_length": 78.89583587646484, + "epoch": 0.015141672080355804, + "grad_norm": 0.006920889485627413, + "learning_rate": 3e-06, + "loss": 0.007, + "reward": 0.5395833849906921, + "rewards/countdown_reward_func": 0.5395833551883698, + "step": 909 + }, + { + "epoch": 0.0151583295853947, + "grad_norm": 0.016865693032741547, + "learning_rate": 3e-06, + "loss": 0.0015, + "step": 910 + }, + { + "epoch": 0.015174987090433596, + "grad_norm": 0.006796528585255146, + "learning_rate": 3e-06, + "loss": 0.007, + "step": 911 + }, + { + "epoch": 0.01519164459547249, + "grad_norm": 0.016117867082357407, + "learning_rate": 3e-06, + "loss": 0.0015, + "step": 912 + }, + { + "completion_length": 78.9375, + "epoch": 0.015208302100511386, + "grad_norm": 0.14087113738059998, + "learning_rate": 3e-06, + "loss": 0.0143, + "reward": 0.6625000238418579, + "rewards/countdown_reward_func": 0.6624999642372131, + "step": 913 + }, + { + "epoch": 0.015224959605550282, + "grad_norm": 0.11682312190532684, + "learning_rate": 3e-06, + "loss": 0.0253, + "step": 914 + }, + { + "epoch": 0.015241617110589176, + "grad_norm": 0.11751624196767807, + "learning_rate": 3e-06, + "loss": 0.0139, + "step": 915 + }, + { + "epoch": 0.015258274615628072, + "grad_norm": 0.07911940664052963, + "learning_rate": 3e-06, + "loss": 0.0244, + "step": 916 + }, + { + "completion_length": 79.41667175292969, + "epoch": 0.015274932120666966, + "grad_norm": 0.06170279160141945, + "learning_rate": 3e-06, + "loss": -0.0105, + "reward": 0.550000011920929, + "rewards/countdown_reward_func": 0.550000011920929, + "step": 917 + }, + { + "epoch": 0.015291589625705862, + "grad_norm": 0.11309674382209778, + "learning_rate": 3e-06, + "loss": -0.0024, + "step": 918 + }, + { + "epoch": 0.015308247130744758, + "grad_norm": 0.0573110356926918, + "learning_rate": 3e-06, + "loss": -0.0108, + "step": 919 + }, + { + "epoch": 0.015324904635783652, + "grad_norm": 0.12822383642196655, + "learning_rate": 3e-06, + "loss": -0.0033, + "step": 920 + }, + { + "completion_length": 79.1875, + "epoch": 0.015341562140822548, + "grad_norm": 0.04075918346643448, + "learning_rate": 3e-06, + "loss": 0.0121, + "reward": 0.4843750447034836, + "rewards/countdown_reward_func": 0.4843750149011612, + "step": 921 + }, + { + "epoch": 0.015358219645861444, + "grad_norm": 0.04625847935676575, + "learning_rate": 3e-06, + "loss": 0.0175, + "step": 922 + }, + { + "epoch": 0.015374877150900338, + "grad_norm": 0.036200232803821564, + "learning_rate": 3e-06, + "loss": 0.0122, + "step": 923 + }, + { + "epoch": 0.015391534655939234, + "grad_norm": 0.046278662979602814, + "learning_rate": 3e-06, + "loss": 0.0175, + "step": 924 + }, + { + "completion_length": 78.50000381469727, + "epoch": 0.01540819216097813, + "grad_norm": 0.06729736924171448, + "learning_rate": 3e-06, + "loss": -0.0083, + "reward": 0.625, + "rewards/countdown_reward_func": 0.625, + "step": 925 + }, + { + "epoch": 0.015424849666017024, + "grad_norm": 0.08305469155311584, + "learning_rate": 3e-06, + "loss": 0.0282, + "step": 926 + }, + { + "epoch": 0.01544150717105592, + "grad_norm": 0.06945432722568512, + "learning_rate": 3e-06, + "loss": -0.0083, + "step": 927 + }, + { + "epoch": 0.015458164676094814, + "grad_norm": 0.07825946807861328, + "learning_rate": 3e-06, + "loss": 0.0281, + "step": 928 + }, + { + "completion_length": 78.80208587646484, + "epoch": 0.01547482218113371, + "grad_norm": 0.03000076860189438, + "learning_rate": 3e-06, + "loss": -0.0058, + "reward": 0.559374988079071, + "rewards/countdown_reward_func": 0.5593749582767487, + "step": 929 + }, + { + "epoch": 0.015491479686172606, + "grad_norm": 0.11393778771162033, + "learning_rate": 3e-06, + "loss": 0.0031, + "step": 930 + }, + { + "epoch": 0.0155081371912115, + "grad_norm": 0.029605679214000702, + "learning_rate": 3e-06, + "loss": -0.0057, + "step": 931 + }, + { + "epoch": 0.015524794696250396, + "grad_norm": 0.13035736978054047, + "learning_rate": 3e-06, + "loss": 0.0024, + "step": 932 + }, + { + "completion_length": 78.83333587646484, + "epoch": 0.015541452201289292, + "grad_norm": 0.0681850016117096, + "learning_rate": 3e-06, + "loss": 0.0085, + "reward": 0.41875001788139343, + "rewards/countdown_reward_func": 0.41875000298023224, + "step": 933 + }, + { + "epoch": 0.015558109706328186, + "grad_norm": 0.05465130880475044, + "learning_rate": 3e-06, + "loss": 0.0033, + "step": 934 + }, + { + "epoch": 0.015574767211367082, + "grad_norm": 0.07529845088720322, + "learning_rate": 3e-06, + "loss": 0.0083, + "step": 935 + }, + { + "epoch": 0.015591424716405978, + "grad_norm": 0.05386465787887573, + "learning_rate": 3e-06, + "loss": 0.0032, + "step": 936 + }, + { + "completion_length": 79.05208587646484, + "epoch": 0.015608082221444872, + "grad_norm": 0.05262433737516403, + "learning_rate": 3e-06, + "loss": -0.0031, + "reward": 0.6718750298023224, + "rewards/countdown_reward_func": 0.671875, + "step": 937 + }, + { + "epoch": 0.015624739726483768, + "grad_norm": 0.033995963633060455, + "learning_rate": 3e-06, + "loss": -0.0026, + "step": 938 + }, + { + "epoch": 0.015641397231522663, + "grad_norm": 0.06024275720119476, + "learning_rate": 3e-06, + "loss": -0.0033, + "step": 939 + }, + { + "epoch": 0.01565805473656156, + "grad_norm": 0.03365107253193855, + "learning_rate": 3e-06, + "loss": -0.0027, + "step": 940 + }, + { + "completion_length": 78.94791793823242, + "epoch": 0.015674712241600452, + "grad_norm": 0.06738370656967163, + "learning_rate": 3e-06, + "loss": 0.0058, + "reward": 0.5499999821186066, + "rewards/countdown_reward_func": 0.5499999672174454, + "step": 941 + }, + { + "epoch": 0.015691369746639348, + "grad_norm": 0.04971608147025108, + "learning_rate": 3e-06, + "loss": 0.0082, + "step": 942 + }, + { + "epoch": 0.015708027251678244, + "grad_norm": 0.06710503250360489, + "learning_rate": 3e-06, + "loss": 0.0055, + "step": 943 + }, + { + "epoch": 0.01572468475671714, + "grad_norm": 0.04930960759520531, + "learning_rate": 3e-06, + "loss": 0.0081, + "step": 944 + }, + { + "completion_length": 79.64583587646484, + "epoch": 0.015741342261756035, + "grad_norm": 0.0715128630399704, + "learning_rate": 3e-06, + "loss": 0.004, + "reward": 0.7562500536441803, + "rewards/countdown_reward_func": 0.7562500238418579, + "step": 945 + }, + { + "epoch": 0.015757999766794928, + "grad_norm": 0.058484550565481186, + "learning_rate": 3e-06, + "loss": 0.0005, + "step": 946 + }, + { + "epoch": 0.015774657271833824, + "grad_norm": 0.07569784671068192, + "learning_rate": 3e-06, + "loss": 0.004, + "step": 947 + }, + { + "epoch": 0.01579131477687272, + "grad_norm": 0.06754549592733383, + "learning_rate": 3e-06, + "loss": 0.0004, + "step": 948 + }, + { + "completion_length": 79.03125, + "epoch": 0.015807972281911616, + "grad_norm": 0.05504095181822777, + "learning_rate": 3e-06, + "loss": 0.024, + "reward": 0.71875, + "rewards/countdown_reward_func": 0.7187499850988388, + "step": 949 + }, + { + "epoch": 0.01582462978695051, + "grad_norm": 0.059572555124759674, + "learning_rate": 3e-06, + "loss": 0.016, + "step": 950 + }, + { + "epoch": 0.015841287291989407, + "grad_norm": 0.05944107845425606, + "learning_rate": 3e-06, + "loss": 0.0238, + "step": 951 + }, + { + "epoch": 0.0158579447970283, + "grad_norm": 0.0591764934360981, + "learning_rate": 3e-06, + "loss": 0.016, + "step": 952 + }, + { + "completion_length": 78.30208587646484, + "epoch": 0.015874602302067196, + "grad_norm": 0.05985407158732414, + "learning_rate": 3e-06, + "loss": 0.0026, + "reward": 0.6062500178813934, + "rewards/countdown_reward_func": 0.6062500178813934, + "step": 953 + }, + { + "epoch": 0.01589125980710609, + "grad_norm": 0.03819705918431282, + "learning_rate": 3e-06, + "loss": -0.0043, + "step": 954 + }, + { + "epoch": 0.015907917312144988, + "grad_norm": 0.08360816538333893, + "learning_rate": 3e-06, + "loss": 0.0023, + "step": 955 + }, + { + "epoch": 0.015924574817183883, + "grad_norm": 0.03618149086833, + "learning_rate": 3e-06, + "loss": -0.0043, + "step": 956 + }, + { + "completion_length": 78.58333587646484, + "epoch": 0.015941232322222776, + "grad_norm": 0.023036271333694458, + "learning_rate": 3e-06, + "loss": 0.0018, + "reward": 0.484375, + "rewards/countdown_reward_func": 0.484375, + "step": 957 + }, + { + "epoch": 0.015957889827261672, + "grad_norm": 0.0250247772783041, + "learning_rate": 3e-06, + "loss": 0.0058, + "step": 958 + }, + { + "epoch": 0.015974547332300568, + "grad_norm": 0.022346217185258865, + "learning_rate": 3e-06, + "loss": 0.0019, + "step": 959 + }, + { + "epoch": 0.015991204837339464, + "grad_norm": 0.025184890255331993, + "learning_rate": 3e-06, + "loss": 0.0058, + "step": 960 + }, + { + "completion_length": 77.98958587646484, + "epoch": 0.01600786234237836, + "grad_norm": 0.05932793766260147, + "learning_rate": 3e-06, + "loss": 0.0079, + "reward": 0.7468750476837158, + "rewards/countdown_reward_func": 0.746874988079071, + "step": 961 + }, + { + "epoch": 0.016024519847417255, + "grad_norm": 0.0678398460149765, + "learning_rate": 3e-06, + "loss": -0.0063, + "step": 962 + }, + { + "epoch": 0.016041177352456148, + "grad_norm": 0.06092070788145065, + "learning_rate": 3e-06, + "loss": 0.0078, + "step": 963 + }, + { + "epoch": 0.016057834857495044, + "grad_norm": 0.0690838098526001, + "learning_rate": 3e-06, + "loss": -0.0066, + "step": 964 + }, + { + "completion_length": 78.01042175292969, + "epoch": 0.01607449236253394, + "grad_norm": 0.0544421561062336, + "learning_rate": 3e-06, + "loss": 0.006, + "reward": 0.5218749940395355, + "rewards/countdown_reward_func": 0.5218749940395355, + "step": 965 + }, + { + "epoch": 0.016091149867572836, + "grad_norm": 0.03812885284423828, + "learning_rate": 3e-06, + "loss": -0.0006, + "step": 966 + }, + { + "epoch": 0.01610780737261173, + "grad_norm": 0.05326592177152634, + "learning_rate": 3e-06, + "loss": 0.0058, + "step": 967 + }, + { + "epoch": 0.016124464877650624, + "grad_norm": 0.03886517509818077, + "learning_rate": 3e-06, + "loss": -0.0006, + "step": 968 + }, + { + "completion_length": 78.0625, + "epoch": 0.01614112238268952, + "grad_norm": 0.06144983321428299, + "learning_rate": 3e-06, + "loss": -0.0012, + "reward": 0.5781250596046448, + "rewards/countdown_reward_func": 0.578125, + "step": 969 + }, + { + "epoch": 0.016157779887728416, + "grad_norm": 0.05182495340704918, + "learning_rate": 3e-06, + "loss": 0.0085, + "step": 970 + }, + { + "epoch": 0.01617443739276731, + "grad_norm": 0.05916476622223854, + "learning_rate": 3e-06, + "loss": -0.0012, + "step": 971 + }, + { + "epoch": 0.016191094897806207, + "grad_norm": 0.05411950498819351, + "learning_rate": 3e-06, + "loss": 0.0083, + "step": 972 + }, + { + "completion_length": 77.56250381469727, + "epoch": 0.016207752402845103, + "grad_norm": 0.08917998522520065, + "learning_rate": 3e-06, + "loss": 0.0015, + "reward": 0.6427083611488342, + "rewards/countdown_reward_func": 0.6427083313465118, + "step": 973 + }, + { + "epoch": 0.016224409907883996, + "grad_norm": 0.05249038338661194, + "learning_rate": 3e-06, + "loss": 0.0016, + "step": 974 + }, + { + "epoch": 0.01624106741292289, + "grad_norm": 0.09241148829460144, + "learning_rate": 3e-06, + "loss": 0.0009, + "step": 975 + }, + { + "epoch": 0.016257724917961788, + "grad_norm": 0.0534595362842083, + "learning_rate": 3e-06, + "loss": 0.0014, + "step": 976 + }, + { + "completion_length": 78.52083587646484, + "epoch": 0.016274382423000684, + "grad_norm": 0.043872151523828506, + "learning_rate": 3e-06, + "loss": -0.0194, + "reward": 0.7552083730697632, + "rewards/countdown_reward_func": 0.7552083134651184, + "step": 977 + }, + { + "epoch": 0.01629103992803958, + "grad_norm": 0.048060253262519836, + "learning_rate": 3e-06, + "loss": 0.0176, + "step": 978 + }, + { + "epoch": 0.016307697433078472, + "grad_norm": 0.04736912250518799, + "learning_rate": 3e-06, + "loss": -0.0193, + "step": 979 + }, + { + "epoch": 0.016324354938117368, + "grad_norm": 0.052148379385471344, + "learning_rate": 3e-06, + "loss": 0.0173, + "step": 980 + }, + { + "completion_length": 78.66666793823242, + "epoch": 0.016341012443156264, + "grad_norm": 0.03696002811193466, + "learning_rate": 3e-06, + "loss": -0.006, + "reward": 0.5593750178813934, + "rewards/countdown_reward_func": 0.5593750178813934, + "step": 981 + }, + { + "epoch": 0.01635766994819516, + "grad_norm": 0.03732531890273094, + "learning_rate": 3e-06, + "loss": 0.0073, + "step": 982 + }, + { + "epoch": 0.016374327453234055, + "grad_norm": 0.038251619786024094, + "learning_rate": 3e-06, + "loss": -0.0061, + "step": 983 + }, + { + "epoch": 0.01639098495827295, + "grad_norm": 0.04090643301606178, + "learning_rate": 3e-06, + "loss": 0.0072, + "step": 984 + }, + { + "completion_length": 79.00000381469727, + "epoch": 0.016407642463311844, + "grad_norm": 0.07788969576358795, + "learning_rate": 3e-06, + "loss": 0.0101, + "reward": 0.4375000447034836, + "rewards/countdown_reward_func": 0.4375000149011612, + "step": 985 + }, + { + "epoch": 0.01642429996835074, + "grad_norm": 0.11451190710067749, + "learning_rate": 3e-06, + "loss": 0.0063, + "step": 986 + }, + { + "epoch": 0.016440957473389636, + "grad_norm": 0.08021572232246399, + "learning_rate": 3e-06, + "loss": 0.0098, + "step": 987 + }, + { + "epoch": 0.01645761497842853, + "grad_norm": 0.10452651232481003, + "learning_rate": 3e-06, + "loss": 0.0056, + "step": 988 + }, + { + "completion_length": 78.01041793823242, + "epoch": 0.016474272483467427, + "grad_norm": 0.06619682908058167, + "learning_rate": 3e-06, + "loss": -0.0036, + "reward": 0.7562500536441803, + "rewards/countdown_reward_func": 0.7562500238418579, + "step": 989 + }, + { + "epoch": 0.01649092998850632, + "grad_norm": 0.056727632880210876, + "learning_rate": 3e-06, + "loss": 0.022, + "step": 990 + }, + { + "epoch": 0.016507587493545216, + "grad_norm": 0.060290560126304626, + "learning_rate": 3e-06, + "loss": -0.0039, + "step": 991 + }, + { + "epoch": 0.01652424499858411, + "grad_norm": 0.06136344000697136, + "learning_rate": 3e-06, + "loss": 0.0219, + "step": 992 + }, + { + "completion_length": 78.78125381469727, + "epoch": 0.016540902503623008, + "grad_norm": 0.08433445543050766, + "learning_rate": 3e-06, + "loss": -0.0153, + "reward": 0.5125000178813934, + "rewards/countdown_reward_func": 0.5125000029802322, + "step": 993 + }, + { + "epoch": 0.016557560008661903, + "grad_norm": 0.06190748140215874, + "learning_rate": 3e-06, + "loss": 0.0084, + "step": 994 + }, + { + "epoch": 0.0165742175137008, + "grad_norm": 0.0875028595328331, + "learning_rate": 3e-06, + "loss": -0.0154, + "step": 995 + }, + { + "epoch": 0.016590875018739692, + "grad_norm": 0.0656026229262352, + "learning_rate": 3e-06, + "loss": 0.008, + "step": 996 + }, + { + "completion_length": 78.00000381469727, + "epoch": 0.016607532523778588, + "grad_norm": 0.06006379798054695, + "learning_rate": 3e-06, + "loss": -0.0029, + "reward": 0.40000002086162567, + "rewards/countdown_reward_func": 0.40000002086162567, + "step": 997 + }, + { + "epoch": 0.016624190028817484, + "grad_norm": 0.06516308337450027, + "learning_rate": 3e-06, + "loss": -0.0078, + "step": 998 + }, + { + "epoch": 0.01664084753385638, + "grad_norm": 0.061335306614637375, + "learning_rate": 3e-06, + "loss": -0.0033, + "step": 999 + }, + { + "epoch": 0.016657505038895275, + "grad_norm": 0.06947959214448929, + "learning_rate": 3e-06, + "loss": -0.0082, + "step": 1000 + }, + { + "completion_length": 77.98958587646484, + "epoch": 0.016674162543934168, + "grad_norm": 0.07094883918762207, + "learning_rate": 3e-06, + "loss": -0.0006, + "reward": 0.49375002086162567, + "rewards/countdown_reward_func": 0.4937499910593033, + "step": 1001 + }, + { + "epoch": 0.016690820048973064, + "grad_norm": 0.08499892801046371, + "learning_rate": 3e-06, + "loss": 0.003, + "step": 1002 + }, + { + "epoch": 0.01670747755401196, + "grad_norm": 0.06431587785482407, + "learning_rate": 3e-06, + "loss": -0.0005, + "step": 1003 + }, + { + "epoch": 0.016724135059050856, + "grad_norm": 0.0890759751200676, + "learning_rate": 3e-06, + "loss": 0.0029, + "step": 1004 + }, + { + "completion_length": 77.56250381469727, + "epoch": 0.01674079256408975, + "grad_norm": 0.02649698778986931, + "learning_rate": 3e-06, + "loss": 0.0081, + "reward": 0.6624999940395355, + "rewards/countdown_reward_func": 0.6624999940395355, + "step": 1005 + }, + { + "epoch": 0.016757450069128647, + "grad_norm": 0.027886051684617996, + "learning_rate": 3e-06, + "loss": 0.0026, + "step": 1006 + }, + { + "epoch": 0.01677410757416754, + "grad_norm": 0.027308441698551178, + "learning_rate": 3e-06, + "loss": 0.0079, + "step": 1007 + }, + { + "epoch": 0.016790765079206436, + "grad_norm": 0.04039669409394264, + "learning_rate": 3e-06, + "loss": 0.0026, + "step": 1008 + }, + { + "completion_length": 77.875, + "epoch": 0.01680742258424533, + "grad_norm": 0.06507500261068344, + "learning_rate": 3e-06, + "loss": 0.0147, + "reward": 0.6145833432674408, + "rewards/countdown_reward_func": 0.6145833432674408, + "step": 1009 + }, + { + "epoch": 0.016824080089284228, + "grad_norm": 0.07641483843326569, + "learning_rate": 3e-06, + "loss": 0.0133, + "step": 1010 + }, + { + "epoch": 0.016840737594323123, + "grad_norm": 0.06355273723602295, + "learning_rate": 3e-06, + "loss": 0.0144, + "step": 1011 + }, + { + "epoch": 0.016857395099362016, + "grad_norm": 0.07882444560527802, + "learning_rate": 3e-06, + "loss": 0.0131, + "step": 1012 + }, + { + "completion_length": 77.57291793823242, + "epoch": 0.016874052604400912, + "grad_norm": 0.07242882251739502, + "learning_rate": 3e-06, + "loss": 0.0038, + "reward": 0.5218750536441803, + "rewards/countdown_reward_func": 0.5218750089406967, + "step": 1013 + }, + { + "epoch": 0.016890710109439808, + "grad_norm": 0.08544197678565979, + "learning_rate": 3e-06, + "loss": -0.001, + "step": 1014 + }, + { + "epoch": 0.016907367614478704, + "grad_norm": 0.07647145539522171, + "learning_rate": 3e-06, + "loss": 0.0035, + "step": 1015 + }, + { + "epoch": 0.0169240251195176, + "grad_norm": 0.08418236672878265, + "learning_rate": 3e-06, + "loss": -0.0014, + "step": 1016 + }, + { + "completion_length": 78.04167175292969, + "epoch": 0.016940682624556495, + "grad_norm": 0.04315047711133957, + "learning_rate": 3e-06, + "loss": 0.0123, + "reward": 0.5406249761581421, + "rewards/countdown_reward_func": 0.5406249761581421, + "step": 1017 + }, + { + "epoch": 0.016957340129595388, + "grad_norm": 0.05901029333472252, + "learning_rate": 3e-06, + "loss": 0.0131, + "step": 1018 + }, + { + "epoch": 0.016973997634634284, + "grad_norm": 0.045158904045820236, + "learning_rate": 3e-06, + "loss": 0.0123, + "step": 1019 + }, + { + "epoch": 0.01699065513967318, + "grad_norm": 0.056594718247652054, + "learning_rate": 3e-06, + "loss": 0.0131, + "step": 1020 + }, + { + "completion_length": 77.72917175292969, + "epoch": 0.017007312644712076, + "grad_norm": 0.09094784408807755, + "learning_rate": 3e-06, + "loss": 0.0252, + "reward": 0.6624999940395355, + "rewards/countdown_reward_func": 0.6624999940395355, + "step": 1021 + }, + { + "epoch": 0.01702397014975097, + "grad_norm": 0.09983258694410324, + "learning_rate": 3e-06, + "loss": 0.0062, + "step": 1022 + }, + { + "epoch": 0.017040627654789864, + "grad_norm": 0.08905185759067535, + "learning_rate": 3e-06, + "loss": 0.0249, + "step": 1023 + }, + { + "epoch": 0.01705728515982876, + "grad_norm": 0.09621740132570267, + "learning_rate": 3e-06, + "loss": 0.0058, + "step": 1024 + }, + { + "completion_length": 77.78125, + "epoch": 0.017073942664867656, + "grad_norm": 0.029306670650839806, + "learning_rate": 3e-06, + "loss": 0.0045, + "reward": 0.44687502086162567, + "rewards/countdown_reward_func": 0.44687502086162567, + "step": 1025 + }, + { + "epoch": 0.01709060016990655, + "grad_norm": 0.030634427443146706, + "learning_rate": 3e-06, + "loss": 0.0038, + "step": 1026 + }, + { + "epoch": 0.017107257674945447, + "grad_norm": 0.0271756574511528, + "learning_rate": 3e-06, + "loss": 0.0044, + "step": 1027 + }, + { + "epoch": 0.017123915179984343, + "grad_norm": 0.031020840629935265, + "learning_rate": 3e-06, + "loss": 0.0038, + "step": 1028 + }, + { + "completion_length": 77.82291793823242, + "epoch": 0.017140572685023236, + "grad_norm": 0.06014309450984001, + "learning_rate": 3e-06, + "loss": 0.0143, + "reward": 0.5875000059604645, + "rewards/countdown_reward_func": 0.5875000059604645, + "step": 1029 + }, + { + "epoch": 0.01715723019006213, + "grad_norm": 0.06266535073518753, + "learning_rate": 3e-06, + "loss": 0.0007, + "step": 1030 + }, + { + "epoch": 0.017173887695101028, + "grad_norm": 0.06231268495321274, + "learning_rate": 3e-06, + "loss": 0.0139, + "step": 1031 + }, + { + "epoch": 0.017190545200139924, + "grad_norm": 0.05885448306798935, + "learning_rate": 3e-06, + "loss": 0.0006, + "step": 1032 + }, + { + "completion_length": 78.00000381469727, + "epoch": 0.01720720270517882, + "grad_norm": 0.12691909074783325, + "learning_rate": 3e-06, + "loss": 0.0102, + "reward": 0.5875000506639481, + "rewards/countdown_reward_func": 0.5875000506639481, + "step": 1033 + }, + { + "epoch": 0.017223860210217715, + "grad_norm": 0.08651195466518402, + "learning_rate": 3e-06, + "loss": 0.0137, + "step": 1034 + }, + { + "epoch": 0.017240517715256608, + "grad_norm": 0.0905795767903328, + "learning_rate": 3e-06, + "loss": 0.0098, + "step": 1035 + }, + { + "epoch": 0.017257175220295504, + "grad_norm": 0.06871011853218079, + "learning_rate": 3e-06, + "loss": 0.0132, + "step": 1036 + }, + { + "completion_length": 77.75000381469727, + "epoch": 0.0172738327253344, + "grad_norm": 0.02537558227777481, + "learning_rate": 3e-06, + "loss": 0.0032, + "reward": 0.45520833134651184, + "rewards/countdown_reward_func": 0.45520833134651184, + "step": 1037 + }, + { + "epoch": 0.017290490230373295, + "grad_norm": 0.0335354246199131, + "learning_rate": 3e-06, + "loss": -0.0019, + "step": 1038 + }, + { + "epoch": 0.01730714773541219, + "grad_norm": 0.02519826591014862, + "learning_rate": 3e-06, + "loss": 0.0031, + "step": 1039 + }, + { + "epoch": 0.017323805240451084, + "grad_norm": 0.04759570583701134, + "learning_rate": 3e-06, + "loss": -0.0021, + "step": 1040 + }, + { + "completion_length": 77.60416793823242, + "epoch": 0.01734046274548998, + "grad_norm": 0.0281699076294899, + "learning_rate": 3e-06, + "loss": -0.0027, + "reward": 0.5218750238418579, + "rewards/countdown_reward_func": 0.5218750238418579, + "step": 1041 + }, + { + "epoch": 0.017357120250528876, + "grad_norm": 0.03999556228518486, + "learning_rate": 3e-06, + "loss": 0.0088, + "step": 1042 + }, + { + "epoch": 0.01737377775556777, + "grad_norm": 0.029859347268939018, + "learning_rate": 3e-06, + "loss": -0.0027, + "step": 1043 + }, + { + "epoch": 0.017390435260606667, + "grad_norm": 0.03812756389379501, + "learning_rate": 3e-06, + "loss": 0.0087, + "step": 1044 + }, + { + "completion_length": 76.39583587646484, + "epoch": 0.017407092765645563, + "grad_norm": 0.12725725769996643, + "learning_rate": 3e-06, + "loss": 0.0116, + "reward": 0.6250000149011612, + "rewards/countdown_reward_func": 0.6250000149011612, + "step": 1045 + }, + { + "epoch": 0.017423750270684456, + "grad_norm": 0.10748835653066635, + "learning_rate": 3e-06, + "loss": -0.0065, + "step": 1046 + }, + { + "epoch": 0.01744040777572335, + "grad_norm": 0.11180923134088516, + "learning_rate": 3e-06, + "loss": 0.0112, + "step": 1047 + }, + { + "epoch": 0.017457065280762248, + "grad_norm": 0.12227732688188553, + "learning_rate": 3e-06, + "loss": -0.0066, + "step": 1048 + }, + { + "completion_length": 77.05208587646484, + "epoch": 0.017473722785801143, + "grad_norm": 0.048975490033626556, + "learning_rate": 3e-06, + "loss": 0.0059, + "reward": 0.7000000476837158, + "rewards/countdown_reward_func": 0.699999988079071, + "step": 1049 + }, + { + "epoch": 0.01749038029084004, + "grad_norm": 0.09614705294370651, + "learning_rate": 3e-06, + "loss": -0.0117, + "step": 1050 + }, + { + "epoch": 0.017507037795878932, + "grad_norm": 0.050083886831998825, + "learning_rate": 3e-06, + "loss": 0.0059, + "step": 1051 + }, + { + "epoch": 0.017523695300917828, + "grad_norm": 0.09991956502199173, + "learning_rate": 3e-06, + "loss": -0.0122, + "step": 1052 + }, + { + "completion_length": 77.06250381469727, + "epoch": 0.017540352805956724, + "grad_norm": 0.11448618769645691, + "learning_rate": 3e-06, + "loss": 0.0038, + "reward": 0.44687502086162567, + "rewards/countdown_reward_func": 0.4468749910593033, + "step": 1053 + }, + { + "epoch": 0.01755701031099562, + "grad_norm": 0.09735244512557983, + "learning_rate": 3e-06, + "loss": 0.0126, + "step": 1054 + }, + { + "epoch": 0.017573667816034515, + "grad_norm": 0.10511325299739838, + "learning_rate": 3e-06, + "loss": 0.0034, + "step": 1055 + }, + { + "epoch": 0.01759032532107341, + "grad_norm": 0.09386730939149857, + "learning_rate": 3e-06, + "loss": 0.0122, + "step": 1056 + }, + { + "completion_length": 77.31250381469727, + "epoch": 0.017606982826112304, + "grad_norm": 0.056626059114933014, + "learning_rate": 3e-06, + "loss": 0.0184, + "reward": 0.697916716337204, + "rewards/countdown_reward_func": 0.6979166865348816, + "step": 1057 + }, + { + "epoch": 0.0176236403311512, + "grad_norm": 0.07217638194561005, + "learning_rate": 3e-06, + "loss": 0.021, + "step": 1058 + }, + { + "epoch": 0.017640297836190096, + "grad_norm": 0.05833190679550171, + "learning_rate": 3e-06, + "loss": 0.0181, + "step": 1059 + }, + { + "epoch": 0.01765695534122899, + "grad_norm": 0.07048018276691437, + "learning_rate": 3e-06, + "loss": 0.0207, + "step": 1060 + }, + { + "completion_length": 76.51042175292969, + "epoch": 0.017673612846267887, + "grad_norm": 0.06974723190069199, + "learning_rate": 3e-06, + "loss": -0.002, + "reward": 0.6239583492279053, + "rewards/countdown_reward_func": 0.6239583492279053, + "step": 1061 + }, + { + "epoch": 0.01769027035130678, + "grad_norm": 0.06642426550388336, + "learning_rate": 3e-06, + "loss": -0.0014, + "step": 1062 + }, + { + "epoch": 0.017706927856345676, + "grad_norm": 0.0737224817276001, + "learning_rate": 3e-06, + "loss": -0.0023, + "step": 1063 + }, + { + "epoch": 0.01772358536138457, + "grad_norm": 0.07191863656044006, + "learning_rate": 3e-06, + "loss": -0.0016, + "step": 1064 + }, + { + "completion_length": 76.34375381469727, + "epoch": 0.017740242866423468, + "grad_norm": 0.027337780222296715, + "learning_rate": 3e-06, + "loss": 0.0002, + "reward": 0.5395833551883698, + "rewards/countdown_reward_func": 0.5395833402872086, + "step": 1065 + }, + { + "epoch": 0.017756900371462363, + "grad_norm": 0.053200941532850266, + "learning_rate": 3e-06, + "loss": -0.0097, + "step": 1066 + }, + { + "epoch": 0.01777355787650126, + "grad_norm": 0.027074690908193588, + "learning_rate": 3e-06, + "loss": 0.0002, + "step": 1067 + }, + { + "epoch": 0.017790215381540152, + "grad_norm": 0.0507885217666626, + "learning_rate": 3e-06, + "loss": -0.0099, + "step": 1068 + }, + { + "completion_length": 76.83333587646484, + "epoch": 0.017806872886579048, + "grad_norm": 0.10492333024740219, + "learning_rate": 3e-06, + "loss": -0.0129, + "reward": 0.4750000238418579, + "rewards/countdown_reward_func": 0.4749999940395355, + "step": 1069 + }, + { + "epoch": 0.017823530391617944, + "grad_norm": 0.0775398463010788, + "learning_rate": 3e-06, + "loss": 0.0211, + "step": 1070 + }, + { + "epoch": 0.01784018789665684, + "grad_norm": 0.1052914410829544, + "learning_rate": 3e-06, + "loss": -0.0133, + "step": 1071 + }, + { + "epoch": 0.017856845401695735, + "grad_norm": 0.08178280293941498, + "learning_rate": 3e-06, + "loss": 0.021, + "step": 1072 + }, + { + "completion_length": 77.15625381469727, + "epoch": 0.017873502906734628, + "grad_norm": 0.0993320420384407, + "learning_rate": 3e-06, + "loss": 0.0261, + "reward": 0.5781250298023224, + "rewards/countdown_reward_func": 0.5781250298023224, + "step": 1073 + }, + { + "epoch": 0.017890160411773524, + "grad_norm": 0.09129512310028076, + "learning_rate": 3e-06, + "loss": 0.0098, + "step": 1074 + }, + { + "epoch": 0.01790681791681242, + "grad_norm": 0.10476741194725037, + "learning_rate": 3e-06, + "loss": 0.0259, + "step": 1075 + }, + { + "epoch": 0.017923475421851316, + "grad_norm": 0.08802957832813263, + "learning_rate": 3e-06, + "loss": 0.0094, + "step": 1076 + }, + { + "completion_length": 76.63542175292969, + "epoch": 0.01794013292689021, + "grad_norm": 0.08028431981801987, + "learning_rate": 3e-06, + "loss": 0.0156, + "reward": 0.7093750536441803, + "rewards/countdown_reward_func": 0.7093750238418579, + "step": 1077 + }, + { + "epoch": 0.017956790431929107, + "grad_norm": 0.07435247302055359, + "learning_rate": 3e-06, + "loss": 0.0087, + "step": 1078 + }, + { + "epoch": 0.017973447936968, + "grad_norm": 0.07464917749166489, + "learning_rate": 3e-06, + "loss": 0.0157, + "step": 1079 + }, + { + "epoch": 0.017990105442006896, + "grad_norm": 0.07526235282421112, + "learning_rate": 3e-06, + "loss": 0.0081, + "step": 1080 + }, + { + "completion_length": 76.26041793823242, + "epoch": 0.01800676294704579, + "grad_norm": 0.05551959201693535, + "learning_rate": 3e-06, + "loss": 0.002, + "reward": 0.6625000238418579, + "rewards/countdown_reward_func": 0.6624999642372131, + "step": 1081 + }, + { + "epoch": 0.018023420452084687, + "grad_norm": 0.04644828289747238, + "learning_rate": 3e-06, + "loss": 0.0137, + "step": 1082 + }, + { + "epoch": 0.018040077957123583, + "grad_norm": 0.05169878154993057, + "learning_rate": 3e-06, + "loss": 0.0018, + "step": 1083 + }, + { + "epoch": 0.018056735462162476, + "grad_norm": 0.044488560408353806, + "learning_rate": 3e-06, + "loss": 0.0135, + "step": 1084 + }, + { + "completion_length": 76.72916793823242, + "epoch": 0.01807339296720137, + "grad_norm": 4.488245330946938e-09, + "learning_rate": 3e-06, + "loss": -0.0, + "reward": 0.6625000238418579, + "rewards/countdown_reward_func": 0.6625000238418579, + "step": 1085 + }, + { + "epoch": 0.018090050472240268, + "grad_norm": 2.843547397546331e-09, + "learning_rate": 3e-06, + "loss": -0.0, + "step": 1086 + }, + { + "epoch": 0.018106707977279164, + "grad_norm": 4.3927523840636695e-09, + "learning_rate": 3e-06, + "loss": -0.0, + "step": 1087 + }, + { + "epoch": 0.01812336548231806, + "grad_norm": 2.705804247327137e-09, + "learning_rate": 3e-06, + "loss": -0.0, + "step": 1088 + }, + { + "completion_length": 76.69791793823242, + "epoch": 0.018140022987356955, + "grad_norm": 0.10377940535545349, + "learning_rate": 3e-06, + "loss": -0.0246, + "reward": 0.5687500089406967, + "rewards/countdown_reward_func": 0.5687499791383743, + "step": 1089 + }, + { + "epoch": 0.018156680492395848, + "grad_norm": 0.08611810207366943, + "learning_rate": 3e-06, + "loss": 0.0266, + "step": 1090 + }, + { + "epoch": 0.018173337997434744, + "grad_norm": 0.0820135772228241, + "learning_rate": 3e-06, + "loss": -0.0247, + "step": 1091 + }, + { + "epoch": 0.01818999550247364, + "grad_norm": 0.07469993084669113, + "learning_rate": 3e-06, + "loss": 0.0262, + "step": 1092 + }, + { + "completion_length": 76.88541793823242, + "epoch": 0.018206653007512535, + "grad_norm": 0.14840012788772583, + "learning_rate": 3e-06, + "loss": 0.0038, + "reward": 0.5874999910593033, + "rewards/countdown_reward_func": 0.5874999910593033, + "step": 1093 + }, + { + "epoch": 0.01822331051255143, + "grad_norm": 0.13590377569198608, + "learning_rate": 3e-06, + "loss": 0.0022, + "step": 1094 + }, + { + "epoch": 0.018239968017590324, + "grad_norm": 0.13451842963695526, + "learning_rate": 3e-06, + "loss": 0.0026, + "step": 1095 + }, + { + "epoch": 0.01825662552262922, + "grad_norm": 0.15413163602352142, + "learning_rate": 3e-06, + "loss": 0.0012, + "step": 1096 + }, + { + "completion_length": 77.69792175292969, + "epoch": 0.018273283027668116, + "grad_norm": 0.0337769091129303, + "learning_rate": 3e-06, + "loss": 0.0021, + "reward": 0.6156250238418579, + "rewards/countdown_reward_func": 0.6156250089406967, + "step": 1097 + }, + { + "epoch": 0.01828994053270701, + "grad_norm": 0.06738349795341492, + "learning_rate": 3e-06, + "loss": 0.0094, + "step": 1098 + }, + { + "epoch": 0.018306598037745907, + "grad_norm": 0.037964098155498505, + "learning_rate": 3e-06, + "loss": 0.002, + "step": 1099 + }, + { + "epoch": 0.018323255542784803, + "grad_norm": 0.07724802941083908, + "learning_rate": 3e-06, + "loss": 0.0092, + "step": 1100 + }, + { + "completion_length": 76.86458587646484, + "epoch": 0.018339913047823696, + "grad_norm": 0.09037227928638458, + "learning_rate": 3e-06, + "loss": 0.0263, + "reward": 0.6322916746139526, + "rewards/countdown_reward_func": 0.6322916746139526, + "step": 1101 + }, + { + "epoch": 0.01835657055286259, + "grad_norm": 0.09328408539295197, + "learning_rate": 3e-06, + "loss": 0.0037, + "step": 1102 + }, + { + "epoch": 0.018373228057901488, + "grad_norm": 0.0855465903878212, + "learning_rate": 3e-06, + "loss": 0.0256, + "step": 1103 + }, + { + "epoch": 0.018389885562940383, + "grad_norm": 0.10210173577070236, + "learning_rate": 3e-06, + "loss": 0.0032, + "step": 1104 + }, + { + "completion_length": 77.86458587646484, + "epoch": 0.01840654306797928, + "grad_norm": 0.09195931255817413, + "learning_rate": 3e-06, + "loss": 0.0042, + "reward": 0.503125011920929, + "rewards/countdown_reward_func": 0.5031249970197678, + "step": 1105 + }, + { + "epoch": 0.018423200573018172, + "grad_norm": 0.09613701701164246, + "learning_rate": 3e-06, + "loss": 0.0116, + "step": 1106 + }, + { + "epoch": 0.018439858078057068, + "grad_norm": 0.09089526534080505, + "learning_rate": 3e-06, + "loss": 0.0037, + "step": 1107 + }, + { + "epoch": 0.018456515583095964, + "grad_norm": 0.0981704518198967, + "learning_rate": 3e-06, + "loss": 0.0106, + "step": 1108 + }, + { + "completion_length": 77.70833587646484, + "epoch": 0.01847317308813486, + "grad_norm": 0.09190593659877777, + "learning_rate": 3e-06, + "loss": -0.0098, + "reward": 0.4843750447034836, + "rewards/countdown_reward_func": 0.4843750149011612, + "step": 1109 + }, + { + "epoch": 0.018489830593173755, + "grad_norm": 0.07517038285732269, + "learning_rate": 3e-06, + "loss": -0.0014, + "step": 1110 + }, + { + "epoch": 0.01850648809821265, + "grad_norm": 0.08398468792438507, + "learning_rate": 3e-06, + "loss": -0.0103, + "step": 1111 + }, + { + "epoch": 0.018523145603251544, + "grad_norm": 0.07767587900161743, + "learning_rate": 3e-06, + "loss": -0.0013, + "step": 1112 + }, + { + "completion_length": 77.78125, + "epoch": 0.01853980310829044, + "grad_norm": 0.08936383575201035, + "learning_rate": 3e-06, + "loss": 0.0155, + "reward": 0.5781250298023224, + "rewards/countdown_reward_func": 0.578125, + "step": 1113 + }, + { + "epoch": 0.018556460613329336, + "grad_norm": 0.08371175080537796, + "learning_rate": 3e-06, + "loss": 0.0144, + "step": 1114 + }, + { + "epoch": 0.01857311811836823, + "grad_norm": 0.0906725749373436, + "learning_rate": 3e-06, + "loss": 0.0154, + "step": 1115 + }, + { + "epoch": 0.018589775623407127, + "grad_norm": 0.07422931492328644, + "learning_rate": 3e-06, + "loss": 0.0144, + "step": 1116 + }, + { + "completion_length": 78.13542175292969, + "epoch": 0.01860643312844602, + "grad_norm": 0.03390752524137497, + "learning_rate": 3e-06, + "loss": 0.0026, + "reward": 0.6812500357627869, + "rewards/countdown_reward_func": 0.6812500059604645, + "step": 1117 + }, + { + "epoch": 0.018623090633484916, + "grad_norm": 0.053761448711156845, + "learning_rate": 3e-06, + "loss": -0.0069, + "step": 1118 + }, + { + "epoch": 0.01863974813852381, + "grad_norm": 0.03983241692185402, + "learning_rate": 3e-06, + "loss": 0.0025, + "step": 1119 + }, + { + "epoch": 0.018656405643562708, + "grad_norm": 0.05388634279370308, + "learning_rate": 3e-06, + "loss": -0.0071, + "step": 1120 + }, + { + "completion_length": 77.38541793823242, + "epoch": 0.018673063148601603, + "grad_norm": 0.10176032036542892, + "learning_rate": 3e-06, + "loss": 0.021, + "reward": 0.5583333224058151, + "rewards/countdown_reward_func": 0.5583333224058151, + "step": 1121 + }, + { + "epoch": 0.0186897206536405, + "grad_norm": 0.057487647980451584, + "learning_rate": 3e-06, + "loss": 0.0009, + "step": 1122 + }, + { + "epoch": 0.018706378158679392, + "grad_norm": 0.10296330600976944, + "learning_rate": 3e-06, + "loss": 0.0202, + "step": 1123 + }, + { + "epoch": 0.018723035663718288, + "grad_norm": 0.05802428722381592, + "learning_rate": 3e-06, + "loss": 0.0008, + "step": 1124 + }, + { + "completion_length": 78.13542175292969, + "epoch": 0.018739693168757184, + "grad_norm": 0.05085816979408264, + "learning_rate": 3e-06, + "loss": 0.0008, + "reward": 0.7468750476837158, + "rewards/countdown_reward_func": 0.746874988079071, + "step": 1125 + }, + { + "epoch": 0.01875635067379608, + "grad_norm": 0.07629646360874176, + "learning_rate": 3e-06, + "loss": -0.0012, + "step": 1126 + }, + { + "epoch": 0.018773008178834975, + "grad_norm": 0.050090886652469635, + "learning_rate": 3e-06, + "loss": 0.0007, + "step": 1127 + }, + { + "epoch": 0.018789665683873868, + "grad_norm": 0.07648593187332153, + "learning_rate": 3e-06, + "loss": -0.0016, + "step": 1128 + }, + { + "completion_length": 77.21875381469727, + "epoch": 0.018806323188912764, + "grad_norm": 0.1128632053732872, + "learning_rate": 3e-06, + "loss": 0.0146, + "reward": 0.4833333492279053, + "rewards/countdown_reward_func": 0.4833333492279053, + "step": 1129 + }, + { + "epoch": 0.01882298069395166, + "grad_norm": 0.09674850851297379, + "learning_rate": 3e-06, + "loss": 0.0032, + "step": 1130 + }, + { + "epoch": 0.018839638198990556, + "grad_norm": 0.11442406475543976, + "learning_rate": 3e-06, + "loss": 0.0146, + "step": 1131 + }, + { + "epoch": 0.01885629570402945, + "grad_norm": 0.0966297835111618, + "learning_rate": 3e-06, + "loss": 0.0028, + "step": 1132 + }, + { + "completion_length": 77.07291793823242, + "epoch": 0.018872953209068347, + "grad_norm": 0.021995024755597115, + "learning_rate": 3e-06, + "loss": 0.0027, + "reward": 0.5875000357627869, + "rewards/countdown_reward_func": 0.5875000208616257, + "step": 1133 + }, + { + "epoch": 0.01888961071410724, + "grad_norm": 0.04276172071695328, + "learning_rate": 3e-06, + "loss": -0.0031, + "step": 1134 + }, + { + "epoch": 0.018906268219146136, + "grad_norm": 0.023226909339427948, + "learning_rate": 3e-06, + "loss": 0.0026, + "step": 1135 + }, + { + "epoch": 0.01892292572418503, + "grad_norm": 0.0440530851483345, + "learning_rate": 3e-06, + "loss": -0.0032, + "step": 1136 + }, + { + "completion_length": 77.38541793823242, + "epoch": 0.018939583229223927, + "grad_norm": 0.12742026150226593, + "learning_rate": 3e-06, + "loss": 0.0219, + "reward": 0.49375003576278687, + "rewards/countdown_reward_func": 0.49375002086162567, + "step": 1137 + }, + { + "epoch": 0.018956240734262823, + "grad_norm": 0.08153384178876877, + "learning_rate": 3e-06, + "loss": 0.0097, + "step": 1138 + }, + { + "epoch": 0.018972898239301716, + "grad_norm": 0.09739159047603607, + "learning_rate": 3e-06, + "loss": 0.0211, + "step": 1139 + }, + { + "epoch": 0.01898955574434061, + "grad_norm": 0.07907038182020187, + "learning_rate": 3e-06, + "loss": 0.0093, + "step": 1140 + }, + { + "completion_length": 78.17708587646484, + "epoch": 0.019006213249379508, + "grad_norm": 0.07968182861804962, + "learning_rate": 3e-06, + "loss": 0.013, + "reward": 0.5395833849906921, + "rewards/countdown_reward_func": 0.5395833253860474, + "step": 1141 + }, + { + "epoch": 0.019022870754418403, + "grad_norm": 0.07061457633972168, + "learning_rate": 3e-06, + "loss": 0.0113, + "step": 1142 + }, + { + "epoch": 0.0190395282594573, + "grad_norm": 0.0831107348203659, + "learning_rate": 3e-06, + "loss": 0.0129, + "step": 1143 + }, + { + "epoch": 0.019056185764496195, + "grad_norm": 0.07127412408590317, + "learning_rate": 3e-06, + "loss": 0.011, + "step": 1144 + }, + { + "completion_length": 77.02083587646484, + "epoch": 0.019072843269535088, + "grad_norm": 0.07784897089004517, + "learning_rate": 3e-06, + "loss": 0.0002, + "reward": 0.6062500178813934, + "rewards/countdown_reward_func": 0.606249988079071, + "step": 1145 + }, + { + "epoch": 0.019089500774573984, + "grad_norm": 0.046890512108802795, + "learning_rate": 3e-06, + "loss": 0.0024, + "step": 1146 + }, + { + "epoch": 0.01910615827961288, + "grad_norm": 0.06485949456691742, + "learning_rate": 3e-06, + "loss": -0.0002, + "step": 1147 + }, + { + "epoch": 0.019122815784651775, + "grad_norm": 0.0456463024020195, + "learning_rate": 3e-06, + "loss": 0.0023, + "step": 1148 + }, + { + "completion_length": 77.61458587646484, + "epoch": 0.01913947328969067, + "grad_norm": 0.07196388393640518, + "learning_rate": 3e-06, + "loss": 0.0043, + "reward": 0.7656250298023224, + "rewards/countdown_reward_func": 0.765625, + "step": 1149 + }, + { + "epoch": 0.019156130794729564, + "grad_norm": 0.026296168565750122, + "learning_rate": 3e-06, + "loss": 0.0013, + "step": 1150 + }, + { + "epoch": 0.01917278829976846, + "grad_norm": 0.0736239105463028, + "learning_rate": 3e-06, + "loss": 0.0041, + "step": 1151 + }, + { + "epoch": 0.019189445804807356, + "grad_norm": 0.0247329194098711, + "learning_rate": 3e-06, + "loss": 0.0013, + "step": 1152 + }, + { + "completion_length": 77.54166793823242, + "epoch": 0.01920610330984625, + "grad_norm": 0.13147065043449402, + "learning_rate": 3e-06, + "loss": -0.0033, + "reward": 0.6895833313465118, + "rewards/countdown_reward_func": 0.6895833015441895, + "step": 1153 + }, + { + "epoch": 0.019222760814885147, + "grad_norm": 0.1420595645904541, + "learning_rate": 3e-06, + "loss": 0.052, + "step": 1154 + }, + { + "epoch": 0.019239418319924043, + "grad_norm": 0.12570101022720337, + "learning_rate": 3e-06, + "loss": -0.0037, + "step": 1155 + }, + { + "epoch": 0.019256075824962936, + "grad_norm": 0.1428789347410202, + "learning_rate": 3e-06, + "loss": 0.0509, + "step": 1156 + }, + { + "completion_length": 77.97916793823242, + "epoch": 0.01927273333000183, + "grad_norm": 0.07949557900428772, + "learning_rate": 3e-06, + "loss": 0.0148, + "reward": 0.4375, + "rewards/countdown_reward_func": 0.4375, + "step": 1157 + }, + { + "epoch": 0.019289390835040728, + "grad_norm": 0.0775655061006546, + "learning_rate": 3e-06, + "loss": 0.0118, + "step": 1158 + }, + { + "epoch": 0.019306048340079623, + "grad_norm": 0.07711786031723022, + "learning_rate": 3e-06, + "loss": 0.0144, + "step": 1159 + }, + { + "epoch": 0.01932270584511852, + "grad_norm": 0.07381214946508408, + "learning_rate": 3e-06, + "loss": 0.0118, + "step": 1160 + }, + { + "completion_length": 77.65625, + "epoch": 0.019339363350157412, + "grad_norm": 0.13475027680397034, + "learning_rate": 3e-06, + "loss": 0.0149, + "reward": 0.5031250268220901, + "rewards/countdown_reward_func": 0.5031250268220901, + "step": 1161 + }, + { + "epoch": 0.019356020855196308, + "grad_norm": 0.09009546041488647, + "learning_rate": 3e-06, + "loss": -0.005, + "step": 1162 + }, + { + "epoch": 0.019372678360235204, + "grad_norm": 0.12726618349552155, + "learning_rate": 3e-06, + "loss": 0.0144, + "step": 1163 + }, + { + "epoch": 0.0193893358652741, + "grad_norm": 0.08986508846282959, + "learning_rate": 3e-06, + "loss": -0.0052, + "step": 1164 + }, + { + "completion_length": 77.41667175292969, + "epoch": 0.019405993370312995, + "grad_norm": 0.11597729474306107, + "learning_rate": 3e-06, + "loss": -0.0164, + "reward": 0.6052083373069763, + "rewards/countdown_reward_func": 0.6052083075046539, + "step": 1165 + }, + { + "epoch": 0.01942265087535189, + "grad_norm": 0.0711822658777237, + "learning_rate": 3e-06, + "loss": -0.0114, + "step": 1166 + }, + { + "epoch": 0.019439308380390784, + "grad_norm": 0.1113245040178299, + "learning_rate": 3e-06, + "loss": -0.0171, + "step": 1167 + }, + { + "epoch": 0.01945596588542968, + "grad_norm": 0.07089671492576599, + "learning_rate": 3e-06, + "loss": -0.0117, + "step": 1168 + }, + { + "completion_length": 76.95833587646484, + "epoch": 0.019472623390468576, + "grad_norm": 0.021820804104208946, + "learning_rate": 3e-06, + "loss": 0.0003, + "reward": 0.5406250208616257, + "rewards/countdown_reward_func": 0.5406249910593033, + "step": 1169 + }, + { + "epoch": 0.01948928089550747, + "grad_norm": 0.016696326434612274, + "learning_rate": 3e-06, + "loss": 0.0021, + "step": 1170 + }, + { + "epoch": 0.019505938400546367, + "grad_norm": 0.02234882116317749, + "learning_rate": 3e-06, + "loss": 0.0003, + "step": 1171 + }, + { + "epoch": 0.01952259590558526, + "grad_norm": 0.017860665917396545, + "learning_rate": 3e-06, + "loss": 0.0021, + "step": 1172 + }, + { + "completion_length": 77.30208587646484, + "epoch": 0.019539253410624156, + "grad_norm": 0.030554965138435364, + "learning_rate": 3e-06, + "loss": 0.0018, + "reward": 0.7468750178813934, + "rewards/countdown_reward_func": 0.746874988079071, + "step": 1173 + }, + { + "epoch": 0.01955591091566305, + "grad_norm": 0.0319824293255806, + "learning_rate": 3e-06, + "loss": 0.0124, + "step": 1174 + }, + { + "epoch": 0.019572568420701947, + "grad_norm": 0.03362557291984558, + "learning_rate": 3e-06, + "loss": 0.0018, + "step": 1175 + }, + { + "epoch": 0.019589225925740843, + "grad_norm": 0.03169107437133789, + "learning_rate": 3e-06, + "loss": 0.0123, + "step": 1176 + }, + { + "completion_length": 77.52083587646484, + "epoch": 0.01960588343077974, + "grad_norm": 0.07357499748468399, + "learning_rate": 3e-06, + "loss": 0.0185, + "reward": 0.5020833313465118, + "rewards/countdown_reward_func": 0.5020833313465118, + "step": 1177 + }, + { + "epoch": 0.019622540935818632, + "grad_norm": 0.09687095135450363, + "learning_rate": 3e-06, + "loss": -0.003, + "step": 1178 + }, + { + "epoch": 0.019639198440857528, + "grad_norm": 0.07436784356832504, + "learning_rate": 3e-06, + "loss": 0.0183, + "step": 1179 + }, + { + "epoch": 0.019655855945896424, + "grad_norm": 0.08235058188438416, + "learning_rate": 3e-06, + "loss": -0.0033, + "step": 1180 + }, + { + "completion_length": 77.46875381469727, + "epoch": 0.01967251345093532, + "grad_norm": 0.05219286307692528, + "learning_rate": 3e-06, + "loss": 0.0002, + "reward": 0.6812500059604645, + "rewards/countdown_reward_func": 0.6812499761581421, + "step": 1181 + }, + { + "epoch": 0.019689170955974215, + "grad_norm": 0.02998141013085842, + "learning_rate": 3e-06, + "loss": -0.0021, + "step": 1182 + }, + { + "epoch": 0.019705828461013108, + "grad_norm": 0.046992551535367966, + "learning_rate": 3e-06, + "loss": -0.0, + "step": 1183 + }, + { + "epoch": 0.019722485966052004, + "grad_norm": 0.029123343527317047, + "learning_rate": 3e-06, + "loss": -0.0021, + "step": 1184 + }, + { + "completion_length": 77.54166793823242, + "epoch": 0.0197391434710909, + "grad_norm": 0.04384394735097885, + "learning_rate": 3e-06, + "loss": -0.003, + "reward": 0.6343750357627869, + "rewards/countdown_reward_func": 0.6343749761581421, + "step": 1185 + }, + { + "epoch": 0.019755800976129795, + "grad_norm": 0.13559691607952118, + "learning_rate": 3e-06, + "loss": -0.0046, + "step": 1186 + }, + { + "epoch": 0.01977245848116869, + "grad_norm": 0.04415891692042351, + "learning_rate": 3e-06, + "loss": -0.0033, + "step": 1187 + }, + { + "epoch": 0.019789115986207587, + "grad_norm": 0.13404782116413116, + "learning_rate": 3e-06, + "loss": -0.0052, + "step": 1188 + }, + { + "completion_length": 77.03125381469727, + "epoch": 0.01980577349124648, + "grad_norm": 0.0698050931096077, + "learning_rate": 3e-06, + "loss": 0.0051, + "reward": 0.7468750476837158, + "rewards/countdown_reward_func": 0.746874988079071, + "step": 1189 + }, + { + "epoch": 0.019822430996285376, + "grad_norm": 0.05211123079061508, + "learning_rate": 3e-06, + "loss": 0.0126, + "step": 1190 + }, + { + "epoch": 0.01983908850132427, + "grad_norm": 0.10686541348695755, + "learning_rate": 3e-06, + "loss": 0.0046, + "step": 1191 + }, + { + "epoch": 0.019855746006363167, + "grad_norm": 0.05185304582118988, + "learning_rate": 3e-06, + "loss": 0.0125, + "step": 1192 + }, + { + "completion_length": 77.72916793823242, + "epoch": 0.019872403511402063, + "grad_norm": 0.05780575051903725, + "learning_rate": 3e-06, + "loss": 0.0039, + "reward": 0.4739583730697632, + "rewards/countdown_reward_func": 0.473958358168602, + "step": 1193 + }, + { + "epoch": 0.019889061016440956, + "grad_norm": 0.03962497040629387, + "learning_rate": 3e-06, + "loss": 0.0013, + "step": 1194 + }, + { + "epoch": 0.01990571852147985, + "grad_norm": 0.057405728846788406, + "learning_rate": 3e-06, + "loss": 0.0038, + "step": 1195 + }, + { + "epoch": 0.019922376026518748, + "grad_norm": 0.03901403769850731, + "learning_rate": 3e-06, + "loss": 0.0012, + "step": 1196 + }, + { + "completion_length": 77.08333587646484, + "epoch": 0.019939033531557643, + "grad_norm": 0.04806802421808243, + "learning_rate": 3e-06, + "loss": 0.0034, + "reward": 0.6437500417232513, + "rewards/countdown_reward_func": 0.643750011920929, + "step": 1197 + }, + { + "epoch": 0.01995569103659654, + "grad_norm": 0.03269192948937416, + "learning_rate": 3e-06, + "loss": -0.0038, + "step": 1198 + }, + { + "epoch": 0.019972348541635435, + "grad_norm": 0.06389960646629333, + "learning_rate": 3e-06, + "loss": 0.0033, + "step": 1199 + }, + { + "epoch": 0.019989006046674328, + "grad_norm": 0.03255591541528702, + "learning_rate": 3e-06, + "loss": -0.004, + "step": 1200 + }, + { + "completion_length": 77.38542175292969, + "epoch": 0.020005663551713224, + "grad_norm": 0.09769749641418457, + "learning_rate": 3e-06, + "loss": 0.0099, + "reward": 0.6156250238418579, + "rewards/countdown_reward_func": 0.6156249940395355, + "step": 1201 + }, + { + "epoch": 0.02002232105675212, + "grad_norm": 0.06701067835092545, + "learning_rate": 3e-06, + "loss": -0.002, + "step": 1202 + }, + { + "epoch": 0.020038978561791015, + "grad_norm": 0.08442507684230804, + "learning_rate": 3e-06, + "loss": 0.0093, + "step": 1203 + }, + { + "epoch": 0.02005563606682991, + "grad_norm": 0.0673530176281929, + "learning_rate": 3e-06, + "loss": -0.0022, + "step": 1204 + }, + { + "completion_length": 77.48958587646484, + "epoch": 0.020072293571868804, + "grad_norm": 0.10409381985664368, + "learning_rate": 3e-06, + "loss": 0.0021, + "reward": 0.6812500357627869, + "rewards/countdown_reward_func": 0.6812500059604645, + "step": 1205 + }, + { + "epoch": 0.0200889510769077, + "grad_norm": 0.11577782779932022, + "learning_rate": 3e-06, + "loss": -0.0026, + "step": 1206 + }, + { + "epoch": 0.020105608581946596, + "grad_norm": 0.10105322301387787, + "learning_rate": 3e-06, + "loss": 0.0014, + "step": 1207 + }, + { + "epoch": 0.02012226608698549, + "grad_norm": 0.1235220730304718, + "learning_rate": 3e-06, + "loss": -0.0034, + "step": 1208 + }, + { + "completion_length": 77.69791793823242, + "epoch": 0.020138923592024387, + "grad_norm": 0.06561663001775742, + "learning_rate": 3e-06, + "loss": 0.0158, + "reward": 0.6625000536441803, + "rewards/countdown_reward_func": 0.6625000536441803, + "step": 1209 + }, + { + "epoch": 0.020155581097063283, + "grad_norm": 0.0588885098695755, + "learning_rate": 3e-06, + "loss": -0.0182, + "step": 1210 + }, + { + "epoch": 0.020172238602102176, + "grad_norm": 0.0672120675444603, + "learning_rate": 3e-06, + "loss": 0.0156, + "step": 1211 + }, + { + "epoch": 0.02018889610714107, + "grad_norm": 0.058216314762830734, + "learning_rate": 3e-06, + "loss": -0.0185, + "step": 1212 + }, + { + "completion_length": 77.02083587646484, + "epoch": 0.020205553612179968, + "grad_norm": 0.06464819610118866, + "learning_rate": 3e-06, + "loss": 0.0056, + "reward": 0.7937500476837158, + "rewards/countdown_reward_func": 0.793749988079071, + "step": 1213 + }, + { + "epoch": 0.020222211117218863, + "grad_norm": 0.08003026992082596, + "learning_rate": 3e-06, + "loss": 0.0181, + "step": 1214 + }, + { + "epoch": 0.02023886862225776, + "grad_norm": 0.056547366082668304, + "learning_rate": 3e-06, + "loss": 0.0053, + "step": 1215 + }, + { + "epoch": 0.020255526127296652, + "grad_norm": 0.079323910176754, + "learning_rate": 3e-06, + "loss": 0.018, + "step": 1216 + }, + { + "completion_length": 77.61458587646484, + "epoch": 0.020272183632335548, + "grad_norm": 0.01872013881802559, + "learning_rate": 3e-06, + "loss": 0.0121, + "reward": 0.596875011920929, + "rewards/countdown_reward_func": 0.5968749821186066, + "step": 1217 + }, + { + "epoch": 0.020288841137374444, + "grad_norm": 0.018633369356393814, + "learning_rate": 3e-06, + "loss": 0.0011, + "step": 1218 + }, + { + "epoch": 0.02030549864241334, + "grad_norm": 0.015200788155198097, + "learning_rate": 3e-06, + "loss": 0.0121, + "step": 1219 + }, + { + "epoch": 0.020322156147452235, + "grad_norm": 0.01955927163362503, + "learning_rate": 3e-06, + "loss": 0.0011, + "step": 1220 + }, + { + "completion_length": 77.85416793823242, + "epoch": 0.02033881365249113, + "grad_norm": 0.059634968638420105, + "learning_rate": 3e-06, + "loss": 0.0081, + "reward": 0.503125011920929, + "rewards/countdown_reward_func": 0.503125011920929, + "step": 1221 + }, + { + "epoch": 0.020355471157530024, + "grad_norm": 0.04693957790732384, + "learning_rate": 3e-06, + "loss": 0.0003, + "step": 1222 + }, + { + "epoch": 0.02037212866256892, + "grad_norm": 0.061445076018571854, + "learning_rate": 3e-06, + "loss": 0.008, + "step": 1223 + }, + { + "epoch": 0.020388786167607816, + "grad_norm": 0.04703660309314728, + "learning_rate": 3e-06, + "loss": 0.0001, + "step": 1224 + }, + { + "completion_length": 77.51041793823242, + "epoch": 0.02040544367264671, + "grad_norm": 0.08981825411319733, + "learning_rate": 3e-06, + "loss": 0.006, + "reward": 0.4750000536441803, + "rewards/countdown_reward_func": 0.4750000089406967, + "step": 1225 + }, + { + "epoch": 0.020422101177685607, + "grad_norm": 0.05219965800642967, + "learning_rate": 3e-06, + "loss": 0.0008, + "step": 1226 + }, + { + "epoch": 0.020438758682724503, + "grad_norm": 0.09326458722352982, + "learning_rate": 3e-06, + "loss": 0.0054, + "step": 1227 + }, + { + "epoch": 0.020455416187763396, + "grad_norm": 0.04930628836154938, + "learning_rate": 3e-06, + "loss": 0.0007, + "step": 1228 + }, + { + "completion_length": 77.80208587646484, + "epoch": 0.02047207369280229, + "grad_norm": 0.05841166153550148, + "learning_rate": 3e-06, + "loss": 0.0172, + "reward": 0.596875011920929, + "rewards/countdown_reward_func": 0.596875011920929, + "step": 1229 + }, + { + "epoch": 0.020488731197841187, + "grad_norm": 0.04643399268388748, + "learning_rate": 3e-06, + "loss": 0.0066, + "step": 1230 + }, + { + "epoch": 0.020505388702880083, + "grad_norm": 0.058714866638183594, + "learning_rate": 3e-06, + "loss": 0.017, + "step": 1231 + }, + { + "epoch": 0.02052204620791898, + "grad_norm": 0.04737931862473488, + "learning_rate": 3e-06, + "loss": 0.0065, + "step": 1232 + }, + { + "completion_length": 77.51041793823242, + "epoch": 0.02053870371295787, + "grad_norm": 0.02828388847410679, + "learning_rate": 3e-06, + "loss": -0.004, + "reward": 0.7000000178813934, + "rewards/countdown_reward_func": 0.699999988079071, + "step": 1233 + }, + { + "epoch": 0.020555361217996768, + "grad_norm": 0.0453546978533268, + "learning_rate": 3e-06, + "loss": -0.0052, + "step": 1234 + }, + { + "epoch": 0.020572018723035664, + "grad_norm": 0.027774035930633545, + "learning_rate": 3e-06, + "loss": -0.0041, + "step": 1235 + }, + { + "epoch": 0.02058867622807456, + "grad_norm": 0.04618718847632408, + "learning_rate": 3e-06, + "loss": -0.0051, + "step": 1236 + }, + { + "completion_length": 77.92708587646484, + "epoch": 0.020605333733113455, + "grad_norm": 0.05377303436398506, + "learning_rate": 3e-06, + "loss": 0.0085, + "reward": 0.5312500298023224, + "rewards/countdown_reward_func": 0.5312500298023224, + "step": 1237 + }, + { + "epoch": 0.02062199123815235, + "grad_norm": 0.11201160401105881, + "learning_rate": 3e-06, + "loss": 0.0054, + "step": 1238 + }, + { + "epoch": 0.020638648743191244, + "grad_norm": 0.05213050916790962, + "learning_rate": 3e-06, + "loss": 0.0083, + "step": 1239 + }, + { + "epoch": 0.02065530624823014, + "grad_norm": 0.12415870279073715, + "learning_rate": 3e-06, + "loss": 0.0045, + "step": 1240 + }, + { + "completion_length": 77.76041793823242, + "epoch": 0.020671963753269035, + "grad_norm": 0.029086435213685036, + "learning_rate": 3e-06, + "loss": -0.0042, + "reward": 0.6531250178813934, + "rewards/countdown_reward_func": 0.6531250178813934, + "step": 1241 + }, + { + "epoch": 0.02068862125830793, + "grad_norm": 0.060368917882442474, + "learning_rate": 3e-06, + "loss": 0.0012, + "step": 1242 + }, + { + "epoch": 0.020705278763346827, + "grad_norm": 0.029162611812353134, + "learning_rate": 3e-06, + "loss": -0.0042, + "step": 1243 + }, + { + "epoch": 0.02072193626838572, + "grad_norm": 0.05868222191929817, + "learning_rate": 3e-06, + "loss": 0.0009, + "step": 1244 + }, + { + "completion_length": 77.85417175292969, + "epoch": 0.020738593773424616, + "grad_norm": 0.04332929104566574, + "learning_rate": 3e-06, + "loss": 0.0054, + "reward": 0.4656250327825546, + "rewards/countdown_reward_func": 0.46562500298023224, + "step": 1245 + }, + { + "epoch": 0.02075525127846351, + "grad_norm": 0.0251142755150795, + "learning_rate": 3e-06, + "loss": 0.0032, + "step": 1246 + }, + { + "epoch": 0.020771908783502407, + "grad_norm": 0.04833066463470459, + "learning_rate": 3e-06, + "loss": 0.0053, + "step": 1247 + }, + { + "epoch": 0.020788566288541303, + "grad_norm": 0.022098351269960403, + "learning_rate": 3e-06, + "loss": 0.0031, + "step": 1248 + }, + { + "completion_length": 77.38541793823242, + "epoch": 0.0208052237935802, + "grad_norm": 0.12820112705230713, + "learning_rate": 3e-06, + "loss": 0.0118, + "reward": 0.596875011920929, + "rewards/countdown_reward_func": 0.5968749821186066, + "step": 1249 + }, + { + "epoch": 0.02082188129861909, + "grad_norm": 0.12056494504213333, + "learning_rate": 3e-06, + "loss": 0.0164, + "step": 1250 + }, + { + "epoch": 0.020838538803657988, + "grad_norm": 0.12334901839494705, + "learning_rate": 3e-06, + "loss": 0.0107, + "step": 1251 + }, + { + "epoch": 0.020855196308696883, + "grad_norm": 0.11485328525304794, + "learning_rate": 3e-06, + "loss": 0.0158, + "step": 1252 + }, + { + "completion_length": 77.06250381469727, + "epoch": 0.02087185381373578, + "grad_norm": 0.0848863422870636, + "learning_rate": 3e-06, + "loss": 0.0024, + "reward": 0.596875011920929, + "rewards/countdown_reward_func": 0.5968749821186066, + "step": 1253 + }, + { + "epoch": 0.020888511318774675, + "grad_norm": 0.09343966841697693, + "learning_rate": 3e-06, + "loss": -0.0158, + "step": 1254 + }, + { + "epoch": 0.020905168823813568, + "grad_norm": 0.08392602205276489, + "learning_rate": 3e-06, + "loss": 0.0018, + "step": 1255 + }, + { + "epoch": 0.020921826328852464, + "grad_norm": 0.09431532025337219, + "learning_rate": 3e-06, + "loss": -0.0159, + "step": 1256 + }, + { + "completion_length": 77.45833587646484, + "epoch": 0.02093848383389136, + "grad_norm": 0.030673282220959663, + "learning_rate": 3e-06, + "loss": 0.0001, + "reward": 0.578125, + "rewards/countdown_reward_func": 0.578125, + "step": 1257 + }, + { + "epoch": 0.020955141338930255, + "grad_norm": 0.038777973502874374, + "learning_rate": 3e-06, + "loss": 0.0137, + "step": 1258 + }, + { + "epoch": 0.02097179884396915, + "grad_norm": 0.03246590495109558, + "learning_rate": 3e-06, + "loss": 0.0, + "step": 1259 + }, + { + "epoch": 0.020988456349008047, + "grad_norm": 0.03822234645485878, + "learning_rate": 3e-06, + "loss": 0.0138, + "step": 1260 + }, + { + "completion_length": 77.25000381469727, + "epoch": 0.02100511385404694, + "grad_norm": 0.08232469111680984, + "learning_rate": 3e-06, + "loss": 0.0158, + "reward": 0.5781250596046448, + "rewards/countdown_reward_func": 0.5781250298023224, + "step": 1261 + }, + { + "epoch": 0.021021771359085836, + "grad_norm": 0.06742919236421585, + "learning_rate": 3e-06, + "loss": 0.0079, + "step": 1262 + }, + { + "epoch": 0.02103842886412473, + "grad_norm": 0.0765688419342041, + "learning_rate": 3e-06, + "loss": 0.0155, + "step": 1263 + }, + { + "epoch": 0.021055086369163627, + "grad_norm": 0.06923536211252213, + "learning_rate": 3e-06, + "loss": 0.0075, + "step": 1264 + }, + { + "completion_length": 77.58333587646484, + "epoch": 0.021071743874202523, + "grad_norm": 0.034220464527606964, + "learning_rate": 3e-06, + "loss": 0.0031, + "reward": 0.6156250238418579, + "rewards/countdown_reward_func": 0.6156249940395355, + "step": 1265 + }, + { + "epoch": 0.021088401379241416, + "grad_norm": 0.03089277818799019, + "learning_rate": 3e-06, + "loss": -0.0015, + "step": 1266 + }, + { + "epoch": 0.02110505888428031, + "grad_norm": 0.03297910466790199, + "learning_rate": 3e-06, + "loss": 0.0031, + "step": 1267 + }, + { + "epoch": 0.021121716389319208, + "grad_norm": 0.03010323829948902, + "learning_rate": 3e-06, + "loss": -0.0015, + "step": 1268 + }, + { + "completion_length": 77.67708587646484, + "epoch": 0.021138373894358103, + "grad_norm": 0.0724530965089798, + "learning_rate": 3e-06, + "loss": 0.0051, + "reward": 0.45625000447034836, + "rewards/countdown_reward_func": 0.45625000447034836, + "step": 1269 + }, + { + "epoch": 0.021155031399397, + "grad_norm": 0.07043136656284332, + "learning_rate": 3e-06, + "loss": 0.0114, + "step": 1270 + }, + { + "epoch": 0.021171688904435895, + "grad_norm": 0.07882285863161087, + "learning_rate": 3e-06, + "loss": 0.0049, + "step": 1271 + }, + { + "epoch": 0.021188346409474788, + "grad_norm": 0.07043449580669403, + "learning_rate": 3e-06, + "loss": 0.0112, + "step": 1272 + }, + { + "completion_length": 77.13542175292969, + "epoch": 0.021205003914513684, + "grad_norm": 0.07695278525352478, + "learning_rate": 3e-06, + "loss": 0.0145, + "reward": 0.6624999940395355, + "rewards/countdown_reward_func": 0.6624999940395355, + "step": 1273 + }, + { + "epoch": 0.02122166141955258, + "grad_norm": 0.05775652453303337, + "learning_rate": 3e-06, + "loss": -0.0035, + "step": 1274 + }, + { + "epoch": 0.021238318924591475, + "grad_norm": 0.08633749186992645, + "learning_rate": 3e-06, + "loss": 0.0142, + "step": 1275 + }, + { + "epoch": 0.02125497642963037, + "grad_norm": 0.05807287618517876, + "learning_rate": 3e-06, + "loss": -0.0039, + "step": 1276 + }, + { + "completion_length": 77.91667175292969, + "epoch": 0.021271633934669264, + "grad_norm": 0.04743451997637749, + "learning_rate": 3e-06, + "loss": -0.0035, + "reward": 0.8125000298023224, + "rewards/countdown_reward_func": 0.8125, + "step": 1277 + }, + { + "epoch": 0.02128829143970816, + "grad_norm": 0.03519059345126152, + "learning_rate": 3e-06, + "loss": 0.0066, + "step": 1278 + }, + { + "epoch": 0.021304948944747056, + "grad_norm": 0.047692183405160904, + "learning_rate": 3e-06, + "loss": -0.0037, + "step": 1279 + }, + { + "epoch": 0.02132160644978595, + "grad_norm": 0.04458218067884445, + "learning_rate": 3e-06, + "loss": 0.0066, + "step": 1280 + }, + { + "completion_length": 77.63542175292969, + "epoch": 0.021338263954824847, + "grad_norm": 2.9618438812661907e-09, + "learning_rate": 3e-06, + "loss": -0.0, + "reward": 0.6625000238418579, + "rewards/countdown_reward_func": 0.6624999642372131, + "step": 1281 + }, + { + "epoch": 0.021354921459863743, + "grad_norm": 3.4139109317266048e-09, + "learning_rate": 3e-06, + "loss": -0.0, + "step": 1282 + }, + { + "epoch": 0.021371578964902636, + "grad_norm": 2.9028535131203625e-09, + "learning_rate": 3e-06, + "loss": -0.0, + "step": 1283 + }, + { + "epoch": 0.02138823646994153, + "grad_norm": 3.484261990038817e-09, + "learning_rate": 3e-06, + "loss": -0.0, + "step": 1284 + }, + { + "completion_length": 77.40625381469727, + "epoch": 0.021404893974980427, + "grad_norm": 0.02203640341758728, + "learning_rate": 3e-06, + "loss": 0.0072, + "reward": 0.8218750357627869, + "rewards/countdown_reward_func": 0.8218750059604645, + "step": 1285 + }, + { + "epoch": 0.021421551480019323, + "grad_norm": 0.04227157309651375, + "learning_rate": 3e-06, + "loss": -0.0007, + "step": 1286 + }, + { + "epoch": 0.02143820898505822, + "grad_norm": 0.023128312081098557, + "learning_rate": 3e-06, + "loss": 0.0073, + "step": 1287 + }, + { + "epoch": 0.02145486649009711, + "grad_norm": 0.044070448726415634, + "learning_rate": 3e-06, + "loss": -0.0006, + "step": 1288 + }, + { + "completion_length": 77.79167175292969, + "epoch": 0.021471523995136008, + "grad_norm": 0.07362130284309387, + "learning_rate": 3e-06, + "loss": 0.0053, + "reward": 0.6156250089406967, + "rewards/countdown_reward_func": 0.6156249791383743, + "step": 1289 + }, + { + "epoch": 0.021488181500174904, + "grad_norm": 0.06816733628511429, + "learning_rate": 3e-06, + "loss": 0.0011, + "step": 1290 + }, + { + "epoch": 0.0215048390052138, + "grad_norm": 0.06870274990797043, + "learning_rate": 3e-06, + "loss": 0.0051, + "step": 1291 + }, + { + "epoch": 0.021521496510252695, + "grad_norm": 0.06937043368816376, + "learning_rate": 3e-06, + "loss": 0.0009, + "step": 1292 + }, + { + "completion_length": 77.84375, + "epoch": 0.02153815401529159, + "grad_norm": 0.06988618522882462, + "learning_rate": 3e-06, + "loss": -0.002, + "reward": 0.6062500178813934, + "rewards/countdown_reward_func": 0.6062500029802322, + "step": 1293 + }, + { + "epoch": 0.021554811520330484, + "grad_norm": 0.08443444222211838, + "learning_rate": 3e-06, + "loss": -0.0007, + "step": 1294 + }, + { + "epoch": 0.02157146902536938, + "grad_norm": 0.07036253809928894, + "learning_rate": 3e-06, + "loss": -0.0023, + "step": 1295 + }, + { + "epoch": 0.021588126530408275, + "grad_norm": 0.0853412076830864, + "learning_rate": 3e-06, + "loss": -0.0012, + "step": 1296 + }, + { + "completion_length": 78.07292175292969, + "epoch": 0.02160478403544717, + "grad_norm": 0.031074481084942818, + "learning_rate": 3e-06, + "loss": 0.0114, + "reward": 0.41875001788139343, + "rewards/countdown_reward_func": 0.41875000298023224, + "step": 1297 + }, + { + "epoch": 0.021621441540486067, + "grad_norm": 0.04179736599326134, + "learning_rate": 3e-06, + "loss": -0.0108, + "step": 1298 + }, + { + "epoch": 0.02163809904552496, + "grad_norm": 0.030724365264177322, + "learning_rate": 3e-06, + "loss": 0.0114, + "step": 1299 + }, + { + "epoch": 0.021654756550563856, + "grad_norm": 0.03869551420211792, + "learning_rate": 3e-06, + "loss": -0.0109, + "step": 1300 + }, + { + "completion_length": 77.60416793823242, + "epoch": 0.02167141405560275, + "grad_norm": 0.04001285880804062, + "learning_rate": 3e-06, + "loss": -0.0032, + "reward": 0.6250000596046448, + "rewards/countdown_reward_func": 0.6250000298023224, + "step": 1301 + }, + { + "epoch": 0.021688071560641647, + "grad_norm": 0.06957279145717621, + "learning_rate": 3e-06, + "loss": -0.0009, + "step": 1302 + }, + { + "epoch": 0.021704729065680543, + "grad_norm": 0.04956434667110443, + "learning_rate": 3e-06, + "loss": -0.0034, + "step": 1303 + }, + { + "epoch": 0.02172138657071944, + "grad_norm": 0.06957556307315826, + "learning_rate": 3e-06, + "loss": -0.0014, + "step": 1304 + }, + { + "completion_length": 78.19791793823242, + "epoch": 0.02173804407575833, + "grad_norm": 0.0486566424369812, + "learning_rate": 3e-06, + "loss": 0.0036, + "reward": 0.5968749970197678, + "rewards/countdown_reward_func": 0.5968749672174454, + "step": 1305 + }, + { + "epoch": 0.021754701580797228, + "grad_norm": 0.04246492683887482, + "learning_rate": 3e-06, + "loss": 0.0071, + "step": 1306 + }, + { + "epoch": 0.021771359085836123, + "grad_norm": 0.053005609661340714, + "learning_rate": 3e-06, + "loss": 0.0036, + "step": 1307 + }, + { + "epoch": 0.02178801659087502, + "grad_norm": 0.04268142208456993, + "learning_rate": 3e-06, + "loss": 0.0069, + "step": 1308 + }, + { + "completion_length": 77.875, + "epoch": 0.021804674095913915, + "grad_norm": 0.05287226289510727, + "learning_rate": 3e-06, + "loss": 0.0115, + "reward": 0.5687500238418579, + "rewards/countdown_reward_func": 0.5687500089406967, + "step": 1309 + }, + { + "epoch": 0.021821331600952808, + "grad_norm": 0.03528960421681404, + "learning_rate": 3e-06, + "loss": 0.0027, + "step": 1310 + }, + { + "epoch": 0.021837989105991704, + "grad_norm": 0.055722806602716446, + "learning_rate": 3e-06, + "loss": 0.0114, + "step": 1311 + }, + { + "epoch": 0.0218546466110306, + "grad_norm": 0.0382276326417923, + "learning_rate": 3e-06, + "loss": 0.0026, + "step": 1312 + }, + { + "completion_length": 77.80208587646484, + "epoch": 0.021871304116069495, + "grad_norm": 0.05332973226904869, + "learning_rate": 3e-06, + "loss": 0.0003, + "reward": 0.4937500059604645, + "rewards/countdown_reward_func": 0.4937500059604645, + "step": 1313 + }, + { + "epoch": 0.02188796162110839, + "grad_norm": 0.05508018285036087, + "learning_rate": 3e-06, + "loss": 0.0068, + "step": 1314 + }, + { + "epoch": 0.021904619126147287, + "grad_norm": 0.0572441890835762, + "learning_rate": 3e-06, + "loss": 0.0003, + "step": 1315 + }, + { + "epoch": 0.02192127663118618, + "grad_norm": 0.049032632261514664, + "learning_rate": 3e-06, + "loss": 0.0068, + "step": 1316 + }, + { + "completion_length": 77.69791793823242, + "epoch": 0.021937934136225076, + "grad_norm": 0.12304896116256714, + "learning_rate": 3e-06, + "loss": 0.0045, + "reward": 0.596875011920929, + "rewards/countdown_reward_func": 0.5968749821186066, + "step": 1317 + }, + { + "epoch": 0.02195459164126397, + "grad_norm": 0.07988425344228745, + "learning_rate": 3e-06, + "loss": 0.0032, + "step": 1318 + }, + { + "epoch": 0.021971249146302867, + "grad_norm": 0.11930066347122192, + "learning_rate": 3e-06, + "loss": 0.0035, + "step": 1319 + }, + { + "epoch": 0.021987906651341763, + "grad_norm": 0.07015310227870941, + "learning_rate": 3e-06, + "loss": 0.0028, + "step": 1320 + }, + { + "completion_length": 77.80208587646484, + "epoch": 0.022004564156380656, + "grad_norm": 0.09011714905500412, + "learning_rate": 3e-06, + "loss": 0.0163, + "reward": 0.625, + "rewards/countdown_reward_func": 0.625, + "step": 1321 + }, + { + "epoch": 0.02202122166141955, + "grad_norm": 0.0585167333483696, + "learning_rate": 3e-06, + "loss": 0.0084, + "step": 1322 + }, + { + "epoch": 0.022037879166458448, + "grad_norm": 0.09310527890920639, + "learning_rate": 3e-06, + "loss": 0.0158, + "step": 1323 + }, + { + "epoch": 0.022054536671497343, + "grad_norm": 0.0617208294570446, + "learning_rate": 3e-06, + "loss": 0.0082, + "step": 1324 + }, + { + "completion_length": 77.48958587646484, + "epoch": 0.02207119417653624, + "grad_norm": 3.3768712270898504e-09, + "learning_rate": 3e-06, + "loss": -0.0, + "reward": 0.550000011920929, + "rewards/countdown_reward_func": 0.550000011920929, + "step": 1325 + }, + { + "epoch": 0.022087851681575135, + "grad_norm": 3.114847713092672e-09, + "learning_rate": 3e-06, + "loss": -0.0, + "step": 1326 + }, + { + "epoch": 0.022104509186614028, + "grad_norm": 3.191032771354685e-09, + "learning_rate": 3e-06, + "loss": -0.0, + "step": 1327 + }, + { + "epoch": 0.022121166691652924, + "grad_norm": 3.0168005871189507e-09, + "learning_rate": 3e-06, + "loss": -0.0, + "step": 1328 + }, + { + "completion_length": 78.16667175292969, + "epoch": 0.02213782419669182, + "grad_norm": 0.05989151448011398, + "learning_rate": 3e-06, + "loss": 0.0024, + "reward": 0.7739583551883698, + "rewards/countdown_reward_func": 0.7739583253860474, + "step": 1329 + }, + { + "epoch": 0.022154481701730715, + "grad_norm": 0.06069450080394745, + "learning_rate": 3e-06, + "loss": 0.0024, + "step": 1330 + }, + { + "epoch": 0.02217113920676961, + "grad_norm": 0.07376185804605484, + "learning_rate": 3e-06, + "loss": 0.0021, + "step": 1331 + }, + { + "epoch": 0.022187796711808504, + "grad_norm": 0.05904784053564072, + "learning_rate": 3e-06, + "loss": 0.002, + "step": 1332 + }, + { + "completion_length": 77.32291793823242, + "epoch": 0.0222044542168474, + "grad_norm": 0.07198242098093033, + "learning_rate": 3e-06, + "loss": 0.0135, + "reward": 0.5687500238418579, + "rewards/countdown_reward_func": 0.5687500238418579, + "step": 1333 + }, + { + "epoch": 0.022221111721886296, + "grad_norm": 0.05194592848420143, + "learning_rate": 3e-06, + "loss": -0.0029, + "step": 1334 + }, + { + "epoch": 0.02223776922692519, + "grad_norm": 0.07979171723127365, + "learning_rate": 3e-06, + "loss": 0.0134, + "step": 1335 + }, + { + "epoch": 0.022254426731964087, + "grad_norm": 0.05527674779295921, + "learning_rate": 3e-06, + "loss": -0.0032, + "step": 1336 + }, + { + "completion_length": 77.73958587646484, + "epoch": 0.022271084237002983, + "grad_norm": 0.09083087742328644, + "learning_rate": 3e-06, + "loss": 0.0215, + "reward": 0.5406250208616257, + "rewards/countdown_reward_func": 0.5406249910593033, + "step": 1337 + }, + { + "epoch": 0.022287741742041876, + "grad_norm": 0.08941887319087982, + "learning_rate": 3e-06, + "loss": 0.0277, + "step": 1338 + }, + { + "epoch": 0.02230439924708077, + "grad_norm": 0.09128575026988983, + "learning_rate": 3e-06, + "loss": 0.0211, + "step": 1339 + }, + { + "epoch": 0.022321056752119667, + "grad_norm": 0.08947112411260605, + "learning_rate": 3e-06, + "loss": 0.0272, + "step": 1340 + }, + { + "completion_length": 77.40625381469727, + "epoch": 0.022337714257158563, + "grad_norm": 0.042494915425777435, + "learning_rate": 3e-06, + "loss": -0.0048, + "reward": 0.7093749940395355, + "rewards/countdown_reward_func": 0.7093749940395355, + "step": 1341 + }, + { + "epoch": 0.02235437176219746, + "grad_norm": 0.013886269181966782, + "learning_rate": 3e-06, + "loss": 0.0049, + "step": 1342 + }, + { + "epoch": 0.02237102926723635, + "grad_norm": 0.04492849111557007, + "learning_rate": 3e-06, + "loss": -0.0049, + "step": 1343 + }, + { + "epoch": 0.022387686772275248, + "grad_norm": 0.013192708604037762, + "learning_rate": 3e-06, + "loss": 0.0049, + "step": 1344 + }, + { + "completion_length": 77.19791793823242, + "epoch": 0.022404344277314144, + "grad_norm": 0.05899173766374588, + "learning_rate": 3e-06, + "loss": 0.005, + "reward": 0.7281250357627869, + "rewards/countdown_reward_func": 0.7281250357627869, + "step": 1345 + }, + { + "epoch": 0.02242100178235304, + "grad_norm": 0.08368319272994995, + "learning_rate": 3e-06, + "loss": 0.0148, + "step": 1346 + }, + { + "epoch": 0.022437659287391935, + "grad_norm": 0.05564848706126213, + "learning_rate": 3e-06, + "loss": 0.0047, + "step": 1347 + }, + { + "epoch": 0.02245431679243083, + "grad_norm": 0.0816107764840126, + "learning_rate": 3e-06, + "loss": 0.0142, + "step": 1348 + }, + { + "completion_length": 78.06250381469727, + "epoch": 0.022470974297469724, + "grad_norm": 0.04288644716143608, + "learning_rate": 3e-06, + "loss": 0.0014, + "reward": 0.7093749940395355, + "rewards/countdown_reward_func": 0.7093749642372131, + "step": 1349 + }, + { + "epoch": 0.02248763180250862, + "grad_norm": 0.02451229654252529, + "learning_rate": 3e-06, + "loss": 0.0035, + "step": 1350 + }, + { + "epoch": 0.022504289307547515, + "grad_norm": 0.044914934784173965, + "learning_rate": 3e-06, + "loss": 0.0012, + "step": 1351 + }, + { + "epoch": 0.02252094681258641, + "grad_norm": 0.023603985086083412, + "learning_rate": 3e-06, + "loss": 0.0035, + "step": 1352 + }, + { + "completion_length": 77.84375381469727, + "epoch": 0.022537604317625307, + "grad_norm": 0.05233975127339363, + "learning_rate": 3e-06, + "loss": 0.0107, + "reward": 0.6062500476837158, + "rewards/countdown_reward_func": 0.606249988079071, + "step": 1353 + }, + { + "epoch": 0.0225542618226642, + "grad_norm": 0.06866282224655151, + "learning_rate": 3e-06, + "loss": 0.0079, + "step": 1354 + }, + { + "epoch": 0.022570919327703096, + "grad_norm": 0.057136110961437225, + "learning_rate": 3e-06, + "loss": 0.0103, + "step": 1355 + }, + { + "epoch": 0.02258757683274199, + "grad_norm": 0.04932220280170441, + "learning_rate": 3e-06, + "loss": 0.0073, + "step": 1356 + }, + { + "completion_length": 77.4375, + "epoch": 0.022604234337780887, + "grad_norm": 0.03950830176472664, + "learning_rate": 3e-06, + "loss": -0.0046, + "reward": 0.7000000178813934, + "rewards/countdown_reward_func": 0.699999988079071, + "step": 1357 + }, + { + "epoch": 0.022620891842819783, + "grad_norm": 0.034459300339221954, + "learning_rate": 3e-06, + "loss": 0.0057, + "step": 1358 + }, + { + "epoch": 0.02263754934785868, + "grad_norm": 0.03856908157467842, + "learning_rate": 3e-06, + "loss": -0.0049, + "step": 1359 + }, + { + "epoch": 0.02265420685289757, + "grad_norm": 0.03429027646780014, + "learning_rate": 3e-06, + "loss": 0.0057, + "step": 1360 + }, + { + "completion_length": 77.85417175292969, + "epoch": 0.022670864357936468, + "grad_norm": 0.08360004425048828, + "learning_rate": 3e-06, + "loss": 0.0023, + "reward": 0.625, + "rewards/countdown_reward_func": 0.625, + "step": 1361 + }, + { + "epoch": 0.022687521862975363, + "grad_norm": 0.09209456294775009, + "learning_rate": 3e-06, + "loss": -0.0006, + "step": 1362 + }, + { + "epoch": 0.02270417936801426, + "grad_norm": 0.07783455401659012, + "learning_rate": 3e-06, + "loss": 0.0023, + "step": 1363 + }, + { + "epoch": 0.022720836873053155, + "grad_norm": 0.08715023845434189, + "learning_rate": 3e-06, + "loss": -0.0012, + "step": 1364 + }, + { + "completion_length": 77.64583587646484, + "epoch": 0.022737494378092048, + "grad_norm": 0.07262544333934784, + "learning_rate": 3e-06, + "loss": 0.0124, + "reward": 0.5875000357627869, + "rewards/countdown_reward_func": 0.5875000059604645, + "step": 1365 + }, + { + "epoch": 0.022754151883130944, + "grad_norm": 0.09045146405696869, + "learning_rate": 3e-06, + "loss": -0.0036, + "step": 1366 + }, + { + "epoch": 0.02277080938816984, + "grad_norm": 0.07410691678524017, + "learning_rate": 3e-06, + "loss": 0.0121, + "step": 1367 + }, + { + "epoch": 0.022787466893208735, + "grad_norm": 0.07934346050024033, + "learning_rate": 3e-06, + "loss": -0.004, + "step": 1368 + }, + { + "completion_length": 77.23958587646484, + "epoch": 0.02280412439824763, + "grad_norm": 0.054384805262088776, + "learning_rate": 3e-06, + "loss": -0.0054, + "reward": 0.7468750178813934, + "rewards/countdown_reward_func": 0.7468750178813934, + "step": 1369 + }, + { + "epoch": 0.022820781903286527, + "grad_norm": 0.06647000461816788, + "learning_rate": 3e-06, + "loss": 0.0094, + "step": 1370 + }, + { + "epoch": 0.02283743940832542, + "grad_norm": 0.054883357137441635, + "learning_rate": 3e-06, + "loss": -0.0055, + "step": 1371 + }, + { + "epoch": 0.022854096913364316, + "grad_norm": 0.06919855624437332, + "learning_rate": 3e-06, + "loss": 0.0093, + "step": 1372 + }, + { + "completion_length": 77.29166793823242, + "epoch": 0.02287075441840321, + "grad_norm": 0.05632363259792328, + "learning_rate": 3e-06, + "loss": 0.0028, + "reward": 0.7468750476837158, + "rewards/countdown_reward_func": 0.746874988079071, + "step": 1373 + }, + { + "epoch": 0.022887411923442107, + "grad_norm": 0.0659336969256401, + "learning_rate": 3e-06, + "loss": 0.0012, + "step": 1374 + }, + { + "epoch": 0.022904069428481003, + "grad_norm": 0.06821155548095703, + "learning_rate": 3e-06, + "loss": 0.0026, + "step": 1375 + }, + { + "epoch": 0.022920726933519896, + "grad_norm": 0.06393817812204361, + "learning_rate": 3e-06, + "loss": 0.001, + "step": 1376 + }, + { + "completion_length": 77.59375, + "epoch": 0.02293738443855879, + "grad_norm": 0.09811081737279892, + "learning_rate": 3e-06, + "loss": 0.0071, + "reward": 0.6625000238418579, + "rewards/countdown_reward_func": 0.6625000238418579, + "step": 1377 + }, + { + "epoch": 0.022954041943597688, + "grad_norm": 0.09132862091064453, + "learning_rate": 3e-06, + "loss": -0.001, + "step": 1378 + }, + { + "epoch": 0.022970699448636583, + "grad_norm": 0.10226461291313171, + "learning_rate": 3e-06, + "loss": 0.0066, + "step": 1379 + }, + { + "epoch": 0.02298735695367548, + "grad_norm": 0.12431399524211884, + "learning_rate": 3e-06, + "loss": -0.0012, + "step": 1380 + }, + { + "completion_length": 77.56250381469727, + "epoch": 0.023004014458714375, + "grad_norm": 0.10309239476919174, + "learning_rate": 3e-06, + "loss": 0.0127, + "reward": 0.6250000298023224, + "rewards/countdown_reward_func": 0.6250000298023224, + "step": 1381 + }, + { + "epoch": 0.023020671963753268, + "grad_norm": 0.14624060690402985, + "learning_rate": 3e-06, + "loss": 0.006, + "step": 1382 + }, + { + "epoch": 0.023037329468792164, + "grad_norm": 0.10823620855808258, + "learning_rate": 3e-06, + "loss": 0.0124, + "step": 1383 + }, + { + "epoch": 0.02305398697383106, + "grad_norm": 0.10773634910583496, + "learning_rate": 3e-06, + "loss": 0.0052, + "step": 1384 + }, + { + "completion_length": 76.61458587646484, + "epoch": 0.023070644478869955, + "grad_norm": 0.06305456906557083, + "learning_rate": 3e-06, + "loss": 0.008, + "reward": 0.5499999970197678, + "rewards/countdown_reward_func": 0.5499999672174454, + "step": 1385 + }, + { + "epoch": 0.02308730198390885, + "grad_norm": 0.08236860483884811, + "learning_rate": 3e-06, + "loss": -0.0148, + "step": 1386 + }, + { + "epoch": 0.023103959488947744, + "grad_norm": 0.06321364641189575, + "learning_rate": 3e-06, + "loss": 0.0077, + "step": 1387 + }, + { + "epoch": 0.02312061699398664, + "grad_norm": 0.0842658206820488, + "learning_rate": 3e-06, + "loss": -0.0152, + "step": 1388 + }, + { + "completion_length": 76.95833587646484, + "epoch": 0.023137274499025536, + "grad_norm": 0.050507914274930954, + "learning_rate": 3e-06, + "loss": 0.0103, + "reward": 0.6343749910593033, + "rewards/countdown_reward_func": 0.6343749910593033, + "step": 1389 + }, + { + "epoch": 0.02315393200406443, + "grad_norm": 0.02278854139149189, + "learning_rate": 3e-06, + "loss": 0.0046, + "step": 1390 + }, + { + "epoch": 0.023170589509103327, + "grad_norm": 0.051197927445173264, + "learning_rate": 3e-06, + "loss": 0.01, + "step": 1391 + }, + { + "epoch": 0.023187247014142223, + "grad_norm": 0.02220262587070465, + "learning_rate": 3e-06, + "loss": 0.0045, + "step": 1392 + }, + { + "completion_length": 77.73958587646484, + "epoch": 0.023203904519181116, + "grad_norm": 3.705719064939217e-09, + "learning_rate": 3e-06, + "loss": -0.0, + "reward": 0.6062500178813934, + "rewards/countdown_reward_func": 0.606249988079071, + "step": 1393 + }, + { + "epoch": 0.02322056202422001, + "grad_norm": 3.073959531363357e-09, + "learning_rate": 3e-06, + "loss": -0.0, + "step": 1394 + }, + { + "epoch": 0.023237219529258907, + "grad_norm": 3.611813292891952e-09, + "learning_rate": 3e-06, + "loss": -0.0, + "step": 1395 + }, + { + "epoch": 0.023253877034297803, + "grad_norm": 3.0612765655746443e-09, + "learning_rate": 3e-06, + "loss": -0.0, + "step": 1396 + }, + { + "completion_length": 76.84375, + "epoch": 0.0232705345393367, + "grad_norm": 0.13253922760486603, + "learning_rate": 3e-06, + "loss": 0.0109, + "reward": 0.4843750447034836, + "rewards/countdown_reward_func": 0.4843750149011612, + "step": 1397 + }, + { + "epoch": 0.02328719204437559, + "grad_norm": 0.07008399069309235, + "learning_rate": 3e-06, + "loss": 0.0022, + "step": 1398 + }, + { + "epoch": 0.023303849549414488, + "grad_norm": 0.1310146152973175, + "learning_rate": 3e-06, + "loss": 0.01, + "step": 1399 + }, + { + "epoch": 0.023320507054453384, + "grad_norm": 0.06885950267314911, + "learning_rate": 3e-06, + "loss": 0.0018, + "step": 1400 + }, + { + "completion_length": 77.69791793823242, + "epoch": 0.02333716455949228, + "grad_norm": 0.07762031257152557, + "learning_rate": 3e-06, + "loss": -0.0056, + "reward": 0.41875001788139343, + "rewards/countdown_reward_func": 0.41875001788139343, + "step": 1401 + }, + { + "epoch": 0.023353822064531175, + "grad_norm": 0.10060802102088928, + "learning_rate": 3e-06, + "loss": 0.0134, + "step": 1402 + }, + { + "epoch": 0.02337047956957007, + "grad_norm": 0.07979115098714828, + "learning_rate": 3e-06, + "loss": -0.0059, + "step": 1403 + }, + { + "epoch": 0.023387137074608964, + "grad_norm": 0.10377225279808044, + "learning_rate": 3e-06, + "loss": 0.0132, + "step": 1404 + }, + { + "completion_length": 77.34375381469727, + "epoch": 0.02340379457964786, + "grad_norm": 0.058312948793172836, + "learning_rate": 3e-06, + "loss": 0.001, + "reward": 0.6250000298023224, + "rewards/countdown_reward_func": 0.6250000298023224, + "step": 1405 + }, + { + "epoch": 0.023420452084686755, + "grad_norm": 0.05046786367893219, + "learning_rate": 3e-06, + "loss": 0.0081, + "step": 1406 + }, + { + "epoch": 0.02343710958972565, + "grad_norm": 0.058003056794404984, + "learning_rate": 3e-06, + "loss": 0.0008, + "step": 1407 + }, + { + "epoch": 0.023453767094764547, + "grad_norm": 0.05249866470694542, + "learning_rate": 3e-06, + "loss": 0.0078, + "step": 1408 + }, + { + "completion_length": 76.87500381469727, + "epoch": 0.023470424599803443, + "grad_norm": 0.09185436367988586, + "learning_rate": 3e-06, + "loss": 0.018, + "reward": 0.5968749821186066, + "rewards/countdown_reward_func": 0.5968749821186066, + "step": 1409 + }, + { + "epoch": 0.023487082104842336, + "grad_norm": 0.09062211215496063, + "learning_rate": 3e-06, + "loss": -0.0164, + "step": 1410 + }, + { + "epoch": 0.02350373960988123, + "grad_norm": 0.07713036239147186, + "learning_rate": 3e-06, + "loss": 0.018, + "step": 1411 + }, + { + "epoch": 0.023520397114920127, + "grad_norm": 0.09732808917760849, + "learning_rate": 3e-06, + "loss": -0.0165, + "step": 1412 + }, + { + "completion_length": 77.37500381469727, + "epoch": 0.023537054619959023, + "grad_norm": 0.071933813393116, + "learning_rate": 3e-06, + "loss": 0.0064, + "reward": 0.6062500476837158, + "rewards/countdown_reward_func": 0.6062500476837158, + "step": 1413 + }, + { + "epoch": 0.02355371212499792, + "grad_norm": 0.07997026294469833, + "learning_rate": 3e-06, + "loss": 0.0008, + "step": 1414 + }, + { + "epoch": 0.02357036963003681, + "grad_norm": 0.08994881063699722, + "learning_rate": 3e-06, + "loss": 0.006, + "step": 1415 + }, + { + "epoch": 0.023587027135075708, + "grad_norm": 0.07706700265407562, + "learning_rate": 3e-06, + "loss": 0.0004, + "step": 1416 + }, + { + "completion_length": 77.33333587646484, + "epoch": 0.023603684640114603, + "grad_norm": 0.06306517869234085, + "learning_rate": 3e-06, + "loss": 0.0129, + "reward": 0.8031250536441803, + "rewards/countdown_reward_func": 0.8031250238418579, + "step": 1417 + }, + { + "epoch": 0.0236203421451535, + "grad_norm": 0.11881524324417114, + "learning_rate": 3e-06, + "loss": -0.0078, + "step": 1418 + }, + { + "epoch": 0.023636999650192395, + "grad_norm": 0.07218842953443527, + "learning_rate": 3e-06, + "loss": 0.0126, + "step": 1419 + }, + { + "epoch": 0.02365365715523129, + "grad_norm": 0.12303237617015839, + "learning_rate": 3e-06, + "loss": -0.0083, + "step": 1420 + }, + { + "completion_length": 76.66667175292969, + "epoch": 0.023670314660270184, + "grad_norm": 0.07066743075847626, + "learning_rate": 3e-06, + "loss": 0.0084, + "reward": 0.793749988079071, + "rewards/countdown_reward_func": 0.7937499582767487, + "step": 1421 + }, + { + "epoch": 0.02368697216530908, + "grad_norm": 0.04281092807650566, + "learning_rate": 3e-06, + "loss": 0.0147, + "step": 1422 + }, + { + "epoch": 0.023703629670347975, + "grad_norm": 0.07535140216350555, + "learning_rate": 3e-06, + "loss": 0.0082, + "step": 1423 + }, + { + "epoch": 0.02372028717538687, + "grad_norm": 0.04259409010410309, + "learning_rate": 3e-06, + "loss": 0.0145, + "step": 1424 + }, + { + "completion_length": 77.18750381469727, + "epoch": 0.023736944680425767, + "grad_norm": 4.4463703829933365e-09, + "learning_rate": 3e-06, + "loss": -0.0, + "reward": 0.4937500059604645, + "rewards/countdown_reward_func": 0.4937500059604645, + "step": 1425 + }, + { + "epoch": 0.02375360218546466, + "grad_norm": 3.865797459923215e-09, + "learning_rate": 3e-06, + "loss": -0.0, + "step": 1426 + }, + { + "epoch": 0.023770259690503556, + "grad_norm": 4.519241425526843e-09, + "learning_rate": 3e-06, + "loss": -0.0, + "step": 1427 + }, + { + "epoch": 0.02378691719554245, + "grad_norm": 3.9147960428920214e-09, + "learning_rate": 3e-06, + "loss": -0.0, + "step": 1428 + }, + { + "completion_length": 76.70833587646484, + "epoch": 0.023803574700581347, + "grad_norm": 0.20493681728839874, + "learning_rate": 3e-06, + "loss": 0.0142, + "reward": 0.5125000178813934, + "rewards/countdown_reward_func": 0.5125000029802322, + "step": 1429 + }, + { + "epoch": 0.023820232205620243, + "grad_norm": 0.07043073326349258, + "learning_rate": 3e-06, + "loss": 0.0036, + "step": 1430 + }, + { + "epoch": 0.02383688971065914, + "grad_norm": 0.06236785650253296, + "learning_rate": 3e-06, + "loss": 0.014, + "step": 1431 + }, + { + "epoch": 0.02385354721569803, + "grad_norm": 0.07170114666223526, + "learning_rate": 3e-06, + "loss": 0.0031, + "step": 1432 + }, + { + "completion_length": 76.71875, + "epoch": 0.023870204720736928, + "grad_norm": 0.022193752229213715, + "learning_rate": 3e-06, + "loss": 0.0003, + "reward": 0.6531250476837158, + "rewards/countdown_reward_func": 0.653124988079071, + "step": 1433 + }, + { + "epoch": 0.023886862225775823, + "grad_norm": 0.031683772802352905, + "learning_rate": 3e-06, + "loss": 0.0019, + "step": 1434 + }, + { + "epoch": 0.02390351973081472, + "grad_norm": 0.02229478769004345, + "learning_rate": 3e-06, + "loss": 0.0003, + "step": 1435 + }, + { + "epoch": 0.023920177235853615, + "grad_norm": 0.02944616600871086, + "learning_rate": 3e-06, + "loss": 0.0019, + "step": 1436 + }, + { + "completion_length": 77.3125, + "epoch": 0.023936834740892508, + "grad_norm": 0.07184036821126938, + "learning_rate": 3e-06, + "loss": 0.0143, + "reward": 0.4937500059604645, + "rewards/countdown_reward_func": 0.4937500059604645, + "step": 1437 + }, + { + "epoch": 0.023953492245931404, + "grad_norm": 0.0702061727643013, + "learning_rate": 3e-06, + "loss": -0.0053, + "step": 1438 + }, + { + "epoch": 0.0239701497509703, + "grad_norm": 0.07558220624923706, + "learning_rate": 3e-06, + "loss": 0.014, + "step": 1439 + }, + { + "epoch": 0.023986807256009195, + "grad_norm": 0.07767302542924881, + "learning_rate": 3e-06, + "loss": -0.0056, + "step": 1440 + }, + { + "completion_length": 77.10416793823242, + "epoch": 0.02400346476104809, + "grad_norm": 0.039056446403265, + "learning_rate": 3e-06, + "loss": 0.0031, + "reward": 0.671875, + "rewards/countdown_reward_func": 0.671875, + "step": 1441 + }, + { + "epoch": 0.024020122266086987, + "grad_norm": 0.03566322475671768, + "learning_rate": 3e-06, + "loss": 0.0042, + "step": 1442 + }, + { + "epoch": 0.02403677977112588, + "grad_norm": 0.037761762738227844, + "learning_rate": 3e-06, + "loss": 0.003, + "step": 1443 + }, + { + "epoch": 0.024053437276164776, + "grad_norm": 0.03775296360254288, + "learning_rate": 3e-06, + "loss": 0.0042, + "step": 1444 + }, + { + "completion_length": 77.73958587646484, + "epoch": 0.02407009478120367, + "grad_norm": 0.08504849672317505, + "learning_rate": 3e-06, + "loss": 0.006, + "reward": 0.6156250536441803, + "rewards/countdown_reward_func": 0.6156250238418579, + "step": 1445 + }, + { + "epoch": 0.024086752286242567, + "grad_norm": 0.10846791416406631, + "learning_rate": 3e-06, + "loss": 0.01, + "step": 1446 + }, + { + "epoch": 0.024103409791281463, + "grad_norm": 0.08546160161495209, + "learning_rate": 3e-06, + "loss": 0.0057, + "step": 1447 + }, + { + "epoch": 0.024120067296320356, + "grad_norm": 0.1139037013053894, + "learning_rate": 3e-06, + "loss": 0.0095, + "step": 1448 + }, + { + "completion_length": 77.43750381469727, + "epoch": 0.02413672480135925, + "grad_norm": 2.3413193606103277e-09, + "learning_rate": 3e-06, + "loss": -0.0, + "reward": 0.831250011920929, + "rewards/countdown_reward_func": 0.8312499821186066, + "step": 1449 + }, + { + "epoch": 0.024153382306398147, + "grad_norm": 2.2755617390401994e-09, + "learning_rate": 3e-06, + "loss": -0.0, + "step": 1450 + }, + { + "epoch": 0.024170039811437043, + "grad_norm": 2.180402525198133e-09, + "learning_rate": 3e-06, + "loss": -0.0, + "step": 1451 + }, + { + "epoch": 0.02418669731647594, + "grad_norm": 2.3729518350279477e-09, + "learning_rate": 3e-06, + "loss": -0.0, + "step": 1452 + }, + { + "completion_length": 77.28125381469727, + "epoch": 0.024203354821514835, + "grad_norm": 0.06985422968864441, + "learning_rate": 3e-06, + "loss": -0.0023, + "reward": 0.6531250178813934, + "rewards/countdown_reward_func": 0.653124988079071, + "step": 1453 + }, + { + "epoch": 0.024220012326553728, + "grad_norm": 0.04612107574939728, + "learning_rate": 3e-06, + "loss": -0.0065, + "step": 1454 + }, + { + "epoch": 0.024236669831592624, + "grad_norm": 0.06246389076113701, + "learning_rate": 3e-06, + "loss": -0.0028, + "step": 1455 + }, + { + "epoch": 0.02425332733663152, + "grad_norm": 0.05120018869638443, + "learning_rate": 3e-06, + "loss": -0.0066, + "step": 1456 + }, + { + "completion_length": 76.76041793823242, + "epoch": 0.024269984841670415, + "grad_norm": 0.0391736701130867, + "learning_rate": 3e-06, + "loss": 0.004, + "reward": 0.6531250178813934, + "rewards/countdown_reward_func": 0.6531250178813934, + "step": 1457 + }, + { + "epoch": 0.02428664234670931, + "grad_norm": 0.034990228712558746, + "learning_rate": 3e-06, + "loss": -0.0027, + "step": 1458 + }, + { + "epoch": 0.024303299851748204, + "grad_norm": 0.038876038044691086, + "learning_rate": 3e-06, + "loss": 0.0039, + "step": 1459 + }, + { + "epoch": 0.0243199573567871, + "grad_norm": 0.03430899605154991, + "learning_rate": 3e-06, + "loss": -0.0028, + "step": 1460 + }, + { + "completion_length": 76.69791793823242, + "epoch": 0.024336614861825995, + "grad_norm": 0.027321478351950645, + "learning_rate": 3e-06, + "loss": -0.003, + "reward": 0.5406250357627869, + "rewards/countdown_reward_func": 0.5406250059604645, + "step": 1461 + }, + { + "epoch": 0.02435327236686489, + "grad_norm": 0.01594717614352703, + "learning_rate": 3e-06, + "loss": 0.0007, + "step": 1462 + }, + { + "epoch": 0.024369929871903787, + "grad_norm": 0.026387548074126244, + "learning_rate": 3e-06, + "loss": -0.0032, + "step": 1463 + }, + { + "epoch": 0.024386587376942683, + "grad_norm": 0.015395723283290863, + "learning_rate": 3e-06, + "loss": 0.0006, + "step": 1464 + }, + { + "completion_length": 77.29166793823242, + "epoch": 0.024403244881981576, + "grad_norm": 0.0691758543252945, + "learning_rate": 3e-06, + "loss": -0.0014, + "reward": 0.6531250178813934, + "rewards/countdown_reward_func": 0.6531250178813934, + "step": 1465 + }, + { + "epoch": 0.02441990238702047, + "grad_norm": 0.08399573713541031, + "learning_rate": 3e-06, + "loss": 0.0164, + "step": 1466 + }, + { + "epoch": 0.024436559892059367, + "grad_norm": 0.0764850601553917, + "learning_rate": 3e-06, + "loss": -0.0014, + "step": 1467 + }, + { + "epoch": 0.024453217397098263, + "grad_norm": 0.08421406149864197, + "learning_rate": 3e-06, + "loss": 0.0163, + "step": 1468 + }, + { + "completion_length": 77.37500381469727, + "epoch": 0.02446987490213716, + "grad_norm": 5.113168111137156e-09, + "learning_rate": 3e-06, + "loss": -0.0, + "reward": 0.6625000238418579, + "rewards/countdown_reward_func": 0.6624999642372131, + "step": 1469 + }, + { + "epoch": 0.02448653240717605, + "grad_norm": 5.6317248642301365e-09, + "learning_rate": 3e-06, + "loss": -0.0, + "step": 1470 + }, + { + "epoch": 0.024503189912214948, + "grad_norm": 5.122125390499832e-09, + "learning_rate": 3e-06, + "loss": -0.0, + "step": 1471 + }, + { + "epoch": 0.024519847417253843, + "grad_norm": 5.552292847710305e-09, + "learning_rate": 3e-06, + "loss": -0.0, + "step": 1472 + }, + { + "completion_length": 77.72916793823242, + "epoch": 0.02453650492229274, + "grad_norm": 7.276875102490976e-09, + "learning_rate": 3e-06, + "loss": -0.0, + "reward": 0.49375002086162567, + "rewards/countdown_reward_func": 0.4937499910593033, + "step": 1473 + }, + { + "epoch": 0.024553162427331635, + "grad_norm": 9.50188194792645e-09, + "learning_rate": 3e-06, + "loss": -0.0, + "step": 1474 + }, + { + "epoch": 0.02456981993237053, + "grad_norm": 7.253067479950914e-09, + "learning_rate": 3e-06, + "loss": -0.0, + "step": 1475 + }, + { + "epoch": 0.024586477437409424, + "grad_norm": 9.373644971333306e-09, + "learning_rate": 3e-06, + "loss": -0.0, + "step": 1476 + }, + { + "completion_length": 76.70833587646484, + "epoch": 0.02460313494244832, + "grad_norm": 6.9692642767904545e-09, + "learning_rate": 3e-06, + "loss": -0.0, + "reward": 0.4937500059604645, + "rewards/countdown_reward_func": 0.4937500059604645, + "step": 1477 + }, + { + "epoch": 0.024619792447487215, + "grad_norm": 6.245739481158807e-09, + "learning_rate": 3e-06, + "loss": -0.0, + "step": 1478 + }, + { + "epoch": 0.02463644995252611, + "grad_norm": 6.883124292755838e-09, + "learning_rate": 3e-06, + "loss": -0.0, + "step": 1479 + }, + { + "epoch": 0.024653107457565007, + "grad_norm": 6.009997832734371e-09, + "learning_rate": 3e-06, + "loss": -0.0, + "step": 1480 + }, + { + "completion_length": 77.35417175292969, + "epoch": 0.0246697649626039, + "grad_norm": 0.06656426191329956, + "learning_rate": 3e-06, + "loss": -0.0057, + "reward": 0.6812500059604645, + "rewards/countdown_reward_func": 0.6812499761581421, + "step": 1481 + }, + { + "epoch": 0.024686422467642796, + "grad_norm": 0.10670114308595657, + "learning_rate": 3e-06, + "loss": 0.0253, + "step": 1482 + }, + { + "epoch": 0.02470307997268169, + "grad_norm": 0.06477946788072586, + "learning_rate": 3e-06, + "loss": -0.0058, + "step": 1483 + }, + { + "epoch": 0.024719737477720587, + "grad_norm": 0.10548369586467743, + "learning_rate": 3e-06, + "loss": 0.0247, + "step": 1484 + }, + { + "completion_length": 77.32291793823242, + "epoch": 0.024736394982759483, + "grad_norm": 0.0646500289440155, + "learning_rate": 3e-06, + "loss": 0.0004, + "reward": 0.3531250059604645, + "rewards/countdown_reward_func": 0.3531250059604645, + "step": 1485 + }, + { + "epoch": 0.02475305248779838, + "grad_norm": 0.07308440655469894, + "learning_rate": 3e-06, + "loss": 0.012, + "step": 1486 + }, + { + "epoch": 0.02476970999283727, + "grad_norm": 0.061079997569322586, + "learning_rate": 3e-06, + "loss": 0.0003, + "step": 1487 + }, + { + "epoch": 0.024786367497876167, + "grad_norm": 0.0690690204501152, + "learning_rate": 3e-06, + "loss": 0.0118, + "step": 1488 + }, + { + "completion_length": 77.5, + "epoch": 0.024803025002915063, + "grad_norm": 0.028555873781442642, + "learning_rate": 3e-06, + "loss": 0.0029, + "reward": 0.5958333313465118, + "rewards/countdown_reward_func": 0.5958333313465118, + "step": 1489 + }, + { + "epoch": 0.02481968250795396, + "grad_norm": 0.028642740100622177, + "learning_rate": 3e-06, + "loss": 0.0079, + "step": 1490 + }, + { + "epoch": 0.024836340012992855, + "grad_norm": 0.026428161188960075, + "learning_rate": 3e-06, + "loss": 0.0029, + "step": 1491 + }, + { + "epoch": 0.024852997518031748, + "grad_norm": 0.02766229584813118, + "learning_rate": 3e-06, + "loss": 0.0078, + "step": 1492 + }, + { + "completion_length": 77.02083587646484, + "epoch": 0.024869655023070644, + "grad_norm": 0.07397337257862091, + "learning_rate": 3e-06, + "loss": 0.013, + "reward": 0.6625000238418579, + "rewards/countdown_reward_func": 0.6624999642372131, + "step": 1493 + }, + { + "epoch": 0.02488631252810954, + "grad_norm": 0.05898324027657509, + "learning_rate": 3e-06, + "loss": -0.0019, + "step": 1494 + }, + { + "epoch": 0.024902970033148435, + "grad_norm": 0.08079120516777039, + "learning_rate": 3e-06, + "loss": 0.0128, + "step": 1495 + }, + { + "epoch": 0.02491962753818733, + "grad_norm": 0.05847210809588432, + "learning_rate": 3e-06, + "loss": -0.0023, + "step": 1496 + }, + { + "completion_length": 76.58333587646484, + "epoch": 0.024936285043226227, + "grad_norm": 0.039157431572675705, + "learning_rate": 3e-06, + "loss": 0.0059, + "reward": 0.643750011920929, + "rewards/countdown_reward_func": 0.643750011920929, + "step": 1497 + }, + { + "epoch": 0.02495294254826512, + "grad_norm": 0.04516930505633354, + "learning_rate": 3e-06, + "loss": 0.0024, + "step": 1498 + }, + { + "epoch": 0.024969600053304015, + "grad_norm": 0.03887012600898743, + "learning_rate": 3e-06, + "loss": 0.0057, + "step": 1499 + }, + { + "epoch": 0.02498625755834291, + "grad_norm": 0.04461653530597687, + "learning_rate": 3e-06, + "loss": 0.0021, + "step": 1500 + } + ], + "logging_steps": 1, + "max_steps": 180099, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 6, + "trial_name": null, + "trial_params": null +}