diff --git "a/contentstyle_modeling/posttrained/trainer_state.json" "b/contentstyle_modeling/posttrained/trainer_state.json" new file mode 100644--- /dev/null +++ "b/contentstyle_modeling/posttrained/trainer_state.json" @@ -0,0 +1,18933 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 6.769172932330827, + "eval_steps": 500, + "global_step": 9000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "advantages": 2.118448499288661e-07, + "advantages_std": 1.497294020652771, + "clip_ratio": 0.0, + "completion_length": 86.44166946411133, + "epoch": 0.007518796992481203, + "grad_norm": 6.75, + "kl": 0.02395845539867878, + "learning_rate": 4.996240601503759e-06, + "loss": 0.0064, + "num_tokens": 303965.0, + "reward": -2.5252880096435546, + "reward_std": 7.708663702011108, + "rewards/get_chromagram_reward": 0.6135738074779511, + "rewards/get_chromagram_reward_std": 0.12009836137294769, + "rewards/get_intelligibility_reward": -8.167745923995971, + "rewards/get_intelligibility_reward_std": 11.17201633453369, + "rewards/get_target_len_reward": -0.021691455319523812, + "rewards/get_target_len_reward_std": 0.06159961894154549, + "step": 10 + }, + { + "advantages": -7.053215966834614e-08, + "advantages_std": 1.5645875334739685, + "clip_ratio": 0.0, + "completion_length": 88.32143020629883, + "epoch": 0.015037593984962405, + "grad_norm": 7.09375, + "kl": 0.08431399557739497, + "learning_rate": 4.992481203007519e-06, + "loss": 0.0118, + "num_tokens": 614856.0, + "reward": -1.9682563066482544, + "reward_std": 7.121526718139648, + "rewards/get_chromagram_reward": 0.6209080219268799, + "rewards/get_chromagram_reward_std": 0.11176137179136277, + "rewards/get_intelligibility_reward": -6.507082128524781, + "rewards/get_intelligibility_reward_std": 10.901963329315185, + "rewards/get_target_len_reward": -0.018594418559223412, + "rewards/get_target_len_reward_std": 0.05279657319188118, + "step": 20 + }, + { + "advantages": -1.517434867537304e-07, + "advantages_std": 1.6699465274810792, + "clip_ratio": 0.0, + "completion_length": 87.67381134033204, + "epoch": 0.022556390977443608, + "grad_norm": 11.1875, + "kl": 0.05091430209577084, + "learning_rate": 4.9887218045112785e-06, + "loss": 0.0063, + "num_tokens": 924413.0, + "reward": -1.7607180513441563, + "reward_std": 6.793271017074585, + "rewards/get_chromagram_reward": 0.6260063648223877, + "rewards/get_chromagram_reward_std": 0.1159092791378498, + "rewards/get_intelligibility_reward": -5.883437991142273, + "rewards/get_intelligibility_reward_std": 10.413993167877198, + "rewards/get_target_len_reward": -0.024722003471106292, + "rewards/get_target_len_reward_std": 0.06130978129804134, + "step": 30 + }, + { + "advantages": 4.1971603508272893e-07, + "advantages_std": 1.6840260982513429, + "clip_ratio": 0.0, + "completion_length": 86.18273849487305, + "epoch": 0.03007518796992481, + "grad_norm": 9.0625, + "kl": 0.08079373240470886, + "learning_rate": 4.984962406015038e-06, + "loss": 0.0128, + "num_tokens": 1229476.0, + "reward": -1.8878801107406615, + "reward_std": 7.23775429725647, + "rewards/get_chromagram_reward": 0.6211916923522949, + "rewards/get_chromagram_reward_std": 0.11408754736185074, + "rewards/get_intelligibility_reward": -6.262418246269226, + "rewards/get_intelligibility_reward_std": 11.161653137207031, + "rewards/get_target_len_reward": -0.022413293924182655, + "rewards/get_target_len_reward_std": 0.06784052737057208, + "step": 40 + }, + { + "advantages": -1.251697586468481e-07, + "advantages_std": 1.4488242149353028, + "clip_ratio": 0.0, + "completion_length": 90.10297927856445, + "epoch": 0.03759398496240601, + "grad_norm": 6.875, + "kl": 0.09215598180890083, + "learning_rate": 4.981203007518797e-06, + "loss": 0.0095, + "num_tokens": 1544823.0, + "reward": -1.7895142793655396, + "reward_std": 6.443612480163575, + "rewards/get_chromagram_reward": 0.6228613257408142, + "rewards/get_chromagram_reward_std": 0.10370689854025841, + "rewards/get_intelligibility_reward": -5.974599933624267, + "rewards/get_intelligibility_reward_std": 9.874517250061036, + "rewards/get_target_len_reward": -0.016803824808448553, + "rewards/get_target_len_reward_std": 0.045621754601597786, + "step": 50 + }, + { + "advantages": 5.180637305812752e-07, + "advantages_std": 1.553832471370697, + "clip_ratio": 0.0, + "completion_length": 87.78928756713867, + "epoch": 0.045112781954887216, + "grad_norm": 6.5, + "kl": 0.12433034032583237, + "learning_rate": 4.977443609022557e-06, + "loss": 0.0151, + "num_tokens": 1853259.0, + "reward": -1.7618976533412933, + "reward_std": 7.031143283843994, + "rewards/get_chromagram_reward": 0.6234481394290924, + "rewards/get_chromagram_reward_std": 0.12234157994389534, + "rewards/get_intelligibility_reward": -5.885248851776123, + "rewards/get_intelligibility_reward_std": 11.020007610321045, + "rewards/get_target_len_reward": -0.023891913425177335, + "rewards/get_target_len_reward_std": 0.06750189382582902, + "step": 60 + }, + { + "advantages": -1.1424223771427932e-07, + "advantages_std": 1.6184780716896057, + "clip_ratio": 0.0, + "completion_length": 87.79166793823242, + "epoch": 0.05263157894736842, + "grad_norm": 7.96875, + "kl": 0.18404756337404252, + "learning_rate": 4.973684210526316e-06, + "loss": 0.0199, + "num_tokens": 2161781.0, + "reward": -1.8432800233364106, + "reward_std": 7.268847322463989, + "rewards/get_chromagram_reward": 0.6163238942623138, + "rewards/get_chromagram_reward_std": 0.11489329114556313, + "rewards/get_intelligibility_reward": -6.128581380844116, + "rewards/get_intelligibility_reward_std": 11.405856323242187, + "rewards/get_target_len_reward": -0.017582453973591327, + "rewards/get_target_len_reward_std": 0.046123570390045644, + "step": 70 + }, + { + "advantages": 4.713734142569592e-07, + "advantages_std": 1.726417601108551, + "clip_ratio": 0.0, + "completion_length": 87.27023849487304, + "epoch": 0.06015037593984962, + "grad_norm": 16.25, + "kl": 0.41561629995703697, + "learning_rate": 4.969924812030076e-06, + "loss": 0.0458, + "num_tokens": 2469378.0, + "reward": -2.222189700603485, + "reward_std": 8.024344348907471, + "rewards/get_chromagram_reward": 0.6164293169975281, + "rewards/get_chromagram_reward_std": 0.11355845630168915, + "rewards/get_intelligibility_reward": -7.259270143508911, + "rewards/get_intelligibility_reward_std": 12.41039228439331, + "rewards/get_target_len_reward": -0.02372810449451208, + "rewards/get_target_len_reward_std": 0.06544858254492283, + "step": 80 + }, + { + "advantages": -3.9115550976021043e-08, + "advantages_std": 1.5857127904891968, + "clip_ratio": 0.0, + "completion_length": 83.47380981445312, + "epoch": 0.06766917293233082, + "grad_norm": 5.9375, + "kl": 0.38582088649272916, + "learning_rate": 4.966165413533835e-06, + "loss": 0.0418, + "num_tokens": 2767074.0, + "reward": -2.1240372538566588, + "reward_std": 7.091709327697754, + "rewards/get_chromagram_reward": 0.6215401530265808, + "rewards/get_chromagram_reward_std": 0.1213473655283451, + "rewards/get_intelligibility_reward": -6.968455457687378, + "rewards/get_intelligibility_reward_std": 10.730904293060302, + "rewards/get_target_len_reward": -0.025196005031466483, + "rewards/get_target_len_reward_std": 0.06093333307653666, + "step": 90 + }, + { + "advantages": -1.0828177181565479e-07, + "advantages_std": 1.5523891210556031, + "clip_ratio": 0.0, + "completion_length": 86.3976203918457, + "epoch": 0.07518796992481203, + "grad_norm": 7.84375, + "kl": 0.1602418266236782, + "learning_rate": 4.962406015037594e-06, + "loss": 0.0229, + "num_tokens": 3072540.0, + "reward": -2.1356523513793944, + "reward_std": 7.14356107711792, + "rewards/get_chromagram_reward": 0.6146622836589813, + "rewards/get_chromagram_reward_std": 0.11833304241299629, + "rewards/get_intelligibility_reward": -6.999609637260437, + "rewards/get_intelligibility_reward_std": 10.735732078552246, + "rewards/get_target_len_reward": -0.02200925387442112, + "rewards/get_target_len_reward_std": 0.06754342392086983, + "step": 100 + }, + { + "advantages": -1.3721486098461354e-07, + "advantages_std": 1.5168035984039308, + "clip_ratio": 0.0, + "completion_length": 87.88928680419922, + "epoch": 0.08270676691729323, + "grad_norm": 8.0625, + "kl": 0.21410228833556175, + "learning_rate": 4.958646616541354e-06, + "loss": 0.0238, + "num_tokens": 3381491.0, + "reward": -1.6413340508937835, + "reward_std": 6.78236951828003, + "rewards/get_chromagram_reward": 0.618117380142212, + "rewards/get_chromagram_reward_std": 0.11681264266371727, + "rewards/get_intelligibility_reward": -5.526298701763153, + "rewards/get_intelligibility_reward_std": 10.602624130249023, + "rewards/get_target_len_reward": -0.015820635203272104, + "rewards/get_target_len_reward_std": 0.045049076154828074, + "step": 110 + }, + { + "advantages": -2.128382519117622e-07, + "advantages_std": 1.590258038043976, + "clip_ratio": 0.0, + "completion_length": 88.00714492797852, + "epoch": 0.09022556390977443, + "grad_norm": 5.34375, + "kl": 0.1765545964241028, + "learning_rate": 4.954887218045113e-06, + "loss": 0.0232, + "num_tokens": 3691706.0, + "reward": -1.7672514021396637, + "reward_std": 6.887362623214722, + "rewards/get_chromagram_reward": 0.6296150028705597, + "rewards/get_chromagram_reward_std": 0.10867820680141449, + "rewards/get_intelligibility_reward": -5.908388066291809, + "rewards/get_intelligibility_reward_std": 10.689667415618896, + "rewards/get_target_len_reward": -0.022980824299156665, + "rewards/get_target_len_reward_std": 0.06642410941421986, + "step": 120 + }, + { + "advantages": -1.5621384186204068e-07, + "advantages_std": 1.4537077307701112, + "clip_ratio": 0.0, + "completion_length": 87.18214492797851, + "epoch": 0.09774436090225563, + "grad_norm": 7.40625, + "kl": 0.22367219924926757, + "learning_rate": 4.951127819548872e-06, + "loss": 0.0251, + "num_tokens": 3998700.0, + "reward": -2.014804148674011, + "reward_std": 6.904703187942505, + "rewards/get_chromagram_reward": 0.616754686832428, + "rewards/get_chromagram_reward_std": 0.11644136309623718, + "rewards/get_intelligibility_reward": -6.640150189399719, + "rewards/get_intelligibility_reward_std": 10.418735027313232, + "rewards/get_target_len_reward": -0.021016582287847994, + "rewards/get_target_len_reward_std": 0.061346288211643694, + "step": 130 + }, + { + "advantages": -8.443991836415421e-07, + "advantages_std": 1.5730493187904357, + "clip_ratio": 0.0, + "completion_length": 84.52380981445313, + "epoch": 0.10526315789473684, + "grad_norm": 8.125, + "kl": 0.3281571701169014, + "learning_rate": 4.947368421052632e-06, + "loss": 0.0385, + "num_tokens": 4298262.0, + "reward": -1.8539611220359802, + "reward_std": 6.75795578956604, + "rewards/get_chromagram_reward": 0.6141342937946319, + "rewards/get_chromagram_reward_std": 0.11726542636752128, + "rewards/get_intelligibility_reward": -6.155655121803283, + "rewards/get_intelligibility_reward_std": 10.387967538833617, + "rewards/get_target_len_reward": -0.02036202410236001, + "rewards/get_target_len_reward_std": 0.05991331338882446, + "step": 140 + }, + { + "advantages": 8.443992953743873e-08, + "advantages_std": 1.4925345063209534, + "clip_ratio": 0.0, + "completion_length": 89.01964416503907, + "epoch": 0.11278195488721804, + "grad_norm": 16.5, + "kl": 0.2456020161509514, + "learning_rate": 4.943609022556392e-06, + "loss": 0.0272, + "num_tokens": 4611748.0, + "reward": -1.597898268699646, + "reward_std": 6.498913764953613, + "rewards/get_chromagram_reward": 0.6427377462387085, + "rewards/get_chromagram_reward_std": 0.11222934648394585, + "rewards/get_intelligibility_reward": -5.41141984462738, + "rewards/get_intelligibility_reward_std": 10.233173274993897, + "rewards/get_target_len_reward": -0.025012334156781436, + "rewards/get_target_len_reward_std": 0.05679422654211521, + "step": 150 + }, + { + "advantages": -1.0927518871994834e-07, + "advantages_std": 1.5491583943367004, + "clip_ratio": 0.0, + "completion_length": 86.72500152587891, + "epoch": 0.12030075187969924, + "grad_norm": 7.3125, + "kl": 0.19152849316596984, + "learning_rate": 4.9398496240601505e-06, + "loss": 0.0232, + "num_tokens": 4917996.0, + "reward": -2.06285679936409, + "reward_std": 7.055027866363526, + "rewards/get_chromagram_reward": 0.6176262974739075, + "rewards/get_chromagram_reward_std": 0.11738575920462609, + "rewards/get_intelligibility_reward": -6.784622621536255, + "rewards/get_intelligibility_reward_std": 10.655733203887939, + "rewards/get_target_len_reward": -0.021573868487030266, + "rewards/get_target_len_reward_std": 0.061045969277620314, + "step": 160 + }, + { + "advantages": 1.5075007411269326e-07, + "advantages_std": 1.6391386032104491, + "clip_ratio": 0.0, + "completion_length": 86.58154907226563, + "epoch": 0.12781954887218044, + "grad_norm": 8.1875, + "kl": 0.216806098818779, + "learning_rate": 4.93609022556391e-06, + "loss": 0.0204, + "num_tokens": 5223687.0, + "reward": -1.6740633368492126, + "reward_std": 6.215735626220703, + "rewards/get_chromagram_reward": 0.6173975050449372, + "rewards/get_chromagram_reward_std": 0.11676538810133934, + "rewards/get_intelligibility_reward": -5.620945620536804, + "rewards/get_intelligibility_reward_std": 9.571168136596679, + "rewards/get_target_len_reward": -0.018641630932688714, + "rewards/get_target_len_reward_std": 0.04243884533643723, + "step": 170 + }, + { + "advantages": 3.563860104804917e-08, + "advantages_std": 1.605065941810608, + "clip_ratio": 0.0, + "completion_length": 85.777978515625, + "epoch": 0.13533834586466165, + "grad_norm": 7.1875, + "kl": 0.22296204417943954, + "learning_rate": 4.93233082706767e-06, + "loss": 0.0279, + "num_tokens": 5526129.0, + "reward": -1.9188735783100128, + "reward_std": 7.194452619552612, + "rewards/get_chromagram_reward": 0.6230684220790863, + "rewards/get_chromagram_reward_std": 0.10260485261678695, + "rewards/get_intelligibility_reward": -6.356639158725739, + "rewards/get_intelligibility_reward_std": 11.165844345092774, + "rewards/get_target_len_reward": -0.0230495841242373, + "rewards/get_target_len_reward_std": 0.06901184841990471, + "step": 180 + }, + { + "advantages": 5.2899112823467934e-08, + "advantages_std": 1.6138009309768677, + "clip_ratio": 0.0, + "completion_length": 84.4803596496582, + "epoch": 0.14285714285714285, + "grad_norm": 5.40625, + "kl": 0.4960071489214897, + "learning_rate": 4.928571428571429e-06, + "loss": 0.0515, + "num_tokens": 5826103.0, + "reward": -1.8541862666606903, + "reward_std": 6.528654289245606, + "rewards/get_chromagram_reward": 0.6299790501594543, + "rewards/get_chromagram_reward_std": 0.12313227728009224, + "rewards/get_intelligibility_reward": -6.169015526771545, + "rewards/get_intelligibility_reward_std": 9.88207130432129, + "rewards/get_target_len_reward": -0.02352191610261798, + "rewards/get_target_len_reward_std": 0.05509545002132654, + "step": 190 + }, + { + "advantages": -3.269563261909525e-07, + "advantages_std": 1.5400643467903137, + "clip_ratio": 0.0, + "completion_length": 87.34881057739258, + "epoch": 0.15037593984962405, + "grad_norm": 6.53125, + "kl": 0.23356708884239197, + "learning_rate": 4.924812030075188e-06, + "loss": 0.0275, + "num_tokens": 6134473.0, + "reward": -1.7080044865608215, + "reward_std": 6.700563287734985, + "rewards/get_chromagram_reward": 0.6138969004154206, + "rewards/get_chromagram_reward_std": 0.1239325612783432, + "rewards/get_intelligibility_reward": -5.716689348220825, + "rewards/get_intelligibility_reward_std": 10.459880065917968, + "rewards/get_target_len_reward": -0.021220722515136004, + "rewards/get_target_len_reward_std": 0.05877160653471947, + "step": 200 + }, + { + "advantages": -4.68889902549563e-07, + "advantages_std": 1.5320081472396851, + "clip_ratio": 0.0, + "completion_length": 87.94166793823243, + "epoch": 0.15789473684210525, + "grad_norm": 6.375, + "kl": 0.20200251489877702, + "learning_rate": 4.921052631578948e-06, + "loss": 0.0273, + "num_tokens": 6444424.0, + "reward": -1.5870545089244843, + "reward_std": 6.537412786483765, + "rewards/get_chromagram_reward": 0.6228235900402069, + "rewards/get_chromagram_reward_std": 0.11699960082769394, + "rewards/get_intelligibility_reward": -5.36222653388977, + "rewards/get_intelligibility_reward_std": 10.262313938140869, + "rewards/get_target_len_reward": -0.02176028909161687, + "rewards/get_target_len_reward_std": 0.06819032784551382, + "step": 210 + }, + { + "advantages": 1.400709237486808e-07, + "advantages_std": 1.5696740984916686, + "clip_ratio": 0.0, + "completion_length": 86.96012115478516, + "epoch": 0.16541353383458646, + "grad_norm": 7.84375, + "kl": 0.20887088924646377, + "learning_rate": 4.9172932330827075e-06, + "loss": 0.0224, + "num_tokens": 6751704.0, + "reward": -2.110741305351257, + "reward_std": 7.709843921661377, + "rewards/get_chromagram_reward": 0.6346747756004334, + "rewards/get_chromagram_reward_std": 0.1108613669872284, + "rewards/get_intelligibility_reward": -6.943783760070801, + "rewards/get_intelligibility_reward_std": 11.854671812057495, + "rewards/get_target_len_reward": -0.023114563897252083, + "rewards/get_target_len_reward_std": 0.06313695535063743, + "step": 220 + }, + { + "advantages": -1.0306637108215e-07, + "advantages_std": 1.4813214898109437, + "clip_ratio": 0.0, + "completion_length": 85.79404830932617, + "epoch": 0.17293233082706766, + "grad_norm": 6.03125, + "kl": 0.2533964037895203, + "learning_rate": 4.913533834586466e-06, + "loss": 0.0297, + "num_tokens": 7055744.0, + "reward": -1.7548074908554554, + "reward_std": 7.328680038452148, + "rewards/get_chromagram_reward": 0.6251808404922485, + "rewards/get_chromagram_reward_std": 0.11388902738690376, + "rewards/get_intelligibility_reward": -5.8636813282966616, + "rewards/get_intelligibility_reward_std": 11.45258846282959, + "rewards/get_target_len_reward": -0.02592161502689123, + "rewards/get_target_len_reward_std": 0.08212394453585148, + "step": 230 + }, + { + "advantages": -1.2392800385896408e-07, + "advantages_std": 1.5091304302215576, + "clip_ratio": 0.0, + "completion_length": 88.00178833007813, + "epoch": 0.18045112781954886, + "grad_norm": 5.375, + "kl": 0.2697433799505234, + "learning_rate": 4.909774436090226e-06, + "loss": 0.0281, + "num_tokens": 7365315.0, + "reward": -1.947976952791214, + "reward_std": 7.370342445373535, + "rewards/get_chromagram_reward": 0.6169031858444214, + "rewards/get_chromagram_reward_std": 0.10551710426807404, + "rewards/get_intelligibility_reward": -6.44406920671463, + "rewards/get_intelligibility_reward_std": 11.373730850219726, + "rewards/get_target_len_reward": -0.016764528863132, + "rewards/get_target_len_reward_std": 0.05088087841868401, + "step": 240 + }, + { + "advantages": 1.3088186108234367e-07, + "advantages_std": 1.4498517632484436, + "clip_ratio": 0.0, + "completion_length": 87.54881134033204, + "epoch": 0.18796992481203006, + "grad_norm": 15.0625, + "kl": 0.22136173099279405, + "learning_rate": 4.906015037593986e-06, + "loss": 0.033, + "num_tokens": 7672896.0, + "reward": -2.0891721487045287, + "reward_std": 7.358000087738037, + "rewards/get_chromagram_reward": 0.6265693724155426, + "rewards/get_chromagram_reward_std": 0.11023145914077759, + "rewards/get_intelligibility_reward": -6.870501017570495, + "rewards/get_intelligibility_reward_std": 11.197034549713134, + "rewards/get_target_len_reward": -0.02358451336622238, + "rewards/get_target_len_reward_std": 0.07190894670784473, + "step": 250 + }, + { + "advantages": 1.5969078219768563e-07, + "advantages_std": 1.628383994102478, + "clip_ratio": 0.0, + "completion_length": 86.22857284545898, + "epoch": 0.19548872180451127, + "grad_norm": 6.5625, + "kl": 0.24279214888811113, + "learning_rate": 4.902255639097745e-06, + "loss": 0.0225, + "num_tokens": 7978116.0, + "reward": -1.7696778357028962, + "reward_std": 7.20601077079773, + "rewards/get_chromagram_reward": 0.6273474216461181, + "rewards/get_chromagram_reward_std": 0.11232817322015762, + "rewards/get_intelligibility_reward": -5.913970136642456, + "rewards/get_intelligibility_reward_std": 11.329379558563232, + "rewards/get_target_len_reward": -0.02241056999191642, + "rewards/get_target_len_reward_std": 0.060182999819517136, + "step": 260 + }, + { + "advantages": 2.610186704998796e-07, + "advantages_std": 1.6137927174568176, + "clip_ratio": 0.0, + "completion_length": 86.38928756713867, + "epoch": 0.20300751879699247, + "grad_norm": 6.5, + "kl": 0.36207431107759475, + "learning_rate": 4.898496240601504e-06, + "loss": 0.038, + "num_tokens": 8283066.0, + "reward": -1.8501620173454285, + "reward_std": 6.904186582565307, + "rewards/get_chromagram_reward": 0.6266819715499878, + "rewards/get_chromagram_reward_std": 0.11233701780438424, + "rewards/get_intelligibility_reward": -6.15733824968338, + "rewards/get_intelligibility_reward_std": 10.550333213806152, + "rewards/get_target_len_reward": -0.019829432107508184, + "rewards/get_target_len_reward_std": 0.05274516306817532, + "step": 270 + }, + { + "advantages": 3.568828223166065e-07, + "advantages_std": 1.4528794765472413, + "clip_ratio": 0.0, + "completion_length": 84.30774002075195, + "epoch": 0.21052631578947367, + "grad_norm": 6.0625, + "kl": 0.23077704459428788, + "learning_rate": 4.894736842105264e-06, + "loss": 0.027, + "num_tokens": 8581555.0, + "reward": -2.233861434459686, + "reward_std": 7.116700315475464, + "rewards/get_chromagram_reward": 0.6151746988296509, + "rewards/get_chromagram_reward_std": 0.1124894380569458, + "rewards/get_intelligibility_reward": -7.2921109914779665, + "rewards/get_intelligibility_reward_std": 10.465060329437256, + "rewards/get_target_len_reward": -0.024647843185812236, + "rewards/get_target_len_reward_std": 0.06898632310330868, + "step": 280 + }, + { + "advantages": -2.6598572944180887e-07, + "advantages_std": 1.6519518733024596, + "clip_ratio": 0.0, + "completion_length": 85.73809585571288, + "epoch": 0.21804511278195488, + "grad_norm": 5.25, + "kl": 0.2224901869893074, + "learning_rate": 4.890977443609023e-06, + "loss": 0.0233, + "num_tokens": 8885105.0, + "reward": -1.7631924510002137, + "reward_std": 7.07942156791687, + "rewards/get_chromagram_reward": 0.6191496014595032, + "rewards/get_chromagram_reward_std": 0.10151097774505616, + "rewards/get_intelligibility_reward": -5.888655805587769, + "rewards/get_intelligibility_reward_std": 11.147787237167359, + "rewards/get_target_len_reward": -0.02007094845175743, + "rewards/get_target_len_reward_std": 0.054488342627882956, + "step": 290 + }, + { + "advantages": 4.023313522338867e-07, + "advantages_std": 1.5905011296272278, + "clip_ratio": 0.0, + "completion_length": 88.19524002075195, + "epoch": 0.22556390977443608, + "grad_norm": 13.75, + "kl": 0.25747594237327576, + "learning_rate": 4.887218045112782e-06, + "loss": 0.0286, + "num_tokens": 9194808.0, + "reward": -1.5700021982192993, + "reward_std": 6.991564178466797, + "rewards/get_chromagram_reward": 0.6341972947120667, + "rewards/get_chromagram_reward_std": 0.11429327800869941, + "rewards/get_intelligibility_reward": -5.320208358764648, + "rewards/get_intelligibility_reward_std": 11.084502124786377, + "rewards/get_target_len_reward": -0.023995132092386483, + "rewards/get_target_len_reward_std": 0.06535121817141772, + "step": 300 + }, + { + "advantages": -1.0132790144723458e-07, + "advantages_std": 1.4907500624656678, + "clip_ratio": 0.0, + "completion_length": 82.57321624755859, + "epoch": 0.23308270676691728, + "grad_norm": 6.84375, + "kl": 0.2589078933000565, + "learning_rate": 4.883458646616542e-06, + "loss": 0.0285, + "num_tokens": 9489624.0, + "reward": -2.0285483241081237, + "reward_std": 7.1084287643432615, + "rewards/get_chromagram_reward": 0.6235000729560852, + "rewards/get_chromagram_reward_std": 0.11943832337856293, + "rewards/get_intelligibility_reward": -6.6851557970046995, + "rewards/get_intelligibility_reward_std": 10.78179931640625, + "rewards/get_target_len_reward": -0.023988985922187567, + "rewards/get_target_len_reward_std": 0.06262606605887414, + "step": 310 + }, + { + "advantages": 2.3469328738201512e-07, + "advantages_std": 1.6342792153358459, + "clip_ratio": 0.0, + "completion_length": 88.48750228881836, + "epoch": 0.24060150375939848, + "grad_norm": 9.0, + "kl": 0.2543763667345047, + "learning_rate": 4.8796992481203006e-06, + "loss": 0.0294, + "num_tokens": 9800418.0, + "reward": -1.7057337164878845, + "reward_std": 6.9817795753479, + "rewards/get_chromagram_reward": 0.6268013715744019, + "rewards/get_chromagram_reward_std": 0.11317485049366952, + "rewards/get_intelligibility_reward": -5.723447632789612, + "rewards/get_intelligibility_reward_std": 11.025568771362305, + "rewards/get_target_len_reward": -0.02055465867742896, + "rewards/get_target_len_reward_std": 0.05412652175873518, + "step": 320 + }, + { + "advantages": 1.4081598394000138e-07, + "advantages_std": 1.5237973570823669, + "clip_ratio": 0.0, + "completion_length": 87.94702529907227, + "epoch": 0.24812030075187969, + "grad_norm": 6.28125, + "kl": 0.20852650851011276, + "learning_rate": 4.875939849624061e-06, + "loss": 0.0254, + "num_tokens": 10109643.0, + "reward": -1.5429876923561097, + "reward_std": 6.826085138320923, + "rewards/get_chromagram_reward": 0.6277043044567108, + "rewards/get_chromagram_reward_std": 0.12131512090563774, + "rewards/get_intelligibility_reward": -5.235681021213532, + "rewards/get_intelligibility_reward_std": 10.872391033172608, + "rewards/get_target_len_reward": -0.020986052136868237, + "rewards/get_target_len_reward_std": 0.06118348352611065, + "step": 330 + }, + { + "advantages": -4.728635323303365e-07, + "advantages_std": 1.60645192861557, + "clip_ratio": 0.0, + "completion_length": 81.48571548461913, + "epoch": 0.2556390977443609, + "grad_norm": 25.5, + "kl": 0.2564759775996208, + "learning_rate": 4.87218045112782e-06, + "loss": 0.029, + "num_tokens": 10401241.0, + "reward": -1.7328977763652802, + "reward_std": 6.528843545913697, + "rewards/get_chromagram_reward": 0.616835993528366, + "rewards/get_chromagram_reward_std": 0.10688713267445564, + "rewards/get_intelligibility_reward": -5.793001580238342, + "rewards/get_intelligibility_reward_std": 10.110911083221435, + "rewards/get_target_len_reward": -0.022527353093028068, + "rewards/get_target_len_reward_std": 0.06940292119979859, + "step": 340 + }, + { + "advantages": 1.7856558258699807e-07, + "advantages_std": 1.4808288097381592, + "clip_ratio": 0.0, + "completion_length": 87.45893096923828, + "epoch": 0.2631578947368421, + "grad_norm": 8.375, + "kl": 0.29431896060705187, + "learning_rate": 4.8684210526315795e-06, + "loss": 0.0392, + "num_tokens": 10708869.0, + "reward": -1.878147792816162, + "reward_std": 6.971077013015747, + "rewards/get_chromagram_reward": 0.6234214305877686, + "rewards/get_chromagram_reward_std": 0.10851850062608719, + "rewards/get_intelligibility_reward": -6.233998012542725, + "rewards/get_intelligibility_reward_std": 10.780960750579833, + "rewards/get_target_len_reward": -0.023866467643529177, + "rewards/get_target_len_reward_std": 0.06917856726795435, + "step": 350 + }, + { + "advantages": 2.4487575629450474e-07, + "advantages_std": 1.381740403175354, + "clip_ratio": 0.0, + "completion_length": 89.93631134033203, + "epoch": 0.2706766917293233, + "grad_norm": 7.375, + "kl": 0.32826483249664307, + "learning_rate": 4.864661654135338e-06, + "loss": 0.0364, + "num_tokens": 11023929.0, + "reward": -1.3972072571516037, + "reward_std": 7.187353134155273, + "rewards/get_chromagram_reward": 0.6259436666965484, + "rewards/get_chromagram_reward_std": 0.10643556043505668, + "rewards/get_intelligibility_reward": -4.795684731006622, + "rewards/get_intelligibility_reward_std": 11.673350143432618, + "rewards/get_target_len_reward": -0.021880417317152023, + "rewards/get_target_len_reward_std": 0.06496078819036484, + "step": 360 + }, + { + "advantages": 9.31322603037188e-08, + "advantages_std": 1.4747216343879699, + "clip_ratio": 0.0, + "completion_length": 86.92321548461913, + "epoch": 0.2781954887218045, + "grad_norm": 6.71875, + "kl": 0.26946457624435427, + "learning_rate": 4.860902255639098e-06, + "loss": 0.0315, + "num_tokens": 11330383.0, + "reward": -1.709691733121872, + "reward_std": 7.236200475692749, + "rewards/get_chromagram_reward": 0.6254827082157135, + "rewards/get_chromagram_reward_std": 0.11119709685444831, + "rewards/get_intelligibility_reward": -5.735943913459778, + "rewards/get_intelligibility_reward_std": 11.42770071029663, + "rewards/get_target_len_reward": -0.018613758590072395, + "rewards/get_target_len_reward_std": 0.05823171120136976, + "step": 370 + }, + { + "advantages": 2.741813730722242e-07, + "advantages_std": 1.5728921175003052, + "clip_ratio": 0.0, + "completion_length": 87.16845245361328, + "epoch": 0.2857142857142857, + "grad_norm": 8.6875, + "kl": 0.32719208896160124, + "learning_rate": 4.857142857142858e-06, + "loss": 0.0363, + "num_tokens": 11637826.0, + "reward": -1.6900139684788882, + "reward_std": 7.238736009597778, + "rewards/get_chromagram_reward": 0.6199962019920349, + "rewards/get_chromagram_reward_std": 0.12772160023450851, + "rewards/get_intelligibility_reward": -5.663939923048019, + "rewards/get_intelligibility_reward_std": 11.40099401473999, + "rewards/get_target_len_reward": -0.026098042167723177, + "rewards/get_target_len_reward_std": 0.0803416196256876, + "step": 380 + }, + { + "advantages": -4.66903049556322e-08, + "advantages_std": 1.6298266768455505, + "clip_ratio": 0.0, + "completion_length": 88.75774002075195, + "epoch": 0.2932330827067669, + "grad_norm": 4.78125, + "kl": 0.22528714388608934, + "learning_rate": 4.853383458646617e-06, + "loss": 0.0239, + "num_tokens": 11949639.0, + "reward": -1.7683505415916443, + "reward_std": 6.958176136016846, + "rewards/get_chromagram_reward": 0.627706092596054, + "rewards/get_chromagram_reward_std": 0.11281427592039109, + "rewards/get_intelligibility_reward": -5.915233945846557, + "rewards/get_intelligibility_reward_std": 10.804437732696533, + "rewards/get_target_len_reward": -0.017523423489183187, + "rewards/get_target_len_reward_std": 0.046101400069892405, + "step": 390 + }, + { + "advantages": 3.0547367302347084e-08, + "advantages_std": 1.5157369017601012, + "clip_ratio": 0.0, + "completion_length": 86.27678833007812, + "epoch": 0.3007518796992481, + "grad_norm": 6.125, + "kl": 0.2560649961233139, + "learning_rate": 4.849624060150376e-06, + "loss": 0.0237, + "num_tokens": 12255149.0, + "reward": -1.8036470532417297, + "reward_std": 7.095457553863525, + "rewards/get_chromagram_reward": 0.6217517971992492, + "rewards/get_chromagram_reward_std": 0.12013033628463746, + "rewards/get_intelligibility_reward": -6.012101840972901, + "rewards/get_intelligibility_reward_std": 10.966209888458252, + "rewards/get_target_len_reward": -0.02059096023440361, + "rewards/get_target_len_reward_std": 0.0436623141169548, + "step": 400 + }, + { + "advantages": 1.4603139817381816e-07, + "advantages_std": 1.5756688952445983, + "clip_ratio": 0.0, + "completion_length": 87.86666946411133, + "epoch": 0.3082706766917293, + "grad_norm": 7.71875, + "kl": 0.2444481447339058, + "learning_rate": 4.845864661654136e-06, + "loss": 0.0282, + "num_tokens": 12564269.0, + "reward": -1.677663379907608, + "reward_std": 6.837050580978394, + "rewards/get_chromagram_reward": 0.6197855114936829, + "rewards/get_chromagram_reward_std": 0.11356526985764503, + "rewards/get_intelligibility_reward": -5.63051826953888, + "rewards/get_intelligibility_reward_std": 10.768561553955077, + "rewards/get_target_len_reward": -0.022257220838218926, + "rewards/get_target_len_reward_std": 0.06741791926324367, + "step": 410 + }, + { + "advantages": 3.0659140577427023e-07, + "advantages_std": 1.4965197563171386, + "clip_ratio": 0.0, + "completion_length": 90.12321472167969, + "epoch": 0.3157894736842105, + "grad_norm": 17.75, + "kl": 0.23634643405675887, + "learning_rate": 4.842105263157895e-06, + "loss": 0.0247, + "num_tokens": 12880120.0, + "reward": -1.583085983991623, + "reward_std": 6.680713558197022, + "rewards/get_chromagram_reward": 0.6274874389171601, + "rewards/get_chromagram_reward_std": 0.11383199393749237, + "rewards/get_intelligibility_reward": -5.357515811920166, + "rewards/get_intelligibility_reward_std": 10.511133098602295, + "rewards/get_target_len_reward": -0.019229174684733154, + "rewards/get_target_len_reward_std": 0.05312262093648314, + "step": 420 + }, + { + "advantages": 1.2268624587363776e-07, + "advantages_std": 1.6010493755340576, + "clip_ratio": 0.0, + "completion_length": 88.22738265991211, + "epoch": 0.3233082706766917, + "grad_norm": 37.75, + "kl": 0.3049021452665329, + "learning_rate": 4.838345864661654e-06, + "loss": 0.0352, + "num_tokens": 13190566.0, + "reward": -1.7616869747638702, + "reward_std": 6.855478191375733, + "rewards/get_chromagram_reward": 0.6218553423881531, + "rewards/get_chromagram_reward_std": 0.11711084693670273, + "rewards/get_intelligibility_reward": -5.8811728954315186, + "rewards/get_intelligibility_reward_std": 10.717539930343628, + "rewards/get_target_len_reward": -0.02574317567050457, + "rewards/get_target_len_reward_std": 0.06924263034015894, + "step": 430 + }, + { + "advantages": -2.2165477844282578e-07, + "advantages_std": 1.6048481464385986, + "clip_ratio": 0.0, + "completion_length": 85.9375015258789, + "epoch": 0.3308270676691729, + "grad_norm": 6.90625, + "kl": 0.2963700398802757, + "learning_rate": 4.834586466165414e-06, + "loss": 0.0339, + "num_tokens": 13494215.0, + "reward": -1.6599295616149903, + "reward_std": 6.307693576812744, + "rewards/get_chromagram_reward": 0.6308047652244568, + "rewards/get_chromagram_reward_std": 0.11237408369779586, + "rewards/get_intelligibility_reward": -5.5879511594772335, + "rewards/get_intelligibility_reward_std": 9.72415108680725, + "rewards/get_target_len_reward": -0.022642039228230715, + "rewards/get_target_len_reward_std": 0.06010422967374325, + "step": 440 + }, + { + "advantages": -1.1374552357779067e-07, + "advantages_std": 1.632838749885559, + "clip_ratio": 0.0, + "completion_length": 84.73333435058593, + "epoch": 0.3383458646616541, + "grad_norm": 8.9375, + "kl": 0.350592827796936, + "learning_rate": 4.830827067669173e-06, + "loss": 0.0397, + "num_tokens": 13795055.0, + "reward": -1.6300417900085449, + "reward_std": 6.483615875244141, + "rewards/get_chromagram_reward": 0.6207803785800934, + "rewards/get_chromagram_reward_std": 0.11223144382238388, + "rewards/get_intelligibility_reward": -5.485245895385742, + "rewards/get_intelligibility_reward_std": 10.055084419250488, + "rewards/get_target_len_reward": -0.02565964898094535, + "rewards/get_target_len_reward_std": 0.07205168101936579, + "step": 450 + }, + { + "advantages": 3.3900142284437604e-07, + "advantages_std": 1.5068569421768188, + "clip_ratio": 0.0, + "completion_length": 91.00416717529296, + "epoch": 0.3458646616541353, + "grad_norm": 8.25, + "kl": 23.371902348101138, + "learning_rate": 4.827067669172933e-06, + "loss": 2.337, + "num_tokens": 14113145.0, + "reward": -1.2409809799864888, + "reward_std": 6.696157693862915, + "rewards/get_chromagram_reward": 0.6164524137973786, + "rewards/get_chromagram_reward_std": 0.10411357581615448, + "rewards/get_intelligibility_reward": -4.322750660777092, + "rewards/get_intelligibility_reward_std": 10.822736454010009, + "rewards/get_target_len_reward": -0.016644550208002328, + "rewards/get_target_len_reward_std": 0.04289772268384695, + "step": 460 + }, + { + "advantages": 2.8361877362215183e-07, + "advantages_std": 1.6239615321159362, + "clip_ratio": 0.0, + "completion_length": 86.97738342285156, + "epoch": 0.3533834586466165, + "grad_norm": 14.4375, + "kl": 0.31207115948200226, + "learning_rate": 4.823308270676692e-06, + "loss": 0.0404, + "num_tokens": 14420277.0, + "reward": -1.7174145102500915, + "reward_std": 7.22825779914856, + "rewards/get_chromagram_reward": 0.6044569492340088, + "rewards/get_chromagram_reward_std": 0.12300211787223816, + "rewards/get_intelligibility_reward": -5.731238055229187, + "rewards/get_intelligibility_reward_std": 11.418381404876708, + "rewards/get_target_len_reward": -0.025462107546627522, + "rewards/get_target_len_reward_std": 0.0857331132516265, + "step": 470 + }, + { + "advantages": 2.9876830467401303e-07, + "advantages_std": 1.5418254494667054, + "clip_ratio": 0.0, + "completion_length": 87.61726379394531, + "epoch": 0.3609022556390977, + "grad_norm": 7.78125, + "kl": 0.2414279818534851, + "learning_rate": 4.8195488721804515e-06, + "loss": 0.0237, + "num_tokens": 14729195.0, + "reward": -1.6702099859714508, + "reward_std": 6.8579872131347654, + "rewards/get_chromagram_reward": 0.632961118221283, + "rewards/get_chromagram_reward_std": 0.10743627920746804, + "rewards/get_intelligibility_reward": -5.624613666534424, + "rewards/get_intelligibility_reward_std": 10.713689804077148, + "rewards/get_target_len_reward": -0.0189772330224514, + "rewards/get_target_len_reward_std": 0.051045392826199534, + "step": 480 + }, + { + "advantages": -1.8800299841359447e-07, + "advantages_std": 1.564692234992981, + "clip_ratio": 0.0, + "completion_length": 88.64166870117188, + "epoch": 0.3684210526315789, + "grad_norm": 9.8125, + "kl": 0.26591150760650634, + "learning_rate": 4.815789473684211e-06, + "loss": 0.0295, + "num_tokens": 15041323.0, + "reward": -1.4515798807144165, + "reward_std": 6.639446020126343, + "rewards/get_chromagram_reward": 0.6145521402359009, + "rewards/get_chromagram_reward_std": 0.11701491698622704, + "rewards/get_intelligibility_reward": -4.948449277877808, + "rewards/get_intelligibility_reward_std": 10.588137817382812, + "rewards/get_target_len_reward": -0.020842281449586154, + "rewards/get_target_len_reward_std": 0.060225320421159266, + "step": 490 + }, + { + "advantages": -9.822350222066234e-08, + "advantages_std": 1.6208269357681275, + "clip_ratio": 0.0, + "completion_length": 87.24940719604493, + "epoch": 0.37593984962406013, + "grad_norm": 9.875, + "kl": 0.24478698670864105, + "learning_rate": 4.81203007518797e-06, + "loss": 0.0304, + "num_tokens": 15348400.0, + "reward": -1.7204331919550895, + "reward_std": 6.657948637008667, + "rewards/get_chromagram_reward": 0.6120970249176025, + "rewards/get_chromagram_reward_std": 0.10694977194070816, + "rewards/get_intelligibility_reward": -5.753721928596496, + "rewards/get_intelligibility_reward_std": 10.243351125717163, + "rewards/get_target_len_reward": -0.019674433302134274, + "rewards/get_target_len_reward_std": 0.07041770461946725, + "step": 500 + }, + { + "advantages": 2.849847236419123e-07, + "advantages_std": 1.589078712463379, + "clip_ratio": 0.0, + "completion_length": 83.6428596496582, + "epoch": 0.38345864661654133, + "grad_norm": 6.03125, + "kl": 0.26985139548778536, + "learning_rate": 4.80827067669173e-06, + "loss": 0.0268, + "num_tokens": 15646198.0, + "reward": -1.8068750977516175, + "reward_std": 6.750207424163818, + "rewards/get_chromagram_reward": 0.6317034482955932, + "rewards/get_chromagram_reward_std": 0.11371424272656441, + "rewards/get_intelligibility_reward": -6.03351776599884, + "rewards/get_intelligibility_reward_std": 10.352978706359863, + "rewards/get_target_len_reward": -0.018810535687953232, + "rewards/get_target_len_reward_std": 0.05019157826900482, + "step": 510 + }, + { + "advantages": -4.1474899603599624e-07, + "advantages_std": 1.5117250084877014, + "clip_ratio": 0.0, + "completion_length": 87.92976303100586, + "epoch": 0.39097744360902253, + "grad_norm": 7.15625, + "kl": 0.2585775926709175, + "learning_rate": 4.804511278195489e-06, + "loss": 0.029, + "num_tokens": 15956086.0, + "reward": -1.6853113710880279, + "reward_std": 6.823435592651367, + "rewards/get_chromagram_reward": 0.6293639838695526, + "rewards/get_chromagram_reward_std": 0.11193648576736451, + "rewards/get_intelligibility_reward": -5.663621878623962, + "rewards/get_intelligibility_reward_std": 10.723005723953246, + "rewards/get_target_len_reward": -0.021675997786223887, + "rewards/get_target_len_reward_std": 0.06440430246293545, + "step": 520 + }, + { + "advantages": -9.18905129765335e-09, + "advantages_std": 1.5427281141281128, + "clip_ratio": 0.0, + "completion_length": 87.26071624755859, + "epoch": 0.39849624060150374, + "grad_norm": 14.5, + "kl": 0.2736205294728279, + "learning_rate": 4.800751879699249e-06, + "loss": 0.0322, + "num_tokens": 16263270.0, + "reward": -1.678658276796341, + "reward_std": 6.964376831054688, + "rewards/get_chromagram_reward": 0.6212248384952546, + "rewards/get_chromagram_reward_std": 0.1208167664706707, + "rewards/get_intelligibility_reward": -5.635741448402404, + "rewards/get_intelligibility_reward_std": 10.981070423126221, + "rewards/get_target_len_reward": -0.02145816870033741, + "rewards/get_target_len_reward_std": 0.0654794754460454, + "step": 530 + }, + { + "advantages": 9.809930503479337e-08, + "advantages_std": 1.6744032025337219, + "clip_ratio": 0.0, + "completion_length": 87.52143096923828, + "epoch": 0.40601503759398494, + "grad_norm": 5.59375, + "kl": 0.29339379668235777, + "learning_rate": 4.796992481203008e-06, + "loss": 0.0315, + "num_tokens": 16571800.0, + "reward": -1.3652776062488556, + "reward_std": 6.553081464767456, + "rewards/get_chromagram_reward": 0.6332614958286286, + "rewards/get_chromagram_reward_std": 0.11830071583390236, + "rewards/get_intelligibility_reward": -4.706533789634705, + "rewards/get_intelligibility_reward_std": 10.460928821563721, + "rewards/get_target_len_reward": -0.02256029974669218, + "rewards/get_target_len_reward_std": 0.060732940770685674, + "step": 540 + }, + { + "advantages": 2.3345152975196015e-07, + "advantages_std": 1.5902326703071594, + "clip_ratio": 0.0, + "completion_length": 83.64523849487304, + "epoch": 0.41353383458646614, + "grad_norm": 39.5, + "kl": 0.252433679997921, + "learning_rate": 4.793233082706767e-06, + "loss": 0.0288, + "num_tokens": 16868510.0, + "reward": -2.3146336674690247, + "reward_std": 7.112114381790161, + "rewards/get_chromagram_reward": 0.6218239188194274, + "rewards/get_chromagram_reward_std": 0.12277880832552909, + "rewards/get_intelligibility_reward": -7.544064474105835, + "rewards/get_intelligibility_reward_std": 10.471257495880128, + "rewards/get_target_len_reward": -0.021660258620977403, + "rewards/get_target_len_reward_std": 0.06123478710651398, + "step": 550 + }, + { + "advantages": 2.9032429722519736e-07, + "advantages_std": 1.516219162940979, + "clip_ratio": 0.0, + "completion_length": 85.42381134033204, + "epoch": 0.42105263157894735, + "grad_norm": 13.4375, + "kl": 0.25826217532157897, + "learning_rate": 4.789473684210527e-06, + "loss": 0.0308, + "num_tokens": 17171120.0, + "reward": -1.5234275877475738, + "reward_std": 6.649781656265259, + "rewards/get_chromagram_reward": 0.6296819686889649, + "rewards/get_chromagram_reward_std": 0.11613814607262611, + "rewards/get_intelligibility_reward": -5.17573721408844, + "rewards/get_intelligibility_reward_std": 10.52333984375, + "rewards/get_target_len_reward": -0.024227123986929656, + "rewards/get_target_len_reward_std": 0.06795755084604024, + "step": 560 + }, + { + "advantages": -3.3006072328589655e-07, + "advantages_std": 1.5026580929756164, + "clip_ratio": 0.0, + "completion_length": 88.41131134033203, + "epoch": 0.42857142857142855, + "grad_norm": 7.15625, + "kl": 0.5135219663381576, + "learning_rate": 4.785714285714287e-06, + "loss": 0.056, + "num_tokens": 17481979.0, + "reward": -1.4849580019712447, + "reward_std": 6.6246805667877195, + "rewards/get_chromagram_reward": 0.6225826203823089, + "rewards/get_chromagram_reward_std": 0.11443077027797699, + "rewards/get_intelligibility_reward": -5.055980670452118, + "rewards/get_intelligibility_reward_std": 10.499218845367432, + "rewards/get_target_len_reward": -0.021475626993924378, + "rewards/get_target_len_reward_std": 0.06761925015598536, + "step": 570 + }, + { + "advantages": -3.099441585163731e-07, + "advantages_std": 1.5539302945137023, + "clip_ratio": 0.0, + "completion_length": 90.10595474243163, + "epoch": 0.43609022556390975, + "grad_norm": 8.6875, + "kl": 0.41011454313993456, + "learning_rate": 4.781954887218045e-06, + "loss": 0.0467, + "num_tokens": 17797588.0, + "reward": -1.4988545447587966, + "reward_std": 7.042573499679565, + "rewards/get_chromagram_reward": 0.6192649960517883, + "rewards/get_chromagram_reward_std": 0.12368768453598022, + "rewards/get_intelligibility_reward": -5.092814598977566, + "rewards/get_intelligibility_reward_std": 11.2423526763916, + "rewards/get_target_len_reward": -0.023013710416853426, + "rewards/get_target_len_reward_std": 0.06633382495492697, + "step": 580 + }, + { + "advantages": 1.0530154903598543e-07, + "advantages_std": 1.600118923187256, + "clip_ratio": 0.0, + "completion_length": 83.48333511352538, + "epoch": 0.44360902255639095, + "grad_norm": 7.875, + "kl": 0.26907303333282473, + "learning_rate": 4.778195488721805e-06, + "loss": 0.0326, + "num_tokens": 18093997.0, + "reward": -2.03454070687294, + "reward_std": 7.202506160736084, + "rewards/get_chromagram_reward": 0.6092650592327118, + "rewards/get_chromagram_reward_std": 0.10631671249866485, + "rewards/get_intelligibility_reward": -6.693427014350891, + "rewards/get_intelligibility_reward_std": 10.90904884338379, + "rewards/get_target_len_reward": -0.01945957327261567, + "rewards/get_target_len_reward_std": 0.06454089805483817, + "step": 590 + }, + { + "advantages": -4.1872264446851657e-07, + "advantages_std": 1.4982729434967041, + "clip_ratio": 0.0, + "completion_length": 84.88095397949219, + "epoch": 0.45112781954887216, + "grad_norm": 5.96875, + "kl": 0.2921657621860504, + "learning_rate": 4.774436090225565e-06, + "loss": 0.034, + "num_tokens": 18394582.0, + "reward": -1.8129005968570708, + "reward_std": 7.103170919418335, + "rewards/get_chromagram_reward": 0.6202451944351196, + "rewards/get_chromagram_reward_std": 0.11972960755228997, + "rewards/get_intelligibility_reward": -6.034282064437866, + "rewards/get_intelligibility_reward_std": 11.095508289337157, + "rewards/get_target_len_reward": -0.024664431624114514, + "rewards/get_target_len_reward_std": 0.07610471770167351, + "step": 600 + }, + { + "advantages": 5.799035989184631e-08, + "advantages_std": 1.5440994024276733, + "clip_ratio": 0.0, + "completion_length": 87.07857360839844, + "epoch": 0.45864661654135336, + "grad_norm": 6.21875, + "kl": 0.4257489159703255, + "learning_rate": 4.7706766917293235e-06, + "loss": 0.0471, + "num_tokens": 18701907.0, + "reward": -1.700702142715454, + "reward_std": 6.709496259689331, + "rewards/get_chromagram_reward": 0.6193202555179596, + "rewards/get_chromagram_reward_std": 0.12053735405206681, + "rewards/get_intelligibility_reward": -5.697566413879395, + "rewards/get_intelligibility_reward_std": 10.440969467163086, + "rewards/get_target_len_reward": -0.023859874997287988, + "rewards/get_target_len_reward_std": 0.06426951251924037, + "step": 610 + }, + { + "advantages": 5.923211645608717e-07, + "advantages_std": 1.5804563403129577, + "clip_ratio": 0.0, + "completion_length": 85.25654907226563, + "epoch": 0.46616541353383456, + "grad_norm": 8.0625, + "kl": 0.3186333954334259, + "learning_rate": 4.766917293233083e-06, + "loss": 0.0386, + "num_tokens": 19004660.0, + "reward": -1.5786213517189025, + "reward_std": 6.751915645599365, + "rewards/get_chromagram_reward": 0.6187632083892822, + "rewards/get_chromagram_reward_std": 0.10962516814470291, + "rewards/get_intelligibility_reward": -5.333805966377258, + "rewards/get_intelligibility_reward_std": 10.596128702163696, + "rewards/get_target_len_reward": -0.020821068761870266, + "rewards/get_target_len_reward_std": 0.07383420001715421, + "step": 620 + }, + { + "advantages": 2.384185933124172e-07, + "advantages_std": 1.6757961630821228, + "clip_ratio": 0.0, + "completion_length": 85.70238189697265, + "epoch": 0.47368421052631576, + "grad_norm": 1440.0, + "kl": 0.4250665083527565, + "learning_rate": 4.763157894736842e-06, + "loss": 0.0464, + "num_tokens": 19308680.0, + "reward": -1.7613612473011018, + "reward_std": 6.973189926147461, + "rewards/get_chromagram_reward": 0.6115583896636962, + "rewards/get_chromagram_reward_std": 0.11262777373194695, + "rewards/get_intelligibility_reward": -5.871619367599488, + "rewards/get_intelligibility_reward_std": 10.821762990951537, + "rewards/get_target_len_reward": -0.02402249900624156, + "rewards/get_target_len_reward_std": 0.061053736694157125, + "step": 630 + }, + { + "advantages": -4.579623663403254e-07, + "advantages_std": 1.5364292025566102, + "clip_ratio": 0.0, + "completion_length": 87.05357131958007, + "epoch": 0.48120300751879697, + "grad_norm": 7.75, + "kl": 0.30670359134674074, + "learning_rate": 4.759398496240602e-06, + "loss": 0.0329, + "num_tokens": 19616135.0, + "reward": -1.9259871065616607, + "reward_std": 7.262286853790283, + "rewards/get_chromagram_reward": 0.6155335962772369, + "rewards/get_chromagram_reward_std": 0.10379507169127464, + "rewards/get_intelligibility_reward": -6.371912264823914, + "rewards/get_intelligibility_reward_std": 11.193008613586425, + "rewards/get_target_len_reward": -0.021582304313778878, + "rewards/get_target_len_reward_std": 0.06231225673109293, + "step": 640 + }, + { + "advantages": -1.8179416159114225e-07, + "advantages_std": 1.6486274361610413, + "clip_ratio": 0.0, + "completion_length": 88.81666870117188, + "epoch": 0.48872180451127817, + "grad_norm": 30336.0, + "kl": 2.4305228680372237, + "learning_rate": 4.755639097744361e-06, + "loss": 0.2454, + "num_tokens": 19928080.0, + "reward": -1.6895100951194764, + "reward_std": 7.358239984512329, + "rewards/get_chromagram_reward": 0.6077431797981262, + "rewards/get_chromagram_reward_std": 0.11492864713072777, + "rewards/get_intelligibility_reward": -5.655220425128936, + "rewards/get_intelligibility_reward_std": 11.619172477722168, + "rewards/get_target_len_reward": -0.021052911598235368, + "rewards/get_target_len_reward_std": 0.06442425940185785, + "step": 650 + }, + { + "advantages": -1.544753786220099e-07, + "advantages_std": 1.6463606476783752, + "clip_ratio": 0.0, + "completion_length": 85.95714416503907, + "epoch": 0.49624060150375937, + "grad_norm": 5.46875, + "kl": 0.3190068453550339, + "learning_rate": 4.751879699248121e-06, + "loss": 0.0348, + "num_tokens": 20231918.0, + "reward": -2.009367752075195, + "reward_std": 7.01142954826355, + "rewards/get_chromagram_reward": 0.6336705982685089, + "rewards/get_chromagram_reward_std": 0.1101664699614048, + "rewards/get_intelligibility_reward": -6.642132258415222, + "rewards/get_intelligibility_reward_std": 10.66052188873291, + "rewards/get_target_len_reward": -0.01964133554138243, + "rewards/get_target_len_reward_std": 0.04977958481758833, + "step": 660 + }, + { + "advantages": -1.4205774192532772e-07, + "advantages_std": 1.5998609900474547, + "clip_ratio": 0.0, + "completion_length": 84.0226203918457, + "epoch": 0.5037593984962406, + "grad_norm": 5.5625, + "kl": 0.2667311102151871, + "learning_rate": 4.74812030075188e-06, + "loss": 0.0347, + "num_tokens": 20531077.0, + "reward": -2.143736845254898, + "reward_std": 7.50629448890686, + "rewards/get_chromagram_reward": 0.6176834642887116, + "rewards/get_chromagram_reward_std": 0.12896764725446702, + "rewards/get_intelligibility_reward": -7.02304618358612, + "rewards/get_intelligibility_reward_std": 11.318552112579345, + "rewards/get_target_len_reward": -0.025847438164055346, + "rewards/get_target_len_reward_std": 0.08172615952789783, + "step": 670 + }, + { + "advantages": -2.93925414673879e-07, + "advantages_std": 1.6468318104743958, + "clip_ratio": 0.0, + "completion_length": 84.94345397949219, + "epoch": 0.5112781954887218, + "grad_norm": 7.0625, + "kl": 0.3305581882596016, + "learning_rate": 4.744360902255639e-06, + "loss": 0.041, + "num_tokens": 20832587.0, + "reward": -1.2728881180286407, + "reward_std": 6.217760515213013, + "rewards/get_chromagram_reward": 0.6311124086380004, + "rewards/get_chromagram_reward_std": 0.11825463101267815, + "rewards/get_intelligibility_reward": -4.425248765945435, + "rewards/get_intelligibility_reward_std": 9.891574048995972, + "rewards/get_target_len_reward": -0.02452768972143531, + "rewards/get_target_len_reward_std": 0.07514422051608563, + "step": 680 + }, + { + "advantages": -2.4388234116656803e-07, + "advantages_std": 1.4703909873962402, + "clip_ratio": 0.0, + "completion_length": 85.74226379394531, + "epoch": 0.518796992481203, + "grad_norm": 9.625, + "kl": 0.2962498337030411, + "learning_rate": 4.740601503759399e-06, + "loss": 0.0323, + "num_tokens": 21135905.0, + "reward": -1.868764042854309, + "reward_std": 7.241525459289551, + "rewards/get_chromagram_reward": 0.6176757931709289, + "rewards/get_chromagram_reward_std": 0.12186905890703201, + "rewards/get_intelligibility_reward": -6.201520228385926, + "rewards/get_intelligibility_reward_std": 11.264957427978516, + "rewards/get_target_len_reward": -0.022447278164327143, + "rewards/get_target_len_reward_std": 0.055239592865109446, + "step": 690 + }, + { + "advantages": 5.366901660863732e-07, + "advantages_std": 1.5638061165809631, + "clip_ratio": 0.0, + "completion_length": 86.67500152587891, + "epoch": 0.5263157894736842, + "grad_norm": 5.125, + "kl": 0.28444311767816544, + "learning_rate": 4.736842105263158e-06, + "loss": 0.0319, + "num_tokens": 21442155.0, + "reward": -1.47476726770401, + "reward_std": 6.729415082931519, + "rewards/get_chromagram_reward": 0.6268014788627625, + "rewards/get_chromagram_reward_std": 0.11580024063587188, + "rewards/get_intelligibility_reward": -5.027878886461258, + "rewards/get_intelligibility_reward_std": 10.612294578552246, + "rewards/get_target_len_reward": -0.023224306292831898, + "rewards/get_target_len_reward_std": 0.07012978214770556, + "step": 700 + }, + { + "advantages": 6.116927021793117e-07, + "advantages_std": 1.6555665493011475, + "clip_ratio": 0.0, + "completion_length": 88.07381134033203, + "epoch": 0.5338345864661654, + "grad_norm": 25.875, + "kl": 0.31802781522274015, + "learning_rate": 4.733082706766917e-06, + "loss": 0.0379, + "num_tokens": 21751478.0, + "reward": -2.0176144003868104, + "reward_std": 7.255285930633545, + "rewards/get_chromagram_reward": 0.6271386921405793, + "rewards/get_chromagram_reward_std": 0.11421655938029289, + "rewards/get_intelligibility_reward": -6.658082771301269, + "rewards/get_intelligibility_reward_std": 11.103228569030762, + "rewards/get_target_len_reward": -0.021898697735741733, + "rewards/get_target_len_reward_std": 0.061879089660942556, + "step": 710 + }, + { + "advantages": -2.7765831447368326e-07, + "advantages_std": 1.6042242527008057, + "clip_ratio": 0.0, + "completion_length": 85.33214416503907, + "epoch": 0.5413533834586466, + "grad_norm": 5.78125, + "kl": 0.31839183866977694, + "learning_rate": 4.729323308270677e-06, + "loss": 0.0369, + "num_tokens": 22054086.0, + "reward": -1.5850762702524661, + "reward_std": 6.881160306930542, + "rewards/get_chromagram_reward": 0.6308808922767639, + "rewards/get_chromagram_reward_std": 0.11955713406205178, + "rewards/get_intelligibility_reward": -5.366273105144501, + "rewards/get_intelligibility_reward_std": 10.855913162231445, + "rewards/get_target_len_reward": -0.019836284592747687, + "rewards/get_target_len_reward_std": 0.05966003518551588, + "step": 720 + }, + { + "advantages": 1.5373030635146278e-07, + "advantages_std": 1.6363926649093627, + "clip_ratio": 0.0, + "completion_length": 87.30714492797851, + "epoch": 0.5488721804511278, + "grad_norm": 9.0, + "kl": 0.2782985955476761, + "learning_rate": 4.725563909774437e-06, + "loss": 0.031, + "num_tokens": 22361267.0, + "reward": -1.9439775586128234, + "reward_std": 7.178706884384155, + "rewards/get_chromagram_reward": 0.6102202415466309, + "rewards/get_chromagram_reward_std": 0.11201724261045456, + "rewards/get_intelligibility_reward": -6.42216944694519, + "rewards/get_intelligibility_reward_std": 11.05174961090088, + "rewards/get_target_len_reward": -0.019982862658798693, + "rewards/get_target_len_reward_std": 0.05483436398208141, + "step": 730 + }, + { + "advantages": -2.774099560731713e-07, + "advantages_std": 1.6907092094421388, + "clip_ratio": 0.0, + "completion_length": 85.42440567016601, + "epoch": 0.556390977443609, + "grad_norm": 83.5, + "kl": 0.30403348952531817, + "learning_rate": 4.7218045112781955e-06, + "loss": 0.0391, + "num_tokens": 22662925.0, + "reward": -1.8942086100578308, + "reward_std": 6.965086030960083, + "rewards/get_chromagram_reward": 0.6224843442440033, + "rewards/get_chromagram_reward_std": 0.1091718964278698, + "rewards/get_intelligibility_reward": -6.277362990379333, + "rewards/get_intelligibility_reward_std": 10.695509052276611, + "rewards/get_target_len_reward": -0.027746990974992513, + "rewards/get_target_len_reward_std": 0.09219296015799046, + "step": 740 + }, + { + "advantages": -1.3162691061552323e-07, + "advantages_std": 1.654877233505249, + "clip_ratio": 0.0, + "completion_length": 85.3678581237793, + "epoch": 0.5639097744360902, + "grad_norm": 12.375, + "kl": 0.8061857357621193, + "learning_rate": 4.718045112781955e-06, + "loss": 0.0843, + "num_tokens": 22965571.0, + "reward": -1.688933926820755, + "reward_std": 6.675320863723755, + "rewards/get_chromagram_reward": 0.6081489741802215, + "rewards/get_chromagram_reward_std": 0.12180505245923996, + "rewards/get_intelligibility_reward": -5.65586256980896, + "rewards/get_intelligibility_reward_std": 10.319089794158936, + "rewards/get_target_len_reward": -0.019087663665413857, + "rewards/get_target_len_reward_std": 0.058424011990427974, + "step": 750 + }, + { + "advantages": -2.2302070163959796e-07, + "advantages_std": 1.612697470188141, + "clip_ratio": 0.0, + "completion_length": 86.45774002075196, + "epoch": 0.5714285714285714, + "grad_norm": 17.375, + "kl": 0.3379309445619583, + "learning_rate": 4.714285714285715e-06, + "loss": 0.0397, + "num_tokens": 23271039.0, + "reward": -1.8412569880485534, + "reward_std": 6.6673956394195555, + "rewards/get_chromagram_reward": 0.6226194262504577, + "rewards/get_chromagram_reward_std": 0.12411452755331993, + "rewards/get_intelligibility_reward": -6.122761702537536, + "rewards/get_intelligibility_reward_std": 10.182913446426392, + "rewards/get_target_len_reward": -0.02362839113920927, + "rewards/get_target_len_reward_std": 0.06820572800934314, + "step": 760 + }, + { + "advantages": -7.972120670274308e-08, + "advantages_std": 1.614003050327301, + "clip_ratio": 0.0, + "completion_length": 84.36309661865235, + "epoch": 0.5789473684210527, + "grad_norm": 7.5, + "kl": 0.32674605399370193, + "learning_rate": 4.710526315789474e-06, + "loss": 0.0362, + "num_tokens": 23570408.0, + "reward": -1.632363921403885, + "reward_std": 6.429037570953369, + "rewards/get_chromagram_reward": 0.6164280056953431, + "rewards/get_chromagram_reward_std": 0.11995449736714363, + "rewards/get_intelligibility_reward": -5.492781853675842, + "rewards/get_intelligibility_reward_std": 9.978094005584717, + "rewards/get_target_len_reward": -0.020737541373819113, + "rewards/get_target_len_reward_std": 0.05414057523012161, + "step": 770 + }, + { + "advantages": -1.0542570532123818e-07, + "advantages_std": 1.5682517766952515, + "clip_ratio": 0.0, + "completion_length": 87.86547775268555, + "epoch": 0.5864661654135338, + "grad_norm": 47.0, + "kl": 0.4949570521712303, + "learning_rate": 4.706766917293233e-06, + "loss": 0.0534, + "num_tokens": 23879465.0, + "reward": -1.6829198122024536, + "reward_std": 6.87082347869873, + "rewards/get_chromagram_reward": 0.6280323505401612, + "rewards/get_chromagram_reward_std": 0.1086762361228466, + "rewards/get_intelligibility_reward": -5.656386470794677, + "rewards/get_intelligibility_reward_std": 10.67564115524292, + "rewards/get_target_len_reward": -0.020404842961579562, + "rewards/get_target_len_reward_std": 0.05386058986186981, + "step": 780 + }, + { + "advantages": -1.0952354614346405e-07, + "advantages_std": 1.6235635638237, + "clip_ratio": 0.0, + "completion_length": 89.78155059814453, + "epoch": 0.5939849624060151, + "grad_norm": 6.125, + "kl": 0.40520851165056226, + "learning_rate": 4.703007518796993e-06, + "loss": 0.0419, + "num_tokens": 24193456.0, + "reward": -1.6209597945213319, + "reward_std": 6.467411375045776, + "rewards/get_chromagram_reward": 0.6285930275917053, + "rewards/get_chromagram_reward_std": 0.121580471098423, + "rewards/get_intelligibility_reward": -5.469432234764099, + "rewards/get_intelligibility_reward_std": 10.131909942626953, + "rewards/get_target_len_reward": -0.022039972990751267, + "rewards/get_target_len_reward_std": 0.05911780633032322, + "step": 790 + }, + { + "advantages": -1.2144447119055713e-07, + "advantages_std": 1.7115575075149536, + "clip_ratio": 0.0, + "completion_length": 84.20952529907227, + "epoch": 0.6015037593984962, + "grad_norm": 32.5, + "kl": 0.4055857822299004, + "learning_rate": 4.6992481203007525e-06, + "loss": 0.0433, + "num_tokens": 24492762.0, + "reward": -1.5891988754272461, + "reward_std": 6.875995683670044, + "rewards/get_chromagram_reward": 0.6362646162509918, + "rewards/get_chromagram_reward_std": 0.11419221684336663, + "rewards/get_intelligibility_reward": -5.380186462402344, + "rewards/get_intelligibility_reward_std": 10.897884845733643, + "rewards/get_target_len_reward": -0.023674625623971223, + "rewards/get_target_len_reward_std": 0.06239906083792448, + "step": 800 + }, + { + "advantages": -2.900759483281945e-07, + "advantages_std": 1.5877565264701843, + "clip_ratio": 0.0, + "completion_length": 87.95714492797852, + "epoch": 0.6090225563909775, + "grad_norm": 5.78125, + "kl": 0.27903091013431547, + "learning_rate": 4.695488721804511e-06, + "loss": 0.0378, + "num_tokens": 24801534.0, + "reward": -1.5388197422027587, + "reward_std": 6.82519645690918, + "rewards/get_chromagram_reward": 0.6175784945487977, + "rewards/get_chromagram_reward_std": 0.11102894842624664, + "rewards/get_intelligibility_reward": -5.2068812370300295, + "rewards/get_intelligibility_reward_std": 10.757489204406738, + "rewards/get_target_len_reward": -0.027156245335936545, + "rewards/get_target_len_reward_std": 0.0964772343635559, + "step": 810 + }, + { + "advantages": 1.3982256774625057e-07, + "advantages_std": 1.4997711062431336, + "clip_ratio": 0.0, + "completion_length": 88.94762115478515, + "epoch": 0.6165413533834586, + "grad_norm": 7.28125, + "kl": 0.29116481095552443, + "learning_rate": 4.691729323308271e-06, + "loss": 0.0281, + "num_tokens": 25113845.0, + "reward": -1.3392221808433533, + "reward_std": 6.566954040527344, + "rewards/get_chromagram_reward": 0.6304997444152832, + "rewards/get_chromagram_reward_std": 0.1095633253455162, + "rewards/get_intelligibility_reward": -4.630185222625732, + "rewards/get_intelligibility_reward_std": 10.61223382949829, + "rewards/get_target_len_reward": -0.017980954982340334, + "rewards/get_target_len_reward_std": 0.04479764401912689, + "step": 820 + }, + { + "advantages": -6.432333492512043e-08, + "advantages_std": 1.621793019771576, + "clip_ratio": 0.0, + "completion_length": 86.63928680419922, + "epoch": 0.6240601503759399, + "grad_norm": 6.21875, + "kl": 0.2632207229733467, + "learning_rate": 4.687969924812031e-06, + "loss": 0.0303, + "num_tokens": 25419723.0, + "reward": -1.6574245631694793, + "reward_std": 6.641559648513794, + "rewards/get_chromagram_reward": 0.6183900475502014, + "rewards/get_chromagram_reward_std": 0.12058342695236206, + "rewards/get_intelligibility_reward": -5.569248151779175, + "rewards/get_intelligibility_reward_std": 10.371752309799195, + "rewards/get_target_len_reward": -0.021415283996611835, + "rewards/get_target_len_reward_std": 0.062272250093519686, + "step": 830 + }, + { + "advantages": 4.901861231587645e-07, + "advantages_std": 1.491612982749939, + "clip_ratio": 0.0, + "completion_length": 86.89464416503907, + "epoch": 0.631578947368421, + "grad_norm": 88.5, + "kl": 0.3433301538228989, + "learning_rate": 4.68421052631579e-06, + "loss": 0.0352, + "num_tokens": 25726539.0, + "reward": -1.5290105819702149, + "reward_std": 6.745681285858154, + "rewards/get_chromagram_reward": 0.614410787820816, + "rewards/get_chromagram_reward_std": 0.10712209716439247, + "rewards/get_intelligibility_reward": -5.180805230140686, + "rewards/get_intelligibility_reward_std": 10.757587146759032, + "rewards/get_target_len_reward": -0.020636959094554187, + "rewards/get_target_len_reward_std": 0.06199995744973421, + "step": 840 + }, + { + "advantages": 2.0439424215368262e-07, + "advantages_std": 1.6053584575653077, + "clip_ratio": 0.0, + "completion_length": 83.75238189697265, + "epoch": 0.6390977443609023, + "grad_norm": 7.75, + "kl": 0.32621364295482635, + "learning_rate": 4.680451127819549e-06, + "loss": 0.0354, + "num_tokens": 26025367.0, + "reward": -1.6455067068338394, + "reward_std": 6.85167384147644, + "rewards/get_chromagram_reward": 0.6253707230091095, + "rewards/get_chromagram_reward_std": 0.12833054959774018, + "rewards/get_intelligibility_reward": -5.537108218669891, + "rewards/get_intelligibility_reward_std": 10.775818157196046, + "rewards/get_target_len_reward": -0.02478252612054348, + "rewards/get_target_len_reward_std": 0.05785290952771902, + "step": 850 + }, + { + "advantages": -2.25504242123975e-07, + "advantages_std": 1.7222684264183044, + "clip_ratio": 0.0, + "completion_length": 87.38392944335938, + "epoch": 0.6466165413533834, + "grad_norm": 6.46875, + "kl": 0.5005932718515396, + "learning_rate": 4.676691729323309e-06, + "loss": 0.0556, + "num_tokens": 26333639.0, + "reward": -1.5956645965576173, + "reward_std": 7.104577875137329, + "rewards/get_chromagram_reward": 0.6158910393714905, + "rewards/get_chromagram_reward_std": 0.11812372878193855, + "rewards/get_intelligibility_reward": -5.373835563659668, + "rewards/get_intelligibility_reward_std": 11.39167242050171, + "rewards/get_target_len_reward": -0.0290489312261343, + "rewards/get_target_len_reward_std": 0.09160682074725628, + "step": 860 + }, + { + "advantages": 2.6449561438823823e-07, + "advantages_std": 1.6398038148880005, + "clip_ratio": 0.0, + "completion_length": 86.44583435058594, + "epoch": 0.6541353383458647, + "grad_norm": 7.1875, + "kl": 0.3201334476470947, + "learning_rate": 4.672932330827068e-06, + "loss": 0.039, + "num_tokens": 26639675.0, + "reward": -1.377868014574051, + "reward_std": 6.831578731536865, + "rewards/get_chromagram_reward": 0.6151859581470489, + "rewards/get_chromagram_reward_std": 0.12284478545188904, + "rewards/get_intelligibility_reward": -4.722595846652984, + "rewards/get_intelligibility_reward_std": 10.993205451965332, + "rewards/get_target_len_reward": -0.026193857286125423, + "rewards/get_target_len_reward_std": 0.0736829001456499, + "step": 870 + }, + { + "advantages": 2.769132663615892e-07, + "advantages_std": 1.663590395450592, + "clip_ratio": 0.0, + "completion_length": 84.027978515625, + "epoch": 0.6616541353383458, + "grad_norm": 14.25, + "kl": 0.29166722744703294, + "learning_rate": 4.669172932330828e-06, + "loss": 0.0297, + "num_tokens": 26939198.0, + "reward": -1.411170706152916, + "reward_std": 6.868584156036377, + "rewards/get_chromagram_reward": 0.6172568500041962, + "rewards/get_chromagram_reward_std": 0.11014089062809944, + "rewards/get_intelligibility_reward": -4.832118815183639, + "rewards/get_intelligibility_reward_std": 11.0176420211792, + "rewards/get_target_len_reward": -0.01864988887682557, + "rewards/get_target_len_reward_std": 0.0528477106243372, + "step": 880 + }, + { + "advantages": -1.1473893692937054e-07, + "advantages_std": 1.6940292239189148, + "clip_ratio": 0.0, + "completion_length": 87.0202392578125, + "epoch": 0.6691729323308271, + "grad_norm": 5.6875, + "kl": 0.39647049456834793, + "learning_rate": 4.665413533834587e-06, + "loss": 0.0423, + "num_tokens": 27246872.0, + "reward": -1.597488921880722, + "reward_std": 7.147005319595337, + "rewards/get_chromagram_reward": 0.6287810504436493, + "rewards/get_chromagram_reward_std": 0.12597450688481332, + "rewards/get_intelligibility_reward": -5.393320921063423, + "rewards/get_intelligibility_reward_std": 11.286065196990966, + "rewards/get_target_len_reward": -0.02792652351781726, + "rewards/get_target_len_reward_std": 0.06876220367848873, + "step": 890 + }, + { + "advantages": 7.887681448437433e-07, + "advantages_std": 1.6071461677551269, + "clip_ratio": 0.0, + "completion_length": 86.16785888671875, + "epoch": 0.6766917293233082, + "grad_norm": 5.9375, + "kl": 0.6452051237225532, + "learning_rate": 4.661654135338346e-06, + "loss": 0.0648, + "num_tokens": 27551525.0, + "reward": -1.593279379606247, + "reward_std": 6.675328016281128, + "rewards/get_chromagram_reward": 0.6114086985588074, + "rewards/get_chromagram_reward_std": 0.11248691827058792, + "rewards/get_intelligibility_reward": -5.372988653182984, + "rewards/get_intelligibility_reward_std": 10.505959796905518, + "rewards/get_target_len_reward": -0.018257823958992957, + "rewards/get_target_len_reward_std": 0.04569785110652447, + "step": 900 + }, + { + "advantages": -4.731118766088116e-07, + "advantages_std": 1.5220891833305359, + "clip_ratio": 0.0, + "completion_length": 87.25535888671875, + "epoch": 0.6842105263157895, + "grad_norm": 49.75, + "kl": 0.38621631264686584, + "learning_rate": 4.657894736842106e-06, + "loss": 0.0444, + "num_tokens": 27859147.0, + "reward": -1.7658088684082032, + "reward_std": 7.283221912384033, + "rewards/get_chromagram_reward": 0.6201018691062927, + "rewards/get_chromagram_reward_std": 0.1176083043217659, + "rewards/get_intelligibility_reward": -5.895946288108826, + "rewards/get_intelligibility_reward_std": 11.443974113464355, + "rewards/get_target_len_reward": -0.02158181704580784, + "rewards/get_target_len_reward_std": 0.05893752183765173, + "step": 910 + }, + { + "advantages": 2.9044847664749797e-07, + "advantages_std": 1.5768916845321654, + "clip_ratio": 0.0, + "completion_length": 86.76369247436523, + "epoch": 0.6917293233082706, + "grad_norm": 5.5625, + "kl": 0.2984780207276344, + "learning_rate": 4.654135338345865e-06, + "loss": 0.0331, + "num_tokens": 28165265.0, + "reward": -1.2902542769908905, + "reward_std": 6.306678295135498, + "rewards/get_chromagram_reward": 0.6366979598999023, + "rewards/get_chromagram_reward_std": 0.10054028406739235, + "rewards/get_intelligibility_reward": -4.488920116424561, + "rewards/get_intelligibility_reward_std": 10.173047637939453, + "rewards/get_target_len_reward": -0.018540383130311967, + "rewards/get_target_len_reward_std": 0.046944990381598474, + "step": 920 + }, + { + "advantages": 3.6557516409629897e-07, + "advantages_std": 1.625953483581543, + "clip_ratio": 0.0, + "completion_length": 84.18392944335938, + "epoch": 0.6992481203007519, + "grad_norm": 5.84375, + "kl": 0.2764819011092186, + "learning_rate": 4.6503759398496245e-06, + "loss": 0.0343, + "num_tokens": 28464361.0, + "reward": -1.5938740998506546, + "reward_std": 6.704269456863403, + "rewards/get_chromagram_reward": 0.6101159989833832, + "rewards/get_chromagram_reward_std": 0.11639057248830795, + "rewards/get_intelligibility_reward": -5.371188521385193, + "rewards/get_intelligibility_reward_std": 10.497893142700196, + "rewards/get_target_len_reward": -0.020549843832850457, + "rewards/get_target_len_reward_std": 0.06374723017215729, + "step": 930 + }, + { + "advantages": 3.159046173095703e-07, + "advantages_std": 1.573643147945404, + "clip_ratio": 0.0, + "completion_length": 90.86190643310547, + "epoch": 0.706766917293233, + "grad_norm": 5.53125, + "kl": 0.3013840883970261, + "learning_rate": 4.646616541353383e-06, + "loss": 0.0398, + "num_tokens": 28781202.0, + "reward": -1.6651936948299408, + "reward_std": 6.829349184036255, + "rewards/get_chromagram_reward": 0.646953010559082, + "rewards/get_chromagram_reward_std": 0.11124408766627311, + "rewards/get_intelligibility_reward": -5.6138955950737, + "rewards/get_intelligibility_reward_std": 10.676132678985596, + "rewards/get_target_len_reward": -0.028638134244829416, + "rewards/get_target_len_reward_std": 0.08526257313787937, + "step": 940 + }, + { + "advantages": 2.5058785411147257e-07, + "advantages_std": 1.6995879650115966, + "clip_ratio": 0.0, + "completion_length": 88.37024002075195, + "epoch": 0.7142857142857143, + "grad_norm": 6.78125, + "kl": 0.3869833633303642, + "learning_rate": 4.642857142857144e-06, + "loss": 0.0397, + "num_tokens": 29092284.0, + "reward": -1.2819394290447235, + "reward_std": 6.067027044296265, + "rewards/get_chromagram_reward": 0.6137153327465057, + "rewards/get_chromagram_reward_std": 0.10772662758827209, + "rewards/get_intelligibility_reward": -4.441144847869873, + "rewards/get_intelligibility_reward_std": 9.736154413223266, + "rewards/get_target_len_reward": -0.018388483859598636, + "rewards/get_target_len_reward_std": 0.05689036846160889, + "step": 950 + }, + { + "advantages": 1.5944243330068276e-07, + "advantages_std": 1.5586883306503296, + "clip_ratio": 0.0, + "completion_length": 85.98333511352538, + "epoch": 0.7218045112781954, + "grad_norm": 5.25, + "kl": 0.498983108997345, + "learning_rate": 4.639097744360903e-06, + "loss": 0.0583, + "num_tokens": 29396896.0, + "reward": -1.1031381070613862, + "reward_std": 6.2076152801513675, + "rewards/get_chromagram_reward": 0.6318881809711456, + "rewards/get_chromagram_reward_std": 0.11726146414875985, + "rewards/get_intelligibility_reward": -3.916832911968231, + "rewards/get_intelligibility_reward_std": 10.10825605392456, + "rewards/get_target_len_reward": -0.024469429068267344, + "rewards/get_target_len_reward_std": 0.07591898571699858, + "step": 960 + }, + { + "advantages": -4.048150259450267e-08, + "advantages_std": 1.519583487510681, + "clip_ratio": 0.0, + "completion_length": 85.97916870117187, + "epoch": 0.7293233082706767, + "grad_norm": 6.40625, + "kl": 0.3494548827409744, + "learning_rate": 4.635338345864662e-06, + "loss": 0.0376, + "num_tokens": 29701172.0, + "reward": -1.7784659802913665, + "reward_std": 6.481203222274781, + "rewards/get_chromagram_reward": 0.6209641814231872, + "rewards/get_chromagram_reward_std": 0.10738300830125809, + "rewards/get_intelligibility_reward": -5.936930441856385, + "rewards/get_intelligibility_reward_std": 9.880269956588744, + "rewards/get_target_len_reward": -0.019431459810584785, + "rewards/get_target_len_reward_std": 0.05564035829156637, + "step": 970 + }, + { + "advantages": 5.463760022195175e-09, + "advantages_std": 1.399158489704132, + "clip_ratio": 0.0, + "completion_length": 86.23690643310547, + "epoch": 0.7368421052631579, + "grad_norm": 6.90625, + "kl": 0.3064037337899208, + "learning_rate": 4.631578947368421e-06, + "loss": 0.031, + "num_tokens": 30005992.0, + "reward": -1.6206971883773804, + "reward_std": 6.9678229808807375, + "rewards/get_chromagram_reward": 0.6120537519454956, + "rewards/get_chromagram_reward_std": 0.11391936540603638, + "rewards/get_intelligibility_reward": -5.458502078056336, + "rewards/get_intelligibility_reward_std": 10.979090690612793, + "rewards/get_target_len_reward": -0.015642876317724586, + "rewards/get_target_len_reward_std": 0.03758895331993699, + "step": 980 + }, + { + "advantages": 9.114544923249923e-08, + "advantages_std": 1.6145791053771972, + "clip_ratio": 0.0, + "completion_length": 81.36369247436524, + "epoch": 0.7443609022556391, + "grad_norm": 7.34375, + "kl": 0.36196746826171877, + "learning_rate": 4.6278195488721815e-06, + "loss": 0.0395, + "num_tokens": 30297509.0, + "reward": -1.9562557220458985, + "reward_std": 6.989383935928345, + "rewards/get_chromagram_reward": 0.605682373046875, + "rewards/get_chromagram_reward_std": 0.11706684604287147, + "rewards/get_intelligibility_reward": -6.451294040679931, + "rewards/get_intelligibility_reward_std": 10.68880500793457, + "rewards/get_target_len_reward": -0.02315532071515918, + "rewards/get_target_len_reward_std": 0.0718101266771555, + "step": 990 + }, + { + "advantages": 1.899898109058995e-07, + "advantages_std": 1.6159561038017274, + "clip_ratio": 0.0, + "completion_length": 93.06369247436524, + "epoch": 0.7518796992481203, + "grad_norm": 4.96875, + "kl": 0.49673385322093966, + "learning_rate": 4.62406015037594e-06, + "loss": 0.0528, + "num_tokens": 30621053.0, + "reward": -1.3294874399900436, + "reward_std": 6.693207502365112, + "rewards/get_chromagram_reward": 0.6109622955322266, + "rewards/get_chromagram_reward_std": 0.10932595655322075, + "rewards/get_intelligibility_reward": -4.581081557273865, + "rewards/get_intelligibility_reward_std": 10.80920705795288, + "rewards/get_target_len_reward": -0.018342763558030127, + "rewards/get_target_len_reward_std": 0.04740550182759762, + "step": 1000 + }, + { + "advantages": 2.8510890643929087e-07, + "advantages_std": 1.6630101799964905, + "clip_ratio": 0.0, + "completion_length": 84.70238189697265, + "epoch": 0.7593984962406015, + "grad_norm": 5.75, + "kl": 0.31738368421792984, + "learning_rate": 4.620300751879699e-06, + "loss": 0.0351, + "num_tokens": 30922303.0, + "reward": -1.5521818846464157, + "reward_std": 6.914695501327515, + "rewards/get_chromagram_reward": 0.6350254416465759, + "rewards/get_chromagram_reward_std": 0.11678898185491562, + "rewards/get_intelligibility_reward": -5.263093185424805, + "rewards/get_intelligibility_reward_std": 10.940269947052002, + "rewards/get_target_len_reward": -0.028477614279836416, + "rewards/get_target_len_reward_std": 0.07360137198120356, + "step": 1010 + }, + { + "advantages": 1.989926062151426e-07, + "advantages_std": 1.4804226577281951, + "clip_ratio": 0.0, + "completion_length": 85.90238189697266, + "epoch": 0.7669172932330827, + "grad_norm": 18.75, + "kl": 0.2990993529558182, + "learning_rate": 4.616541353383459e-06, + "loss": 0.0338, + "num_tokens": 31225914.0, + "reward": -1.722965794801712, + "reward_std": 6.612655448913574, + "rewards/get_chromagram_reward": 0.6062596440315247, + "rewards/get_chromagram_reward_std": 0.11684568524360657, + "rewards/get_intelligibility_reward": -5.754314303398132, + "rewards/get_intelligibility_reward_std": 10.259960079193116, + "rewards/get_target_len_reward": -0.020842405408620833, + "rewards/get_target_len_reward_std": 0.05934775285422802, + "step": 1020 + }, + { + "advantages": -5.247692380194735e-07, + "advantages_std": 1.5217979907989503, + "clip_ratio": 0.0, + "completion_length": 88.8470245361328, + "epoch": 0.7744360902255639, + "grad_norm": 9.125, + "kl": 0.3498847380280495, + "learning_rate": 4.612781954887218e-06, + "loss": 0.0394, + "num_tokens": 31538857.0, + "reward": -1.323236495256424, + "reward_std": 6.522344350814819, + "rewards/get_chromagram_reward": 0.6311689078807831, + "rewards/get_chromagram_reward_std": 0.11285480856895447, + "rewards/get_intelligibility_reward": -4.577942156791687, + "rewards/get_intelligibility_reward_std": 10.531904697418213, + "rewards/get_target_len_reward": -0.022935927845537663, + "rewards/get_target_len_reward_std": 0.05773061886429787, + "step": 1030 + }, + { + "advantages": -9.474654660834858e-08, + "advantages_std": 1.600497829914093, + "clip_ratio": 0.0, + "completion_length": 84.66666717529297, + "epoch": 0.7819548872180451, + "grad_norm": 7.78125, + "kl": 0.31537162363529203, + "learning_rate": 4.609022556390978e-06, + "loss": 0.0395, + "num_tokens": 31839416.0, + "reward": -2.097861647605896, + "reward_std": 7.330296373367309, + "rewards/get_chromagram_reward": 0.6081015944480896, + "rewards/get_chromagram_reward_std": 0.11224598959088325, + "rewards/get_intelligibility_reward": -6.872477197647095, + "rewards/get_intelligibility_reward_std": 11.225436401367187, + "rewards/get_target_len_reward": -0.02920899149030447, + "rewards/get_target_len_reward_std": 0.09407919310033322, + "step": 1040 + }, + { + "advantages": -1.1250377056626348e-07, + "advantages_std": 1.580076313018799, + "clip_ratio": 0.0, + "completion_length": 87.11666717529297, + "epoch": 0.7894736842105263, + "grad_norm": 8.875, + "kl": 0.2959355965256691, + "learning_rate": 4.605263157894737e-06, + "loss": 0.0323, + "num_tokens": 32146897.0, + "reward": -1.5526989638805389, + "reward_std": 6.767483377456665, + "rewards/get_chromagram_reward": 0.6345309376716614, + "rewards/get_chromagram_reward_std": 0.1191826693713665, + "rewards/get_intelligibility_reward": -5.266578364372253, + "rewards/get_intelligibility_reward_std": 10.712108993530274, + "rewards/get_target_len_reward": -0.02604932654649019, + "rewards/get_target_len_reward_std": 0.07846166621893644, + "step": 1050 + }, + { + "advantages": 1.502533697461672e-07, + "advantages_std": 1.5794021248817445, + "clip_ratio": 0.0, + "completion_length": 86.48452606201172, + "epoch": 0.7969924812030075, + "grad_norm": 31.375, + "kl": 0.2713562995195389, + "learning_rate": 4.6015037593984965e-06, + "loss": 0.0278, + "num_tokens": 32452918.0, + "reward": -1.7753393650054932, + "reward_std": 6.538304424285888, + "rewards/get_chromagram_reward": 0.6110261261463166, + "rewards/get_chromagram_reward_std": 0.11173097193241119, + "rewards/get_intelligibility_reward": -5.921868181228637, + "rewards/get_intelligibility_reward_std": 10.043649768829345, + "rewards/get_target_len_reward": -0.015175698138773442, + "rewards/get_target_len_reward_std": 0.042067173309624194, + "step": 1060 + }, + { + "advantages": 1.1244168627300155e-07, + "advantages_std": 1.675466275215149, + "clip_ratio": 0.0, + "completion_length": 86.32500228881835, + "epoch": 0.8045112781954887, + "grad_norm": 10.5, + "kl": 0.4505449026823044, + "learning_rate": 4.597744360902256e-06, + "loss": 0.0514, + "num_tokens": 32757810.0, + "reward": -1.6142403960227967, + "reward_std": 6.906636905670166, + "rewards/get_chromagram_reward": 0.6085878312587738, + "rewards/get_chromagram_reward_std": 0.11353035345673561, + "rewards/get_intelligibility_reward": -5.427689337730408, + "rewards/get_intelligibility_reward_std": 10.920014953613281, + "rewards/get_target_len_reward": -0.02361916834488511, + "rewards/get_target_len_reward_std": 0.07267850339412689, + "step": 1070 + }, + { + "advantages": -7.525086758164434e-08, + "advantages_std": 1.6398833632469176, + "clip_ratio": 0.0, + "completion_length": 88.7452392578125, + "epoch": 0.8120300751879699, + "grad_norm": 63.25, + "kl": 0.30239309668540953, + "learning_rate": 4.593984962406016e-06, + "loss": 0.0342, + "num_tokens": 33069816.0, + "reward": -1.7610064923763276, + "reward_std": 7.087743330001831, + "rewards/get_chromagram_reward": 0.6198269784450531, + "rewards/get_chromagram_reward_std": 0.11029869988560677, + "rewards/get_intelligibility_reward": -5.880134701728821, + "rewards/get_intelligibility_reward_std": 11.074260044097901, + "rewards/get_target_len_reward": -0.022711543925106527, + "rewards/get_target_len_reward_std": 0.06780009865760803, + "step": 1080 + }, + { + "advantages": 1.7409523564992923e-07, + "advantages_std": 1.4920554280281066, + "clip_ratio": 0.0, + "completion_length": 88.64821548461914, + "epoch": 0.8195488721804511, + "grad_norm": 5.625, + "kl": 2.4893749192357064, + "learning_rate": 4.5902255639097746e-06, + "loss": 0.2518, + "num_tokens": 33380904.0, + "reward": -1.623878252506256, + "reward_std": 6.674379110336304, + "rewards/get_chromagram_reward": 0.6122495532035828, + "rewards/get_chromagram_reward_std": 0.10934195294976234, + "rewards/get_intelligibility_reward": -5.4654449939727785, + "rewards/get_intelligibility_reward_std": 10.542883014678955, + "rewards/get_target_len_reward": -0.018439139425754546, + "rewards/get_target_len_reward_std": 0.05776769071817398, + "step": 1090 + }, + { + "advantages": -6.531675573739904e-08, + "advantages_std": 1.6210807323455811, + "clip_ratio": 0.0, + "completion_length": 87.26309661865234, + "epoch": 0.8270676691729323, + "grad_norm": 9.125, + "kl": 0.3473955288529396, + "learning_rate": 4.586466165413534e-06, + "loss": 0.0368, + "num_tokens": 33688342.0, + "reward": -1.305153553187847, + "reward_std": 6.34152626991272, + "rewards/get_chromagram_reward": 0.6365042924880981, + "rewards/get_chromagram_reward_std": 0.11087472662329674, + "rewards/get_intelligibility_reward": -4.5303013920784, + "rewards/get_intelligibility_reward_std": 10.122239017486573, + "rewards/get_target_len_reward": -0.021663395036011935, + "rewards/get_target_len_reward_std": 0.059238927252590654, + "step": 1100 + }, + { + "advantages": 4.557271902072557e-07, + "advantages_std": 1.661526906490326, + "clip_ratio": 0.0, + "completion_length": 87.11726379394531, + "epoch": 0.8345864661654135, + "grad_norm": 6.75, + "kl": 0.5738143682479858, + "learning_rate": 4.582706766917294e-06, + "loss": 0.0596, + "num_tokens": 33995647.0, + "reward": -1.4435159385204315, + "reward_std": 6.458550071716308, + "rewards/get_chromagram_reward": 0.6249020397663116, + "rewards/get_chromagram_reward_std": 0.11777897700667381, + "rewards/get_intelligibility_reward": -4.9328501462936405, + "rewards/get_intelligibility_reward_std": 10.187185144424438, + "rewards/get_target_len_reward": -0.022599360160529613, + "rewards/get_target_len_reward_std": 0.058740793541073796, + "step": 1110 + }, + { + "advantages": 1.9073487269594125e-07, + "advantages_std": 1.5964321136474608, + "clip_ratio": 0.0, + "completion_length": 87.53690643310547, + "epoch": 0.8421052631578947, + "grad_norm": 8.375, + "kl": 0.27469114661216737, + "learning_rate": 4.578947368421053e-06, + "loss": 0.0335, + "num_tokens": 34304204.0, + "reward": -1.526003235578537, + "reward_std": 6.978189182281494, + "rewards/get_chromagram_reward": 0.6308319568634033, + "rewards/get_chromagram_reward_std": 0.10873896330595016, + "rewards/get_intelligibility_reward": -5.183152413368225, + "rewards/get_intelligibility_reward_std": 11.189978885650635, + "rewards/get_target_len_reward": -0.025688940472900868, + "rewards/get_target_len_reward_std": 0.0640136267989874, + "step": 1120 + }, + { + "advantages": 1.0542571917682153e-06, + "advantages_std": 1.7151224732398986, + "clip_ratio": 0.0, + "completion_length": 88.56905059814453, + "epoch": 0.849624060150376, + "grad_norm": 5.90625, + "kl": 0.31005347073078154, + "learning_rate": 4.575187969924812e-06, + "loss": 0.0312, + "num_tokens": 34616297.0, + "reward": -1.14257645085454, + "reward_std": 6.571909236907959, + "rewards/get_chromagram_reward": 0.6334330260753631, + "rewards/get_chromagram_reward_std": 0.10452088415622711, + "rewards/get_intelligibility_reward": -4.038772355020046, + "rewards/get_intelligibility_reward_std": 10.711493492126465, + "rewards/get_target_len_reward": -0.022389863990247248, + "rewards/get_target_len_reward_std": 0.05032932460308075, + "step": 1130 + }, + { + "advantages": 1.379599261497333e-07, + "advantages_std": 1.608151626586914, + "clip_ratio": 0.0, + "completion_length": 86.90714492797852, + "epoch": 0.8571428571428571, + "grad_norm": 32.25, + "kl": 0.40623040348291395, + "learning_rate": 4.571428571428572e-06, + "loss": 0.0441, + "num_tokens": 34921856.0, + "reward": -1.3641541302204132, + "reward_std": 6.569147109985352, + "rewards/get_chromagram_reward": 0.6162215530872345, + "rewards/get_chromagram_reward_std": 0.11952584758400916, + "rewards/get_intelligibility_reward": -4.683884525299073, + "rewards/get_intelligibility_reward_std": 10.552340030670166, + "rewards/get_target_len_reward": -0.02479925286024809, + "rewards/get_target_len_reward_std": 0.07111198548227549, + "step": 1140 + }, + { + "advantages": -3.4620365880755343e-07, + "advantages_std": 1.6208989381790162, + "clip_ratio": 0.0, + "completion_length": 87.70654907226563, + "epoch": 0.8646616541353384, + "grad_norm": 12.375, + "kl": 0.3247146025300026, + "learning_rate": 4.567669172932332e-06, + "loss": 0.0369, + "num_tokens": 35230640.0, + "reward": -1.501368111371994, + "reward_std": 6.6000793933868405, + "rewards/get_chromagram_reward": 0.6304889559745789, + "rewards/get_chromagram_reward_std": 0.11283566728234291, + "rewards/get_intelligibility_reward": -5.113024723529816, + "rewards/get_intelligibility_reward_std": 10.441469764709472, + "rewards/get_target_len_reward": -0.021568275708705186, + "rewards/get_target_len_reward_std": 0.05896295178681612, + "step": 1150 + }, + { + "advantages": -6.382664125226256e-07, + "advantages_std": 1.5868653297424316, + "clip_ratio": 0.0, + "completion_length": 88.5958351135254, + "epoch": 0.8721804511278195, + "grad_norm": 8.875, + "kl": 0.3063840791583061, + "learning_rate": 4.56390977443609e-06, + "loss": 0.0347, + "num_tokens": 35542296.0, + "reward": -1.5182266354560852, + "reward_std": 6.986057329177856, + "rewards/get_chromagram_reward": 0.6088717997074127, + "rewards/get_chromagram_reward_std": 0.12039782926440239, + "rewards/get_intelligibility_reward": -5.142949795722961, + "rewards/get_intelligibility_reward_std": 11.212296390533448, + "rewards/get_target_len_reward": -0.020601730328053236, + "rewards/get_target_len_reward_std": 0.0644838048145175, + "step": 1160 + }, + { + "advantages": 7.761022402519302e-08, + "advantages_std": 1.5734822034835816, + "clip_ratio": 0.0, + "completion_length": 83.76071624755859, + "epoch": 0.8796992481203008, + "grad_norm": 6.4375, + "kl": 2.6361778348684313, + "learning_rate": 4.56015037593985e-06, + "loss": 0.2637, + "num_tokens": 35840099.0, + "reward": -1.675141602754593, + "reward_std": 6.357161331176758, + "rewards/get_chromagram_reward": 0.6211533367633819, + "rewards/get_chromagram_reward_std": 0.11045403182506561, + "rewards/get_intelligibility_reward": -5.627998042106628, + "rewards/get_intelligibility_reward_std": 9.793633270263673, + "rewards/get_target_len_reward": -0.018579850811511277, + "rewards/get_target_len_reward_std": 0.04803097825497389, + "step": 1170 + }, + { + "advantages": -1.194576420004978e-07, + "advantages_std": 1.5758933544158935, + "clip_ratio": 0.0, + "completion_length": 89.42916793823242, + "epoch": 0.8872180451127819, + "grad_norm": 4.9375, + "kl": 0.39932370483875274, + "learning_rate": 4.55639097744361e-06, + "loss": 0.0422, + "num_tokens": 36153899.0, + "reward": -1.36173208206892, + "reward_std": 6.48907585144043, + "rewards/get_chromagram_reward": 0.625183516740799, + "rewards/get_chromagram_reward_std": 0.10821353197097779, + "rewards/get_intelligibility_reward": -4.691559541225433, + "rewards/get_intelligibility_reward_std": 10.373832702636719, + "rewards/get_target_len_reward": -0.01881989361718297, + "rewards/get_target_len_reward_std": 0.050335131399333474, + "step": 1180 + }, + { + "advantages": -3.0174851559650053e-07, + "advantages_std": 1.5266151547431945, + "clip_ratio": 0.0, + "completion_length": 86.88333511352539, + "epoch": 0.8947368421052632, + "grad_norm": 8.3125, + "kl": 0.2715902358293533, + "learning_rate": 4.552631578947369e-06, + "loss": 0.0298, + "num_tokens": 36461761.0, + "reward": -1.0700063236057757, + "reward_std": 6.352678012847901, + "rewards/get_chromagram_reward": 0.6274643957614898, + "rewards/get_chromagram_reward_std": 0.12471745386719704, + "rewards/get_intelligibility_reward": -3.818689227104187, + "rewards/get_intelligibility_reward_std": 10.39213514328003, + "rewards/get_target_len_reward": -0.018793891929090024, + "rewards/get_target_len_reward_std": 0.04841918870806694, + "step": 1190 + }, + { + "advantages": -2.0364921482496356e-07, + "advantages_std": 1.6584547877311706, + "clip_ratio": 0.0, + "completion_length": 85.5297637939453, + "epoch": 0.9022556390977443, + "grad_norm": 8.125, + "kl": 0.441507688164711, + "learning_rate": 4.548872180451128e-06, + "loss": 0.0474, + "num_tokens": 36763911.0, + "reward": -1.7878079771995545, + "reward_std": 7.366923475265503, + "rewards/get_chromagram_reward": 0.6185015857219696, + "rewards/get_chromagram_reward_std": 0.11396012380719185, + "rewards/get_intelligibility_reward": -5.96343092918396, + "rewards/get_intelligibility_reward_std": 11.580034923553466, + "rewards/get_target_len_reward": -0.018494250997900964, + "rewards/get_target_len_reward_std": 0.05283417291939259, + "step": 1200 + }, + { + "advantages": -6.829698726562583e-08, + "advantages_std": 1.6967475652694701, + "clip_ratio": 0.0, + "completion_length": 87.72916870117187, + "epoch": 0.9097744360902256, + "grad_norm": 6.21875, + "kl": 0.2949825465679169, + "learning_rate": 4.545112781954888e-06, + "loss": 0.0315, + "num_tokens": 37073042.0, + "reward": -1.2314562678337098, + "reward_std": 7.152263164520264, + "rewards/get_chromagram_reward": 0.630731874704361, + "rewards/get_chromagram_reward_std": 0.10836580172181129, + "rewards/get_intelligibility_reward": -4.306980383396149, + "rewards/get_intelligibility_reward_std": 11.745508003234864, + "rewards/get_target_len_reward": -0.01812002747319639, + "rewards/get_target_len_reward_std": 0.04881219994276762, + "step": 1210 + }, + { + "advantages": -4.892548526314044e-08, + "advantages_std": 1.5888686656951905, + "clip_ratio": 0.0, + "completion_length": 84.1827392578125, + "epoch": 0.9172932330827067, + "grad_norm": 6.5625, + "kl": 0.2977434679865837, + "learning_rate": 4.541353383458647e-06, + "loss": 0.0301, + "num_tokens": 37372829.0, + "reward": -1.401185193657875, + "reward_std": 6.355694580078125, + "rewards/get_chromagram_reward": 0.6275815725326538, + "rewards/get_chromagram_reward_std": 0.1175346054136753, + "rewards/get_intelligibility_reward": -4.810519421100617, + "rewards/get_intelligibility_reward_std": 10.111643552780151, + "rewards/get_target_len_reward": -0.02061743279919028, + "rewards/get_target_len_reward_std": 0.05778510309755802, + "step": 1220 + }, + { + "advantages": 5.101164489929033e-07, + "advantages_std": 1.5641892433166504, + "clip_ratio": 0.0, + "completion_length": 85.9928596496582, + "epoch": 0.924812030075188, + "grad_norm": 29.25, + "kl": 0.3739720702171326, + "learning_rate": 4.537593984962406e-06, + "loss": 0.042, + "num_tokens": 37676822.0, + "reward": -1.3860167860984802, + "reward_std": 6.516747045516968, + "rewards/get_chromagram_reward": 0.6117733120918274, + "rewards/get_chromagram_reward_std": 0.1162826582789421, + "rewards/get_intelligibility_reward": -4.744295835494995, + "rewards/get_intelligibility_reward_std": 10.425153636932373, + "rewards/get_target_len_reward": -0.025527626182883977, + "rewards/get_target_len_reward_std": 0.07178398761898279, + "step": 1230 + }, + { + "advantages": 5.960440319086047e-09, + "advantages_std": 1.5911370515823364, + "clip_ratio": 0.0, + "completion_length": 90.75774002075195, + "epoch": 0.9323308270676691, + "grad_norm": 7.4375, + "kl": 0.31194020956754687, + "learning_rate": 4.533834586466166e-06, + "loss": 0.0321, + "num_tokens": 37994117.0, + "reward": -1.3426182121038437, + "reward_std": 6.672745943069458, + "rewards/get_chromagram_reward": 0.6171811401844025, + "rewards/get_chromagram_reward_std": 0.10649881064891815, + "rewards/get_intelligibility_reward": -4.626643079519272, + "rewards/get_intelligibility_reward_std": 10.690338182449342, + "rewards/get_target_len_reward": -0.018392365891486406, + "rewards/get_target_len_reward_std": 0.049134592339396474, + "step": 1240 + }, + { + "advantages": 6.659577451273436e-07, + "advantages_std": 1.5247369408607483, + "clip_ratio": 0.0, + "completion_length": 90.20654907226563, + "epoch": 0.9398496240601504, + "grad_norm": 6.75, + "kl": 0.30897647738456724, + "learning_rate": 4.530075187969925e-06, + "loss": 0.0347, + "num_tokens": 38309634.0, + "reward": -1.3368622988462449, + "reward_std": 6.864240121841431, + "rewards/get_chromagram_reward": 0.6257738709449768, + "rewards/get_chromagram_reward_std": 0.12027077302336693, + "rewards/get_intelligibility_reward": -4.608479511737824, + "rewards/get_intelligibility_reward_std": 11.090731430053712, + "rewards/get_target_len_reward": -0.027881059050559997, + "rewards/get_target_len_reward_std": 0.07647916078567504, + "step": 1250 + }, + { + "advantages": -1.5484790054642873e-07, + "advantages_std": 1.590539848804474, + "clip_ratio": 0.0, + "completion_length": 89.33333587646484, + "epoch": 0.9473684210526315, + "grad_norm": 6.5625, + "kl": 0.3056371137499809, + "learning_rate": 4.526315789473685e-06, + "loss": 0.032, + "num_tokens": 38623372.0, + "reward": -1.5094636023044585, + "reward_std": 6.879690170288086, + "rewards/get_chromagram_reward": 0.6226464509963989, + "rewards/get_chromagram_reward_std": 0.11185586228966712, + "rewards/get_intelligibility_reward": -5.134295892715454, + "rewards/get_intelligibility_reward_std": 10.881260585784911, + "rewards/get_target_len_reward": -0.016741198487579823, + "rewards/get_target_len_reward_std": 0.04208283200860023, + "step": 1260 + }, + { + "advantages": 6.342927775904173e-07, + "advantages_std": 1.4530799746513368, + "clip_ratio": 0.0, + "completion_length": 88.98988342285156, + "epoch": 0.9548872180451128, + "grad_norm": 5.96875, + "kl": 0.370340433716774, + "learning_rate": 4.522556390977444e-06, + "loss": 0.0399, + "num_tokens": 38935033.0, + "reward": -1.4293070256710052, + "reward_std": 6.392971324920654, + "rewards/get_chromagram_reward": 0.622153103351593, + "rewards/get_chromagram_reward_std": 0.11032605618238449, + "rewards/get_intelligibility_reward": -4.890468895435333, + "rewards/get_intelligibility_reward_std": 10.122354888916016, + "rewards/get_target_len_reward": -0.019604854937642812, + "rewards/get_target_len_reward_std": 0.05110882055014372, + "step": 1270 + }, + { + "advantages": 2.257525977711339e-07, + "advantages_std": 1.5813369393348693, + "clip_ratio": 0.0, + "completion_length": 87.82321624755859, + "epoch": 0.9624060150375939, + "grad_norm": 5.6875, + "kl": 0.4109964817762375, + "learning_rate": 4.518796992481204e-06, + "loss": 0.0457, + "num_tokens": 39244958.0, + "reward": -1.6143119536340236, + "reward_std": 7.410225963592529, + "rewards/get_chromagram_reward": 0.6208023488521576, + "rewards/get_chromagram_reward_std": 0.10917952060699462, + "rewards/get_intelligibility_reward": -5.437799453735352, + "rewards/get_intelligibility_reward_std": 11.794871520996093, + "rewards/get_target_len_reward": -0.0259383799508214, + "rewards/get_target_len_reward_std": 0.07941694743931293, + "step": 1280 + }, + { + "advantages": -2.6561321639917426e-07, + "advantages_std": 1.5743435859680175, + "clip_ratio": 0.0, + "completion_length": 83.72857284545898, + "epoch": 0.9699248120300752, + "grad_norm": 7.8125, + "kl": 0.331342040002346, + "learning_rate": 4.515037593984962e-06, + "loss": 0.0327, + "num_tokens": 39544497.0, + "reward": -1.3727825999259948, + "reward_std": 6.4047469139099125, + "rewards/get_chromagram_reward": 0.6227813065052032, + "rewards/get_chromagram_reward_std": 0.11946281418204308, + "rewards/get_intelligibility_reward": -4.721085810661316, + "rewards/get_intelligibility_reward_std": 10.246169281005859, + "rewards/get_target_len_reward": -0.02004314949735999, + "rewards/get_target_len_reward_std": 0.04626058042049408, + "step": 1290 + }, + { + "advantages": 7.500250394087971e-08, + "advantages_std": 1.6308319687843322, + "clip_ratio": 0.0, + "completion_length": 90.06666946411133, + "epoch": 0.9774436090225563, + "grad_norm": 19.0, + "kl": 0.4505035996437073, + "learning_rate": 4.511278195488722e-06, + "loss": 0.0485, + "num_tokens": 39860399.0, + "reward": -1.6213070809841157, + "reward_std": 6.728913688659668, + "rewards/get_chromagram_reward": 0.619669246673584, + "rewards/get_chromagram_reward_std": 0.11839989796280861, + "rewards/get_intelligibility_reward": -5.457677006721497, + "rewards/get_intelligibility_reward_std": 10.566771411895752, + "rewards/get_target_len_reward": -0.025913165416568518, + "rewards/get_target_len_reward_std": 0.06257133968174458, + "step": 1300 + }, + { + "advantages": -6.455928271975608e-07, + "advantages_std": 1.5221191763877868, + "clip_ratio": 0.0, + "completion_length": 84.05357284545899, + "epoch": 0.9849624060150376, + "grad_norm": 6.28125, + "kl": 0.4204570382833481, + "learning_rate": 4.507518796992482e-06, + "loss": 0.0493, + "num_tokens": 40159013.0, + "reward": -1.8954948306083679, + "reward_std": 6.913206434249878, + "rewards/get_chromagram_reward": 0.6115539908409119, + "rewards/get_chromagram_reward_std": 0.1196502335369587, + "rewards/get_intelligibility_reward": -6.275245666503906, + "rewards/get_intelligibility_reward_std": 10.659251880645751, + "rewards/get_target_len_reward": -0.02279257755726576, + "rewards/get_target_len_reward_std": 0.06787131484597922, + "step": 1310 + }, + { + "advantages": 7.599592493079399e-08, + "advantages_std": 1.451578962802887, + "clip_ratio": 0.0, + "completion_length": 90.43928680419921, + "epoch": 0.9924812030075187, + "grad_norm": 6.40625, + "kl": 0.3029856622219086, + "learning_rate": 4.5037593984962405e-06, + "loss": 0.037, + "num_tokens": 40475013.0, + "reward": -1.1015370726585387, + "reward_std": 6.6790083885192875, + "rewards/get_chromagram_reward": 0.6353936314582824, + "rewards/get_chromagram_reward_std": 0.10527931228280067, + "rewards/get_intelligibility_reward": -3.917825734615326, + "rewards/get_intelligibility_reward_std": 10.991698837280273, + "rewards/get_target_len_reward": -0.022179031558334828, + "rewards/get_target_len_reward_std": 0.06528392806649208, + "step": 1320 + }, + { + "advantages": -3.712251807286293e-07, + "advantages_std": 1.5168458461761474, + "clip_ratio": 0.0, + "completion_length": 86.39321594238281, + "epoch": 1.000751879699248, + "grad_norm": 28.625, + "kl": 0.3049457028508186, + "learning_rate": 4.5e-06, + "loss": 0.0335, + "num_tokens": 40777755.0, + "reward": -1.4215580880641938, + "reward_std": 6.318771314620972, + "rewards/get_chromagram_reward": 0.601853746175766, + "rewards/get_chromagram_reward_std": 0.11767267286777497, + "rewards/get_intelligibility_reward": -4.846245145797729, + "rewards/get_intelligibility_reward_std": 10.053415679931641, + "rewards/get_target_len_reward": -0.020282691903412343, + "rewards/get_target_len_reward_std": 0.06443829238414764, + "step": 1330 + }, + { + "advantages": -1.4180938734398296e-07, + "advantages_std": 1.5976835131645202, + "clip_ratio": 0.0, + "completion_length": 89.9607147216797, + "epoch": 1.0082706766917293, + "grad_norm": 66.5, + "kl": 0.28232268542051314, + "learning_rate": 4.49624060150376e-06, + "loss": 0.0355, + "num_tokens": 41092861.0, + "reward": -1.2550855576992035, + "reward_std": 6.8124340057373045, + "rewards/get_chromagram_reward": 0.6082775354385376, + "rewards/get_chromagram_reward_std": 0.124069694429636, + "rewards/get_intelligibility_reward": -4.347317087650299, + "rewards/get_intelligibility_reward_std": 11.108979606628418, + "rewards/get_target_len_reward": -0.026216878183186056, + "rewards/get_target_len_reward_std": 0.07688224576413631, + "step": 1340 + }, + { + "advantages": -4.2368969257466913e-07, + "advantages_std": 1.5768442749977112, + "clip_ratio": 0.0, + "completion_length": 88.80714416503906, + "epoch": 1.0157894736842106, + "grad_norm": 5.78125, + "kl": 0.2955815777182579, + "learning_rate": 4.492481203007519e-06, + "loss": 0.032, + "num_tokens": 41404610.0, + "reward": -1.4556842476129532, + "reward_std": 6.5793181419372555, + "rewards/get_chromagram_reward": 0.6118521928787232, + "rewards/get_chromagram_reward_std": 0.09616614580154419, + "rewards/get_intelligibility_reward": -4.963257133960724, + "rewards/get_intelligibility_reward_std": 10.508817291259765, + "rewards/get_target_len_reward": -0.015647331532090903, + "rewards/get_target_len_reward_std": 0.041581641510128976, + "step": 1350 + }, + { + "advantages": 4.92235039928346e-07, + "advantages_std": 1.5769322037696838, + "clip_ratio": 0.0, + "completion_length": 87.67024002075195, + "epoch": 1.0233082706766918, + "grad_norm": 7.3125, + "kl": 0.2699931785464287, + "learning_rate": 4.488721804511278e-06, + "loss": 0.0272, + "num_tokens": 41713817.0, + "reward": -1.4852119833230972, + "reward_std": 6.430934286117553, + "rewards/get_chromagram_reward": 0.6303296208381652, + "rewards/get_chromagram_reward_std": 0.11175912097096444, + "rewards/get_intelligibility_reward": -5.066480994224548, + "rewards/get_intelligibility_reward_std": 10.17646369934082, + "rewards/get_target_len_reward": -0.019484441634267567, + "rewards/get_target_len_reward_std": 0.04843886476010084, + "step": 1360 + }, + { + "advantages": 7.37359146540939e-07, + "advantages_std": 1.5006843090057373, + "clip_ratio": 0.0, + "completion_length": 89.53869247436523, + "epoch": 1.0308270676691729, + "grad_norm": 6.3125, + "kl": 0.2454486146569252, + "learning_rate": 4.484962406015038e-06, + "loss": 0.0292, + "num_tokens": 42028223.0, + "reward": -1.110092930495739, + "reward_std": 6.495879364013672, + "rewards/get_chromagram_reward": 0.6152016639709472, + "rewards/get_chromagram_reward_std": 0.11499225050210952, + "rewards/get_intelligibility_reward": -3.9230158805847166, + "rewards/get_intelligibility_reward_std": 10.612717962265014, + "rewards/get_target_len_reward": -0.022464485326781868, + "rewards/get_target_len_reward_std": 0.06751915938220918, + "step": 1370 + }, + { + "advantages": 7.003545921868693e-07, + "advantages_std": 1.6903245568275451, + "clip_ratio": 0.0, + "completion_length": 86.36190643310547, + "epoch": 1.0383458646616541, + "grad_norm": 14.3125, + "kl": 0.3273655205965042, + "learning_rate": 4.4812030075187975e-06, + "loss": 0.0377, + "num_tokens": 42333747.0, + "reward": -1.5490575909614563, + "reward_std": 6.348045444488525, + "rewards/get_chromagram_reward": 0.6144465744495392, + "rewards/get_chromagram_reward_std": 0.11609260141849517, + "rewards/get_intelligibility_reward": -5.239705562591553, + "rewards/get_intelligibility_reward_std": 9.924682903289796, + "rewards/get_target_len_reward": -0.021913580782711505, + "rewards/get_target_len_reward_std": 0.07249412853270769, + "step": 1380 + }, + { + "advantages": 4.435579057826544e-07, + "advantages_std": 1.6824481248855592, + "clip_ratio": 0.0, + "completion_length": 85.4571434020996, + "epoch": 1.0458646616541354, + "grad_norm": 24.0, + "kl": 0.31759440451860427, + "learning_rate": 4.477443609022556e-06, + "loss": 0.0326, + "num_tokens": 42636500.0, + "reward": -1.533106380701065, + "reward_std": 6.635279893875122, + "rewards/get_chromagram_reward": 0.6223339438438416, + "rewards/get_chromagram_reward_std": 0.12457952573895455, + "rewards/get_intelligibility_reward": -5.1997171401977536, + "rewards/get_intelligibility_reward_std": 10.557915115356446, + "rewards/get_target_len_reward": -0.021935593243688344, + "rewards/get_target_len_reward_std": 0.04879705365747213, + "step": 1390 + }, + { + "advantages": -8.630257042341327e-08, + "advantages_std": 1.5478406071662902, + "clip_ratio": 0.0, + "completion_length": 88.22261962890624, + "epoch": 1.0533834586466166, + "grad_norm": 11.625, + "kl": 0.35201059728860856, + "learning_rate": 4.473684210526316e-06, + "loss": 0.0351, + "num_tokens": 42947076.0, + "reward": -1.0654352620244025, + "reward_std": 6.328466606140137, + "rewards/get_chromagram_reward": 0.6265157759189606, + "rewards/get_chromagram_reward_std": 0.1030153326690197, + "rewards/get_intelligibility_reward": -3.8062858104705812, + "rewards/get_intelligibility_reward_std": 10.39826946258545, + "rewards/get_target_len_reward": -0.016535515151917934, + "rewards/get_target_len_reward_std": 0.04152263272553682, + "step": 1400 + }, + { + "advantages": 5.507220883771424e-07, + "advantages_std": 1.6665414214134215, + "clip_ratio": 0.0, + "completion_length": 88.79345397949218, + "epoch": 1.0609022556390977, + "grad_norm": 108.5, + "kl": 0.3712936282157898, + "learning_rate": 4.469924812030076e-06, + "loss": 0.0386, + "num_tokens": 43258728.0, + "reward": -1.7698104798793792, + "reward_std": 6.952770853042603, + "rewards/get_chromagram_reward": 0.6143802165985107, + "rewards/get_chromagram_reward_std": 0.10771603286266326, + "rewards/get_intelligibility_reward": -5.902230358123779, + "rewards/get_intelligibility_reward_std": 10.760121154785157, + "rewards/get_target_len_reward": -0.02158096982166171, + "rewards/get_target_len_reward_std": 0.06000328604131937, + "step": 1410 + }, + { + "advantages": -8.791684713571613e-08, + "advantages_std": 1.59330712556839, + "clip_ratio": 0.0, + "completion_length": 89.35000152587891, + "epoch": 1.068421052631579, + "grad_norm": 21.625, + "kl": 0.29032159596681595, + "learning_rate": 4.466165413533835e-06, + "loss": 0.0295, + "num_tokens": 43572786.0, + "reward": -1.209948765486479, + "reward_std": 6.531015062332154, + "rewards/get_chromagram_reward": 0.6338232755661011, + "rewards/get_chromagram_reward_std": 0.10927505195140838, + "rewards/get_intelligibility_reward": -4.243760868906975, + "rewards/get_intelligibility_reward_std": 10.54481372833252, + "rewards/get_target_len_reward": -0.01990845762193203, + "rewards/get_target_len_reward_std": 0.05181889459490776, + "step": 1420 + }, + { + "advantages": -3.5638611493027386e-07, + "advantages_std": 1.6873063921928406, + "clip_ratio": 0.0, + "completion_length": 87.25119171142578, + "epoch": 1.0759398496240602, + "grad_norm": 5.875, + "kl": 0.3652739107608795, + "learning_rate": 4.462406015037594e-06, + "loss": 0.0441, + "num_tokens": 43880015.0, + "reward": -1.794026893377304, + "reward_std": 6.88950924873352, + "rewards/get_chromagram_reward": 0.6147673070430756, + "rewards/get_chromagram_reward_std": 0.1085585631430149, + "rewards/get_intelligibility_reward": -5.975696587562561, + "rewards/get_intelligibility_reward_std": 10.655347537994384, + "rewards/get_target_len_reward": -0.02115098023787141, + "rewards/get_target_len_reward_std": 0.06407887656241655, + "step": 1430 + }, + { + "advantages": -2.2724272383811694e-07, + "advantages_std": 1.597934901714325, + "clip_ratio": 0.0, + "completion_length": 85.1458351135254, + "epoch": 1.0834586466165415, + "grad_norm": 7.78125, + "kl": 0.2844631150364876, + "learning_rate": 4.458646616541354e-06, + "loss": 0.034, + "num_tokens": 44181556.0, + "reward": -1.610038973391056, + "reward_std": 7.0278373718261715, + "rewards/get_chromagram_reward": 0.6216884851455688, + "rewards/get_chromagram_reward_std": 0.10702887326478958, + "rewards/get_intelligibility_reward": -5.433124041557312, + "rewards/get_intelligibility_reward_std": 11.143450927734374, + "rewards/get_target_len_reward": -0.018680935073643923, + "rewards/get_target_len_reward_std": 0.052847124822437766, + "step": 1440 + }, + { + "advantages": -2.371768186293366e-07, + "advantages_std": 1.6119914293289184, + "clip_ratio": 0.0, + "completion_length": 91.92738342285156, + "epoch": 1.0909774436090225, + "grad_norm": 6.625, + "kl": 0.25501908659934996, + "learning_rate": 4.454887218045113e-06, + "loss": 0.0245, + "num_tokens": 44501395.0, + "reward": -1.4425207868218421, + "reward_std": 7.0387735843658445, + "rewards/get_chromagram_reward": 0.6270141720771789, + "rewards/get_chromagram_reward_std": 0.10027562379837036, + "rewards/get_intelligibility_reward": -4.93934919834137, + "rewards/get_intelligibility_reward_std": 11.302708530426026, + "rewards/get_target_len_reward": -0.015226956270635129, + "rewards/get_target_len_reward_std": 0.041442089900374415, + "step": 1450 + }, + { + "advantages": 4.912416148528109e-07, + "advantages_std": 1.5803248286247253, + "clip_ratio": 0.0, + "completion_length": 90.03154907226562, + "epoch": 1.0984962406015037, + "grad_norm": 16.5, + "kl": 0.2846480205655098, + "learning_rate": 4.451127819548873e-06, + "loss": 0.034, + "num_tokens": 44817065.0, + "reward": -1.4160616666078567, + "reward_std": 6.803660202026367, + "rewards/get_chromagram_reward": 0.6224904239177704, + "rewards/get_chromagram_reward_std": 0.10744070336222648, + "rewards/get_intelligibility_reward": -4.849036240577698, + "rewards/get_intelligibility_reward_std": 10.839382362365722, + "rewards/get_target_len_reward": -0.021639053942635655, + "rewards/get_target_len_reward_std": 0.05841308189556003, + "step": 1460 + }, + { + "advantages": 4.967053882865002e-07, + "advantages_std": 1.580376970767975, + "clip_ratio": 0.0, + "completion_length": 87.665478515625, + "epoch": 1.106015037593985, + "grad_norm": 90.5, + "kl": 0.29044296592473984, + "learning_rate": 4.447368421052632e-06, + "loss": 0.0303, + "num_tokens": 45125318.0, + "reward": -1.5038848042488098, + "reward_std": 6.9014417171478275, + "rewards/get_chromagram_reward": 0.6155295431613922, + "rewards/get_chromagram_reward_std": 0.11622778475284576, + "rewards/get_intelligibility_reward": -5.104975247383118, + "rewards/get_intelligibility_reward_std": 11.041919040679932, + "rewards/get_target_len_reward": -0.022208488639444114, + "rewards/get_target_len_reward_std": 0.06804153546690941, + "step": 1470 + }, + { + "advantages": -2.2227563434285004e-08, + "advantages_std": 1.5115014910697937, + "clip_ratio": 0.0, + "completion_length": 87.7130958557129, + "epoch": 1.1135338345864663, + "grad_norm": 6.0625, + "kl": 0.5705481573939324, + "learning_rate": 4.443609022556391e-06, + "loss": 0.0599, + "num_tokens": 45434347.0, + "reward": -1.292085385322571, + "reward_std": 6.833794784545899, + "rewards/get_chromagram_reward": 0.6335037827491761, + "rewards/get_chromagram_reward_std": 0.11769273206591606, + "rewards/get_intelligibility_reward": -4.487040567398071, + "rewards/get_intelligibility_reward_std": 11.127706336975098, + "rewards/get_target_len_reward": -0.02271918151527643, + "rewards/get_target_len_reward_std": 0.06205502189695835, + "step": 1480 + }, + { + "advantages": 2.8560555165313416e-08, + "advantages_std": 1.6912749409675598, + "clip_ratio": 0.0, + "completion_length": 90.43214569091796, + "epoch": 1.1210526315789473, + "grad_norm": 8.75, + "kl": 0.39698506742715833, + "learning_rate": 4.439849624060151e-06, + "loss": 0.0435, + "num_tokens": 45750722.0, + "reward": -1.3910152643918992, + "reward_std": 6.621922302246094, + "rewards/get_chromagram_reward": 0.6196567595005036, + "rewards/get_chromagram_reward_std": 0.10777244716882706, + "rewards/get_intelligibility_reward": -4.771567785739899, + "rewards/get_intelligibility_reward_std": 10.633524322509766, + "rewards/get_target_len_reward": -0.021134493965655567, + "rewards/get_target_len_reward_std": 0.06209372207522392, + "step": 1490 + }, + { + "advantages": -5.6872755749282077e-08, + "advantages_std": 1.4580247700214386, + "clip_ratio": 0.0, + "completion_length": 88.9755958557129, + "epoch": 1.1285714285714286, + "grad_norm": 5.03125, + "kl": 0.28813485354185103, + "learning_rate": 4.43609022556391e-06, + "loss": 0.0355, + "num_tokens": 46063025.0, + "reward": -1.2929147403687238, + "reward_std": 6.570118951797485, + "rewards/get_chromagram_reward": 0.6155599892139435, + "rewards/get_chromagram_reward_std": 0.11506677493453026, + "rewards/get_intelligibility_reward": -4.472536733746528, + "rewards/get_intelligibility_reward_std": 10.563865470886231, + "rewards/get_target_len_reward": -0.021767213568091394, + "rewards/get_target_len_reward_std": 0.06660667713731527, + "step": 1500 + }, + { + "advantages": 2.9082100407862297e-07, + "advantages_std": 1.6153509140014648, + "clip_ratio": 0.0, + "completion_length": 87.2125015258789, + "epoch": 1.1360902255639098, + "grad_norm": 14.5, + "kl": 0.3224494606256485, + "learning_rate": 4.4323308270676695e-06, + "loss": 0.0343, + "num_tokens": 46370373.0, + "reward": -1.3674847215414048, + "reward_std": 6.626483583450318, + "rewards/get_chromagram_reward": 0.6133666872978211, + "rewards/get_chromagram_reward_std": 0.10720290318131447, + "rewards/get_intelligibility_reward": -4.692685705423355, + "rewards/get_intelligibility_reward_std": 10.596193408966064, + "rewards/get_target_len_reward": -0.023134994506835937, + "rewards/get_target_len_reward_std": 0.07376478314399719, + "step": 1510 + }, + { + "advantages": -1.9694368376121928e-07, + "advantages_std": 1.6327782154083252, + "clip_ratio": 0.0, + "completion_length": 86.3755973815918, + "epoch": 1.143609022556391, + "grad_norm": 7.1875, + "kl": 0.35077326446771623, + "learning_rate": 4.428571428571429e-06, + "loss": 0.0396, + "num_tokens": 46675165.0, + "reward": -1.8090145349502564, + "reward_std": 6.713898372650147, + "rewards/get_chromagram_reward": 0.6248087406158447, + "rewards/get_chromagram_reward_std": 0.11518047973513604, + "rewards/get_intelligibility_reward": -6.0250754117965695, + "rewards/get_intelligibility_reward_std": 10.370805311203004, + "rewards/get_target_len_reward": -0.02677653534337878, + "rewards/get_target_len_reward_std": 0.07797287553548812, + "step": 1520 + }, + { + "advantages": -1.0505319445464067e-07, + "advantages_std": 1.6318493604660034, + "clip_ratio": 0.0, + "completion_length": 87.5809539794922, + "epoch": 1.151127819548872, + "grad_norm": 8.375, + "kl": 0.3616947069764137, + "learning_rate": 4.424812030075189e-06, + "loss": 0.0394, + "num_tokens": 46983757.0, + "reward": -1.3312961548566817, + "reward_std": 6.356854820251465, + "rewards/get_chromagram_reward": 0.6233540952205658, + "rewards/get_chromagram_reward_std": 0.12999609112739563, + "rewards/get_intelligibility_reward": -4.593122959136963, + "rewards/get_intelligibility_reward_std": 10.18731756210327, + "rewards/get_target_len_reward": -0.024119340069592, + "rewards/get_target_len_reward_std": 0.06394902095198632, + "step": 1530 + }, + { + "advantages": -1.4652809312565296e-07, + "advantages_std": 1.5422507643699646, + "clip_ratio": 0.0, + "completion_length": 85.48928756713867, + "epoch": 1.1586466165413534, + "grad_norm": 12.5625, + "kl": 0.3347591429948807, + "learning_rate": 4.4210526315789476e-06, + "loss": 0.0322, + "num_tokens": 47286357.0, + "reward": -1.4546345457434655, + "reward_std": 6.773113775253296, + "rewards/get_chromagram_reward": 0.6325708985328674, + "rewards/get_chromagram_reward_std": 0.10721831172704696, + "rewards/get_intelligibility_reward": -4.977900385856628, + "rewards/get_intelligibility_reward_std": 10.788327884674072, + "rewards/get_target_len_reward": -0.018573809042572977, + "rewards/get_target_len_reward_std": 0.04435503650456667, + "step": 1540 + }, + { + "advantages": 5.419055707278631e-07, + "advantages_std": 1.6118939757347106, + "clip_ratio": 0.0, + "completion_length": 86.31428756713868, + "epoch": 1.1661654135338346, + "grad_norm": 6.125, + "kl": 1.34088137447834, + "learning_rate": 4.417293233082707e-06, + "loss": 0.1428, + "num_tokens": 47590543.0, + "reward": -1.5086460947990417, + "reward_std": 6.8800328254699705, + "rewards/get_chromagram_reward": 0.6266894578933716, + "rewards/get_chromagram_reward_std": 0.11288965046405793, + "rewards/get_intelligibility_reward": -5.127788019180298, + "rewards/get_intelligibility_reward_std": 10.920746994018554, + "rewards/get_target_len_reward": -0.024839654657989742, + "rewards/get_target_len_reward_std": 0.08086240235716105, + "step": 1550 + }, + { + "advantages": -4.6330195999644277e-07, + "advantages_std": 1.6480433940887451, + "clip_ratio": 0.0, + "completion_length": 85.3125015258789, + "epoch": 1.1736842105263159, + "grad_norm": 9.5625, + "kl": 0.3124460786581039, + "learning_rate": 4.413533834586467e-06, + "loss": 0.0367, + "num_tokens": 47892098.0, + "reward": -1.5736109614372253, + "reward_std": 6.557619524002075, + "rewards/get_chromagram_reward": 0.6038493335247039, + "rewards/get_chromagram_reward_std": 0.1160405620932579, + "rewards/get_intelligibility_reward": -5.302905559539795, + "rewards/get_intelligibility_reward_std": 10.392185306549072, + "rewards/get_target_len_reward": -0.021776399575173855, + "rewards/get_target_len_reward_std": 0.0646151814609766, + "step": 1560 + }, + { + "advantages": 5.87354101355686e-08, + "advantages_std": 1.4956356883049011, + "clip_ratio": 0.0, + "completion_length": 87.60774154663086, + "epoch": 1.181203007518797, + "grad_norm": 5.6875, + "kl": 0.3692999482154846, + "learning_rate": 4.4097744360902265e-06, + "loss": 0.0477, + "num_tokens": 48200190.0, + "reward": -1.1628334634006023, + "reward_std": 6.734441566467285, + "rewards/get_chromagram_reward": 0.6324356317520141, + "rewards/get_chromagram_reward_std": 0.10941554978489876, + "rewards/get_intelligibility_reward": -4.098512363433838, + "rewards/get_intelligibility_reward_std": 10.982434701919555, + "rewards/get_target_len_reward": -0.022423355374485254, + "rewards/get_target_len_reward_std": 0.06325810812413693, + "step": 1570 + }, + { + "advantages": -1.0244549315530094e-07, + "advantages_std": 1.626529288291931, + "clip_ratio": 0.0, + "completion_length": 88.18154907226562, + "epoch": 1.1887218045112782, + "grad_norm": 6.34375, + "kl": 0.9706477746367455, + "learning_rate": 4.406015037593985e-06, + "loss": 0.1031, + "num_tokens": 48509733.0, + "reward": -1.7460247814655303, + "reward_std": 6.996130752563476, + "rewards/get_chromagram_reward": 0.6319741845130921, + "rewards/get_chromagram_reward_std": 0.11034553200006485, + "rewards/get_intelligibility_reward": -5.849562883377075, + "rewards/get_intelligibility_reward_std": 10.926602697372436, + "rewards/get_target_len_reward": -0.020485294051468372, + "rewards/get_target_len_reward_std": 0.05834382399916649, + "step": 1580 + }, + { + "advantages": -1.4839073099182086e-07, + "advantages_std": 1.5476613879203795, + "clip_ratio": 0.0, + "completion_length": 88.67678909301758, + "epoch": 1.1962406015037594, + "grad_norm": 6.4375, + "kl": 0.27188325226306914, + "learning_rate": 4.402255639097744e-06, + "loss": 0.0276, + "num_tokens": 48821267.0, + "reward": -1.2882464125752449, + "reward_std": 6.523538208007812, + "rewards/get_chromagram_reward": 0.6279970049858093, + "rewards/get_chromagram_reward_std": 0.09582380726933479, + "rewards/get_intelligibility_reward": -4.473076581954956, + "rewards/get_intelligibility_reward_std": 10.490189599990845, + "rewards/get_target_len_reward": -0.019659423362463714, + "rewards/get_target_len_reward_std": 0.05062466654926538, + "step": 1590 + }, + { + "advantages": 6.534159325610745e-07, + "advantages_std": 1.647250759601593, + "clip_ratio": 0.0, + "completion_length": 88.38154983520508, + "epoch": 1.2037593984962407, + "grad_norm": 27.5, + "kl": 0.2886976793408394, + "learning_rate": 4.398496240601504e-06, + "loss": 0.0329, + "num_tokens": 49132692.0, + "reward": -1.3682509139180183, + "reward_std": 6.9248096466064455, + "rewards/get_chromagram_reward": 0.6313087105751037, + "rewards/get_chromagram_reward_std": 0.11718138679862022, + "rewards/get_intelligibility_reward": -4.710203987360001, + "rewards/get_intelligibility_reward_std": 11.157450008392335, + "rewards/get_target_len_reward": -0.025857241172343493, + "rewards/get_target_len_reward_std": 0.07822401337325573, + "step": 1600 + }, + { + "advantages": 1.373390333014868e-07, + "advantages_std": 1.6475409626960755, + "clip_ratio": 0.0, + "completion_length": 83.82619094848633, + "epoch": 1.2112781954887217, + "grad_norm": 5.53125, + "kl": 0.3448040962219238, + "learning_rate": 4.394736842105263e-06, + "loss": 0.0403, + "num_tokens": 49431312.0, + "reward": -1.8660083770751954, + "reward_std": 7.143636417388916, + "rewards/get_chromagram_reward": 0.6275706827640534, + "rewards/get_chromagram_reward_std": 0.11524273306131363, + "rewards/get_intelligibility_reward": -6.202886414527893, + "rewards/get_intelligibility_reward_std": 11.145586681365966, + "rewards/get_target_len_reward": -0.02270910535007715, + "rewards/get_target_len_reward_std": 0.07161459308117628, + "step": 1610 + }, + { + "advantages": 3.9339067661181557e-07, + "advantages_std": 1.577373206615448, + "clip_ratio": 0.0, + "completion_length": 88.20535736083984, + "epoch": 1.218796992481203, + "grad_norm": 740.0, + "kl": 0.4538608729839325, + "learning_rate": 4.390977443609023e-06, + "loss": 0.0523, + "num_tokens": 49741967.0, + "reward": -1.2801022872328758, + "reward_std": 6.292341852188111, + "rewards/get_chromagram_reward": 0.6265273749828338, + "rewards/get_chromagram_reward_std": 0.10703437700867653, + "rewards/get_intelligibility_reward": -4.4441596299409865, + "rewards/get_intelligibility_reward_std": 10.096945667266846, + "rewards/get_target_len_reward": -0.022674433421343565, + "rewards/get_target_len_reward_std": 0.06971333101391793, + "step": 1620 + }, + { + "advantages": -3.42975042855187e-07, + "advantages_std": 1.5779038429260255, + "clip_ratio": 0.0, + "completion_length": 89.30595397949219, + "epoch": 1.2263157894736842, + "grad_norm": 5.375, + "kl": 38.55426201820374, + "learning_rate": 4.387218045112782e-06, + "loss": 3.8586, + "num_tokens": 50056039.0, + "reward": -1.3629911191761495, + "reward_std": 6.501353454589844, + "rewards/get_chromagram_reward": 0.6287332653999329, + "rewards/get_chromagram_reward_std": 0.11263178661465645, + "rewards/get_intelligibility_reward": -4.694042664766312, + "rewards/get_intelligibility_reward_std": 10.31979284286499, + "rewards/get_target_len_reward": -0.02366358144208789, + "rewards/get_target_len_reward_std": 0.06714603845030069, + "step": 1630 + }, + { + "advantages": -4.142522893602063e-07, + "advantages_std": 1.6763808131217957, + "clip_ratio": 0.0, + "completion_length": 87.64404830932617, + "epoch": 1.2338345864661655, + "grad_norm": 28.5, + "kl": 0.3203793570399284, + "learning_rate": 4.3834586466165415e-06, + "loss": 0.0324, + "num_tokens": 50364992.0, + "reward": -1.573092085123062, + "reward_std": 6.357421112060547, + "rewards/get_chromagram_reward": 0.6227325141429901, + "rewards/get_chromagram_reward_std": 0.11975778564810753, + "rewards/get_intelligibility_reward": -5.323095595836639, + "rewards/get_intelligibility_reward_std": 9.898041820526123, + "rewards/get_target_len_reward": -0.018912956397980452, + "rewards/get_target_len_reward_std": 0.04353561829775572, + "step": 1640 + }, + { + "advantages": 2.920627608204995e-07, + "advantages_std": 1.6491339564323426, + "clip_ratio": 0.0, + "completion_length": 88.24226303100586, + "epoch": 1.2413533834586465, + "grad_norm": 8.9375, + "kl": 2234.8600372612477, + "learning_rate": 4.379699248120301e-06, + "loss": 223.4912, + "num_tokens": 50675660.0, + "reward": -1.3432014167308808, + "reward_std": 6.825479030609131, + "rewards/get_chromagram_reward": 0.611679208278656, + "rewards/get_chromagram_reward_std": 0.11697395518422127, + "rewards/get_intelligibility_reward": -4.619905805587768, + "rewards/get_intelligibility_reward_std": 11.043926239013672, + "rewards/get_target_len_reward": -0.02137720864266157, + "rewards/get_target_len_reward_std": 0.06526100691407918, + "step": 1650 + }, + { + "advantages": 4.4430297307940236e-07, + "advantages_std": 1.5631268739700317, + "clip_ratio": 0.0, + "completion_length": 85.99643020629883, + "epoch": 1.2488721804511278, + "grad_norm": 10.3125, + "kl": 0.41894365549087526, + "learning_rate": 4.375939849624061e-06, + "loss": 0.0469, + "num_tokens": 50979381.0, + "reward": -2.0171368598937987, + "reward_std": 7.4840082168579105, + "rewards/get_chromagram_reward": 0.6187068104743958, + "rewards/get_chromagram_reward_std": 0.1058721587061882, + "rewards/get_intelligibility_reward": -6.645796608924866, + "rewards/get_intelligibility_reward_std": 11.541616916656494, + "rewards/get_target_len_reward": -0.024320135079324245, + "rewards/get_target_len_reward_std": 0.08446543496102095, + "step": 1660 + }, + { + "advantages": -7.599592279916579e-08, + "advantages_std": 1.6220983147621155, + "clip_ratio": 0.0, + "completion_length": 85.78274078369141, + "epoch": 1.256390977443609, + "grad_norm": 9.4375, + "kl": 0.3555022940039635, + "learning_rate": 4.3721804511278196e-06, + "loss": 0.0473, + "num_tokens": 51283103.0, + "reward": -1.3157795041799545, + "reward_std": 6.486531400680542, + "rewards/get_chromagram_reward": 0.6245897948741913, + "rewards/get_chromagram_reward_std": 0.12780793830752374, + "rewards/get_intelligibility_reward": -4.5451841205358505, + "rewards/get_intelligibility_reward_std": 10.350716400146485, + "rewards/get_target_len_reward": -0.02674414971843362, + "rewards/get_target_len_reward_std": 0.08355995900928974, + "step": 1670 + }, + { + "advantages": -1.4441710263213282e-07, + "advantages_std": 1.6307815790176392, + "clip_ratio": 0.0, + "completion_length": 90.22500228881836, + "epoch": 1.2639097744360903, + "grad_norm": 11.75, + "kl": 0.5674268335103989, + "learning_rate": 4.368421052631579e-06, + "loss": 0.0603, + "num_tokens": 51599142.0, + "reward": -1.4558716148138047, + "reward_std": 6.707897043228149, + "rewards/get_chromagram_reward": 0.6259454905986785, + "rewards/get_chromagram_reward_std": 0.10946089550852775, + "rewards/get_intelligibility_reward": -4.973546826839447, + "rewards/get_intelligibility_reward_std": 10.674462413787841, + "rewards/get_target_len_reward": -0.020013115461915733, + "rewards/get_target_len_reward_std": 0.060001314245164396, + "step": 1680 + }, + { + "advantages": 5.508462791681268e-07, + "advantages_std": 1.4188524723052978, + "clip_ratio": 0.0, + "completion_length": 82.01190643310547, + "epoch": 1.2714285714285714, + "grad_norm": 15.125, + "kl": 0.39633190631866455, + "learning_rate": 4.364661654135339e-06, + "loss": 0.0446, + "num_tokens": 51892605.0, + "reward": -1.8866865515708924, + "reward_std": 6.593513154983521, + "rewards/get_chromagram_reward": 0.6106093227863312, + "rewards/get_chromagram_reward_std": 0.11626130864024162, + "rewards/get_intelligibility_reward": -6.248795795440674, + "rewards/get_intelligibility_reward_std": 9.956018733978272, + "rewards/get_target_len_reward": -0.02187281660735607, + "rewards/get_target_len_reward_std": 0.060151894204318525, + "step": 1690 + }, + { + "advantages": -1.322478150100892e-07, + "advantages_std": 1.627256417274475, + "clip_ratio": 0.0, + "completion_length": 88.05714569091796, + "epoch": 1.2789473684210526, + "grad_norm": 7.53125, + "kl": 0.38881611078977585, + "learning_rate": 4.360902255639098e-06, + "loss": 0.0482, + "num_tokens": 52202844.0, + "reward": -1.630874615907669, + "reward_std": 7.069363403320312, + "rewards/get_chromagram_reward": 0.6299600839614868, + "rewards/get_chromagram_reward_std": 0.12561212480068207, + "rewards/get_intelligibility_reward": -5.494401431083679, + "rewards/get_intelligibility_reward_std": 11.227049160003663, + "rewards/get_target_len_reward": -0.028182223346084355, + "rewards/get_target_len_reward_std": 0.08260326832532883, + "step": 1700 + }, + { + "advantages": 3.118688951531112e-07, + "advantages_std": 1.5345268487930297, + "clip_ratio": 0.0, + "completion_length": 87.63631210327148, + "epoch": 1.2864661654135339, + "grad_norm": 5.9375, + "kl": 0.3392829239368439, + "learning_rate": 4.357142857142857e-06, + "loss": 0.0339, + "num_tokens": 52511465.0, + "reward": -1.732570117712021, + "reward_std": 6.669246482849121, + "rewards/get_chromagram_reward": 0.6242092669010162, + "rewards/get_chromagram_reward_std": 0.11928762272000312, + "rewards/get_intelligibility_reward": -5.80348813533783, + "rewards/get_intelligibility_reward_std": 10.308567714691161, + "rewards/get_target_len_reward": -0.018431250657886266, + "rewards/get_target_len_reward_std": 0.04403619281947613, + "step": 1710 + }, + { + "advantages": -3.8954119361278574e-07, + "advantages_std": 1.5930951476097106, + "clip_ratio": 0.0, + "completion_length": 84.03095474243165, + "epoch": 1.293984962406015, + "grad_norm": 7.96875, + "kl": 0.3179944708943367, + "learning_rate": 4.353383458646617e-06, + "loss": 0.0405, + "num_tokens": 52810252.0, + "reward": -1.6103318095207215, + "reward_std": 7.0111226558685305, + "rewards/get_chromagram_reward": 0.6270187616348266, + "rewards/get_chromagram_reward_std": 0.12597814574837685, + "rewards/get_intelligibility_reward": -5.428212428092957, + "rewards/get_intelligibility_reward_std": 11.129766941070557, + "rewards/get_target_len_reward": -0.029801409970968962, + "rewards/get_target_len_reward_std": 0.08510931301862001, + "step": 1720 + }, + { + "advantages": 5.165736070011917e-08, + "advantages_std": 1.530632495880127, + "clip_ratio": 0.0, + "completion_length": 86.00535888671875, + "epoch": 1.3015037593984962, + "grad_norm": 7.75, + "kl": 0.5288962870836258, + "learning_rate": 4.349624060150377e-06, + "loss": 0.0588, + "num_tokens": 53114288.0, + "reward": -1.5636741399765015, + "reward_std": 6.584192562103271, + "rewards/get_chromagram_reward": 0.6242028534412384, + "rewards/get_chromagram_reward_std": 0.10919138565659522, + "rewards/get_intelligibility_reward": -5.293649542331695, + "rewards/get_intelligibility_reward_std": 10.293786716461181, + "rewards/get_target_len_reward": -0.021575375087559225, + "rewards/get_target_len_reward_std": 0.059037490375339986, + "step": 1730 + }, + { + "advantages": -4.000341135679264e-07, + "advantages_std": 1.519398820400238, + "clip_ratio": 0.0, + "completion_length": 86.75714416503907, + "epoch": 1.3090225563909774, + "grad_norm": 13.0625, + "kl": 0.32477793991565707, + "learning_rate": 4.345864661654135e-06, + "loss": 0.035, + "num_tokens": 53420372.0, + "reward": -1.5456707239151002, + "reward_std": 6.6155702590942385, + "rewards/get_chromagram_reward": 0.6140604197978974, + "rewards/get_chromagram_reward_std": 0.11171316578984261, + "rewards/get_intelligibility_reward": -5.232801699638367, + "rewards/get_intelligibility_reward_std": 10.395722389221191, + "rewards/get_target_len_reward": -0.01827064696699381, + "rewards/get_target_len_reward_std": 0.052512189373373985, + "step": 1740 + }, + { + "advantages": -5.935629210362947e-08, + "advantages_std": 1.6254116296768188, + "clip_ratio": 0.0, + "completion_length": 86.40178680419922, + "epoch": 1.3165413533834587, + "grad_norm": 20.25, + "kl": 0.32791008800268173, + "learning_rate": 4.342105263157895e-06, + "loss": 0.0382, + "num_tokens": 53725544.0, + "reward": -1.2662768244743348, + "reward_std": 6.619711637496948, + "rewards/get_chromagram_reward": 0.6357514679431915, + "rewards/get_chromagram_reward_std": 0.11389932408928871, + "rewards/get_intelligibility_reward": -4.411121499538422, + "rewards/get_intelligibility_reward_std": 10.643956756591797, + "rewards/get_target_len_reward": -0.0234602483920753, + "rewards/get_target_len_reward_std": 0.06484440937638283, + "step": 1750 + }, + { + "advantages": -3.2583874087777076e-07, + "advantages_std": 1.6038388013839722, + "clip_ratio": 0.0, + "completion_length": 84.06904830932618, + "epoch": 1.32406015037594, + "grad_norm": 24.0, + "kl": 0.364103789627552, + "learning_rate": 4.338345864661655e-06, + "loss": 0.0407, + "num_tokens": 54024535.0, + "reward": -1.9007295727729798, + "reward_std": 6.875812959671021, + "rewards/get_chromagram_reward": 0.627848082780838, + "rewards/get_chromagram_reward_std": 0.1251745492219925, + "rewards/get_intelligibility_reward": -6.303326368331909, + "rewards/get_intelligibility_reward_std": 10.511895561218262, + "rewards/get_target_len_reward": -0.02671011108905077, + "rewards/get_target_len_reward_std": 0.07524531930685044, + "step": 1760 + }, + { + "advantages": -3.568828191191642e-07, + "advantages_std": 1.6275288105010985, + "clip_ratio": 0.0, + "completion_length": 85.51726379394532, + "epoch": 1.331578947368421, + "grad_norm": 13.75, + "kl": 26.64915532618761, + "learning_rate": 4.334586466165414e-06, + "loss": 2.6748, + "num_tokens": 54326894.0, + "reward": -1.6861646354198456, + "reward_std": 6.675696802139282, + "rewards/get_chromagram_reward": 0.6133986115455627, + "rewards/get_chromagram_reward_std": 0.11093844547867775, + "rewards/get_intelligibility_reward": -5.648205804824829, + "rewards/get_intelligibility_reward_std": 10.412496376037598, + "rewards/get_target_len_reward": -0.02368657514452934, + "rewards/get_target_len_reward_std": 0.08231094852089882, + "step": 1770 + }, + { + "advantages": 1.353521980718142e-07, + "advantages_std": 1.505396318435669, + "clip_ratio": 0.0, + "completion_length": 84.86666793823242, + "epoch": 1.3390977443609022, + "grad_norm": 6.90625, + "kl": 0.30498138815164566, + "learning_rate": 4.330827067669173e-06, + "loss": 0.0313, + "num_tokens": 54628689.0, + "reward": -1.3113522812724114, + "reward_std": 6.501615858078003, + "rewards/get_chromagram_reward": 0.621231734752655, + "rewards/get_chromagram_reward_std": 0.11187814921140671, + "rewards/get_intelligibility_reward": -4.536672675609589, + "rewards/get_intelligibility_reward_std": 10.441095304489135, + "rewards/get_target_len_reward": -0.01861567758023739, + "rewards/get_target_len_reward_std": 0.047264106944203375, + "step": 1780 + }, + { + "advantages": -3.361453636330225e-07, + "advantages_std": 1.6669383287429809, + "clip_ratio": 0.0, + "completion_length": 86.45059738159179, + "epoch": 1.3466165413533835, + "grad_norm": 176.0, + "kl": 0.39939187467098236, + "learning_rate": 4.327067669172933e-06, + "loss": 0.0483, + "num_tokens": 54934200.0, + "reward": -1.4774677753448486, + "reward_std": 6.3739540576934814, + "rewards/get_chromagram_reward": 0.6193186163902282, + "rewards/get_chromagram_reward_std": 0.12128224372863769, + "rewards/get_intelligibility_reward": -5.026852607727051, + "rewards/get_intelligibility_reward_std": 10.082926750183105, + "rewards/get_target_len_reward": -0.024869086034595967, + "rewards/get_target_len_reward_std": 0.07926335744559765, + "step": 1790 + }, + { + "advantages": -4.2666993449458346e-07, + "advantages_std": 1.6381709456443787, + "clip_ratio": 0.0, + "completion_length": 84.02321548461914, + "epoch": 1.3541353383458645, + "grad_norm": 8.875, + "kl": 352.27155195474626, + "learning_rate": 4.323308270676692e-06, + "loss": 35.2321, + "num_tokens": 55233137.0, + "reward": -1.8081068694591522, + "reward_std": 7.203808116912842, + "rewards/get_chromagram_reward": 0.6083721816539764, + "rewards/get_chromagram_reward_std": 0.1257934235036373, + "rewards/get_intelligibility_reward": -6.011060285568237, + "rewards/get_intelligibility_reward_std": 11.309279632568359, + "rewards/get_target_len_reward": -0.021632275870069863, + "rewards/get_target_len_reward_std": 0.06019601076841354, + "step": 1800 + }, + { + "advantages": -6.941457684206398e-07, + "advantages_std": 1.4455040097236633, + "clip_ratio": 0.0, + "completion_length": 86.79880981445312, + "epoch": 1.3616541353383458, + "grad_norm": 7.9375, + "kl": 0.3444279834628105, + "learning_rate": 4.319548872180451e-06, + "loss": 0.0375, + "num_tokens": 55540102.0, + "reward": -1.517743881419301, + "reward_std": 7.033014822006225, + "rewards/get_chromagram_reward": 0.6199808716773987, + "rewards/get_chromagram_reward_std": 0.11536377593874932, + "rewards/get_intelligibility_reward": -5.1520576775074005, + "rewards/get_intelligibility_reward_std": 11.083409786224365, + "rewards/get_target_len_reward": -0.021154607087373732, + "rewards/get_target_len_reward_std": 0.05849700104445219, + "step": 1810 + }, + { + "advantages": -3.558894064781271e-07, + "advantages_std": 1.5300377130508422, + "clip_ratio": 0.0, + "completion_length": 84.18690643310546, + "epoch": 1.369172932330827, + "grad_norm": 82.5, + "kl": 0.30925544649362563, + "learning_rate": 4.315789473684211e-06, + "loss": 0.0301, + "num_tokens": 55839635.0, + "reward": -1.4298014640808105, + "reward_std": 6.427416753768921, + "rewards/get_chromagram_reward": 0.6322358787059784, + "rewards/get_chromagram_reward_std": 0.1157355085015297, + "rewards/get_intelligibility_reward": -4.903176188468933, + "rewards/get_intelligibility_reward_std": 10.227682876586915, + "rewards/get_target_len_reward": -0.018464045226573945, + "rewards/get_target_len_reward_std": 0.043058661930263045, + "step": 1820 + }, + { + "advantages": 2.5952856859134953e-07, + "advantages_std": 1.522819423675537, + "clip_ratio": 0.0, + "completion_length": 89.50892944335938, + "epoch": 1.3766917293233083, + "grad_norm": 5.65625, + "kl": 0.33326326608657836, + "learning_rate": 4.3120300751879705e-06, + "loss": 0.0338, + "num_tokens": 56152656.0, + "reward": -1.698273527622223, + "reward_std": 6.879736280441284, + "rewards/get_chromagram_reward": 0.6160605251789093, + "rewards/get_chromagram_reward_std": 0.11663919240236283, + "rewards/get_intelligibility_reward": -5.691361594200134, + "rewards/get_intelligibility_reward_std": 10.71043291091919, + "rewards/get_target_len_reward": -0.019519127625972032, + "rewards/get_target_len_reward_std": 0.04903759118169546, + "step": 1830 + }, + { + "advantages": 1.6887984060076634e-07, + "advantages_std": 1.681946301460266, + "clip_ratio": 0.0, + "completion_length": 87.87678680419921, + "epoch": 1.3842105263157896, + "grad_norm": 368.0, + "kl": 0.3765055865049362, + "learning_rate": 4.30827067669173e-06, + "loss": 0.0432, + "num_tokens": 56461831.0, + "reward": -1.584184655547142, + "reward_std": 6.927801322937012, + "rewards/get_chromagram_reward": 0.6217585206031799, + "rewards/get_chromagram_reward_std": 0.11466614827513695, + "rewards/get_intelligibility_reward": -5.353343296051025, + "rewards/get_intelligibility_reward_std": 10.991171741485596, + "rewards/get_target_len_reward": -0.020968902111053466, + "rewards/get_target_len_reward_std": 0.06113089099526405, + "step": 1840 + }, + { + "advantages": -3.616015149532359e-07, + "advantages_std": 1.618896448612213, + "clip_ratio": 0.0, + "completion_length": 89.11667022705078, + "epoch": 1.3917293233082706, + "grad_norm": 8.5, + "kl": 0.3279285907745361, + "learning_rate": 4.304511278195489e-06, + "loss": 0.0355, + "num_tokens": 56775245.0, + "reward": -1.461240404844284, + "reward_std": 6.541769075393677, + "rewards/get_chromagram_reward": 0.6240219950675965, + "rewards/get_chromagram_reward_std": 0.11915393397212029, + "rewards/get_intelligibility_reward": -4.989465570449829, + "rewards/get_intelligibility_reward_std": 10.445559215545654, + "rewards/get_target_len_reward": -0.01827723728492856, + "rewards/get_target_len_reward_std": 0.049859827384352684, + "step": 1850 + }, + { + "advantages": -2.9305610382834857e-08, + "advantages_std": 1.6972397446632386, + "clip_ratio": 0.0, + "completion_length": 86.04404983520507, + "epoch": 1.3992481203007519, + "grad_norm": 14.0625, + "kl": 0.32066617608070375, + "learning_rate": 4.3007518796992486e-06, + "loss": 0.0313, + "num_tokens": 57079914.0, + "reward": -1.1806714341044426, + "reward_std": 6.98257007598877, + "rewards/get_chromagram_reward": 0.6416585624217988, + "rewards/get_chromagram_reward_std": 0.11144919693470001, + "rewards/get_intelligibility_reward": -4.1637926429510115, + "rewards/get_intelligibility_reward_std": 11.391026782989503, + "rewards/get_target_len_reward": -0.019880097545683383, + "rewards/get_target_len_reward_std": 0.046197598055005076, + "step": 1860 + }, + { + "advantages": -2.48849386252914e-07, + "advantages_std": 1.6818946957588197, + "clip_ratio": 0.0, + "completion_length": 87.50714492797852, + "epoch": 1.406766917293233, + "grad_norm": 8.125, + "kl": 0.3438696265220642, + "learning_rate": 4.296992481203008e-06, + "loss": 0.0383, + "num_tokens": 57388572.0, + "reward": -1.6199523091316224, + "reward_std": 7.197538423538208, + "rewards/get_chromagram_reward": 0.6259011566638947, + "rewards/get_chromagram_reward_std": 0.10719226896762848, + "rewards/get_intelligibility_reward": -5.459428668022156, + "rewards/get_intelligibility_reward_std": 11.51907172203064, + "rewards/get_target_len_reward": -0.02632923349738121, + "rewards/get_target_len_reward_std": 0.08135125394910574, + "step": 1870 + }, + { + "advantages": -1.9955137986471527e-07, + "advantages_std": 1.5469175934791566, + "clip_ratio": 0.0, + "completion_length": 84.44464416503907, + "epoch": 1.4142857142857144, + "grad_norm": 7.09375, + "kl": 0.3164676412940025, + "learning_rate": 4.293233082706768e-06, + "loss": 0.0365, + "num_tokens": 57688742.0, + "reward": -1.4961084365844726, + "reward_std": 6.876049757003784, + "rewards/get_chromagram_reward": 0.6287756443023682, + "rewards/get_chromagram_reward_std": 0.12433092966675759, + "rewards/get_intelligibility_reward": -5.093908071517944, + "rewards/get_intelligibility_reward_std": 11.01416654586792, + "rewards/get_target_len_reward": -0.02319267261773348, + "rewards/get_target_len_reward_std": 0.06343817189335824, + "step": 1880 + }, + { + "advantages": 4.1226548148642904e-07, + "advantages_std": 1.6911157608032226, + "clip_ratio": 0.0, + "completion_length": 87.12797775268555, + "epoch": 1.4218045112781956, + "grad_norm": 7.25, + "kl": 0.31820856481790544, + "learning_rate": 4.289473684210527e-06, + "loss": 0.034, + "num_tokens": 57995800.0, + "reward": -1.7069338321685792, + "reward_std": 6.826904821395874, + "rewards/get_chromagram_reward": 0.6184460759162903, + "rewards/get_chromagram_reward_std": 0.10959328189492226, + "rewards/get_intelligibility_reward": -5.722243046760559, + "rewards/get_intelligibility_reward_std": 10.652085971832275, + "rewards/get_target_len_reward": -0.017004191037267448, + "rewards/get_target_len_reward_std": 0.040973109379410746, + "step": 1890 + }, + { + "advantages": -8.443988974704553e-09, + "advantages_std": 1.610334575176239, + "clip_ratio": 0.0, + "completion_length": 87.94821701049804, + "epoch": 1.4293233082706767, + "grad_norm": 7.0, + "kl": 0.38517433404922485, + "learning_rate": 4.2857142857142855e-06, + "loss": 0.0417, + "num_tokens": 58305269.0, + "reward": -1.8303340256214142, + "reward_std": 7.070497989654541, + "rewards/get_chromagram_reward": 0.6330944538116455, + "rewards/get_chromagram_reward_std": 0.11860318705439568, + "rewards/get_intelligibility_reward": -6.100761485099793, + "rewards/get_intelligibility_reward_std": 10.920927333831788, + "rewards/get_target_len_reward": -0.02333456613123417, + "rewards/get_target_len_reward_std": 0.06448300629854202, + "step": 1900 + }, + { + "advantages": -4.054357674476705e-07, + "advantages_std": 1.4755270838737489, + "clip_ratio": 0.0, + "completion_length": 88.48571701049805, + "epoch": 1.436842105263158, + "grad_norm": 5.1875, + "kl": 0.3150394469499588, + "learning_rate": 4.281954887218046e-06, + "loss": 0.0335, + "num_tokens": 58615675.0, + "reward": -1.3779967308044434, + "reward_std": 6.456488084793091, + "rewards/get_chromagram_reward": 0.6358872652053833, + "rewards/get_chromagram_reward_std": 0.10376172587275505, + "rewards/get_intelligibility_reward": -4.750557589530945, + "rewards/get_intelligibility_reward_std": 10.346266222000121, + "rewards/get_target_len_reward": -0.019319624826312064, + "rewards/get_target_len_reward_std": 0.05323235634714365, + "step": 1910 + }, + { + "advantages": 7.972121274235633e-08, + "advantages_std": 1.5427042961120605, + "clip_ratio": 0.0, + "completion_length": 85.0898826599121, + "epoch": 1.4443609022556392, + "grad_norm": 43.25, + "kl": 0.3174961805343628, + "learning_rate": 4.278195488721805e-06, + "loss": 0.0355, + "num_tokens": 58917778.0, + "reward": -1.9309344321489335, + "reward_std": 7.068848085403443, + "rewards/get_chromagram_reward": 0.6075616657733918, + "rewards/get_chromagram_reward_std": 0.10997026190161704, + "rewards/get_intelligibility_reward": -6.379441344738007, + "rewards/get_intelligibility_reward_std": 10.813093662261963, + "rewards/get_target_len_reward": -0.0209231847897172, + "rewards/get_target_len_reward_std": 0.06707295998930932, + "step": 1920 + }, + { + "advantages": 3.0646723061522605e-07, + "advantages_std": 1.5311549425125122, + "clip_ratio": 0.0, + "completion_length": 88.07678527832032, + "epoch": 1.4518796992481202, + "grad_norm": 6.03125, + "kl": 1.02582398802042, + "learning_rate": 4.274436090225564e-06, + "loss": 0.1081, + "num_tokens": 59227243.0, + "reward": -1.4925808787345887, + "reward_std": 6.5475245952606205, + "rewards/get_chromagram_reward": 0.6171232759952545, + "rewards/get_chromagram_reward_std": 0.105997933447361, + "rewards/get_intelligibility_reward": -5.07579274745658, + "rewards/get_intelligibility_reward_std": 10.237911939620972, + "rewards/get_target_len_reward": -0.01907276422716677, + "rewards/get_target_len_reward_std": 0.062690981477499, + "step": 1930 + }, + { + "advantages": -6.929040182512836e-07, + "advantages_std": 1.4544852733612061, + "clip_ratio": 0.0, + "completion_length": 86.28988189697266, + "epoch": 1.4593984962406015, + "grad_norm": 74.0, + "kl": 0.5569351270794869, + "learning_rate": 4.270676691729323e-06, + "loss": 0.0581, + "num_tokens": 59533137.0, + "reward": -1.0190489768981934, + "reward_std": 6.246702671051025, + "rewards/get_chromagram_reward": 0.6327454149723053, + "rewards/get_chromagram_reward_std": 0.12573510631918908, + "rewards/get_intelligibility_reward": -3.6655944168567656, + "rewards/get_intelligibility_reward_std": 10.2553316116333, + "rewards/get_target_len_reward": -0.024297917261719704, + "rewards/get_target_len_reward_std": 0.06265397630631923, + "step": 1940 + }, + { + "advantages": 2.677241838000555e-07, + "advantages_std": 1.668886387348175, + "clip_ratio": 0.0, + "completion_length": 86.92559661865235, + "epoch": 1.4669172932330827, + "grad_norm": 312.0, + "kl": 0.3745987594127655, + "learning_rate": 4.266917293233083e-06, + "loss": 0.0421, + "num_tokens": 59839979.0, + "reward": -1.851040291786194, + "reward_std": 7.3681464195251465, + "rewards/get_chromagram_reward": 0.6237763226032257, + "rewards/get_chromagram_reward_std": 0.11604878455400466, + "rewards/get_intelligibility_reward": -6.155086278915405, + "rewards/get_intelligibility_reward_std": 11.5657377243042, + "rewards/get_target_len_reward": -0.021810497622936964, + "rewards/get_target_len_reward_std": 0.059417030215263365, + "step": 1950 + }, + { + "advantages": -1.0927521287840136e-08, + "advantages_std": 1.5496743440628051, + "clip_ratio": 0.0, + "completion_length": 86.44821548461914, + "epoch": 1.474436090225564, + "grad_norm": 9.9375, + "kl": 0.43885346353054044, + "learning_rate": 4.2631578947368425e-06, + "loss": 0.0477, + "num_tokens": 60144915.0, + "reward": -1.4612555474042892, + "reward_std": 6.437270307540894, + "rewards/get_chromagram_reward": 0.6093687117099762, + "rewards/get_chromagram_reward_std": 0.11680521965026855, + "rewards/get_intelligibility_reward": -4.9737292408943174, + "rewards/get_intelligibility_reward_std": 10.008707952499389, + "rewards/get_target_len_reward": -0.019405988790094853, + "rewards/get_target_len_reward_std": 0.055537658743560314, + "step": 1960 + }, + { + "advantages": -2.731879504835888e-08, + "advantages_std": 1.5995185375213623, + "clip_ratio": 0.0, + "completion_length": 87.5625015258789, + "epoch": 1.4819548872180452, + "grad_norm": 5.96875, + "kl": 2.4066055372357367, + "learning_rate": 4.259398496240602e-06, + "loss": 0.2514, + "num_tokens": 60452724.0, + "reward": -1.7014335989952087, + "reward_std": 7.449602222442627, + "rewards/get_chromagram_reward": 0.6065891265869141, + "rewards/get_chromagram_reward_std": 0.12512124553322793, + "rewards/get_intelligibility_reward": -5.687872338294983, + "rewards/get_intelligibility_reward_std": 11.906324100494384, + "rewards/get_target_len_reward": -0.02301737116649747, + "rewards/get_target_len_reward_std": 0.08284691572189332, + "step": 1970 + }, + { + "advantages": -4.1027865336218385e-07, + "advantages_std": 1.5075610518455504, + "clip_ratio": 0.0, + "completion_length": 86.51428833007813, + "epoch": 1.4894736842105263, + "grad_norm": 11.0625, + "kl": 0.5192355632781982, + "learning_rate": 4.255639097744361e-06, + "loss": 0.0555, + "num_tokens": 60758290.0, + "reward": -1.4112955316901208, + "reward_std": 6.756988954544068, + "rewards/get_chromagram_reward": 0.6147924721240997, + "rewards/get_chromagram_reward_std": 0.11576244533061981, + "rewards/get_intelligibility_reward": -4.826417958736419, + "rewards/get_intelligibility_reward_std": 10.790842008590698, + "rewards/get_target_len_reward": -0.022260715905576944, + "rewards/get_target_len_reward_std": 0.0649005737155676, + "step": 1980 + }, + { + "advantages": -4.5200177467563664e-08, + "advantages_std": 1.6201247930526734, + "clip_ratio": 0.0, + "completion_length": 86.08214416503907, + "epoch": 1.4969924812030075, + "grad_norm": 7.46875, + "kl": 0.3470937877893448, + "learning_rate": 4.2518796992481206e-06, + "loss": 0.0467, + "num_tokens": 61062617.0, + "reward": -1.359550093114376, + "reward_std": 6.684824514389038, + "rewards/get_chromagram_reward": 0.6119980812072754, + "rewards/get_chromagram_reward_std": 0.11458643302321433, + "rewards/get_intelligibility_reward": -4.662409788370132, + "rewards/get_intelligibility_reward_std": 10.784858322143554, + "rewards/get_target_len_reward": -0.028238356299698352, + "rewards/get_target_len_reward_std": 0.08857602290809155, + "step": 1990 + }, + { + "advantages": -6.33299521268782e-08, + "advantages_std": 1.6459670662879944, + "clip_ratio": 0.0, + "completion_length": 86.25833358764649, + "epoch": 1.5045112781954888, + "grad_norm": 7.0, + "kl": 0.32820585519075396, + "learning_rate": 4.24812030075188e-06, + "loss": 0.0331, + "num_tokens": 61367485.0, + "reward": -1.6770533800125123, + "reward_std": 6.5401702404022215, + "rewards/get_chromagram_reward": 0.6255118787288666, + "rewards/get_chromagram_reward_std": 0.11729757189750671, + "rewards/get_intelligibility_reward": -5.6358928203582765, + "rewards/get_intelligibility_reward_std": 10.16779546737671, + "rewards/get_target_len_reward": -0.02077860590070486, + "rewards/get_target_len_reward_std": 0.05197990909218788, + "step": 2000 + }, + { + "advantages": 2.6673079958072774e-07, + "advantages_std": 1.6713383674621582, + "clip_ratio": 0.0, + "completion_length": 90.85238342285156, + "epoch": 1.5120300751879698, + "grad_norm": 8.9375, + "kl": 0.5624005109071731, + "learning_rate": 4.244360902255639e-06, + "loss": 0.0571, + "num_tokens": 61685194.0, + "reward": -1.4429410874843598, + "reward_std": 6.481061124801636, + "rewards/get_chromagram_reward": 0.6198509991168976, + "rewards/get_chromagram_reward_std": 0.1047507330775261, + "rewards/get_intelligibility_reward": -4.931775975227356, + "rewards/get_intelligibility_reward_std": 10.304393005371093, + "rewards/get_target_len_reward": -0.01689812494441867, + "rewards/get_target_len_reward_std": 0.046997369080781934, + "step": 2010 + }, + { + "advantages": -1.6391277597449515e-07, + "advantages_std": 1.5733685612678527, + "clip_ratio": 0.0, + "completion_length": 86.6053581237793, + "epoch": 1.519548872180451, + "grad_norm": 6.625, + "kl": 0.31895223557949065, + "learning_rate": 4.240601503759399e-06, + "loss": 0.0354, + "num_tokens": 61991299.0, + "reward": -1.4001641318202018, + "reward_std": 6.923908948898315, + "rewards/get_chromagram_reward": 0.6155903279781342, + "rewards/get_chromagram_reward_std": 0.1192019023001194, + "rewards/get_intelligibility_reward": -4.797593307495117, + "rewards/get_intelligibility_reward_std": 11.075876808166504, + "rewards/get_target_len_reward": -0.018489096034318208, + "rewards/get_target_len_reward_std": 0.05505495984107256, + "step": 2020 + }, + { + "advantages": 1.416852043689687e-07, + "advantages_std": 1.5786041617393494, + "clip_ratio": 0.0, + "completion_length": 88.1529769897461, + "epoch": 1.5270676691729324, + "grad_norm": 6.15625, + "kl": 0.2888381630182266, + "learning_rate": 4.236842105263158e-06, + "loss": 0.0373, + "num_tokens": 62301897.0, + "reward": -1.2167622834444045, + "reward_std": 6.394130229949951, + "rewards/get_chromagram_reward": 0.6328611254692078, + "rewards/get_chromagram_reward_std": 0.11856895983219147, + "rewards/get_intelligibility_reward": -4.259195864200592, + "rewards/get_intelligibility_reward_std": 10.343937587738036, + "rewards/get_target_len_reward": -0.02395202973857522, + "rewards/get_target_len_reward_std": 0.07318154443055391, + "step": 2030 + }, + { + "advantages": 1.4851491414447082e-07, + "advantages_std": 1.6340704083442688, + "clip_ratio": 0.0, + "completion_length": 85.79464416503906, + "epoch": 1.5345864661654134, + "grad_norm": 7.875, + "kl": 0.3205702111124992, + "learning_rate": 4.233082706766918e-06, + "loss": 0.0357, + "num_tokens": 62606044.0, + "reward": -1.099614891409874, + "reward_std": 6.524851655960083, + "rewards/get_chromagram_reward": 0.6311138451099396, + "rewards/get_chromagram_reward_std": 0.11696632355451583, + "rewards/get_intelligibility_reward": -3.907797175645828, + "rewards/get_intelligibility_reward_std": 10.681060409545898, + "rewards/get_target_len_reward": -0.022161136101931333, + "rewards/get_target_len_reward_std": 0.0629130657762289, + "step": 2040 + }, + { + "advantages": 8.774300653158206e-07, + "advantages_std": 1.6362772941589356, + "clip_ratio": 0.0, + "completion_length": 82.92381134033204, + "epoch": 1.5421052631578949, + "grad_norm": 7.375, + "kl": 0.43492402136325836, + "learning_rate": 4.229323308270677e-06, + "loss": 0.048, + "num_tokens": 62901928.0, + "reward": -1.770355224609375, + "reward_std": 7.1313379287719725, + "rewards/get_chromagram_reward": 0.6375100195407868, + "rewards/get_chromagram_reward_std": 0.12280775308609009, + "rewards/get_intelligibility_reward": -5.923044615983963, + "rewards/get_intelligibility_reward_std": 11.02694330215454, + "rewards/get_target_len_reward": -0.025530668999999763, + "rewards/get_target_len_reward_std": 0.07188544794917107, + "step": 2050 + }, + { + "advantages": -3.94135725656497e-07, + "advantages_std": 1.5501110434532166, + "clip_ratio": 0.0, + "completion_length": 87.2125015258789, + "epoch": 1.549624060150376, + "grad_norm": 5.53125, + "kl": 0.29535968899726867, + "learning_rate": 4.225563909774436e-06, + "loss": 0.0318, + "num_tokens": 63209556.0, + "reward": -1.536487441137433, + "reward_std": 6.634403896331787, + "rewards/get_chromagram_reward": 0.6187046766281128, + "rewards/get_chromagram_reward_std": 0.1150432951748371, + "rewards/get_intelligibility_reward": -5.209327453374863, + "rewards/get_intelligibility_reward_std": 10.373435020446777, + "rewards/get_target_len_reward": -0.018839253485202788, + "rewards/get_target_len_reward_std": 0.060705633461475374, + "step": 2060 + }, + { + "advantages": -7.376074186993265e-08, + "advantages_std": 1.679667603969574, + "clip_ratio": 0.0, + "completion_length": 88.85000228881836, + "epoch": 1.5571428571428572, + "grad_norm": 7.9375, + "kl": 0.31674362570047376, + "learning_rate": 4.221804511278196e-06, + "loss": 0.0355, + "num_tokens": 63521952.0, + "reward": -1.2775519102811814, + "reward_std": 6.381492376327515, + "rewards/get_chromagram_reward": 0.6168266594409942, + "rewards/get_chromagram_reward_std": 0.10888128727674484, + "rewards/get_intelligibility_reward": -4.430067479610443, + "rewards/get_intelligibility_reward_std": 10.242457675933839, + "rewards/get_target_len_reward": -0.019414818566292524, + "rewards/get_target_len_reward_std": 0.05602561179548502, + "step": 2070 + }, + { + "advantages": -3.116826341909018e-07, + "advantages_std": 1.5748518228530883, + "clip_ratio": 0.0, + "completion_length": 86.72797622680665, + "epoch": 1.5646616541353384, + "grad_norm": 5.71875, + "kl": 0.37901861518621444, + "learning_rate": 4.218045112781956e-06, + "loss": 0.0437, + "num_tokens": 63828336.0, + "reward": -1.6440318048000335, + "reward_std": 7.379472351074218, + "rewards/get_chromagram_reward": 0.6186399698257447, + "rewards/get_chromagram_reward_std": 0.11654711589217186, + "rewards/get_intelligibility_reward": -5.527314138412476, + "rewards/get_intelligibility_reward_std": 11.792286777496338, + "rewards/get_target_len_reward": -0.02342082476243377, + "rewards/get_target_len_reward_std": 0.07802388649433852, + "step": 2080 + }, + { + "advantages": -3.8345656037108713e-07, + "advantages_std": 1.5448354840278626, + "clip_ratio": 0.0, + "completion_length": 86.90119247436523, + "epoch": 1.5721804511278195, + "grad_norm": 452.0, + "kl": 0.4000831454992294, + "learning_rate": 4.2142857142857145e-06, + "loss": 0.044, + "num_tokens": 64135276.0, + "reward": -1.2782334357500076, + "reward_std": 6.2945537090301515, + "rewards/get_chromagram_reward": 0.6352133572101593, + "rewards/get_chromagram_reward_std": 0.10771576762199402, + "rewards/get_intelligibility_reward": -4.444746363162994, + "rewards/get_intelligibility_reward_std": 10.07327060699463, + "rewards/get_target_len_reward": -0.0251670790836215, + "rewards/get_target_len_reward_std": 0.07453116215765476, + "step": 2090 + }, + { + "advantages": 1.3907754237152403e-08, + "advantages_std": 1.6230470061302185, + "clip_ratio": 0.0, + "completion_length": 87.97202606201172, + "epoch": 1.5796992481203007, + "grad_norm": 31.125, + "kl": 0.39640209525823594, + "learning_rate": 4.210526315789474e-06, + "loss": 0.0425, + "num_tokens": 64444876.0, + "reward": -1.6344273149967194, + "reward_std": 6.510859823226928, + "rewards/get_chromagram_reward": 0.6180970013141632, + "rewards/get_chromagram_reward_std": 0.12092170044779778, + "rewards/get_intelligibility_reward": -5.5024089336395265, + "rewards/get_intelligibility_reward_std": 10.168234968185425, + "rewards/get_target_len_reward": -0.018969781789928676, + "rewards/get_target_len_reward_std": 0.04659441541880369, + "step": 2100 + }, + { + "advantages": -1.7695124654437677e-08, + "advantages_std": 1.5527965784072877, + "clip_ratio": 0.0, + "completion_length": 86.90178756713867, + "epoch": 1.587218045112782, + "grad_norm": 6.46875, + "kl": 0.34301224946975706, + "learning_rate": 4.206766917293234e-06, + "loss": 0.0396, + "num_tokens": 64751657.0, + "reward": -1.6146832585334778, + "reward_std": 7.331169462203979, + "rewards/get_chromagram_reward": 0.6357932686805725, + "rewards/get_chromagram_reward_std": 0.1203016348183155, + "rewards/get_intelligibility_reward": -5.459855031967163, + "rewards/get_intelligibility_reward_std": 11.720126533508301, + "rewards/get_target_len_reward": -0.019987638201564552, + "rewards/get_target_len_reward_std": 0.058079042471945286, + "step": 2110 + }, + { + "advantages": 6.993611819439139e-07, + "advantages_std": 1.6053866624832154, + "clip_ratio": 0.0, + "completion_length": 85.5982162475586, + "epoch": 1.594736842105263, + "grad_norm": 8.875, + "kl": 0.6178436279296875, + "learning_rate": 4.2030075187969926e-06, + "loss": 0.0617, + "num_tokens": 65054328.0, + "reward": -1.8031591415405273, + "reward_std": 6.9044517993927, + "rewards/get_chromagram_reward": 0.6330244064331054, + "rewards/get_chromagram_reward_std": 0.11358195766806603, + "rewards/get_intelligibility_reward": -6.023799133300781, + "rewards/get_intelligibility_reward_std": 10.727534580230714, + "rewards/get_target_len_reward": -0.018702456075698138, + "rewards/get_target_len_reward_std": 0.04642644617706537, + "step": 2120 + }, + { + "advantages": -3.3875307146047364e-07, + "advantages_std": 1.5467365503311157, + "clip_ratio": 0.0, + "completion_length": 87.56904983520508, + "epoch": 1.6022556390977445, + "grad_norm": 5.3125, + "kl": 0.42215982377529143, + "learning_rate": 4.199248120300752e-06, + "loss": 0.049, + "num_tokens": 65363307.0, + "reward": -1.4519341588020325, + "reward_std": 6.497218227386474, + "rewards/get_chromagram_reward": 0.6397540211677551, + "rewards/get_chromagram_reward_std": 0.12757838517427444, + "rewards/get_intelligibility_reward": -4.968119716644287, + "rewards/get_intelligibility_reward_std": 10.35838565826416, + "rewards/get_target_len_reward": -0.027436566725373267, + "rewards/get_target_len_reward_std": 0.07567532435059547, + "step": 2130 + }, + { + "advantages": 4.072984125969015e-07, + "advantages_std": 1.5269123673439027, + "clip_ratio": 0.0, + "completion_length": 90.61607437133789, + "epoch": 1.6097744360902255, + "grad_norm": 8.375, + "kl": 0.33207988142967226, + "learning_rate": 4.195488721804512e-06, + "loss": 0.0385, + "num_tokens": 65679406.0, + "reward": -1.6267223179340362, + "reward_std": 6.979967403411865, + "rewards/get_chromagram_reward": 0.6148672103881836, + "rewards/get_chromagram_reward_std": 0.1095418579876423, + "rewards/get_intelligibility_reward": -5.475715672969818, + "rewards/get_intelligibility_reward_std": 10.973806667327882, + "rewards/get_target_len_reward": -0.01931815128773451, + "rewards/get_target_len_reward_std": 0.06183572188019752, + "step": 2140 + }, + { + "advantages": 5.145867760347756e-07, + "advantages_std": 1.6080675721168518, + "clip_ratio": 0.0, + "completion_length": 85.91190643310547, + "epoch": 1.6172932330827068, + "grad_norm": 5.4375, + "kl": 0.4411650985479355, + "learning_rate": 4.1917293233082715e-06, + "loss": 0.0475, + "num_tokens": 65983772.0, + "reward": -1.6623242631554604, + "reward_std": 6.805143022537232, + "rewards/get_chromagram_reward": 0.6309055805206298, + "rewards/get_chromagram_reward_std": 0.12571141943335534, + "rewards/get_intelligibility_reward": -5.5929530501365665, + "rewards/get_intelligibility_reward_std": 10.54892177581787, + "rewards/get_target_len_reward": -0.02492492999881506, + "rewards/get_target_len_reward_std": 0.06787819992750883, + "step": 2150 + }, + { + "advantages": 6.680686985838235e-08, + "advantages_std": 1.5221306800842285, + "clip_ratio": 0.0, + "completion_length": 85.75952529907227, + "epoch": 1.624812030075188, + "grad_norm": 5.25, + "kl": 0.48574246764183043, + "learning_rate": 4.18796992481203e-06, + "loss": 0.0516, + "num_tokens": 66286686.0, + "reward": -1.631846097111702, + "reward_std": 6.67475266456604, + "rewards/get_chromagram_reward": 0.6112766861915588, + "rewards/get_chromagram_reward_std": 0.10239461362361908, + "rewards/get_intelligibility_reward": -5.489213287830353, + "rewards/get_intelligibility_reward_std": 10.394185543060303, + "rewards/get_target_len_reward": -0.017601419892162084, + "rewards/get_target_len_reward_std": 0.0520940450951457, + "step": 2160 + }, + { + "advantages": 1.487632751207002e-07, + "advantages_std": 1.5328869938850402, + "clip_ratio": 0.0, + "completion_length": 84.20238342285157, + "epoch": 1.632330827067669, + "grad_norm": 5.5, + "kl": 0.33671319782733916, + "learning_rate": 4.18421052631579e-06, + "loss": 0.033, + "num_tokens": 66586126.0, + "reward": -1.4015221863985061, + "reward_std": 6.84256010055542, + "rewards/get_chromagram_reward": 0.6251766622066498, + "rewards/get_chromagram_reward_std": 0.1158628709614277, + "rewards/get_intelligibility_reward": -4.806045126914978, + "rewards/get_intelligibility_reward_std": 10.973923015594483, + "rewards/get_target_len_reward": -0.023697828501462938, + "rewards/get_target_len_reward_std": 0.0575832212343812, + "step": 2170 + }, + { + "advantages": -1.763305235158441e-08, + "advantages_std": 1.6057313919067382, + "clip_ratio": 0.0, + "completion_length": 86.18274002075195, + "epoch": 1.6398496240601503, + "grad_norm": 7.28125, + "kl": 0.2764908343553543, + "learning_rate": 4.18045112781955e-06, + "loss": 0.029, + "num_tokens": 66890272.0, + "reward": -1.1156673699617385, + "reward_std": 6.271809482574463, + "rewards/get_chromagram_reward": 0.6292023539543152, + "rewards/get_chromagram_reward_std": 0.11582249924540519, + "rewards/get_intelligibility_reward": -3.95166752114892, + "rewards/get_intelligibility_reward_std": 10.14079508781433, + "rewards/get_target_len_reward": -0.0245367381721735, + "rewards/get_target_len_reward_std": 0.06388699542731047, + "step": 2180 + }, + { + "advantages": -1.6887986475921933e-08, + "advantages_std": 1.6172147035598754, + "clip_ratio": 0.0, + "completion_length": 90.51845474243164, + "epoch": 1.6473684210526316, + "grad_norm": 30.5, + "kl": 0.37103464752435683, + "learning_rate": 4.176691729323308e-06, + "loss": 0.041, + "num_tokens": 67206237.0, + "reward": -1.6112347409129142, + "reward_std": 6.924424266815185, + "rewards/get_chromagram_reward": 0.6163883149623871, + "rewards/get_chromagram_reward_std": 0.11688357889652252, + "rewards/get_intelligibility_reward": -5.430252596735954, + "rewards/get_intelligibility_reward_std": 10.924818515777588, + "rewards/get_target_len_reward": -0.019839623104780914, + "rewards/get_target_len_reward_std": 0.05312856025993824, + "step": 2190 + }, + { + "advantages": 2.2376577533123054e-07, + "advantages_std": 1.6153385758399963, + "clip_ratio": 0.0, + "completion_length": 84.91428680419922, + "epoch": 1.6548872180451126, + "grad_norm": 6.5625, + "kl": 0.32724575996398925, + "learning_rate": 4.172932330827068e-06, + "loss": 0.0363, + "num_tokens": 67507543.0, + "reward": -1.7804813146591187, + "reward_std": 6.974139785766601, + "rewards/get_chromagram_reward": 0.6214574337005615, + "rewards/get_chromagram_reward_std": 0.1169826865196228, + "rewards/get_intelligibility_reward": -5.942538261413574, + "rewards/get_intelligibility_reward_std": 10.9132661819458, + "rewards/get_target_len_reward": -0.020362868160009383, + "rewards/get_target_len_reward_std": 0.06523161455988884, + "step": 2200 + }, + { + "advantages": 4.502634368463987e-07, + "advantages_std": 1.508051860332489, + "clip_ratio": 0.0, + "completion_length": 87.18393020629883, + "epoch": 1.662406015037594, + "grad_norm": 6.21875, + "kl": 0.3858113706111908, + "learning_rate": 4.169172932330827e-06, + "loss": 0.0411, + "num_tokens": 67815097.0, + "reward": -1.484154748916626, + "reward_std": 6.593449640274048, + "rewards/get_chromagram_reward": 0.6221783816814422, + "rewards/get_chromagram_reward_std": 0.11762973815202712, + "rewards/get_intelligibility_reward": -5.0492493391036986, + "rewards/get_intelligibility_reward_std": 10.478116226196288, + "rewards/get_target_len_reward": -0.02539309123530984, + "rewards/get_target_len_reward_std": 0.06811941638588906, + "step": 2210 + }, + { + "advantages": -5.270044144189967e-07, + "advantages_std": 1.5974279642105103, + "clip_ratio": 0.0, + "completion_length": 84.21190643310547, + "epoch": 1.6699248120300751, + "grad_norm": 11.0, + "kl": 0.3089997261762619, + "learning_rate": 4.165413533834587e-06, + "loss": 0.0315, + "num_tokens": 68113590.0, + "reward": -1.5647227585315704, + "reward_std": 6.139684581756592, + "rewards/get_chromagram_reward": 0.6186821639537812, + "rewards/get_chromagram_reward_std": 0.11593869104981422, + "rewards/get_intelligibility_reward": -5.292714548110962, + "rewards/get_intelligibility_reward_std": 9.444132328033447, + "rewards/get_target_len_reward": -0.020135853625833987, + "rewards/get_target_len_reward_std": 0.052642738446593285, + "step": 2220 + }, + { + "advantages": 6.243586710752425e-07, + "advantages_std": 1.6422517776489258, + "clip_ratio": 0.0, + "completion_length": 87.30714416503906, + "epoch": 1.6774436090225564, + "grad_norm": 7.8125, + "kl": 0.40762856900691985, + "learning_rate": 4.161654135338346e-06, + "loss": 0.0448, + "num_tokens": 68421251.0, + "reward": -1.326597476005554, + "reward_std": 6.552673864364624, + "rewards/get_chromagram_reward": 0.6257815420627594, + "rewards/get_chromagram_reward_std": 0.11139634773135185, + "rewards/get_intelligibility_reward": -4.584523618221283, + "rewards/get_intelligibility_reward_std": 10.49343433380127, + "rewards/get_target_len_reward": -0.021050124522298576, + "rewards/get_target_len_reward_std": 0.05365060679614544, + "step": 2230 + }, + { + "advantages": -6.87937045995568e-08, + "advantages_std": 1.6293469667434692, + "clip_ratio": 0.0, + "completion_length": 84.20000228881835, + "epoch": 1.6849624060150377, + "grad_norm": 6.59375, + "kl": 0.34043067693710327, + "learning_rate": 4.157894736842106e-06, + "loss": 0.0404, + "num_tokens": 68720116.0, + "reward": -1.474801480770111, + "reward_std": 6.295886039733887, + "rewards/get_chromagram_reward": 0.617020720243454, + "rewards/get_chromagram_reward_std": 0.11099176555871963, + "rewards/get_intelligibility_reward": -5.020518136024475, + "rewards/get_intelligibility_reward_std": 9.897148609161377, + "rewards/get_target_len_reward": -0.020906874537467958, + "rewards/get_target_len_reward_std": 0.05231982320547104, + "step": 2240 + }, + { + "advantages": 3.166496867379465e-07, + "advantages_std": 1.5133726239204406, + "clip_ratio": 0.0, + "completion_length": 89.67440643310547, + "epoch": 1.6924812030075187, + "grad_norm": 6.0625, + "kl": 0.4981168583035469, + "learning_rate": 4.1541353383458646e-06, + "loss": 0.0493, + "num_tokens": 69034003.0, + "reward": -1.402723914384842, + "reward_std": 6.56936559677124, + "rewards/get_chromagram_reward": 0.6300440192222595, + "rewards/get_chromagram_reward_std": 0.11845290139317513, + "rewards/get_intelligibility_reward": -4.820070219039917, + "rewards/get_intelligibility_reward_std": 10.512784290313721, + "rewards/get_target_len_reward": -0.018145297607406973, + "rewards/get_target_len_reward_std": 0.03970406278967857, + "step": 2250 + }, + { + "advantages": -1.73598525776697e-07, + "advantages_std": 1.5843783140182495, + "clip_ratio": 0.0, + "completion_length": 82.07440643310547, + "epoch": 1.7, + "grad_norm": 7.59375, + "kl": 0.3415731221437454, + "learning_rate": 4.150375939849624e-06, + "loss": 0.037, + "num_tokens": 69327549.0, + "reward": -1.6767581880092621, + "reward_std": 6.931660556793213, + "rewards/get_chromagram_reward": 0.6320427298545838, + "rewards/get_chromagram_reward_std": 0.119264155626297, + "rewards/get_intelligibility_reward": -5.639317321777344, + "rewards/get_intelligibility_reward_std": 10.879788684844971, + "rewards/get_target_len_reward": -0.022999674873426558, + "rewards/get_target_len_reward_std": 0.05821425933390856, + "step": 2260 + }, + { + "advantages": 4.4641398293521207e-07, + "advantages_std": 1.5655923247337342, + "clip_ratio": 0.0, + "completion_length": 89.11309661865235, + "epoch": 1.7075187969924812, + "grad_norm": 6.8125, + "kl": 0.313992902636528, + "learning_rate": 4.146616541353384e-06, + "loss": 0.0304, + "num_tokens": 69640295.0, + "reward": -1.3783277034759522, + "reward_std": 6.3931385517120365, + "rewards/get_chromagram_reward": 0.6265924870967865, + "rewards/get_chromagram_reward_std": 0.11074900850653649, + "rewards/get_intelligibility_reward": -4.743269371986389, + "rewards/get_intelligibility_reward_std": 10.24311022758484, + "rewards/get_target_len_reward": -0.018305783160030842, + "rewards/get_target_len_reward_std": 0.0456329807639122, + "step": 2270 + }, + { + "advantages": 1.5174350380675606e-07, + "advantages_std": 1.519785189628601, + "clip_ratio": 0.0, + "completion_length": 85.77321548461914, + "epoch": 1.7150375939849622, + "grad_norm": 8.9375, + "kl": 0.32146727442741396, + "learning_rate": 4.1428571428571435e-06, + "loss": 0.0365, + "num_tokens": 69944689.0, + "reward": -1.4858910858631134, + "reward_std": 6.917770576477051, + "rewards/get_chromagram_reward": 0.6075939893722534, + "rewards/get_chromagram_reward_std": 0.11109072864055633, + "rewards/get_intelligibility_reward": -5.044983577728272, + "rewards/get_intelligibility_reward_std": 11.057494640350342, + "rewards/get_target_len_reward": -0.0202834433875978, + "rewards/get_target_len_reward_std": 0.0654812516644597, + "step": 2280 + }, + { + "advantages": 4.7360858133060903e-07, + "advantages_std": 1.5867118835449219, + "clip_ratio": 0.0, + "completion_length": 84.55357360839844, + "epoch": 1.7225563909774437, + "grad_norm": 6.8125, + "kl": 0.2956387400627136, + "learning_rate": 4.139097744360902e-06, + "loss": 0.0298, + "num_tokens": 70244751.0, + "reward": -1.6686067134141922, + "reward_std": 6.578378915786743, + "rewards/get_chromagram_reward": 0.6141305983066558, + "rewards/get_chromagram_reward_std": 0.11698191240429878, + "rewards/get_intelligibility_reward": -5.6026394963264465, + "rewards/get_intelligibility_reward_std": 10.14880304336548, + "rewards/get_target_len_reward": -0.017310941684991123, + "rewards/get_target_len_reward_std": 0.044035492651164534, + "step": 2290 + }, + { + "advantages": -5.366901731918006e-07, + "advantages_std": 1.5838298201560974, + "clip_ratio": 0.0, + "completion_length": 90.17262115478516, + "epoch": 1.7300751879699248, + "grad_norm": 8.75, + "kl": 0.3221535414457321, + "learning_rate": 4.135338345864662e-06, + "loss": 0.0324, + "num_tokens": 70561382.0, + "reward": -0.8596220046281815, + "reward_std": 6.4721067428588865, + "rewards/get_chromagram_reward": 0.6353691577911377, + "rewards/get_chromagram_reward_std": 0.11232606545090676, + "rewards/get_intelligibility_reward": -3.1946552455425263, + "rewards/get_intelligibility_reward_std": 10.751107311248779, + "rewards/get_target_len_reward": -0.019579828809946777, + "rewards/get_target_len_reward_std": 0.04634752385318279, + "step": 2300 + }, + { + "advantages": 3.327924957829964e-08, + "advantages_std": 1.629914653301239, + "clip_ratio": 0.0, + "completion_length": 85.86904907226562, + "epoch": 1.737593984962406, + "grad_norm": 6.21875, + "kl": 0.39486820101737974, + "learning_rate": 4.1315789473684216e-06, + "loss": 0.0454, + "num_tokens": 70864705.0, + "reward": -1.6306380838155747, + "reward_std": 6.833079147338867, + "rewards/get_chromagram_reward": 0.6226745009422302, + "rewards/get_chromagram_reward_std": 0.12161731049418449, + "rewards/get_intelligibility_reward": -5.491730678081512, + "rewards/get_intelligibility_reward_std": 10.75001802444458, + "rewards/get_target_len_reward": -0.022857668250799178, + "rewards/get_target_len_reward_std": 0.06112685557454824, + "step": 2310 + }, + { + "advantages": -2.4338557835790198e-08, + "advantages_std": 1.6277719616889954, + "clip_ratio": 0.0, + "completion_length": 87.22678756713867, + "epoch": 1.7451127819548873, + "grad_norm": 6.5, + "kl": 0.3618259161710739, + "learning_rate": 4.12781954887218e-06, + "loss": 0.0366, + "num_tokens": 71172976.0, + "reward": -1.1786362126469612, + "reward_std": 6.607575082778931, + "rewards/get_chromagram_reward": 0.624769514799118, + "rewards/get_chromagram_reward_std": 0.12241154238581657, + "rewards/get_intelligibility_reward": -4.138809217512607, + "rewards/get_intelligibility_reward_std": 10.685311555862427, + "rewards/get_target_len_reward": -0.021868737787008284, + "rewards/get_target_len_reward_std": 0.04962801802903414, + "step": 2320 + }, + { + "advantages": -1.096477070916535e-07, + "advantages_std": 1.7360329389572144, + "clip_ratio": 0.0, + "completion_length": 85.69107208251953, + "epoch": 1.7526315789473683, + "grad_norm": 6.1875, + "kl": 1.3730736181139946, + "learning_rate": 4.12406015037594e-06, + "loss": 0.1449, + "num_tokens": 71476310.0, + "reward": -1.4866398930549622, + "reward_std": 6.556139183044434, + "rewards/get_chromagram_reward": 0.6211271047592163, + "rewards/get_chromagram_reward_std": 0.12372498363256454, + "rewards/get_intelligibility_reward": -5.057052373886108, + "rewards/get_intelligibility_reward_std": 10.40001630783081, + "rewards/get_target_len_reward": -0.023994168732315302, + "rewards/get_target_len_reward_std": 0.065902035869658, + "step": 2330 + }, + { + "advantages": -7.177393968049728e-08, + "advantages_std": 1.5481481909751893, + "clip_ratio": 0.0, + "completion_length": 84.09226303100586, + "epoch": 1.7601503759398496, + "grad_norm": 6.59375, + "kl": 0.35464718043804166, + "learning_rate": 4.1203007518797e-06, + "loss": 0.0384, + "num_tokens": 71774878.0, + "reward": -1.677583646774292, + "reward_std": 6.635879039764404, + "rewards/get_chromagram_reward": 0.6253755390644073, + "rewards/get_chromagram_reward_std": 0.11250732392072678, + "rewards/get_intelligibility_reward": -5.636243104934692, + "rewards/get_intelligibility_reward_std": 10.37077875137329, + "rewards/get_target_len_reward": -0.021883081085979937, + "rewards/get_target_len_reward_std": 0.0651377398520708, + "step": 2340 + }, + { + "advantages": 5.361934768188803e-07, + "advantages_std": 1.6343234777450562, + "clip_ratio": 0.0, + "completion_length": 85.13214492797852, + "epoch": 1.7676691729323308, + "grad_norm": 5.5, + "kl": 0.3477922797203064, + "learning_rate": 4.116541353383459e-06, + "loss": 0.0361, + "num_tokens": 72076568.0, + "reward": -1.9456369400024414, + "reward_std": 6.959947204589843, + "rewards/get_chromagram_reward": 0.6237083613872528, + "rewards/get_chromagram_reward_std": 0.11222967356443406, + "rewards/get_intelligibility_reward": -6.439011716842652, + "rewards/get_intelligibility_reward_std": 10.623391437530518, + "rewards/get_target_len_reward": -0.02160733174532652, + "rewards/get_target_len_reward_std": 0.05893028676509857, + "step": 2350 + }, + { + "advantages": 2.334515258439751e-07, + "advantages_std": 1.6544344305992127, + "clip_ratio": 0.0, + "completion_length": 86.15476303100586, + "epoch": 1.7751879699248119, + "grad_norm": 9.3125, + "kl": 0.3129129856824875, + "learning_rate": 4.112781954887218e-06, + "loss": 0.0418, + "num_tokens": 72380577.0, + "reward": -1.5635125003755093, + "reward_std": 6.368973064422607, + "rewards/get_chromagram_reward": 0.6229640424251557, + "rewards/get_chromagram_reward_std": 0.11633650735020637, + "rewards/get_intelligibility_reward": -5.289789938926697, + "rewards/get_intelligibility_reward_std": 9.956442642211915, + "rewards/get_target_len_reward": -0.023711246997117998, + "rewards/get_target_len_reward_std": 0.07678976822644472, + "step": 2360 + }, + { + "advantages": -4.721184708955661e-07, + "advantages_std": 1.5377548336982727, + "clip_ratio": 0.0, + "completion_length": 89.5404769897461, + "epoch": 1.7827067669172934, + "grad_norm": 8.0, + "kl": 0.2969405084848404, + "learning_rate": 4.109022556390978e-06, + "loss": 0.0343, + "num_tokens": 72694892.0, + "reward": -1.2615583300590516, + "reward_std": 7.277207279205323, + "rewards/get_chromagram_reward": 0.6268039345741272, + "rewards/get_chromagram_reward_std": 0.11108435578644275, + "rewards/get_intelligibility_reward": -4.389381170272827, + "rewards/get_intelligibility_reward_std": 11.901940822601318, + "rewards/get_target_len_reward": -0.022097578458487987, + "rewards/get_target_len_reward_std": 0.06381280329078436, + "step": 2370 + }, + { + "advantages": 6.109476515803181e-08, + "advantages_std": 1.6187481880187988, + "clip_ratio": 0.0, + "completion_length": 87.50178680419921, + "epoch": 1.7902255639097744, + "grad_norm": 7.5625, + "kl": 0.4066790774464607, + "learning_rate": 4.105263157894737e-06, + "loss": 0.0417, + "num_tokens": 73002358.0, + "reward": -1.664416539669037, + "reward_std": 7.076716756820678, + "rewards/get_chromagram_reward": 0.6175857126712799, + "rewards/get_chromagram_reward_std": 0.1235451377928257, + "rewards/get_intelligibility_reward": -5.58691291809082, + "rewards/get_intelligibility_reward_std": 11.194854640960694, + "rewards/get_target_len_reward": -0.023922096379101275, + "rewards/get_target_len_reward_std": 0.0660831168293953, + "step": 2380 + }, + { + "advantages": -6.780029102593232e-08, + "advantages_std": 1.5943035006523132, + "clip_ratio": 0.0, + "completion_length": 88.36428756713867, + "epoch": 1.7977443609022556, + "grad_norm": 5.78125, + "kl": 0.3713136985898018, + "learning_rate": 4.101503759398496e-06, + "loss": 0.0412, + "num_tokens": 73313327.0, + "reward": -1.5908078402280807, + "reward_std": 7.35349440574646, + "rewards/get_chromagram_reward": 0.6217695772647858, + "rewards/get_chromagram_reward_std": 0.1202425293624401, + "rewards/get_intelligibility_reward": -5.372699975967407, + "rewards/get_intelligibility_reward_std": 11.718086051940919, + "rewards/get_target_len_reward": -0.021492914762347937, + "rewards/get_target_len_reward_std": 0.06425060071051121, + "step": 2390 + }, + { + "advantages": 1.4280279980738442e-07, + "advantages_std": 1.6416914343833924, + "clip_ratio": 0.0, + "completion_length": 86.24524002075195, + "epoch": 1.805263157894737, + "grad_norm": 6.6875, + "kl": 0.3477358803153038, + "learning_rate": 4.097744360902256e-06, + "loss": 0.0377, + "num_tokens": 73618532.0, + "reward": -1.3100897327065468, + "reward_std": 6.019156885147095, + "rewards/get_chromagram_reward": 0.6225740551948548, + "rewards/get_chromagram_reward_std": 0.11594133004546166, + "rewards/get_intelligibility_reward": -4.53080498832278, + "rewards/get_intelligibility_reward_std": 9.37488775253296, + "rewards/get_target_len_reward": -0.02203790657222271, + "rewards/get_target_len_reward_std": 0.05875368323177099, + "step": 2400 + }, + { + "advantages": 1.4007092179468829e-07, + "advantages_std": 1.6318424463272094, + "clip_ratio": 0.0, + "completion_length": 86.2976203918457, + "epoch": 1.812781954887218, + "grad_norm": 95.0, + "kl": 0.3875324487686157, + "learning_rate": 4.0939849624060155e-06, + "loss": 0.0403, + "num_tokens": 73923800.0, + "reward": -1.3757910546846688, + "reward_std": 6.7682945728302, + "rewards/get_chromagram_reward": 0.6181758105754852, + "rewards/get_chromagram_reward_std": 0.11742054596543312, + "rewards/get_intelligibility_reward": -4.72544179558754, + "rewards/get_intelligibility_reward_std": 10.851413440704345, + "rewards/get_target_len_reward": -0.020106809958815575, + "rewards/get_target_len_reward_std": 0.05670954566448927, + "step": 2410 + }, + { + "advantages": 4.967053657267684e-10, + "advantages_std": 1.600301456451416, + "clip_ratio": 0.0, + "completion_length": 84.56428680419921, + "epoch": 1.8203007518796992, + "grad_norm": 6.6875, + "kl": 0.31402217745780947, + "learning_rate": 4.090225563909775e-06, + "loss": 0.0425, + "num_tokens": 74223740.0, + "reward": -1.2864448690786958, + "reward_std": 6.1337813377380375, + "rewards/get_chromagram_reward": 0.6213976562023162, + "rewards/get_chromagram_reward_std": 0.11770420670509338, + "rewards/get_intelligibility_reward": -4.453081881999969, + "rewards/get_intelligibility_reward_std": 9.727893400192261, + "rewards/get_target_len_reward": -0.027650110237300397, + "rewards/get_target_len_reward_std": 0.09467000924050809, + "step": 2420 + }, + { + "advantages": 4.803140981834986e-07, + "advantages_std": 1.5512767672538756, + "clip_ratio": 0.0, + "completion_length": 85.27262115478516, + "epoch": 1.8278195488721805, + "grad_norm": 6.5625, + "kl": 0.3004987627267838, + "learning_rate": 4.086466165413534e-06, + "loss": 0.0333, + "num_tokens": 74526273.0, + "reward": -1.2122848182916641, + "reward_std": 6.635207271575927, + "rewards/get_chromagram_reward": 0.6173213303089142, + "rewards/get_chromagram_reward_std": 0.10879912301898002, + "rewards/get_intelligibility_reward": -4.234836637973785, + "rewards/get_intelligibility_reward_std": 10.808967781066894, + "rewards/get_target_len_reward": -0.019338873215019703, + "rewards/get_target_len_reward_std": 0.06528621017932892, + "step": 2430 + }, + { + "advantages": 8.18322206441735e-08, + "advantages_std": 1.5040780901908875, + "clip_ratio": 0.0, + "completion_length": 86.69940567016602, + "epoch": 1.8353383458646615, + "grad_norm": 9.125, + "kl": 0.32429891228675845, + "learning_rate": 4.0827067669172936e-06, + "loss": 0.037, + "num_tokens": 74831879.0, + "reward": -1.7913677096366882, + "reward_std": 6.463911771774292, + "rewards/get_chromagram_reward": 0.6092451333999633, + "rewards/get_chromagram_reward_std": 0.11632038056850433, + "rewards/get_intelligibility_reward": -5.965150308609009, + "rewards/get_intelligibility_reward_std": 9.741594982147216, + "rewards/get_target_len_reward": -0.01819766601547599, + "rewards/get_target_len_reward_std": 0.05161431562155485, + "step": 2440 + }, + { + "advantages": -1.6440948584772742e-07, + "advantages_std": 1.6510832905769348, + "clip_ratio": 0.0, + "completion_length": 87.42916793823242, + "epoch": 1.842857142857143, + "grad_norm": 8.9375, + "kl": 0.28726502507925034, + "learning_rate": 4.078947368421053e-06, + "loss": 0.0282, + "num_tokens": 75139614.0, + "reward": -1.8547417521476746, + "reward_std": 7.167141103744507, + "rewards/get_chromagram_reward": 0.6293876945972443, + "rewards/get_chromagram_reward_std": 0.10446088090538978, + "rewards/get_intelligibility_reward": -6.176269102096557, + "rewards/get_intelligibility_reward_std": 11.209553623199463, + "rewards/get_target_len_reward": -0.017343681119382382, + "rewards/get_target_len_reward_std": 0.05335175041109323, + "step": 2450 + }, + { + "advantages": -3.521641332326908e-07, + "advantages_std": 1.5947361826896667, + "clip_ratio": 0.0, + "completion_length": 84.67024002075195, + "epoch": 1.850375939849624, + "grad_norm": 6.75, + "kl": 0.32163482904434204, + "learning_rate": 4.075187969924813e-06, + "loss": 0.0303, + "num_tokens": 75439855.0, + "reward": -1.5484479904174804, + "reward_std": 6.759703636169434, + "rewards/get_chromagram_reward": 0.620575338602066, + "rewards/get_chromagram_reward_std": 0.11032437160611153, + "rewards/get_intelligibility_reward": -5.249281632900238, + "rewards/get_intelligibility_reward_std": 10.744479370117187, + "rewards/get_target_len_reward": -0.01663746191188693, + "rewards/get_target_len_reward_std": 0.03786009326577187, + "step": 2460 + }, + { + "advantages": 1.6838313658951166e-07, + "advantages_std": 1.5318275094032288, + "clip_ratio": 0.0, + "completion_length": 83.47559661865235, + "epoch": 1.8578947368421053, + "grad_norm": 6.0625, + "kl": 0.3126231297850609, + "learning_rate": 4.071428571428572e-06, + "loss": 0.0333, + "num_tokens": 75737651.0, + "reward": -1.676826250553131, + "reward_std": 6.993101596832275, + "rewards/get_chromagram_reward": 0.6129578173160553, + "rewards/get_chromagram_reward_std": 0.09878090545535087, + "rewards/get_intelligibility_reward": -5.626195979118347, + "rewards/get_intelligibility_reward_std": 11.033592128753662, + "rewards/get_target_len_reward": -0.017240209318697453, + "rewards/get_target_len_reward_std": 0.049395473673939705, + "step": 2470 + }, + { + "advantages": 2.471109095125712e-08, + "advantages_std": 1.6018425345420837, + "clip_ratio": 0.0, + "completion_length": 83.61845397949219, + "epoch": 1.8654135338345865, + "grad_norm": 6.0625, + "kl": 0.3190310463309288, + "learning_rate": 4.067669172932331e-06, + "loss": 0.037, + "num_tokens": 76035766.0, + "reward": -1.5238053441047668, + "reward_std": 6.46599850654602, + "rewards/get_chromagram_reward": 0.6349402129650116, + "rewards/get_chromagram_reward_std": 0.12203380763530731, + "rewards/get_intelligibility_reward": -5.180359315872193, + "rewards/get_intelligibility_reward_std": 10.185456657409668, + "rewards/get_target_len_reward": -0.025996736809611322, + "rewards/get_target_len_reward_std": 0.06878238022327424, + "step": 2480 + }, + { + "advantages": 7.698935178268585e-09, + "advantages_std": 1.583086097240448, + "clip_ratio": 0.0, + "completion_length": 89.21904907226562, + "epoch": 1.8729323308270676, + "grad_norm": 5.4375, + "kl": 0.2938147783279419, + "learning_rate": 4.063909774436091e-06, + "loss": 0.0361, + "num_tokens": 76348903.0, + "reward": -1.237100750207901, + "reward_std": 6.340283393859863, + "rewards/get_chromagram_reward": 0.6105383217334748, + "rewards/get_chromagram_reward_std": 0.10293650925159455, + "rewards/get_intelligibility_reward": -4.298519229888916, + "rewards/get_intelligibility_reward_std": 10.241126346588135, + "rewards/get_target_len_reward": -0.0233210857026279, + "rewards/get_target_len_reward_std": 0.0777428038418293, + "step": 2490 + }, + { + "advantages": 3.899138931728885e-08, + "advantages_std": 1.5943516135215758, + "clip_ratio": 0.0, + "completion_length": 86.84702606201172, + "epoch": 1.8804511278195488, + "grad_norm": 368.0, + "kl": 0.35679852962493896, + "learning_rate": 4.06015037593985e-06, + "loss": 0.0407, + "num_tokens": 76655596.0, + "reward": -1.6571479380130767, + "reward_std": 7.034072685241699, + "rewards/get_chromagram_reward": 0.6275037288665771, + "rewards/get_chromagram_reward_std": 0.11990767642855645, + "rewards/get_intelligibility_reward": -5.577424621582031, + "rewards/get_intelligibility_reward_std": 11.100014400482177, + "rewards/get_target_len_reward": -0.021522563882172108, + "rewards/get_target_len_reward_std": 0.06714740544557571, + "step": 2500 + }, + { + "advantages": -1.862645149230957e-07, + "advantages_std": 1.5596297979354858, + "clip_ratio": 0.0, + "completion_length": 88.28392944335937, + "epoch": 1.88796992481203, + "grad_norm": 9.375, + "kl": 0.4866158485412598, + "learning_rate": 4.056390977443609e-06, + "loss": 0.0532, + "num_tokens": 76965233.0, + "reward": -1.3023281721398234, + "reward_std": 6.401098299026489, + "rewards/get_chromagram_reward": 0.6223603427410126, + "rewards/get_chromagram_reward_std": 0.10559494495391845, + "rewards/get_intelligibility_reward": -4.508207023143768, + "rewards/get_intelligibility_reward_std": 10.275893402099609, + "rewards/get_target_len_reward": -0.02113771978765726, + "rewards/get_target_len_reward_std": 0.07548020184040069, + "step": 2510 + }, + { + "advantages": -3.576279308248331e-08, + "advantages_std": 1.653851580619812, + "clip_ratio": 0.0, + "completion_length": 88.94940643310547, + "epoch": 1.8954887218045111, + "grad_norm": 7.125, + "kl": 0.3362803116440773, + "learning_rate": 4.052631578947368e-06, + "loss": 0.0423, + "num_tokens": 77276883.0, + "reward": -1.6171175594441594, + "reward_std": 6.4608911037445065, + "rewards/get_chromagram_reward": 0.6122646510601044, + "rewards/get_chromagram_reward_std": 0.11121488139033317, + "rewards/get_intelligibility_reward": -5.440471267700195, + "rewards/get_intelligibility_reward_std": 9.968835878372193, + "rewards/get_target_len_reward": -0.0231459632050246, + "rewards/get_target_len_reward_std": 0.0733029767870903, + "step": 2520 + }, + { + "advantages": -2.7803082787158927e-07, + "advantages_std": 1.5271441459655761, + "clip_ratio": 0.0, + "completion_length": 87.99643020629883, + "epoch": 1.9030075187969926, + "grad_norm": 60.0, + "kl": 0.32148386389017103, + "learning_rate": 4.048872180451129e-06, + "loss": 0.0359, + "num_tokens": 77585961.0, + "reward": -1.7723593652248382, + "reward_std": 6.782654476165772, + "rewards/get_chromagram_reward": 0.6239281296730042, + "rewards/get_chromagram_reward_std": 0.12085575759410858, + "rewards/get_intelligibility_reward": -5.919119071960449, + "rewards/get_intelligibility_reward_std": 10.512956428527833, + "rewards/get_target_len_reward": -0.02188691161572933, + "rewards/get_target_len_reward_std": 0.060627135634422305, + "step": 2530 + }, + { + "advantages": 2.081195582093187e-07, + "advantages_std": 1.5499732732772826, + "clip_ratio": 0.0, + "completion_length": 84.20000228881835, + "epoch": 1.9105263157894736, + "grad_norm": 11.9375, + "kl": 0.426358599960804, + "learning_rate": 4.0451127819548875e-06, + "loss": 0.0456, + "num_tokens": 77884894.0, + "reward": -1.5123292624950408, + "reward_std": 6.540386390686035, + "rewards/get_chromagram_reward": 0.6197911500930786, + "rewards/get_chromagram_reward_std": 0.10884481891989709, + "rewards/get_intelligibility_reward": -5.130767846107483, + "rewards/get_intelligibility_reward_std": 10.312138175964355, + "rewards/get_target_len_reward": -0.026010839454829692, + "rewards/get_target_len_reward_std": 0.08144442550837994, + "step": 2540 + }, + { + "advantages": 2.3071964534437938e-07, + "advantages_std": 1.5312702655792236, + "clip_ratio": 0.0, + "completion_length": 89.43274002075195, + "epoch": 1.9180451127819549, + "grad_norm": 25.125, + "kl": 5440.552889862656, + "learning_rate": 4.041353383458647e-06, + "loss": 544.0583, + "num_tokens": 78198326.0, + "reward": -1.667872903123498, + "reward_std": 7.2336784362792965, + "rewards/get_chromagram_reward": 0.6233785390853882, + "rewards/get_chromagram_reward_std": 0.10803897455334663, + "rewards/get_intelligibility_reward": -5.607472121715546, + "rewards/get_intelligibility_reward_std": 11.474693489074706, + "rewards/get_target_len_reward": -0.01952482983469963, + "rewards/get_target_len_reward_std": 0.056231222674250604, + "step": 2550 + }, + { + "advantages": 1.4168521040858194e-07, + "advantages_std": 1.6055476427078248, + "clip_ratio": 0.0, + "completion_length": 87.8934539794922, + "epoch": 1.9255639097744361, + "grad_norm": 7.21875, + "kl": 55.368652729690076, + "learning_rate": 4.037593984962406e-06, + "loss": 5.54, + "num_tokens": 78507568.0, + "reward": -1.5169284701347352, + "reward_std": 6.524043083190918, + "rewards/get_chromagram_reward": 0.6211909174919128, + "rewards/get_chromagram_reward_std": 0.10534756779670715, + "rewards/get_intelligibility_reward": -5.150899171829224, + "rewards/get_intelligibility_reward_std": 10.345247268676758, + "rewards/get_target_len_reward": -0.021076841093599797, + "rewards/get_target_len_reward_std": 0.06926373746246099, + "step": 2560 + }, + { + "advantages": 5.267560624133694e-07, + "advantages_std": 1.4992515563964843, + "clip_ratio": 0.0, + "completion_length": 86.12797622680664, + "epoch": 1.9330827067669172, + "grad_norm": 12.8125, + "kl": 0.3306959331035614, + "learning_rate": 4.033834586466166e-06, + "loss": 0.0387, + "num_tokens": 78812057.0, + "reward": -1.651641035079956, + "reward_std": 6.678805828094482, + "rewards/get_chromagram_reward": 0.6241105377674103, + "rewards/get_chromagram_reward_std": 0.1160986490547657, + "rewards/get_intelligibility_reward": -5.554108786582947, + "rewards/get_intelligibility_reward_std": 10.46739559173584, + "rewards/get_target_len_reward": -0.024924515280872583, + "rewards/get_target_len_reward_std": 0.06897499226033688, + "step": 2570 + }, + { + "advantages": -2.997617171374145e-07, + "advantages_std": 1.566463255882263, + "clip_ratio": 0.0, + "completion_length": 88.16309661865235, + "epoch": 1.9406015037593987, + "grad_norm": 5.90625, + "kl": 0.4035792797803879, + "learning_rate": 4.030075187969925e-06, + "loss": 0.0422, + "num_tokens": 79122637.0, + "reward": -1.3191568836569787, + "reward_std": 6.7891851425170895, + "rewards/get_chromagram_reward": 0.6245894730091095, + "rewards/get_chromagram_reward_std": 0.12556827813386917, + "rewards/get_intelligibility_reward": -4.558280682563781, + "rewards/get_intelligibility_reward_std": 10.986210060119628, + "rewards/get_target_len_reward": -0.023779175989329816, + "rewards/get_target_len_reward_std": 0.06274551041424274, + "step": 2580 + }, + { + "advantages": -3.576277929351335e-08, + "advantages_std": 1.5424383997917175, + "clip_ratio": 0.0, + "completion_length": 86.0434555053711, + "epoch": 1.9481203007518797, + "grad_norm": 5.25, + "kl": 0.4115982368588448, + "learning_rate": 4.026315789473684e-06, + "loss": 0.0447, + "num_tokens": 79426425.0, + "reward": -1.7672771275043488, + "reward_std": 7.219090843200684, + "rewards/get_chromagram_reward": 0.6099259614944458, + "rewards/get_chromagram_reward_std": 0.09907660037279128, + "rewards/get_intelligibility_reward": -5.89415819644928, + "rewards/get_intelligibility_reward_std": 11.30065450668335, + "rewards/get_target_len_reward": -0.017598634399473668, + "rewards/get_target_len_reward_std": 0.05380655974149704, + "step": 2590 + }, + { + "advantages": -4.5175355865012536e-07, + "advantages_std": 1.6156995177268982, + "clip_ratio": 0.0, + "completion_length": 85.45476379394532, + "epoch": 1.955639097744361, + "grad_norm": 9.1875, + "kl": 0.3221691817045212, + "learning_rate": 4.022556390977444e-06, + "loss": 0.0364, + "num_tokens": 79729374.0, + "reward": -1.970957136154175, + "reward_std": 6.993637561798096, + "rewards/get_chromagram_reward": 0.6113289833068848, + "rewards/get_chromagram_reward_std": 0.11882436200976372, + "rewards/get_intelligibility_reward": -6.500963306427002, + "rewards/get_intelligibility_reward_std": 10.691209602355958, + "rewards/get_target_len_reward": -0.023236696422100068, + "rewards/get_target_len_reward_std": 0.0706200659275055, + "step": 2600 + }, + { + "advantages": -3.07212287253833e-07, + "advantages_std": 1.628948163986206, + "clip_ratio": 0.0, + "completion_length": 86.0678581237793, + "epoch": 1.9631578947368422, + "grad_norm": 8.5625, + "kl": 1.3464050814509392, + "learning_rate": 4.018796992481203e-06, + "loss": 0.1366, + "num_tokens": 80033923.0, + "reward": -1.6657975971698762, + "reward_std": 6.801527500152588, + "rewards/get_chromagram_reward": 0.6163083136081695, + "rewards/get_chromagram_reward_std": 0.1146535836160183, + "rewards/get_intelligibility_reward": -5.594403171539307, + "rewards/get_intelligibility_reward_std": 10.669664001464843, + "rewards/get_target_len_reward": -0.019297726918011904, + "rewards/get_target_len_reward_std": 0.05750475097447634, + "step": 2610 + }, + { + "advantages": -2.3345150879094944e-08, + "advantages_std": 1.7088413119316102, + "clip_ratio": 0.0, + "completion_length": 82.7011932373047, + "epoch": 1.9706766917293232, + "grad_norm": 8.125, + "kl": 0.34609042257070544, + "learning_rate": 4.015037593984963e-06, + "loss": 0.0435, + "num_tokens": 80328255.0, + "reward": -1.7058481693267822, + "reward_std": 6.336285066604614, + "rewards/get_chromagram_reward": 0.6239156484603882, + "rewards/get_chromagram_reward_std": 0.10946919023990631, + "rewards/get_intelligibility_reward": -5.717773270606995, + "rewards/get_intelligibility_reward_std": 9.749144554138184, + "rewards/get_target_len_reward": -0.023686547577381135, + "rewards/get_target_len_reward_std": 0.06974872462451458, + "step": 2620 + }, + { + "advantages": -9.05245698845647e-08, + "advantages_std": 1.530289900302887, + "clip_ratio": 0.0, + "completion_length": 88.38690643310547, + "epoch": 1.9781954887218045, + "grad_norm": 8.5625, + "kl": 0.28629245460033415, + "learning_rate": 4.011278195488722e-06, + "loss": 0.029, + "num_tokens": 80638435.0, + "reward": -1.2314883843064308, + "reward_std": 6.625270700454712, + "rewards/get_chromagram_reward": 0.6178261160850524, + "rewards/get_chromagram_reward_std": 0.10950247719883918, + "rewards/get_intelligibility_reward": -4.294896459579467, + "rewards/get_intelligibility_reward_std": 10.598248863220215, + "rewards/get_target_len_reward": -0.017394708935171366, + "rewards/get_target_len_reward_std": 0.04538180362433195, + "step": 2630 + }, + { + "advantages": 1.9421179899836716e-07, + "advantages_std": 1.732315731048584, + "clip_ratio": 0.0, + "completion_length": 91.10000228881836, + "epoch": 1.9857142857142858, + "grad_norm": 872.0, + "kl": 26.47857711613178, + "learning_rate": 4.007518796992481e-06, + "loss": 2.6515, + "num_tokens": 80956909.0, + "reward": -1.3893529994413256, + "reward_std": 6.811365032196045, + "rewards/get_chromagram_reward": 0.631500905752182, + "rewards/get_chromagram_reward_std": 0.11041677147150039, + "rewards/get_intelligibility_reward": -4.777693957090378, + "rewards/get_intelligibility_reward_std": 10.829482650756836, + "rewards/get_target_len_reward": -0.02186558600515127, + "rewards/get_target_len_reward_std": 0.05735799949616194, + "step": 2640 + }, + { + "advantages": -8.928279369158076e-08, + "advantages_std": 1.5102097868919373, + "clip_ratio": 0.0, + "completion_length": 88.96607208251953, + "epoch": 1.9932330827067668, + "grad_norm": 65.5, + "kl": 0.3440512865781784, + "learning_rate": 4.003759398496241e-06, + "loss": 0.0355, + "num_tokens": 81269102.0, + "reward": -1.2409762933850288, + "reward_std": 6.411077070236206, + "rewards/get_chromagram_reward": 0.6272790014743805, + "rewards/get_chromagram_reward_std": 0.1086908034980297, + "rewards/get_intelligibility_reward": -4.325918501615524, + "rewards/get_intelligibility_reward_std": 10.257991981506347, + "rewards/get_target_len_reward": -0.024289328791201114, + "rewards/get_target_len_reward_std": 0.06517439857125282, + "step": 2650 + }, + { + "advantages": -1.0939936316844978e-07, + "advantages_std": 1.575232243537903, + "clip_ratio": 0.0, + "completion_length": 84.14226303100585, + "epoch": 2.001503759398496, + "grad_norm": 5.25, + "kl": 1.0052132874727249, + "learning_rate": 4.000000000000001e-06, + "loss": 0.1059, + "num_tokens": 81572089.0, + "reward": -1.6181729942560197, + "reward_std": 6.623514938354492, + "rewards/get_chromagram_reward": 0.6281363487243652, + "rewards/get_chromagram_reward_std": 0.11101439595222473, + "rewards/get_intelligibility_reward": -5.455703794956207, + "rewards/get_intelligibility_reward_std": 10.291347360610962, + "rewards/get_target_len_reward": -0.02695103920996189, + "rewards/get_target_len_reward_std": 0.07068178877234459, + "step": 2660 + }, + { + "advantages": 1.2417700645528385e-09, + "advantages_std": 1.5857953310012818, + "clip_ratio": 0.0, + "completion_length": 83.08214492797852, + "epoch": 2.0090225563909776, + "grad_norm": 6.34375, + "kl": 0.29621861577034, + "learning_rate": 3.9962406015037595e-06, + "loss": 0.0321, + "num_tokens": 81868085.0, + "reward": -1.6860800623893737, + "reward_std": 6.562015771865845, + "rewards/get_chromagram_reward": 0.6251458704471589, + "rewards/get_chromagram_reward_std": 0.11826840862631798, + "rewards/get_intelligibility_reward": -5.666773128509521, + "rewards/get_intelligibility_reward_std": 10.22925615310669, + "rewards/get_target_len_reward": -0.016612501721829175, + "rewards/get_target_len_reward_std": 0.050476128607988356, + "step": 2670 + }, + { + "advantages": 7.829318747099023e-07, + "advantages_std": 1.627654242515564, + "clip_ratio": 0.0, + "completion_length": 84.9095245361328, + "epoch": 2.0165413533834586, + "grad_norm": 93.5, + "kl": 0.3101460263133049, + "learning_rate": 3.992481203007519e-06, + "loss": 0.0334, + "num_tokens": 82169486.0, + "reward": -1.5961270749568939, + "reward_std": 6.337138366699219, + "rewards/get_chromagram_reward": 0.6360050141811371, + "rewards/get_chromagram_reward_std": 0.11316153407096863, + "rewards/get_intelligibility_reward": -5.403832316398621, + "rewards/get_intelligibility_reward_std": 9.89303879737854, + "rewards/get_target_len_reward": -0.020553555525839328, + "rewards/get_target_len_reward_std": 0.05809407290071249, + "step": 2680 + }, + { + "advantages": 4.1648745252587105e-07, + "advantages_std": 1.6060652375221252, + "clip_ratio": 0.0, + "completion_length": 86.80476303100586, + "epoch": 2.0240601503759397, + "grad_norm": 9.4375, + "kl": 0.3532222270965576, + "learning_rate": 3.988721804511279e-06, + "loss": 0.0344, + "num_tokens": 82476181.0, + "reward": -1.5671575158834457, + "reward_std": 6.547077798843384, + "rewards/get_chromagram_reward": 0.6348513245582581, + "rewards/get_chromagram_reward_std": 0.10924804285168647, + "rewards/get_intelligibility_reward": -5.3163529396057125, + "rewards/get_intelligibility_reward_std": 10.257298564910888, + "rewards/get_target_len_reward": -0.019970581401139498, + "rewards/get_target_len_reward_std": 0.05366719178855419, + "step": 2690 + }, + { + "advantages": 1.631677217783789e-07, + "advantages_std": 1.6509637475013732, + "clip_ratio": 0.0, + "completion_length": 83.5428596496582, + "epoch": 2.031578947368421, + "grad_norm": 16.0, + "kl": 0.31794759780168536, + "learning_rate": 3.9849624060150376e-06, + "loss": 0.041, + "num_tokens": 82773681.0, + "reward": -1.5188110053539277, + "reward_std": 6.337612199783325, + "rewards/get_chromagram_reward": 0.6124212205410003, + "rewards/get_chromagram_reward_std": 0.10746575593948364, + "rewards/get_intelligibility_reward": -5.145204424858093, + "rewards/get_intelligibility_reward_std": 9.923934173583984, + "rewards/get_target_len_reward": -0.023649390833452345, + "rewards/get_target_len_reward_std": 0.07802547551691533, + "step": 2700 + }, + { + "advantages": 6.283323514821859e-08, + "advantages_std": 1.6156264424324036, + "clip_ratio": 0.0, + "completion_length": 86.15595321655273, + "epoch": 2.039097744360902, + "grad_norm": 7.0, + "kl": 0.2870593532919884, + "learning_rate": 3.981203007518797e-06, + "loss": 0.0345, + "num_tokens": 83077770.0, + "reward": -1.752515721321106, + "reward_std": 6.815318632125854, + "rewards/get_chromagram_reward": 0.6136973381042481, + "rewards/get_chromagram_reward_std": 0.12048554718494416, + "rewards/get_intelligibility_reward": -5.849648785591126, + "rewards/get_intelligibility_reward_std": 10.635982704162597, + "rewards/get_target_len_reward": -0.021595498360693455, + "rewards/get_target_len_reward_std": 0.06771521810442209, + "step": 2710 + }, + { + "advantages": 7.996956128408783e-08, + "advantages_std": 1.5599609971046449, + "clip_ratio": 0.0, + "completion_length": 87.32440719604492, + "epoch": 2.0466165413533837, + "grad_norm": 9.5625, + "kl": 0.3296610161662102, + "learning_rate": 3.977443609022557e-06, + "loss": 0.0379, + "num_tokens": 83386010.0, + "reward": -1.3757877141237258, + "reward_std": 6.675065898895264, + "rewards/get_chromagram_reward": 0.6006149351596832, + "rewards/get_chromagram_reward_std": 0.11269161626696586, + "rewards/get_intelligibility_reward": -4.707658588886261, + "rewards/get_intelligibility_reward_std": 10.665964126586914, + "rewards/get_target_len_reward": -0.02031925953924656, + "rewards/get_target_len_reward_std": 0.0578795462846756, + "step": 2720 + }, + { + "advantages": -3.005068265338195e-08, + "advantages_std": 1.6625032067298888, + "clip_ratio": 0.0, + "completion_length": 81.84345474243165, + "epoch": 2.0541353383458647, + "grad_norm": 5.625, + "kl": 0.32018242329359053, + "learning_rate": 3.9736842105263165e-06, + "loss": 0.0344, + "num_tokens": 83679359.0, + "reward": -1.37432102560997, + "reward_std": 6.967769908905029, + "rewards/get_chromagram_reward": 0.6192000329494476, + "rewards/get_chromagram_reward_std": 0.12201750725507736, + "rewards/get_intelligibility_reward": -4.716173662245273, + "rewards/get_intelligibility_reward_std": 11.24532117843628, + "rewards/get_target_len_reward": -0.025989135075360537, + "rewards/get_target_len_reward_std": 0.07435446102172136, + "step": 2730 + }, + { + "advantages": -9.636084961073265e-08, + "advantages_std": 1.5627854466438293, + "clip_ratio": 0.0, + "completion_length": 91.02619171142578, + "epoch": 2.0616541353383457, + "grad_norm": 170.0, + "kl": 0.3474865362048149, + "learning_rate": 3.969924812030075e-06, + "loss": 0.0374, + "num_tokens": 83997524.0, + "reward": -1.208175851404667, + "reward_std": 6.546644783020019, + "rewards/get_chromagram_reward": 0.6278281271457672, + "rewards/get_chromagram_reward_std": 0.1091654047369957, + "rewards/get_intelligibility_reward": -4.231800414249301, + "rewards/get_intelligibility_reward_std": 10.602706050872802, + "rewards/get_target_len_reward": -0.020555054116994143, + "rewards/get_target_len_reward_std": 0.05810644961893559, + "step": 2740 + }, + { + "advantages": -2.4835263445766032e-08, + "advantages_std": 1.731394600868225, + "clip_ratio": 0.0, + "completion_length": 84.61904983520508, + "epoch": 2.069172932330827, + "grad_norm": 35.75, + "kl": 0.3340544059872627, + "learning_rate": 3.966165413533835e-06, + "loss": 0.0418, + "num_tokens": 84297845.0, + "reward": -1.816446018218994, + "reward_std": 6.996577215194702, + "rewards/get_chromagram_reward": 0.6161438524723053, + "rewards/get_chromagram_reward_std": 0.11226251423358917, + "rewards/get_intelligibility_reward": -6.046895384788513, + "rewards/get_intelligibility_reward_std": 10.867814064025879, + "rewards/get_target_len_reward": -0.018586322385817765, + "rewards/get_target_len_reward_std": 0.06211254261434078, + "step": 2750 + }, + { + "advantages": 5.16573589948166e-07, + "advantages_std": 1.553725790977478, + "clip_ratio": 0.0, + "completion_length": 86.25059814453125, + "epoch": 2.0766917293233083, + "grad_norm": 6.0, + "kl": 0.34573080837726594, + "learning_rate": 3.9624060150375946e-06, + "loss": 0.0424, + "num_tokens": 84602520.0, + "reward": -1.4252728760242461, + "reward_std": 6.637792301177979, + "rewards/get_chromagram_reward": 0.639812707901001, + "rewards/get_chromagram_reward_std": 0.11356036961078644, + "rewards/get_intelligibility_reward": -4.889979219436645, + "rewards/get_intelligibility_reward_std": 10.599310779571534, + "rewards/get_target_len_reward": -0.025651910994201898, + "rewards/get_target_len_reward_std": 0.07295588366687297, + "step": 2760 + }, + { + "advantages": 2.972161013303776e-07, + "advantages_std": 1.388267707824707, + "clip_ratio": 0.0, + "completion_length": 86.68154907226562, + "epoch": 2.0842105263157893, + "grad_norm": 54.5, + "kl": 0.3174739718437195, + "learning_rate": 3.958646616541354e-06, + "loss": 0.0344, + "num_tokens": 84908029.0, + "reward": -1.9035109996795654, + "reward_std": 7.167625379562378, + "rewards/get_chromagram_reward": 0.6314962983131409, + "rewards/get_chromagram_reward_std": 0.10467702820897103, + "rewards/get_intelligibility_reward": -6.32180278301239, + "rewards/get_intelligibility_reward_std": 10.970943498611451, + "rewards/get_target_len_reward": -0.02022632034495473, + "rewards/get_target_len_reward_std": 0.050012038089334965, + "step": 2770 + }, + { + "advantages": -1.9321838635733e-07, + "advantages_std": 1.6621986269950866, + "clip_ratio": 0.0, + "completion_length": 87.88869171142578, + "epoch": 2.0917293233082708, + "grad_norm": 8.25, + "kl": 0.36265345960855483, + "learning_rate": 3.954887218045113e-06, + "loss": 0.0378, + "num_tokens": 85217100.0, + "reward": -1.5520632922649384, + "reward_std": 6.731218671798706, + "rewards/get_chromagram_reward": 0.6277051866054535, + "rewards/get_chromagram_reward_std": 0.10972578823566437, + "rewards/get_intelligibility_reward": -5.26589868068695, + "rewards/get_intelligibility_reward_std": 10.702633094787597, + "rewards/get_target_len_reward": -0.01799613079056144, + "rewards/get_target_len_reward_std": 0.049734361842274664, + "step": 2780 + }, + { + "advantages": 7.376073654086213e-08, + "advantages_std": 1.509410297870636, + "clip_ratio": 0.0, + "completion_length": 88.70059661865234, + "epoch": 2.099248120300752, + "grad_norm": 9.25, + "kl": 0.3428507328033447, + "learning_rate": 3.951127819548873e-06, + "loss": 0.0368, + "num_tokens": 85528743.0, + "reward": -1.1163109362125396, + "reward_std": 6.350103139877319, + "rewards/get_chromagram_reward": 0.6196650564670563, + "rewards/get_chromagram_reward_std": 0.10964875966310501, + "rewards/get_intelligibility_reward": -3.947629976272583, + "rewards/get_intelligibility_reward_std": 10.319130992889404, + "rewards/get_target_len_reward": -0.02096777716651559, + "rewards/get_target_len_reward_std": 0.06917067337781191, + "step": 2790 + }, + { + "advantages": -4.4753156434040877e-07, + "advantages_std": 1.6299754142761231, + "clip_ratio": 0.0, + "completion_length": 87.36190567016601, + "epoch": 2.1067669172932333, + "grad_norm": 9.6875, + "kl": 0.3407274499535561, + "learning_rate": 3.947368421052632e-06, + "loss": 0.0433, + "num_tokens": 85837424.0, + "reward": -1.3424668543040752, + "reward_std": 6.6862287521362305, + "rewards/get_chromagram_reward": 0.6276401698589325, + "rewards/get_chromagram_reward_std": 0.10829192474484443, + "rewards/get_intelligibility_reward": -4.629968780279159, + "rewards/get_intelligibility_reward_std": 10.745279312133789, + "rewards/get_target_len_reward": -0.025071771629154683, + "rewards/get_target_len_reward_std": 0.07464848496019841, + "step": 2800 + }, + { + "advantages": 2.4065376322823797e-07, + "advantages_std": 1.612348747253418, + "clip_ratio": 0.0, + "completion_length": 85.93452529907226, + "epoch": 2.1142857142857143, + "grad_norm": 170.0, + "kl": 0.3182450085878372, + "learning_rate": 3.943609022556391e-06, + "loss": 0.039, + "num_tokens": 86141792.0, + "reward": -1.6066758632659912, + "reward_std": 7.027578115463257, + "rewards/get_chromagram_reward": 0.6162883937358856, + "rewards/get_chromagram_reward_std": 0.12795912325382233, + "rewards/get_intelligibility_reward": -5.414373850822448, + "rewards/get_intelligibility_reward_std": 11.074428367614747, + "rewards/get_target_len_reward": -0.02194185955449939, + "rewards/get_target_len_reward_std": 0.06819509305059909, + "step": 2810 + }, + { + "advantages": 4.0705006405516996e-07, + "advantages_std": 1.6682619452476501, + "clip_ratio": 0.0, + "completion_length": 87.75238342285157, + "epoch": 2.1218045112781954, + "grad_norm": 6.34375, + "kl": 0.32629688531160356, + "learning_rate": 3.939849624060151e-06, + "loss": 0.0405, + "num_tokens": 86450610.0, + "reward": -1.5439964354038238, + "reward_std": 6.698471546173096, + "rewards/get_chromagram_reward": 0.6298602938652038, + "rewards/get_chromagram_reward_std": 0.11802728474140167, + "rewards/get_intelligibility_reward": -5.238238763809204, + "rewards/get_intelligibility_reward_std": 10.606562519073487, + "rewards/get_target_len_reward": -0.023610591888427734, + "rewards/get_target_len_reward_std": 0.06324613895267248, + "step": 2820 + }, + { + "advantages": -5.220373740399964e-07, + "advantages_std": 1.566174530982971, + "clip_ratio": 0.0, + "completion_length": 86.38928833007813, + "epoch": 2.129323308270677, + "grad_norm": 10.8125, + "kl": 0.5685814306139946, + "learning_rate": 3.9360902255639095e-06, + "loss": 0.0659, + "num_tokens": 86755590.0, + "reward": -1.7503463923931122, + "reward_std": 6.914837121963501, + "rewards/get_chromagram_reward": 0.6366630434989929, + "rewards/get_chromagram_reward_std": 0.12139641642570495, + "rewards/get_intelligibility_reward": -5.858629488945008, + "rewards/get_intelligibility_reward_std": 10.730266666412353, + "rewards/get_target_len_reward": -0.029072243347764017, + "rewards/get_target_len_reward_std": 0.07542071975767613, + "step": 2830 + }, + { + "advantages": 1.614292443719023e-07, + "advantages_std": 1.549390870332718, + "clip_ratio": 0.0, + "completion_length": 87.40119247436523, + "epoch": 2.136842105263158, + "grad_norm": 24.0, + "kl": 0.3246032640337944, + "learning_rate": 3.93233082706767e-06, + "loss": 0.0372, + "num_tokens": 87063339.0, + "reward": -1.4028959453105927, + "reward_std": 6.6174252986907955, + "rewards/get_chromagram_reward": 0.6235669672489166, + "rewards/get_chromagram_reward_std": 0.11671028062701225, + "rewards/get_intelligibility_reward": -4.811532521247864, + "rewards/get_intelligibility_reward_std": 10.584142017364503, + "rewards/get_target_len_reward": -0.020721999648958444, + "rewards/get_target_len_reward_std": 0.06385829038918019, + "step": 2840 + }, + { + "advantages": -2.299746029166272e-07, + "advantages_std": 1.6184433102607727, + "clip_ratio": 0.0, + "completion_length": 86.2208351135254, + "epoch": 2.144360902255639, + "grad_norm": 7.3125, + "kl": 0.30331481248140335, + "learning_rate": 3.928571428571429e-06, + "loss": 0.0306, + "num_tokens": 87368626.0, + "reward": -1.2707931637763976, + "reward_std": 6.4112237930297855, + "rewards/get_chromagram_reward": 0.6089653193950653, + "rewards/get_chromagram_reward_std": 0.11852159649133683, + "rewards/get_intelligibility_reward": -4.396771430969238, + "rewards/get_intelligibility_reward_std": 10.385689878463745, + "rewards/get_target_len_reward": -0.02457326604053378, + "rewards/get_target_len_reward_std": 0.0729073267430067, + "step": 2850 + }, + { + "advantages": -2.3146470482515724e-07, + "advantages_std": 1.4920684814453125, + "clip_ratio": 0.0, + "completion_length": 88.57321624755859, + "epoch": 2.1518796992481204, + "grad_norm": 5.6875, + "kl": 0.2971339821815491, + "learning_rate": 3.9248120300751885e-06, + "loss": 0.0382, + "num_tokens": 87679743.0, + "reward": -1.4728783011436462, + "reward_std": 6.793437814712524, + "rewards/get_chromagram_reward": 0.6393875896930694, + "rewards/get_chromagram_reward_std": 0.11733865663409233, + "rewards/get_intelligibility_reward": -5.031013822555542, + "rewards/get_intelligibility_reward_std": 10.67188892364502, + "rewards/get_target_len_reward": -0.027008223440498115, + "rewards/get_target_len_reward_std": 0.08672410566359759, + "step": 2860 + }, + { + "advantages": -6.073465172562465e-07, + "advantages_std": 1.5244192004203796, + "clip_ratio": 0.0, + "completion_length": 86.31845474243164, + "epoch": 2.1593984962406014, + "grad_norm": 4.9375, + "kl": 0.3206807836890221, + "learning_rate": 3.921052631578947e-06, + "loss": 0.0363, + "num_tokens": 87984918.0, + "reward": -1.6187335789203643, + "reward_std": 6.374583053588867, + "rewards/get_chromagram_reward": 0.6180616199970246, + "rewards/get_chromagram_reward_std": 0.11240240931510925, + "rewards/get_intelligibility_reward": -5.454882073402405, + "rewards/get_intelligibility_reward_std": 9.929090690612792, + "rewards/get_target_len_reward": -0.019380014995113014, + "rewards/get_target_len_reward_std": 0.05369943529367447, + "step": 2870 + }, + { + "advantages": -3.774961040647895e-07, + "advantages_std": 1.7331120729446412, + "clip_ratio": 0.0, + "completion_length": 86.33511962890626, + "epoch": 2.166917293233083, + "grad_norm": 5.1875, + "kl": 9068.70393010974, + "learning_rate": 3.917293233082707e-06, + "loss": 906.8738, + "num_tokens": 88290049.0, + "reward": -0.9701588183641434, + "reward_std": 6.336407804489136, + "rewards/get_chromagram_reward": 0.6315642714500427, + "rewards/get_chromagram_reward_std": 0.12316482216119766, + "rewards/get_intelligibility_reward": -3.5188528180122374, + "rewards/get_intelligibility_reward_std": 10.371418666839599, + "rewards/get_target_len_reward": -0.02318771481513977, + "rewards/get_target_len_reward_std": 0.058829471841454505, + "step": 2880 + }, + { + "advantages": -4.755953710855465e-08, + "advantages_std": 1.6271605849266053, + "clip_ratio": 0.0, + "completion_length": 85.03988265991211, + "epoch": 2.174436090225564, + "grad_norm": 6.40625, + "kl": 0.3325976699590683, + "learning_rate": 3.9135338345864666e-06, + "loss": 0.0346, + "num_tokens": 88591385.0, + "reward": -1.823885554075241, + "reward_std": 6.934875011444092, + "rewards/get_chromagram_reward": 0.6075676620006562, + "rewards/get_chromagram_reward_std": 0.10233750753104687, + "rewards/get_intelligibility_reward": -6.061479997634888, + "rewards/get_intelligibility_reward_std": 10.791695308685302, + "rewards/get_target_len_reward": -0.017743840347975492, + "rewards/get_target_len_reward_std": 0.05164923332631588, + "step": 2890 + }, + { + "advantages": 3.4285089807184476e-07, + "advantages_std": 1.563487422466278, + "clip_ratio": 0.0, + "completion_length": 88.96369171142578, + "epoch": 2.181954887218045, + "grad_norm": 194.0, + "kl": 0.30613133758306504, + "learning_rate": 3.909774436090225e-06, + "loss": 0.0326, + "num_tokens": 88904159.0, + "reward": -1.0743597209453584, + "reward_std": 6.527523565292358, + "rewards/get_chromagram_reward": 0.6418916761875153, + "rewards/get_chromagram_reward_std": 0.11317120790481568, + "rewards/get_intelligibility_reward": -3.8442397631704806, + "rewards/get_intelligibility_reward_std": 10.655349826812744, + "rewards/get_target_len_reward": -0.020730842463672162, + "rewards/get_target_len_reward_std": 0.05648756790906191, + "step": 2900 + }, + { + "advantages": 3.5390256698519805e-07, + "advantages_std": 1.5687660932540894, + "clip_ratio": 0.0, + "completion_length": 88.92916946411133, + "epoch": 2.1894736842105265, + "grad_norm": 79360.0, + "kl": 23.718442597985266, + "learning_rate": 3.906015037593985e-06, + "loss": 2.3749, + "num_tokens": 89216874.0, + "reward": -0.9435413286089898, + "reward_std": 6.158167028427124, + "rewards/get_chromagram_reward": 0.6337945759296417, + "rewards/get_chromagram_reward_std": 0.12330271378159523, + "rewards/get_intelligibility_reward": -3.438680863380432, + "rewards/get_intelligibility_reward_std": 10.161257123947143, + "rewards/get_target_len_reward": -0.02573751602321863, + "rewards/get_target_len_reward_std": 0.06352040991187095, + "step": 2910 + }, + { + "advantages": 7.947287627985134e-09, + "advantages_std": 1.7121564388275146, + "clip_ratio": 0.0, + "completion_length": 86.64881057739258, + "epoch": 2.1969924812030075, + "grad_norm": 6.03125, + "kl": 0.4889916032552719, + "learning_rate": 3.902255639097745e-06, + "loss": 0.0531, + "num_tokens": 89523055.0, + "reward": -1.565059586800635, + "reward_std": 6.954021406173706, + "rewards/get_chromagram_reward": 0.607182401418686, + "rewards/get_chromagram_reward_std": 0.09809157475829125, + "rewards/get_intelligibility_reward": -5.282276725769043, + "rewards/get_intelligibility_reward_std": 10.995676708221435, + "rewards/get_target_len_reward": -0.02008410422131419, + "rewards/get_target_len_reward_std": 0.06095631066709757, + "step": 2920 + }, + { + "advantages": -5.918244847968878e-07, + "advantages_std": 1.6808531880378723, + "clip_ratio": 0.0, + "completion_length": 85.08393020629883, + "epoch": 2.2045112781954885, + "grad_norm": 28.625, + "kl": 0.29349366426467893, + "learning_rate": 3.898496240601504e-06, + "loss": 0.0305, + "num_tokens": 89823971.0, + "reward": -1.5419997453689576, + "reward_std": 6.375377082824707, + "rewards/get_chromagram_reward": 0.6257684767246247, + "rewards/get_chromagram_reward_std": 0.11149628758430481, + "rewards/get_intelligibility_reward": -5.231714677810669, + "rewards/get_intelligibility_reward_std": 9.97823076248169, + "rewards/get_target_len_reward": -0.020052669383585454, + "rewards/get_target_len_reward_std": 0.0618388619273901, + "step": 2930 + }, + { + "advantages": 1.589457234274505e-07, + "advantages_std": 1.594847321510315, + "clip_ratio": 0.0, + "completion_length": 86.02381057739258, + "epoch": 2.21203007518797, + "grad_norm": 15.9375, + "kl": 0.34517409056425097, + "learning_rate": 3.894736842105263e-06, + "loss": 0.0369, + "num_tokens": 90128555.0, + "reward": -1.4202000886201858, + "reward_std": 6.686147880554199, + "rewards/get_chromagram_reward": 0.6276867032051087, + "rewards/get_chromagram_reward_std": 0.12245145216584205, + "rewards/get_intelligibility_reward": -4.86482664346695, + "rewards/get_intelligibility_reward_std": 10.5703914642334, + "rewards/get_target_len_reward": -0.023460079357028006, + "rewards/get_target_len_reward_std": 0.06456102542579174, + "step": 2940 + }, + { + "advantages": 1.5497208067927203e-07, + "advantages_std": 1.538965892791748, + "clip_ratio": 0.0, + "completion_length": 90.55238265991211, + "epoch": 2.219548872180451, + "grad_norm": 5.375, + "kl": 0.3080202296376228, + "learning_rate": 3.890977443609023e-06, + "loss": 0.035, + "num_tokens": 90445397.0, + "reward": -1.2401035517454146, + "reward_std": 6.639403533935547, + "rewards/get_chromagram_reward": 0.6180085897445678, + "rewards/get_chromagram_reward_std": 0.09938773810863495, + "rewards/get_intelligibility_reward": -4.320744025707245, + "rewards/get_intelligibility_reward_std": 10.65583438873291, + "rewards/get_target_len_reward": -0.017574947141110898, + "rewards/get_target_len_reward_std": 0.05353275462985039, + "step": 2950 + }, + { + "advantages": -6.544093196225731e-08, + "advantages_std": 1.6957322597503661, + "clip_ratio": 0.0, + "completion_length": 89.52262115478516, + "epoch": 2.2270676691729325, + "grad_norm": 7.125, + "kl": 0.3561790719628334, + "learning_rate": 3.887218045112782e-06, + "loss": 0.041, + "num_tokens": 90759465.0, + "reward": -1.3227708965539933, + "reward_std": 6.579633474349976, + "rewards/get_chromagram_reward": 0.6285708963871002, + "rewards/get_chromagram_reward_std": 0.1098080925643444, + "rewards/get_intelligibility_reward": -4.573193967342377, + "rewards/get_intelligibility_reward_std": 10.618196725845337, + "rewards/get_target_len_reward": -0.023689321987330914, + "rewards/get_target_len_reward_std": 0.07437594067305327, + "step": 2960 + }, + { + "advantages": -2.3879112136526716e-07, + "advantages_std": 1.5580598294734955, + "clip_ratio": 0.0, + "completion_length": 85.85178680419922, + "epoch": 2.2345864661654136, + "grad_norm": 6.0, + "kl": 0.3017871379852295, + "learning_rate": 3.883458646616542e-06, + "loss": 0.0429, + "num_tokens": 91062979.0, + "reward": -1.7410799086093902, + "reward_std": 6.695904397964478, + "rewards/get_chromagram_reward": 0.6258544147014617, + "rewards/get_chromagram_reward_std": 0.11486873552203178, + "rewards/get_intelligibility_reward": -5.821660828590393, + "rewards/get_intelligibility_reward_std": 10.377565860748291, + "rewards/get_target_len_reward": -0.027432804461568594, + "rewards/get_target_len_reward_std": 0.08908913023769856, + "step": 2970 + }, + { + "advantages": -6.120652017216344e-07, + "advantages_std": 1.558844006061554, + "clip_ratio": 0.0, + "completion_length": 89.43392868041992, + "epoch": 2.2421052631578946, + "grad_norm": 27.25, + "kl": 0.37718722224235535, + "learning_rate": 3.879699248120301e-06, + "loss": 0.0376, + "num_tokens": 91375952.0, + "reward": -1.268139982968569, + "reward_std": 6.343207359313965, + "rewards/get_chromagram_reward": 0.62771937251091, + "rewards/get_chromagram_reward_std": 0.10384307354688645, + "rewards/get_intelligibility_reward": -4.411396241188049, + "rewards/get_intelligibility_reward_std": 10.165166664123536, + "rewards/get_target_len_reward": -0.020742816664278508, + "rewards/get_target_len_reward_std": 0.05104887764900923, + "step": 2980 + }, + { + "advantages": -5.987783431748994e-07, + "advantages_std": 1.6743453860282898, + "clip_ratio": 0.0, + "completion_length": 86.1476203918457, + "epoch": 2.249624060150376, + "grad_norm": 11.75, + "kl": 0.5795433431863785, + "learning_rate": 3.8759398496240605e-06, + "loss": 0.0562, + "num_tokens": 91680564.0, + "reward": -1.6921858012676239, + "reward_std": 7.0761829853057865, + "rewards/get_chromagram_reward": 0.62059086561203, + "rewards/get_chromagram_reward_std": 0.11869660988450051, + "rewards/get_intelligibility_reward": -5.6784823179245, + "rewards/get_intelligibility_reward_std": 11.159746551513672, + "rewards/get_target_len_reward": -0.01866564080119133, + "rewards/get_target_len_reward_std": 0.04305282030254602, + "step": 2990 + }, + { + "advantages": 3.58372920583605e-07, + "advantages_std": 1.5921060204505921, + "clip_ratio": 0.0, + "completion_length": 87.91011962890624, + "epoch": 2.257142857142857, + "grad_norm": 15.3125, + "kl": 0.3640074670314789, + "learning_rate": 3.87218045112782e-06, + "loss": 0.0387, + "num_tokens": 91989900.0, + "reward": -1.9970873475074769, + "reward_std": 7.173796224594116, + "rewards/get_chromagram_reward": 0.6152364611625671, + "rewards/get_chromagram_reward_std": 0.12383458390831947, + "rewards/get_intelligibility_reward": -6.585516786575317, + "rewards/get_intelligibility_reward_std": 10.964896774291992, + "rewards/get_target_len_reward": -0.020981486793607472, + "rewards/get_target_len_reward_std": 0.055330739729106425, + "step": 3000 + }, + { + "advantages": -6.300707894979496e-07, + "advantages_std": 1.5493954777717591, + "clip_ratio": 0.0, + "completion_length": 91.04047775268555, + "epoch": 2.264661654135338, + "grad_norm": 6.59375, + "kl": 0.35806858688592913, + "learning_rate": 3.868421052631579e-06, + "loss": 0.0393, + "num_tokens": 92307862.0, + "reward": -1.3623809725046159, + "reward_std": 6.6515583992004395, + "rewards/get_chromagram_reward": 0.5948490619659423, + "rewards/get_chromagram_reward_std": 0.11022032350301743, + "rewards/get_intelligibility_reward": -4.6636159181594845, + "rewards/get_intelligibility_reward_std": 10.71325330734253, + "rewards/get_target_len_reward": -0.0183757777325809, + "rewards/get_target_len_reward_std": 0.06054406575858593, + "step": 3010 + }, + { + "advantages": -6.794929596765087e-07, + "advantages_std": 1.497561240196228, + "clip_ratio": 0.0, + "completion_length": 83.14404830932617, + "epoch": 2.2721804511278196, + "grad_norm": 8.1875, + "kl": 0.3727316588163376, + "learning_rate": 3.8646616541353386e-06, + "loss": 0.042, + "num_tokens": 92604494.0, + "reward": -1.775395917892456, + "reward_std": 7.0423095703125, + "rewards/get_chromagram_reward": 0.6238544166088105, + "rewards/get_chromagram_reward_std": 0.13135162368416786, + "rewards/get_intelligibility_reward": -5.922222852706909, + "rewards/get_intelligibility_reward_std": 11.05590362548828, + "rewards/get_target_len_reward": -0.027818970568478107, + "rewards/get_target_len_reward_std": 0.075175317004323, + "step": 3020 + }, + { + "advantages": 2.9628475983756175e-07, + "advantages_std": 1.6100945949554444, + "clip_ratio": 0.0, + "completion_length": 84.90952529907227, + "epoch": 2.2796992481203007, + "grad_norm": 6.4375, + "kl": 0.40788850784301756, + "learning_rate": 3.860902255639098e-06, + "loss": 0.0404, + "num_tokens": 92905243.0, + "reward": -1.580957293510437, + "reward_std": 6.97661657333374, + "rewards/get_chromagram_reward": 0.6177934765815735, + "rewards/get_chromagram_reward_std": 0.12050609439611434, + "rewards/get_intelligibility_reward": -5.341689097881317, + "rewards/get_intelligibility_reward_std": 11.07231788635254, + "rewards/get_target_len_reward": -0.01897582933306694, + "rewards/get_target_len_reward_std": 0.04809415116906166, + "step": 3030 + }, + { + "advantages": -4.5945250803924865e-08, + "advantages_std": 1.5940813064575194, + "clip_ratio": 0.0, + "completion_length": 90.090478515625, + "epoch": 2.287218045112782, + "grad_norm": 17.375, + "kl": 0.3240895554423332, + "learning_rate": 3.857142857142858e-06, + "loss": 0.0336, + "num_tokens": 93221288.0, + "reward": -1.0903705094009637, + "reward_std": 6.505664348602295, + "rewards/get_chromagram_reward": 0.6320096373558044, + "rewards/get_chromagram_reward_std": 0.11818938925862313, + "rewards/get_intelligibility_reward": -3.8794002890586854, + "rewards/get_intelligibility_reward_std": 10.639200782775879, + "rewards/get_target_len_reward": -0.023720779828727244, + "rewards/get_target_len_reward_std": 0.06329209692776203, + "step": 3040 + }, + { + "advantages": 9.238720224402642e-08, + "advantages_std": 1.5934074997901917, + "clip_ratio": 0.0, + "completion_length": 86.07559585571289, + "epoch": 2.294736842105263, + "grad_norm": 5.875, + "kl": 0.6489260986447334, + "learning_rate": 3.853383458646617e-06, + "loss": 0.0662, + "num_tokens": 93525561.0, + "reward": -1.6520125150680542, + "reward_std": 6.356071853637696, + "rewards/get_chromagram_reward": 0.6185117423534393, + "rewards/get_chromagram_reward_std": 0.12471347972750664, + "rewards/get_intelligibility_reward": -5.55214421749115, + "rewards/get_intelligibility_reward_std": 9.824131298065186, + "rewards/get_target_len_reward": -0.022404894977808, + "rewards/get_target_len_reward_std": 0.05886543095111847, + "step": 3050 + }, + { + "advantages": -2.0340086213099795e-07, + "advantages_std": 1.4341715812683105, + "clip_ratio": 0.0, + "completion_length": 88.61488189697266, + "epoch": 2.302255639097744, + "grad_norm": 8.75, + "kl": 0.3880483269691467, + "learning_rate": 3.849624060150376e-06, + "loss": 0.0407, + "num_tokens": 93836947.0, + "reward": -1.6591898486018182, + "reward_std": 6.895362758636475, + "rewards/get_chromagram_reward": 0.6150469839572906, + "rewards/get_chromagram_reward_std": 0.11908129900693894, + "rewards/get_intelligibility_reward": -5.571700441837311, + "rewards/get_intelligibility_reward_std": 10.788173866271972, + "rewards/get_target_len_reward": -0.020915597584098576, + "rewards/get_target_len_reward_std": 0.055847865715622905, + "step": 3060 + }, + { + "advantages": 3.601114016760221e-07, + "advantages_std": 1.7079174041748046, + "clip_ratio": 0.0, + "completion_length": 88.07024002075195, + "epoch": 2.3097744360902257, + "grad_norm": 27.25, + "kl": 0.36389251947402956, + "learning_rate": 3.845864661654136e-06, + "loss": 0.042, + "num_tokens": 94146528.0, + "reward": -1.466315120458603, + "reward_std": 6.648503303527832, + "rewards/get_chromagram_reward": 0.6157117486000061, + "rewards/get_chromagram_reward_std": 0.1228131890296936, + "rewards/get_intelligibility_reward": -4.994232511520385, + "rewards/get_intelligibility_reward_std": 10.560632991790772, + "rewards/get_target_len_reward": -0.020424212515354156, + "rewards/get_target_len_reward_std": 0.06263697929680348, + "step": 3070 + }, + { + "advantages": -3.859400834471671e-07, + "advantages_std": 1.6529303193092346, + "clip_ratio": 0.0, + "completion_length": 87.0821434020996, + "epoch": 2.3172932330827067, + "grad_norm": 612.0, + "kl": 0.5107886403799057, + "learning_rate": 3.842105263157895e-06, + "loss": 0.0555, + "num_tokens": 94452819.0, + "reward": -1.557619434595108, + "reward_std": 6.65304388999939, + "rewards/get_chromagram_reward": 0.6286049544811249, + "rewards/get_chromagram_reward_std": 0.11012716889381409, + "rewards/get_intelligibility_reward": -5.282437968254089, + "rewards/get_intelligibility_reward_std": 10.468348979949951, + "rewards/get_target_len_reward": -0.019024977181106805, + "rewards/get_target_len_reward_std": 0.06559648010879755, + "step": 3080 + }, + { + "advantages": 3.0870238987290577e-07, + "advantages_std": 1.6085237383842468, + "clip_ratio": 0.0, + "completion_length": 87.23928756713867, + "epoch": 2.324812030075188, + "grad_norm": 7.75, + "kl": 0.4844575524330139, + "learning_rate": 3.838345864661654e-06, + "loss": 0.0544, + "num_tokens": 94759931.0, + "reward": -1.1725915879011155, + "reward_std": 6.456204605102539, + "rewards/get_chromagram_reward": 0.6367210924625397, + "rewards/get_chromagram_reward_std": 0.11174852326512337, + "rewards/get_intelligibility_reward": -4.1291744112968445, + "rewards/get_intelligibility_reward_std": 10.456843757629395, + "rewards/get_target_len_reward": -0.025321154668927193, + "rewards/get_target_len_reward_std": 0.07306363489478826, + "step": 3090 + }, + { + "advantages": -4.76837168861266e-07, + "advantages_std": 1.5963156700134278, + "clip_ratio": 0.0, + "completion_length": 88.89345474243164, + "epoch": 2.3323308270676693, + "grad_norm": 7.6875, + "kl": 0.36408271491527555, + "learning_rate": 3.834586466165414e-06, + "loss": 0.0416, + "num_tokens": 95071621.0, + "reward": -1.2260048598051072, + "reward_std": 5.992364072799683, + "rewards/get_chromagram_reward": 0.6308612644672393, + "rewards/get_chromagram_reward_std": 0.11509114354848862, + "rewards/get_intelligibility_reward": -4.284396481513977, + "rewards/get_intelligibility_reward_std": 9.64006485939026, + "rewards/get_target_len_reward": -0.02447923384606838, + "rewards/get_target_len_reward_std": 0.07114081848412752, + "step": 3100 + }, + { + "advantages": 3.2633543014526365e-07, + "advantages_std": 1.6563255071640015, + "clip_ratio": 0.0, + "completion_length": 86.56488265991212, + "epoch": 2.3398496240601503, + "grad_norm": 84.5, + "kl": 0.3443057775497437, + "learning_rate": 3.830827067669174e-06, + "loss": 0.0402, + "num_tokens": 95377114.0, + "reward": -1.6160699844360351, + "reward_std": 6.532747888565064, + "rewards/get_chromagram_reward": 0.6172747492790223, + "rewards/get_chromagram_reward_std": 0.11039882078766823, + "rewards/get_intelligibility_reward": -5.43791823387146, + "rewards/get_intelligibility_reward_std": 10.185232734680175, + "rewards/get_target_len_reward": -0.027566286642104386, + "rewards/get_target_len_reward_std": 0.0706888772547245, + "step": 3110 + }, + { + "advantages": 6.825973692059506e-07, + "advantages_std": 1.7419674158096314, + "clip_ratio": 0.0, + "completion_length": 87.59166870117187, + "epoch": 2.3473684210526318, + "grad_norm": 9.8125, + "kl": 0.29823374897241595, + "learning_rate": 3.8270676691729325e-06, + "loss": 0.0376, + "num_tokens": 95685240.0, + "reward": -1.8026684641838073, + "reward_std": 6.953642177581787, + "rewards/get_chromagram_reward": 0.6273639619350433, + "rewards/get_chromagram_reward_std": 0.11382095590233803, + "rewards/get_intelligibility_reward": -6.013849997520447, + "rewards/get_intelligibility_reward_std": 10.748786926269531, + "rewards/get_target_len_reward": -0.021518971025943755, + "rewards/get_target_len_reward_std": 0.06433455049991607, + "step": 3120 + }, + { + "advantages": 1.3858080478712508e-07, + "advantages_std": 1.4858174562454223, + "clip_ratio": 0.0, + "completion_length": 85.37440567016601, + "epoch": 2.354887218045113, + "grad_norm": 5.59375, + "kl": 0.3512969747185707, + "learning_rate": 3.823308270676692e-06, + "loss": 0.0383, + "num_tokens": 95987277.0, + "reward": -1.7362686932086944, + "reward_std": 7.01571249961853, + "rewards/get_chromagram_reward": 0.6190242350101471, + "rewards/get_chromagram_reward_std": 0.11482224762439727, + "rewards/get_intelligibility_reward": -5.808184885978699, + "rewards/get_intelligibility_reward_std": 11.058324146270753, + "rewards/get_target_len_reward": -0.019645236805081366, + "rewards/get_target_len_reward_std": 0.05717686675488949, + "step": 3130 + }, + { + "advantages": -1.1151035437251266e-07, + "advantages_std": 1.7415273547172547, + "clip_ratio": 0.0, + "completion_length": 87.55774002075195, + "epoch": 2.362406015037594, + "grad_norm": 37.25, + "kl": 0.36580796390771864, + "learning_rate": 3.819548872180452e-06, + "loss": 0.039, + "num_tokens": 96295325.0, + "reward": -1.448211270570755, + "reward_std": 6.679815864562988, + "rewards/get_chromagram_reward": 0.6096611797809601, + "rewards/get_chromagram_reward_std": 0.12779648303985597, + "rewards/get_intelligibility_reward": -4.932143640518189, + "rewards/get_intelligibility_reward_std": 10.682099866867066, + "rewards/get_target_len_reward": -0.02215128391981125, + "rewards/get_target_len_reward_std": 0.06295223757624627, + "step": 3140 + }, + { + "advantages": 4.5945250359835656e-08, + "advantages_std": 1.5435296535491942, + "clip_ratio": 0.0, + "completion_length": 88.24166793823242, + "epoch": 2.3699248120300753, + "grad_norm": 5.40625, + "kl": 0.29209394156932833, + "learning_rate": 3.815789473684211e-06, + "loss": 0.029, + "num_tokens": 96606282.0, + "reward": -1.2214705765247345, + "reward_std": 6.132448053359985, + "rewards/get_chromagram_reward": 0.6140880525112152, + "rewards/get_chromagram_reward_std": 0.11201696321368218, + "rewards/get_intelligibility_reward": -4.26198422908783, + "rewards/get_intelligibility_reward_std": 9.883226490020752, + "rewards/get_target_len_reward": -0.01651527201756835, + "rewards/get_target_len_reward_std": 0.043393169157207014, + "step": 3150 + }, + { + "advantages": 1.6937654216953036e-07, + "advantages_std": 1.4640429258346557, + "clip_ratio": 0.0, + "completion_length": 85.7232162475586, + "epoch": 2.3774436090225564, + "grad_norm": 7.1875, + "kl": 0.482822397351265, + "learning_rate": 3.81203007518797e-06, + "loss": 0.0525, + "num_tokens": 96908428.0, + "reward": -1.7714835286140442, + "reward_std": 6.791526174545288, + "rewards/get_chromagram_reward": 0.6277442216873169, + "rewards/get_chromagram_reward_std": 0.11088423728942871, + "rewards/get_intelligibility_reward": -5.919493269920349, + "rewards/get_intelligibility_reward_std": 10.559081554412842, + "rewards/get_target_len_reward": -0.022701340448111295, + "rewards/get_target_len_reward_std": 0.06931588556617499, + "step": 3160 + }, + { + "advantages": -2.0811954364319264e-07, + "advantages_std": 1.6946855902671814, + "clip_ratio": 0.0, + "completion_length": 88.08869171142578, + "epoch": 2.3849624060150374, + "grad_norm": 7.03125, + "kl": 0.419588178396225, + "learning_rate": 3.80827067669173e-06, + "loss": 0.0447, + "num_tokens": 97218687.0, + "reward": -1.6822382628917694, + "reward_std": 6.9995640277862545, + "rewards/get_chromagram_reward": 0.6256317377090455, + "rewards/get_chromagram_reward_std": 0.11372272670269012, + "rewards/get_intelligibility_reward": -5.649773263931275, + "rewards/get_intelligibility_reward_std": 11.008953666687011, + "rewards/get_target_len_reward": -0.022573013510555028, + "rewards/get_target_len_reward_std": 0.06304403096437454, + "step": 3170 + }, + { + "advantages": -1.7061830615006102e-07, + "advantages_std": 1.6064040184020996, + "clip_ratio": 0.0, + "completion_length": 86.60654983520507, + "epoch": 2.392481203007519, + "grad_norm": 8.5625, + "kl": 0.3004844680428505, + "learning_rate": 3.804511278195489e-06, + "loss": 0.0407, + "num_tokens": 97524048.0, + "reward": -1.4096578717231751, + "reward_std": 6.6632637023925785, + "rewards/get_chromagram_reward": 0.6215019702911377, + "rewards/get_chromagram_reward_std": 0.11183991581201554, + "rewards/get_intelligibility_reward": -4.828030633926391, + "rewards/get_intelligibility_reward_std": 10.725747776031493, + "rewards/get_target_len_reward": -0.0224445603787899, + "rewards/get_target_len_reward_std": 0.06768963728100061, + "step": 3180 + }, + { + "advantages": -4.1375557673362097e-07, + "advantages_std": 1.5909752130508423, + "clip_ratio": 0.0, + "completion_length": 85.55059585571288, + "epoch": 2.4, + "grad_norm": 8.125, + "kl": 0.35912114679813384, + "learning_rate": 3.8007518796992483e-06, + "loss": 0.0387, + "num_tokens": 97826861.0, + "reward": -1.456592407822609, + "reward_std": 6.335251379013061, + "rewards/get_chromagram_reward": 0.6159971415996551, + "rewards/get_chromagram_reward_std": 0.10423725917935371, + "rewards/get_intelligibility_reward": -4.967746996879578, + "rewards/get_intelligibility_reward_std": 10.034010696411134, + "rewards/get_target_len_reward": -0.018027166556566953, + "rewards/get_target_len_reward_std": 0.05203140545636416, + "step": 3190 + }, + { + "advantages": -4.798173961262364e-07, + "advantages_std": 1.6593538522720337, + "clip_ratio": 0.0, + "completion_length": 90.60654907226562, + "epoch": 2.4075187969924814, + "grad_norm": 6.59375, + "kl": 0.6495398178696632, + "learning_rate": 3.796992481203008e-06, + "loss": 0.0662, + "num_tokens": 98144452.0, + "reward": -1.1371527172625064, + "reward_std": 6.710154914855957, + "rewards/get_chromagram_reward": 0.6279964745044708, + "rewards/get_chromagram_reward_std": 0.11072349175810814, + "rewards/get_intelligibility_reward": -4.014143347740173, + "rewards/get_intelligibility_reward_std": 10.976589679718018, + "rewards/get_target_len_reward": -0.02531114164739847, + "rewards/get_target_len_reward_std": 0.07920041754841804, + "step": 3200 + }, + { + "advantages": -3.515432489109571e-07, + "advantages_std": 1.6400597095489502, + "clip_ratio": 0.0, + "completion_length": 86.48393020629882, + "epoch": 2.4150375939849624, + "grad_norm": 5.46875, + "kl": 0.3224791929125786, + "learning_rate": 3.793233082706767e-06, + "loss": 0.0342, + "num_tokens": 98449632.0, + "reward": -1.318561613559723, + "reward_std": 6.473997402191162, + "rewards/get_chromagram_reward": 0.6198614180088043, + "rewards/get_chromagram_reward_std": 0.12230006903409958, + "rewards/get_intelligibility_reward": -4.55453812032938, + "rewards/get_intelligibility_reward_std": 10.3699782371521, + "rewards/get_target_len_reward": -0.021007803454995155, + "rewards/get_target_len_reward_std": 0.05592170432209968, + "step": 3210 + }, + { + "advantages": -9.536742879845406e-08, + "advantages_std": 1.5742892980575562, + "clip_ratio": 0.0, + "completion_length": 85.17500305175781, + "epoch": 2.4225563909774435, + "grad_norm": 8.1875, + "kl": 0.32046190053224566, + "learning_rate": 3.789473684210527e-06, + "loss": 0.0308, + "num_tokens": 98751701.0, + "reward": -1.5562925934791565, + "reward_std": 6.561092281341553, + "rewards/get_chromagram_reward": 0.6253645658493042, + "rewards/get_chromagram_reward_std": 0.11320054829120636, + "rewards/get_intelligibility_reward": -5.274776554107666, + "rewards/get_intelligibility_reward_std": 10.261778974533081, + "rewards/get_target_len_reward": -0.019465396646410227, + "rewards/get_target_len_reward_std": 0.04603518862277269, + "step": 3220 + }, + { + "advantages": -7.872783953644103e-08, + "advantages_std": 1.612378227710724, + "clip_ratio": 0.0, + "completion_length": 84.65952529907227, + "epoch": 2.430075187969925, + "grad_norm": 5.375, + "kl": 0.3558365270495415, + "learning_rate": 3.785714285714286e-06, + "loss": 0.0406, + "num_tokens": 99052318.0, + "reward": -1.578985768556595, + "reward_std": 6.9117063045501705, + "rewards/get_chromagram_reward": 0.617356663942337, + "rewards/get_chromagram_reward_std": 0.12302884310483933, + "rewards/get_intelligibility_reward": -5.33222382068634, + "rewards/get_intelligibility_reward_std": 10.899449586868286, + "rewards/get_target_len_reward": -0.022089978307485582, + "rewards/get_target_len_reward_std": 0.06822279021143914, + "step": 3230 + }, + { + "advantages": -2.3394824539835212e-07, + "advantages_std": 1.5828737497329712, + "clip_ratio": 0.0, + "completion_length": 87.87024002075195, + "epoch": 2.437593984962406, + "grad_norm": 9.6875, + "kl": 3.2737128630280496, + "learning_rate": 3.7819548872180457e-06, + "loss": 0.3328, + "num_tokens": 99361553.0, + "reward": -1.607145693525672, + "reward_std": 6.754365253448486, + "rewards/get_chromagram_reward": 0.6224554538726806, + "rewards/get_chromagram_reward_std": 0.11792162135243416, + "rewards/get_intelligibility_reward": -5.420157140493393, + "rewards/get_intelligibility_reward_std": 10.555351066589356, + "rewards/get_target_len_reward": -0.023735210206359623, + "rewards/get_target_len_reward_std": 0.06005271524190903, + "step": 3240 + }, + { + "advantages": -5.570550882794123e-07, + "advantages_std": 1.5856150448322297, + "clip_ratio": 0.0, + "completion_length": 86.98571548461913, + "epoch": 2.4451127819548875, + "grad_norm": 6.53125, + "kl": 0.3818998262286186, + "learning_rate": 3.778195488721805e-06, + "loss": 0.0464, + "num_tokens": 99669024.0, + "reward": -1.3962798684835434, + "reward_std": 7.34520378112793, + "rewards/get_chromagram_reward": 0.6202928602695466, + "rewards/get_chromagram_reward_std": 0.10778555646538734, + "rewards/get_intelligibility_reward": -4.7882393300533295, + "rewards/get_intelligibility_reward_std": 11.882244777679443, + "rewards/get_target_len_reward": -0.020892890822142362, + "rewards/get_target_len_reward_std": 0.06483328007161618, + "step": 3250 + }, + { + "advantages": 2.7529895376687816e-07, + "advantages_std": 1.6412216067314147, + "clip_ratio": 0.0, + "completion_length": 84.91369171142578, + "epoch": 2.4526315789473685, + "grad_norm": 6.6875, + "kl": 0.32161408066749575, + "learning_rate": 3.7744360902255645e-06, + "loss": 0.0369, + "num_tokens": 99970373.0, + "reward": -1.531099909543991, + "reward_std": 6.41421217918396, + "rewards/get_chromagram_reward": 0.610172426700592, + "rewards/get_chromagram_reward_std": 0.1167138785123825, + "rewards/get_intelligibility_reward": -5.1840015888214115, + "rewards/get_intelligibility_reward_std": 10.05807113647461, + "rewards/get_target_len_reward": -0.019470279663801195, + "rewards/get_target_len_reward_std": 0.05536416377872229, + "step": 3260 + }, + { + "advantages": 2.468625751816944e-07, + "advantages_std": 1.6378588914871215, + "clip_ratio": 0.0, + "completion_length": 85.82440795898438, + "epoch": 2.4601503759398495, + "grad_norm": 5.875, + "kl": 0.3266435742378235, + "learning_rate": 3.7706766917293237e-06, + "loss": 0.0365, + "num_tokens": 100274737.0, + "reward": -1.4399422705173492, + "reward_std": 6.60957703590393, + "rewards/get_chromagram_reward": 0.6344795823097229, + "rewards/get_chromagram_reward_std": 0.10898077189922332, + "rewards/get_intelligibility_reward": -4.932069408893585, + "rewards/get_intelligibility_reward_std": 10.547198295593262, + "rewards/get_target_len_reward": -0.02223665835335851, + "rewards/get_target_len_reward_std": 0.06397623158991336, + "step": 3270 + }, + { + "advantages": -1.778205387381604e-07, + "advantages_std": 1.6094249129295348, + "clip_ratio": 0.0, + "completion_length": 86.78155059814453, + "epoch": 2.467669172932331, + "grad_norm": 6.4375, + "kl": 0.3754301965236664, + "learning_rate": 3.7669172932330825e-06, + "loss": 0.0411, + "num_tokens": 100580476.0, + "reward": -1.7137349367141723, + "reward_std": 6.945359897613526, + "rewards/get_chromagram_reward": 0.6193328857421875, + "rewards/get_chromagram_reward_std": 0.12088619396090508, + "rewards/get_intelligibility_reward": -5.740228915214539, + "rewards/get_intelligibility_reward_std": 10.897998905181884, + "rewards/get_target_len_reward": -0.020308405719697477, + "rewards/get_target_len_reward_std": 0.05714104510843754, + "step": 3280 + }, + { + "advantages": -2.9243536658896118e-08, + "advantages_std": 1.6005398750305175, + "clip_ratio": 0.0, + "completion_length": 89.58571548461914, + "epoch": 2.475187969924812, + "grad_norm": 9.1875, + "kl": 0.4362799167633057, + "learning_rate": 3.7631578947368426e-06, + "loss": 0.0501, + "num_tokens": 100894376.0, + "reward": -1.580004519224167, + "reward_std": 7.118606805801392, + "rewards/get_chromagram_reward": 0.6110920429229736, + "rewards/get_chromagram_reward_std": 0.12410885691642762, + "rewards/get_intelligibility_reward": -5.3250489950180055, + "rewards/get_intelligibility_reward_std": 11.340526008605957, + "rewards/get_target_len_reward": -0.026056183315813542, + "rewards/get_target_len_reward_std": 0.08824401944875718, + "step": 3290 + }, + { + "advantages": -6.062289248376374e-07, + "advantages_std": 1.5084069848060608, + "clip_ratio": 0.0, + "completion_length": 84.20535812377929, + "epoch": 2.482706766917293, + "grad_norm": 6.59375, + "kl": 0.286747407913208, + "learning_rate": 3.7593984962406014e-06, + "loss": 0.0319, + "num_tokens": 101194428.0, + "reward": -1.3394009791314603, + "reward_std": 6.1571714878082275, + "rewards/get_chromagram_reward": 0.6238884270191193, + "rewards/get_chromagram_reward_std": 0.11208853796124459, + "rewards/get_intelligibility_reward": -4.623925578594208, + "rewards/get_intelligibility_reward_std": 9.772040939331054, + "rewards/get_target_len_reward": -0.01816573003306985, + "rewards/get_target_len_reward_std": 0.05329264029860496, + "step": 3300 + }, + { + "advantages": 1.2964011562033306e-07, + "advantages_std": 1.5899734497070312, + "clip_ratio": 0.0, + "completion_length": 89.80357360839844, + "epoch": 2.4902255639097746, + "grad_norm": 6.6875, + "kl": 0.4089387819170952, + "learning_rate": 3.7556390977443615e-06, + "loss": 0.0431, + "num_tokens": 101509800.0, + "reward": -1.3904333353042602, + "reward_std": 6.629190587997437, + "rewards/get_chromagram_reward": 0.6224581658840179, + "rewards/get_chromagram_reward_std": 0.11556925252079964, + "rewards/get_intelligibility_reward": -4.774163477122784, + "rewards/get_intelligibility_reward_std": 10.547905969619752, + "rewards/get_target_len_reward": -0.01959448978304863, + "rewards/get_target_len_reward_std": 0.046945799700915815, + "step": 3310 + }, + { + "advantages": -1.8005571291723755e-07, + "advantages_std": 1.6132059335708617, + "clip_ratio": 0.0, + "completion_length": 84.47083435058593, + "epoch": 2.4977443609022556, + "grad_norm": 7.75, + "kl": 2.8430706575512885, + "learning_rate": 3.7518796992481203e-06, + "loss": 0.2924, + "num_tokens": 101809540.0, + "reward": -1.7256600558757782, + "reward_std": 6.812878942489624, + "rewards/get_chromagram_reward": 0.614870798587799, + "rewards/get_chromagram_reward_std": 0.11710697636008263, + "rewards/get_intelligibility_reward": -5.769259071350097, + "rewards/get_intelligibility_reward_std": 10.664492225646972, + "rewards/get_target_len_reward": -0.0225916619412601, + "rewards/get_target_len_reward_std": 0.06800402384251356, + "step": 3320 + }, + { + "advantages": 1.0130306378641762e-06, + "advantages_std": 1.5711158871650697, + "clip_ratio": 0.0, + "completion_length": 87.52142944335938, + "epoch": 2.5052631578947366, + "grad_norm": 5.8125, + "kl": 0.35297227203845977, + "learning_rate": 3.7481203007518803e-06, + "loss": 0.0416, + "num_tokens": 102118313.0, + "reward": -1.453517109155655, + "reward_std": 7.165091848373413, + "rewards/get_chromagram_reward": 0.6142861127853394, + "rewards/get_chromagram_reward_std": 0.11068090200424194, + "rewards/get_intelligibility_reward": -4.950346994400024, + "rewards/get_intelligibility_reward_std": 11.612703037261962, + "rewards/get_target_len_reward": -0.024490153044462205, + "rewards/get_target_len_reward_std": 0.08085027951747178, + "step": 3330 + }, + { + "advantages": 4.87267990934015e-07, + "advantages_std": 1.6833030700683593, + "clip_ratio": 0.0, + "completion_length": 91.52440490722657, + "epoch": 2.512781954887218, + "grad_norm": 46.0, + "kl": 0.31111850887537, + "learning_rate": 3.744360902255639e-06, + "loss": 0.0383, + "num_tokens": 102437971.0, + "reward": -1.4131182849407196, + "reward_std": 7.0444153308868405, + "rewards/get_chromagram_reward": 0.6308394372463226, + "rewards/get_chromagram_reward_std": 0.12373006641864777, + "rewards/get_intelligibility_reward": -4.842600393295288, + "rewards/get_intelligibility_reward_std": 11.395545768737794, + "rewards/get_target_len_reward": -0.02759362943470478, + "rewards/get_target_len_reward_std": 0.08597943410277367, + "step": 3340 + }, + { + "advantages": 2.942979349995767e-07, + "advantages_std": 1.6357346177101135, + "clip_ratio": 0.0, + "completion_length": 83.11845397949219, + "epoch": 2.520300751879699, + "grad_norm": 8.5625, + "kl": 0.36069548428058623, + "learning_rate": 3.740601503759399e-06, + "loss": 0.0416, + "num_tokens": 102734010.0, + "reward": -1.3619809970259666, + "reward_std": 6.783789920806885, + "rewards/get_chromagram_reward": 0.6247197687625885, + "rewards/get_chromagram_reward_std": 0.1113676056265831, + "rewards/get_intelligibility_reward": -4.687315640039742, + "rewards/get_intelligibility_reward_std": 10.916985607147216, + "rewards/get_target_len_reward": -0.023346869368106128, + "rewards/get_target_len_reward_std": 0.08005320616066455, + "step": 3350 + }, + { + "advantages": -3.7377077433120576e-08, + "advantages_std": 1.490761649608612, + "clip_ratio": 0.0, + "completion_length": 86.02976303100586, + "epoch": 2.5278195488721806, + "grad_norm": 6.90625, + "kl": 0.27752266377210616, + "learning_rate": 3.736842105263158e-06, + "loss": 0.0325, + "num_tokens": 103038877.0, + "reward": -1.3506350100040436, + "reward_std": 6.926352643966675, + "rewards/get_chromagram_reward": 0.6217826008796692, + "rewards/get_chromagram_reward_std": 0.10271879062056541, + "rewards/get_intelligibility_reward": -4.6541990518569945, + "rewards/get_intelligibility_reward_std": 11.229968643188476, + "rewards/get_target_len_reward": -0.019488278403878213, + "rewards/get_target_len_reward_std": 0.059296393766999245, + "step": 3360 + }, + { + "advantages": 7.4505798863810925e-09, + "advantages_std": 1.633939754962921, + "clip_ratio": 0.0, + "completion_length": 87.66071472167968, + "epoch": 2.5353383458646617, + "grad_norm": 5.9375, + "kl": 0.30983753949403764, + "learning_rate": 3.733082706766918e-06, + "loss": 0.0362, + "num_tokens": 103346730.0, + "reward": -1.7869456171989442, + "reward_std": 6.962776184082031, + "rewards/get_chromagram_reward": 0.6205675482749939, + "rewards/get_chromagram_reward_std": 0.12276971340179443, + "rewards/get_intelligibility_reward": -5.957375645637512, + "rewards/get_intelligibility_reward_std": 10.834985589981079, + "rewards/get_target_len_reward": -0.024028254952281714, + "rewards/get_target_len_reward_std": 0.07155142314732074, + "step": 3370 + }, + { + "advantages": 1.924733538771761e-07, + "advantages_std": 1.6906476497650147, + "clip_ratio": 0.0, + "completion_length": 86.42381057739257, + "epoch": 2.5428571428571427, + "grad_norm": 7.84375, + "kl": 0.31835374385118487, + "learning_rate": 3.729323308270677e-06, + "loss": 0.034, + "num_tokens": 103651890.0, + "reward": -1.746234953403473, + "reward_std": 6.781183338165283, + "rewards/get_chromagram_reward": 0.6146263599395752, + "rewards/get_chromagram_reward_std": 0.11475807204842567, + "rewards/get_intelligibility_reward": -5.83338782787323, + "rewards/get_intelligibility_reward_std": 10.554136180877686, + "rewards/get_target_len_reward": -0.01994320354424417, + "rewards/get_target_len_reward_std": 0.0581259747967124, + "step": 3380 + }, + { + "advantages": -1.7931065676712876e-07, + "advantages_std": 1.5177346467971802, + "clip_ratio": 0.0, + "completion_length": 82.60952529907226, + "epoch": 2.550375939849624, + "grad_norm": 13.3125, + "kl": 0.30254295021295546, + "learning_rate": 3.725563909774436e-06, + "loss": 0.038, + "num_tokens": 103946485.0, + "reward": -1.7833642423152924, + "reward_std": 6.767720127105713, + "rewards/get_chromagram_reward": 0.6146700859069825, + "rewards/get_chromagram_reward_std": 0.1099303774535656, + "rewards/get_intelligibility_reward": -5.944972562789917, + "rewards/get_intelligibility_reward_std": 10.43216552734375, + "rewards/get_target_len_reward": -0.01978995162062347, + "rewards/get_target_len_reward_std": 0.06717491708695889, + "step": 3390 + }, + { + "advantages": -2.0427009701506903e-07, + "advantages_std": 1.4660086750984191, + "clip_ratio": 0.0, + "completion_length": 89.39643020629883, + "epoch": 2.557894736842105, + "grad_norm": 5.5, + "kl": 0.47956685572862623, + "learning_rate": 3.7218045112781957e-06, + "loss": 0.0466, + "num_tokens": 104260840.0, + "reward": -1.2805642530322074, + "reward_std": 7.019152069091797, + "rewards/get_chromagram_reward": 0.6259425520896912, + "rewards/get_chromagram_reward_std": 0.10992063507437706, + "rewards/get_intelligibility_reward": -4.452183805406094, + "rewards/get_intelligibility_reward_std": 11.252341842651367, + "rewards/get_target_len_reward": -0.015451249293982982, + "rewards/get_target_len_reward_std": 0.039071221463382245, + "step": 3400 + }, + { + "advantages": 2.962847688081638e-07, + "advantages_std": 1.5682914018630982, + "clip_ratio": 0.0, + "completion_length": 88.70714416503907, + "epoch": 2.5654135338345867, + "grad_norm": 9.0, + "kl": 0.36460898965597155, + "learning_rate": 3.718045112781955e-06, + "loss": 0.0418, + "num_tokens": 104573097.0, + "reward": -1.1514586597681045, + "reward_std": 6.740952682495117, + "rewards/get_chromagram_reward": 0.615703922510147, + "rewards/get_chromagram_reward_std": 0.11887889504432678, + "rewards/get_intelligibility_reward": -4.045766282081604, + "rewards/get_intelligibility_reward_std": 11.008774948120116, + "rewards/get_target_len_reward": -0.02431353470310569, + "rewards/get_target_len_reward_std": 0.07109942696988583, + "step": 3410 + }, + { + "advantages": -2.8014185247116077e-07, + "advantages_std": 1.616212785243988, + "clip_ratio": 0.0, + "completion_length": 87.27381057739258, + "epoch": 2.5729323308270677, + "grad_norm": 6.625, + "kl": 0.3183483988046646, + "learning_rate": 3.7142857142857146e-06, + "loss": 0.0322, + "num_tokens": 104880351.0, + "reward": -1.5393120527267456, + "reward_std": 6.820896768569947, + "rewards/get_chromagram_reward": 0.6322737574577332, + "rewards/get_chromagram_reward_std": 0.10846047028899193, + "rewards/get_intelligibility_reward": -5.230653858184814, + "rewards/get_intelligibility_reward_std": 10.88029613494873, + "rewards/get_target_len_reward": -0.019555770326405764, + "rewards/get_target_len_reward_std": 0.04991299286484718, + "step": 3420 + }, + { + "advantages": -1.548479039215067e-07, + "advantages_std": 1.531691586971283, + "clip_ratio": 0.0, + "completion_length": 91.88988189697265, + "epoch": 2.5804511278195488, + "grad_norm": 6.3125, + "kl": 0.3366027757525444, + "learning_rate": 3.710526315789474e-06, + "loss": 0.0371, + "num_tokens": 105200296.0, + "reward": -1.3896465808153153, + "reward_std": 6.5613306045532225, + "rewards/get_chromagram_reward": 0.6323010861873627, + "rewards/get_chromagram_reward_std": 0.10928079709410668, + "rewards/get_intelligibility_reward": -4.779137639701366, + "rewards/get_intelligibility_reward_std": 10.427274227142334, + "rewards/get_target_len_reward": -0.022102872747927903, + "rewards/get_target_len_reward_std": 0.06365882325917482, + "step": 3430 + }, + { + "advantages": -1.0319053966867386e-07, + "advantages_std": 1.5140344619750976, + "clip_ratio": 0.0, + "completion_length": 85.56488265991212, + "epoch": 2.58796992481203, + "grad_norm": 13.8125, + "kl": 0.33588795363903046, + "learning_rate": 3.7067669172932335e-06, + "loss": 0.0415, + "num_tokens": 105503292.0, + "reward": -1.707659161090851, + "reward_std": 6.968575382232666, + "rewards/get_chromagram_reward": 0.6287704050540924, + "rewards/get_chromagram_reward_std": 0.10842615365982056, + "rewards/get_intelligibility_reward": -5.729169940948486, + "rewards/get_intelligibility_reward_std": 10.97379264831543, + "rewards/get_target_len_reward": -0.022577523067593575, + "rewards/get_target_len_reward_std": 0.06817054338753223, + "step": 3440 + }, + { + "advantages": -7.972122446631147e-08, + "advantages_std": 1.5462905287742614, + "clip_ratio": 0.0, + "completion_length": 84.52559661865234, + "epoch": 2.5954887218045113, + "grad_norm": 7.9375, + "kl": 0.34193562567234037, + "learning_rate": 3.7030075187969927e-06, + "loss": 0.0374, + "num_tokens": 105802432.0, + "reward": -1.729897018149495, + "reward_std": 6.543156433105469, + "rewards/get_chromagram_reward": 0.6110469579696656, + "rewards/get_chromagram_reward_std": 0.1152818813920021, + "rewards/get_intelligibility_reward": -5.780899262428283, + "rewards/get_intelligibility_reward_std": 9.966917133331298, + "rewards/get_target_len_reward": -0.019838462956249713, + "rewards/get_target_len_reward_std": 0.055696993321180346, + "step": 3450 + }, + { + "advantages": 4.2989850967956045e-07, + "advantages_std": 1.5565645217895507, + "clip_ratio": 0.0, + "completion_length": 84.73154907226562, + "epoch": 2.6030075187969923, + "grad_norm": 6.9375, + "kl": 0.34386427104473116, + "learning_rate": 3.6992481203007523e-06, + "loss": 0.0412, + "num_tokens": 106103651.0, + "reward": -1.4579800248146058, + "reward_std": 6.497630643844604, + "rewards/get_chromagram_reward": 0.6378639221191407, + "rewards/get_chromagram_reward_std": 0.11791711077094078, + "rewards/get_intelligibility_reward": -4.987514853477478, + "rewards/get_intelligibility_reward_std": 10.301973390579224, + "rewards/get_target_len_reward": -0.02428892171010375, + "rewards/get_target_len_reward_std": 0.06848178133368492, + "step": 3460 + }, + { + "advantages": -3.051012807731013e-07, + "advantages_std": 1.5158230423927308, + "clip_ratio": 0.0, + "completion_length": 85.61190719604492, + "epoch": 2.610526315789474, + "grad_norm": 9.6875, + "kl": 0.308964267373085, + "learning_rate": 3.6954887218045116e-06, + "loss": 0.0338, + "num_tokens": 106406683.0, + "reward": -1.5890803162008524, + "reward_std": 6.915961217880249, + "rewards/get_chromagram_reward": 0.6316801130771637, + "rewards/get_chromagram_reward_std": 0.11551015973091125, + "rewards/get_intelligibility_reward": -5.377793747186661, + "rewards/get_intelligibility_reward_std": 10.922082614898681, + "rewards/get_target_len_reward": -0.02112711127847433, + "rewards/get_target_len_reward_std": 0.05452207550406456, + "step": 3470 + }, + { + "advantages": -5.841255443783666e-07, + "advantages_std": 1.5797499895095826, + "clip_ratio": 0.0, + "completion_length": 87.48631134033204, + "epoch": 2.618045112781955, + "grad_norm": 9.25, + "kl": 0.2983222767710686, + "learning_rate": 3.6917293233082708e-06, + "loss": 0.0311, + "num_tokens": 106714280.0, + "reward": -1.632106864452362, + "reward_std": 7.033047151565552, + "rewards/get_chromagram_reward": 0.6137499034404754, + "rewards/get_chromagram_reward_std": 0.1243077963590622, + "rewards/get_intelligibility_reward": -5.490492677688598, + "rewards/get_intelligibility_reward_std": 11.182052993774414, + "rewards/get_target_len_reward": -0.01957751587033272, + "rewards/get_target_len_reward_std": 0.054054923728108405, + "step": 3480 + }, + { + "advantages": 8.183221353874614e-08, + "advantages_std": 1.6531654238700866, + "clip_ratio": 0.0, + "completion_length": 88.08690719604492, + "epoch": 2.625563909774436, + "grad_norm": 5.96875, + "kl": 0.43793293833732605, + "learning_rate": 3.6879699248120304e-06, + "loss": 0.0516, + "num_tokens": 107023943.0, + "reward": -1.421302282810211, + "reward_std": 6.5189769744873045, + "rewards/get_chromagram_reward": 0.6069125831127167, + "rewards/get_chromagram_reward_std": 0.10778507739305496, + "rewards/get_intelligibility_reward": -4.85057225227356, + "rewards/get_intelligibility_reward_std": 10.41775884628296, + "rewards/get_target_len_reward": -0.02024686587974429, + "rewards/get_target_len_reward_std": 0.058713534660637376, + "step": 3490 + }, + { + "advantages": -3.988544264643679e-07, + "advantages_std": 1.544123888015747, + "clip_ratio": 0.0, + "completion_length": 88.00178680419921, + "epoch": 2.6330827067669174, + "grad_norm": 8.3125, + "kl": 0.35315332412719724, + "learning_rate": 3.6842105263157896e-06, + "loss": 0.0406, + "num_tokens": 107333844.0, + "reward": -1.5424182265996933, + "reward_std": 6.796130657196045, + "rewards/get_chromagram_reward": 0.6076584696769715, + "rewards/get_chromagram_reward_std": 0.12150803357362747, + "rewards/get_intelligibility_reward": -5.2072618186473845, + "rewards/get_intelligibility_reward_std": 10.616606712341309, + "rewards/get_target_len_reward": -0.027651109732687473, + "rewards/get_target_len_reward_std": 0.07980751022696495, + "step": 3500 + }, + { + "advantages": -3.7650268751576733e-07, + "advantages_std": 1.6298677563667296, + "clip_ratio": 0.0, + "completion_length": 85.09285888671874, + "epoch": 2.6406015037593984, + "grad_norm": 8.75, + "kl": 0.390830771625042, + "learning_rate": 3.6804511278195493e-06, + "loss": 0.0416, + "num_tokens": 107635284.0, + "reward": -1.6289657175540924, + "reward_std": 6.689491701126099, + "rewards/get_chromagram_reward": 0.6175826072692872, + "rewards/get_chromagram_reward_std": 0.11210766062140465, + "rewards/get_intelligibility_reward": -5.485113549232483, + "rewards/get_intelligibility_reward_std": 10.506939315795899, + "rewards/get_target_len_reward": -0.01936584319919348, + "rewards/get_target_len_reward_std": 0.05680835526436567, + "step": 3510 + }, + { + "advantages": 8.145969658812646e-08, + "advantages_std": 1.6662519693374633, + "clip_ratio": 0.0, + "completion_length": 88.59464492797852, + "epoch": 2.64812030075188, + "grad_norm": 8.1875, + "kl": 0.32551622688770293, + "learning_rate": 3.6766917293233085e-06, + "loss": 0.0348, + "num_tokens": 107946453.0, + "reward": -1.5802398189902305, + "reward_std": 6.981956720352173, + "rewards/get_chromagram_reward": 0.6107727229595185, + "rewards/get_chromagram_reward_std": 0.11069920882582665, + "rewards/get_intelligibility_reward": -5.333705711364746, + "rewards/get_intelligibility_reward_std": 11.025648307800292, + "rewards/get_target_len_reward": -0.017786071356385946, + "rewards/get_target_len_reward_std": 0.054831051267683506, + "step": 3520 + }, + { + "advantages": -3.355244807323743e-07, + "advantages_std": 1.6068529963493348, + "clip_ratio": 0.0, + "completion_length": 90.33928756713867, + "epoch": 2.655639097744361, + "grad_norm": 6.34375, + "kl": 0.2986632138490677, + "learning_rate": 3.672932330827068e-06, + "loss": 0.0347, + "num_tokens": 108262351.0, + "reward": -1.345194971561432, + "reward_std": 6.133174467086792, + "rewards/get_chromagram_reward": 0.6212732017040252, + "rewards/get_chromagram_reward_std": 0.11827879324555397, + "rewards/get_intelligibility_reward": -4.638137435913086, + "rewards/get_intelligibility_reward_std": 9.790009784698487, + "rewards/get_target_len_reward": -0.018720502220094203, + "rewards/get_target_len_reward_std": 0.058218426443636416, + "step": 3530 + }, + { + "advantages": 4.344930335520303e-07, + "advantages_std": 1.6811386108398438, + "clip_ratio": 0.0, + "completion_length": 85.16071548461915, + "epoch": 2.663157894736842, + "grad_norm": 9.75, + "kl": 0.8134607136249542, + "learning_rate": 3.6691729323308274e-06, + "loss": 0.0837, + "num_tokens": 108563844.0, + "reward": -0.9894768297672272, + "reward_std": 6.26170506477356, + "rewards/get_chromagram_reward": 0.6209167063236236, + "rewards/get_chromagram_reward_std": 0.118373341858387, + "rewards/get_intelligibility_reward": -3.568402390182018, + "rewards/get_intelligibility_reward_std": 10.210508632659913, + "rewards/get_target_len_reward": -0.020944639947265385, + "rewards/get_target_len_reward_std": 0.055095212161540986, + "step": 3540 + }, + { + "advantages": 3.1739472916569866e-07, + "advantages_std": 1.5644019007682801, + "clip_ratio": 0.0, + "completion_length": 89.62500076293945, + "epoch": 2.6706766917293234, + "grad_norm": 9.625, + "kl": 0.3473496943712234, + "learning_rate": 3.665413533834587e-06, + "loss": 0.0381, + "num_tokens": 108877733.0, + "reward": -1.0514128148555755, + "reward_std": 6.714984369277954, + "rewards/get_chromagram_reward": 0.6352467834949493, + "rewards/get_chromagram_reward_std": 0.10565011724829673, + "rewards/get_intelligibility_reward": -3.770277237892151, + "rewards/get_intelligibility_reward_std": 11.006027889251708, + "rewards/get_target_len_reward": -0.019207827840000392, + "rewards/get_target_len_reward_std": 0.051670771278440955, + "step": 3550 + }, + { + "advantages": 7.400909538546329e-08, + "advantages_std": 1.5003403663635253, + "clip_ratio": 0.0, + "completion_length": 86.48154830932617, + "epoch": 2.6781954887218045, + "grad_norm": 6.9375, + "kl": 0.3223287731409073, + "learning_rate": 3.6616541353383462e-06, + "loss": 0.0364, + "num_tokens": 109183208.0, + "reward": -1.3079241871833802, + "reward_std": 6.220765161514282, + "rewards/get_chromagram_reward": 0.6262296378612519, + "rewards/get_chromagram_reward_std": 0.11949319913983344, + "rewards/get_intelligibility_reward": -4.529001545906067, + "rewards/get_intelligibility_reward_std": 9.840829277038575, + "rewards/get_target_len_reward": -0.021000441908836365, + "rewards/get_target_len_reward_std": 0.059491405822336675, + "step": 3560 + }, + { + "advantages": -2.870957139577968e-07, + "advantages_std": 1.5407105803489685, + "clip_ratio": 0.0, + "completion_length": 89.83333435058594, + "epoch": 2.685714285714286, + "grad_norm": 5.53125, + "kl": 0.28521771281957625, + "learning_rate": 3.657894736842106e-06, + "loss": 0.0328, + "num_tokens": 109497724.0, + "reward": -1.5617619916796683, + "reward_std": 7.22755651473999, + "rewards/get_chromagram_reward": 0.6161579072475434, + "rewards/get_chromagram_reward_std": 0.11087241023778915, + "rewards/get_intelligibility_reward": -5.28342290520668, + "rewards/get_intelligibility_reward_std": 11.369152069091797, + "rewards/get_target_len_reward": -0.018020586017519234, + "rewards/get_target_len_reward_std": 0.05469904895871878, + "step": 3570 + }, + { + "advantages": -5.288670777758853e-07, + "advantages_std": 1.5159010410308837, + "clip_ratio": 0.0, + "completion_length": 86.61845397949219, + "epoch": 2.693233082706767, + "grad_norm": 284.0, + "kl": 0.4494665414094925, + "learning_rate": 3.654135338345865e-06, + "loss": 0.0504, + "num_tokens": 109804278.0, + "reward": -1.0567859530448913, + "reward_std": 6.539412784576416, + "rewards/get_chromagram_reward": 0.6181407809257508, + "rewards/get_chromagram_reward_std": 0.11926394701004028, + "rewards/get_intelligibility_reward": -3.7641510725021363, + "rewards/get_intelligibility_reward_std": 10.725274467468262, + "rewards/get_target_len_reward": -0.024347435776144268, + "rewards/get_target_len_reward_std": 0.07297438457608223, + "step": 3580 + }, + { + "advantages": -5.935629019404587e-08, + "advantages_std": 1.6441392660140992, + "clip_ratio": 0.0, + "completion_length": 85.87619094848633, + "epoch": 2.700751879699248, + "grad_norm": 7.5, + "kl": 0.3371360570192337, + "learning_rate": 3.6503759398496243e-06, + "loss": 0.0402, + "num_tokens": 110108086.0, + "reward": -1.3475211262702942, + "reward_std": 7.116556644439697, + "rewards/get_chromagram_reward": 0.6257963180541992, + "rewards/get_chromagram_reward_std": 0.11940487399697304, + "rewards/get_intelligibility_reward": -4.644616198539734, + "rewards/get_intelligibility_reward_std": 11.614391803741455, + "rewards/get_target_len_reward": -0.02374332509934902, + "rewards/get_target_len_reward_std": 0.0675284055992961, + "step": 3590 + }, + { + "advantages": -4.221995897779607e-07, + "advantages_std": 1.4993983149528503, + "clip_ratio": 0.0, + "completion_length": 86.1422622680664, + "epoch": 2.708270676691729, + "grad_norm": 8.3125, + "kl": 0.3693845167756081, + "learning_rate": 3.646616541353384e-06, + "loss": 0.0412, + "num_tokens": 110412250.0, + "reward": -1.4997180208563805, + "reward_std": 6.539092111587524, + "rewards/get_chromagram_reward": 0.6187716603279114, + "rewards/get_chromagram_reward_std": 0.11276387199759483, + "rewards/get_intelligibility_reward": -5.098487496376038, + "rewards/get_intelligibility_reward_std": 10.326584005355835, + "rewards/get_target_len_reward": -0.01943796221166849, + "rewards/get_target_len_reward_std": 0.060646931640803814, + "step": 3600 + }, + { + "advantages": 2.351900015185038e-07, + "advantages_std": 1.6777186632156371, + "clip_ratio": 0.0, + "completion_length": 87.4303596496582, + "epoch": 2.7157894736842105, + "grad_norm": 6.21875, + "kl": 0.3518179655075073, + "learning_rate": 3.642857142857143e-06, + "loss": 0.0386, + "num_tokens": 110720554.0, + "reward": -1.5681921809911727, + "reward_std": 6.521061992645263, + "rewards/get_chromagram_reward": 0.6199930787086487, + "rewards/get_chromagram_reward_std": 0.10925538167357444, + "rewards/get_intelligibility_reward": -5.302760636806488, + "rewards/get_intelligibility_reward_std": 10.143068408966064, + "rewards/get_target_len_reward": -0.02180876871570945, + "rewards/get_target_len_reward_std": 0.05977758429944515, + "step": 3610 + }, + { + "advantages": -1.136213683139431e-07, + "advantages_std": 1.52393981218338, + "clip_ratio": 0.0, + "completion_length": 88.76190490722657, + "epoch": 2.7233082706766916, + "grad_norm": 6.96875, + "kl": 0.315978978574276, + "learning_rate": 3.639097744360903e-06, + "loss": 0.0328, + "num_tokens": 111032110.0, + "reward": -1.4126427441835403, + "reward_std": 6.398370981216431, + "rewards/get_chromagram_reward": 0.6212054550647735, + "rewards/get_chromagram_reward_std": 0.11859307289123536, + "rewards/get_intelligibility_reward": -4.838269853591919, + "rewards/get_intelligibility_reward_std": 10.18034839630127, + "rewards/get_target_len_reward": -0.020863546431064604, + "rewards/get_target_len_reward_std": 0.046022705547511575, + "step": 3620 + }, + { + "advantages": -5.416572275152021e-07, + "advantages_std": 1.5852022886276245, + "clip_ratio": 0.0, + "completion_length": 89.3077407836914, + "epoch": 2.730827067669173, + "grad_norm": 6.09375, + "kl": 0.29936513900756834, + "learning_rate": 3.6353383458646616e-06, + "loss": 0.0333, + "num_tokens": 111345776.0, + "reward": -1.4010808348655701, + "reward_std": 6.9346442222595215, + "rewards/get_chromagram_reward": 0.6170649528503418, + "rewards/get_chromagram_reward_std": 0.10936603471636772, + "rewards/get_intelligibility_reward": -4.802578103542328, + "rewards/get_intelligibility_reward_std": 11.13853578567505, + "rewards/get_target_len_reward": -0.017729100491851568, + "rewards/get_target_len_reward_std": 0.0609966017305851, + "step": 3630 + }, + { + "advantages": 1.932183759656425e-07, + "advantages_std": 1.5486809968948365, + "clip_ratio": 0.0, + "completion_length": 86.09583435058593, + "epoch": 2.738345864661654, + "grad_norm": 5.90625, + "kl": 0.312508887052536, + "learning_rate": 3.6315789473684217e-06, + "loss": 0.0356, + "num_tokens": 111650023.0, + "reward": -1.6857449889183045, + "reward_std": 6.547386407852173, + "rewards/get_chromagram_reward": 0.6232929110527039, + "rewards/get_chromagram_reward_std": 0.10652627125382423, + "rewards/get_intelligibility_reward": -5.662791061401367, + "rewards/get_intelligibility_reward_std": 10.1871337890625, + "rewards/get_target_len_reward": -0.017736470606178046, + "rewards/get_target_len_reward_std": 0.053376144357025625, + "step": 3640 + }, + { + "advantages": -4.919867109265397e-07, + "advantages_std": 1.5394857764244079, + "clip_ratio": 0.0, + "completion_length": 89.41131057739258, + "epoch": 2.745864661654135, + "grad_norm": 14.1875, + "kl": 27.45135252028704, + "learning_rate": 3.6278195488721805e-06, + "loss": 2.7469, + "num_tokens": 111963669.0, + "reward": -1.0812234073877334, + "reward_std": 6.496358489990234, + "rewards/get_chromagram_reward": 0.6205441057682037, + "rewards/get_chromagram_reward_std": 0.10971427038311958, + "rewards/get_intelligibility_reward": -3.8447015404701235, + "rewards/get_intelligibility_reward_std": 10.631364631652833, + "rewards/get_target_len_reward": -0.019512721337378025, + "rewards/get_target_len_reward_std": 0.04978047218173742, + "step": 3650 + }, + { + "advantages": -3.029902586604294e-08, + "advantages_std": 1.5976695895195008, + "clip_ratio": 0.0, + "completion_length": 87.70892944335938, + "epoch": 2.7533834586466166, + "grad_norm": 5.75, + "kl": 0.36582956910133363, + "learning_rate": 3.6240601503759406e-06, + "loss": 0.0395, + "num_tokens": 112272815.0, + "reward": -1.3024214208126068, + "reward_std": 6.48867712020874, + "rewards/get_chromagram_reward": 0.6302835762500762, + "rewards/get_chromagram_reward_std": 0.11696906760334969, + "rewards/get_intelligibility_reward": -4.515799993276596, + "rewards/get_intelligibility_reward_std": 10.368803787231446, + "rewards/get_target_len_reward": -0.021747537422925234, + "rewards/get_target_len_reward_std": 0.0546572208404541, + "step": 3660 + }, + { + "advantages": -1.8154581198359666e-07, + "advantages_std": 1.5597105026245117, + "clip_ratio": 0.0, + "completion_length": 82.84464569091797, + "epoch": 2.7609022556390976, + "grad_norm": 13.75, + "kl": 0.32838622480630875, + "learning_rate": 3.6203007518796994e-06, + "loss": 0.0338, + "num_tokens": 112568512.0, + "reward": -1.7773055016994477, + "reward_std": 6.957116889953613, + "rewards/get_chromagram_reward": 0.6281178295612335, + "rewards/get_chromagram_reward_std": 0.11852559596300125, + "rewards/get_intelligibility_reward": -5.9411616563797, + "rewards/get_intelligibility_reward_std": 10.83169240951538, + "rewards/get_target_len_reward": -0.018872402142733335, + "rewards/get_target_len_reward_std": 0.04996256493031979, + "step": 3670 + }, + { + "advantages": -3.4918390241500673e-07, + "advantages_std": 1.4727519631385804, + "clip_ratio": 0.0, + "completion_length": 86.79047775268555, + "epoch": 2.768421052631579, + "grad_norm": 8.4375, + "kl": 0.5991848617792129, + "learning_rate": 3.6165413533834586e-06, + "loss": 0.0628, + "num_tokens": 112875367.0, + "reward": -1.557611984014511, + "reward_std": 6.61163101196289, + "rewards/get_chromagram_reward": 0.6248084425926208, + "rewards/get_chromagram_reward_std": 0.10072682946920394, + "rewards/get_intelligibility_reward": -5.278701066970825, + "rewards/get_intelligibility_reward_std": 10.406602478027343, + "rewards/get_target_len_reward": -0.018943112157285214, + "rewards/get_target_len_reward_std": 0.05231909994035959, + "step": 3680 + }, + { + "advantages": -3.3626953666043846e-07, + "advantages_std": 1.6818018198013305, + "clip_ratio": 0.0, + "completion_length": 86.50059661865234, + "epoch": 2.77593984962406, + "grad_norm": 14.6875, + "kl": 0.6916846543550491, + "learning_rate": 3.6127819548872182e-06, + "loss": 0.0717, + "num_tokens": 113181255.0, + "reward": -1.0904676795005799, + "reward_std": 6.335058259963989, + "rewards/get_chromagram_reward": 0.6144516110420227, + "rewards/get_chromagram_reward_std": 0.10726640596985818, + "rewards/get_intelligibility_reward": -3.869361972808838, + "rewards/get_intelligibility_reward_std": 10.41758222579956, + "rewards/get_target_len_reward": -0.016492511704564096, + "rewards/get_target_len_reward_std": 0.0513612063601613, + "step": 3690 + }, + { + "advantages": -9.735427113355399e-08, + "advantages_std": 1.5762090682983398, + "clip_ratio": 0.0, + "completion_length": 86.51607284545898, + "epoch": 2.783458646616541, + "grad_norm": 7.0, + "kl": 0.3062734708189964, + "learning_rate": 3.6090225563909775e-06, + "loss": 0.03, + "num_tokens": 113487230.0, + "reward": -1.5626688718795776, + "reward_std": 6.707300853729248, + "rewards/get_chromagram_reward": 0.6272768974304199, + "rewards/get_chromagram_reward_std": 0.10489500313997269, + "rewards/get_intelligibility_reward": -5.295509243011475, + "rewards/get_intelligibility_reward_std": 10.637312984466552, + "rewards/get_target_len_reward": -0.019774046447128057, + "rewards/get_target_len_reward_std": 0.049518337845802306, + "step": 3700 + }, + { + "advantages": -1.7508864047499628e-07, + "advantages_std": 1.4822612404823303, + "clip_ratio": 0.0, + "completion_length": 85.85535888671875, + "epoch": 2.7909774436090227, + "grad_norm": 18.625, + "kl": 0.40415765792131425, + "learning_rate": 3.605263157894737e-06, + "loss": 0.0434, + "num_tokens": 113791003.0, + "reward": -1.4261482059955597, + "reward_std": 7.004040002822876, + "rewards/get_chromagram_reward": 0.6247886598110199, + "rewards/get_chromagram_reward_std": 0.12303193882107735, + "rewards/get_intelligibility_reward": -4.874522185325622, + "rewards/get_intelligibility_reward_std": 11.273698616027833, + "rewards/get_target_len_reward": -0.028710635006427766, + "rewards/get_target_len_reward_std": 0.09238636270165443, + "step": 3710 + }, + { + "advantages": 6.842116633265505e-07, + "advantages_std": 1.5441161036491393, + "clip_ratio": 0.0, + "completion_length": 89.55595397949219, + "epoch": 2.7984962406015037, + "grad_norm": 6.0625, + "kl": 0.37307691723108294, + "learning_rate": 3.6015037593984963e-06, + "loss": 0.0426, + "num_tokens": 114105102.0, + "reward": -1.4877331912517548, + "reward_std": 6.525564289093017, + "rewards/get_chromagram_reward": 0.6080294728279114, + "rewards/get_chromagram_reward_std": 0.11591408997774125, + "rewards/get_intelligibility_reward": -5.049138689041138, + "rewards/get_intelligibility_reward_std": 10.337861633300781, + "rewards/get_target_len_reward": -0.02209009351208806, + "rewards/get_target_len_reward_std": 0.06789065115153789, + "step": 3720 + }, + { + "advantages": -6.424884020361787e-07, + "advantages_std": 1.5563165068626403, + "clip_ratio": 0.0, + "completion_length": 87.59642944335937, + "epoch": 2.806015037593985, + "grad_norm": 16.125, + "kl": 0.34810103923082353, + "learning_rate": 3.597744360902256e-06, + "loss": 0.0368, + "num_tokens": 114414329.0, + "reward": -1.322847494482994, + "reward_std": 6.680602407455444, + "rewards/get_chromagram_reward": 0.6276551485061646, + "rewards/get_chromagram_reward_std": 0.10801803767681122, + "rewards/get_intelligibility_reward": -4.576816880702973, + "rewards/get_intelligibility_reward_std": 10.717460823059081, + "rewards/get_target_len_reward": -0.019380524475127458, + "rewards/get_target_len_reward_std": 0.0489049194380641, + "step": 3730 + }, + { + "advantages": 5.6972107458364e-07, + "advantages_std": 1.5420011162757874, + "clip_ratio": 0.0, + "completion_length": 85.07916717529297, + "epoch": 2.813533834586466, + "grad_norm": 6.15625, + "kl": 0.3048412173986435, + "learning_rate": 3.593984962406015e-06, + "loss": 0.0339, + "num_tokens": 114716213.0, + "reward": -1.4711872577667235, + "reward_std": 6.391526699066162, + "rewards/get_chromagram_reward": 0.6427627861499786, + "rewards/get_chromagram_reward_std": 0.106219232827425, + "rewards/get_intelligibility_reward": -5.0332791090011595, + "rewards/get_intelligibility_reward_std": 10.067411518096923, + "rewards/get_target_len_reward": -0.023045250680297613, + "rewards/get_target_len_reward_std": 0.06652447283267975, + "step": 3740 + }, + { + "advantages": -7.521361560236528e-07, + "advantages_std": 1.563930594921112, + "clip_ratio": 0.0, + "completion_length": 84.58631134033203, + "epoch": 2.8210526315789473, + "grad_norm": 132.0, + "kl": 0.3405220597982407, + "learning_rate": 3.590225563909775e-06, + "loss": 0.0427, + "num_tokens": 115016687.0, + "reward": -1.5656429648399353, + "reward_std": 6.2206337451934814, + "rewards/get_chromagram_reward": 0.6178694903850556, + "rewards/get_chromagram_reward_std": 0.12440430745482445, + "rewards/get_intelligibility_reward": -5.284491777420044, + "rewards/get_intelligibility_reward_std": 9.689728498458862, + "rewards/get_target_len_reward": -0.030306239239871503, + "rewards/get_target_len_reward_std": 0.0898954387754202, + "step": 3750 + }, + { + "advantages": -2.9926500104693333e-07, + "advantages_std": 1.6133185148239135, + "clip_ratio": 0.0, + "completion_length": 88.39285812377929, + "epoch": 2.8285714285714287, + "grad_norm": 129.0, + "kl": 0.3441634550690651, + "learning_rate": 3.586466165413534e-06, + "loss": 0.0407, + "num_tokens": 115326967.0, + "reward": -1.3336735486984252, + "reward_std": 6.554736709594726, + "rewards/get_chromagram_reward": 0.6175411701202392, + "rewards/get_chromagram_reward_std": 0.10903427228331566, + "rewards/get_intelligibility_reward": -4.596753120422363, + "rewards/get_intelligibility_reward_std": 10.53911051750183, + "rewards/get_target_len_reward": -0.021808473207056523, + "rewards/get_target_len_reward_std": 0.07356351651251317, + "step": 3760 + }, + { + "advantages": -1.4578303080270417e-07, + "advantages_std": 1.6139382362365722, + "clip_ratio": 0.0, + "completion_length": 88.05833511352539, + "epoch": 2.8360902255639098, + "grad_norm": 5.65625, + "kl": 0.35888722240924836, + "learning_rate": 3.5827067669172937e-06, + "loss": 0.0351, + "num_tokens": 115636871.0, + "reward": -1.2391679644584657, + "reward_std": 6.162685680389404, + "rewards/get_chromagram_reward": 0.6107660055160522, + "rewards/get_chromagram_reward_std": 0.1182018756866455, + "rewards/get_intelligibility_reward": -4.3092587232589725, + "rewards/get_intelligibility_reward_std": 9.722102928161622, + "rewards/get_target_len_reward": -0.01901089083403349, + "rewards/get_target_len_reward_std": 0.04649979993700981, + "step": 3770 + }, + { + "advantages": 2.9330453656939424e-07, + "advantages_std": 1.7181447625160218, + "clip_ratio": 0.0, + "completion_length": 87.07857284545898, + "epoch": 2.8436090225563913, + "grad_norm": 23.625, + "kl": 0.31283538192510607, + "learning_rate": 3.578947368421053e-06, + "loss": 0.0374, + "num_tokens": 115943108.0, + "reward": -1.5981365263462066, + "reward_std": 6.841731357574463, + "rewards/get_chromagram_reward": 0.6245369374752044, + "rewards/get_chromagram_reward_std": 0.11415834277868271, + "rewards/get_intelligibility_reward": -5.397093820571899, + "rewards/get_intelligibility_reward_std": 10.789442348480225, + "rewards/get_target_len_reward": -0.021852330211549996, + "rewards/get_target_len_reward_std": 0.06459587197750807, + "step": 3780 + }, + { + "advantages": 2.242624816517491e-07, + "advantages_std": 1.5829466223716735, + "clip_ratio": 0.0, + "completion_length": 86.8458351135254, + "epoch": 2.8511278195488723, + "grad_norm": 7.8125, + "kl": 0.3109581500291824, + "learning_rate": 3.575187969924812e-06, + "loss": 0.0357, + "num_tokens": 116248830.0, + "reward": -1.6930224657058717, + "reward_std": 7.195408773422241, + "rewards/get_chromagram_reward": 0.6122824370861053, + "rewards/get_chromagram_reward_std": 0.11232773438096047, + "rewards/get_intelligibility_reward": -5.672005653381348, + "rewards/get_intelligibility_reward_std": 11.43618803024292, + "rewards/get_target_len_reward": -0.01934403767809272, + "rewards/get_target_len_reward_std": 0.0535100094974041, + "step": 3790 + }, + { + "advantages": -7.351239901254303e-08, + "advantages_std": 1.6079047083854676, + "clip_ratio": 0.0, + "completion_length": 88.32797698974609, + "epoch": 2.8586466165413533, + "grad_norm": 7.40625, + "kl": 0.29484367817640306, + "learning_rate": 3.5714285714285718e-06, + "loss": 0.0367, + "num_tokens": 116558589.0, + "reward": -2.1900202333927155, + "reward_std": 7.080200862884522, + "rewards/get_chromagram_reward": 0.6280510127544403, + "rewards/get_chromagram_reward_std": 0.12109795212745667, + "rewards/get_intelligibility_reward": -7.167077040672302, + "rewards/get_intelligibility_reward_std": 10.389463520050048, + "rewards/get_target_len_reward": -0.031034017261117697, + "rewards/get_target_len_reward_std": 0.105182571336627, + "step": 3800 + }, + { + "advantages": -4.465381451268513e-07, + "advantages_std": 1.5559608340263367, + "clip_ratio": 0.0, + "completion_length": 88.14107284545898, + "epoch": 2.8661654135338344, + "grad_norm": 6.71875, + "kl": 0.3654972165822983, + "learning_rate": 3.567669172932331e-06, + "loss": 0.037, + "num_tokens": 116868580.0, + "reward": -1.5869385808706284, + "reward_std": 6.509708309173584, + "rewards/get_chromagram_reward": 0.6265568256378173, + "rewards/get_chromagram_reward_std": 0.11053410097956658, + "rewards/get_intelligibility_reward": -5.366451478004455, + "rewards/get_intelligibility_reward_std": 10.103280210494995, + "rewards/get_target_len_reward": -0.020920742489397525, + "rewards/get_target_len_reward_std": 0.052569540590047835, + "step": 3810 + }, + { + "advantages": -2.2649765583082626e-07, + "advantages_std": 1.6033626556396485, + "clip_ratio": 0.0, + "completion_length": 86.40774002075196, + "epoch": 2.873684210526316, + "grad_norm": 6.6875, + "kl": 0.34228152930736544, + "learning_rate": 3.5639097744360906e-06, + "loss": 0.0368, + "num_tokens": 117173648.0, + "reward": -1.5541153252124786, + "reward_std": 6.84669189453125, + "rewards/get_chromagram_reward": 0.6283958792686463, + "rewards/get_chromagram_reward_std": 0.11310142204165459, + "rewards/get_intelligibility_reward": -5.272970819473267, + "rewards/get_intelligibility_reward_std": 10.797915077209472, + "rewards/get_target_len_reward": -0.01777073973789811, + "rewards/get_target_len_reward_std": 0.04794793035835028, + "step": 3820 + }, + { + "advantages": 2.4760762897813036e-07, + "advantages_std": 1.576991081237793, + "clip_ratio": 0.0, + "completion_length": 88.41488189697266, + "epoch": 2.881203007518797, + "grad_norm": 12.8125, + "kl": 0.3005476787686348, + "learning_rate": 3.56015037593985e-06, + "loss": 0.0305, + "num_tokens": 117485079.0, + "reward": -1.2031879782676698, + "reward_std": 6.401948547363281, + "rewards/get_chromagram_reward": 0.6222064554691314, + "rewards/get_chromagram_reward_std": 0.11551795303821563, + "rewards/get_intelligibility_reward": -4.2140885353088375, + "rewards/get_intelligibility_reward_std": 10.40561122894287, + "rewards/get_target_len_reward": -0.017681639175862074, + "rewards/get_target_len_reward_std": 0.043648559413850305, + "step": 3830 + }, + { + "advantages": -4.94718563004426e-07, + "advantages_std": 1.6091697573661805, + "clip_ratio": 0.0, + "completion_length": 87.98154983520507, + "epoch": 2.8887218045112784, + "grad_norm": 7.125, + "kl": 0.4117413073778152, + "learning_rate": 3.5563909774436095e-06, + "loss": 0.0405, + "num_tokens": 117794729.0, + "reward": -1.359292358160019, + "reward_std": 6.418743133544922, + "rewards/get_chromagram_reward": 0.6351268887519836, + "rewards/get_chromagram_reward_std": 0.11723964139819146, + "rewards/get_intelligibility_reward": -4.6920299410820006, + "rewards/get_intelligibility_reward_std": 10.142711639404297, + "rewards/get_target_len_reward": -0.02097375998273492, + "rewards/get_target_len_reward_std": 0.05770694185048342, + "step": 3840 + }, + { + "advantages": -3.7675103357059927e-07, + "advantages_std": 1.6376892924308777, + "clip_ratio": 0.0, + "completion_length": 90.702978515625, + "epoch": 2.8962406015037594, + "grad_norm": 59.0, + "kl": 0.3368965730071068, + "learning_rate": 3.5526315789473687e-06, + "loss": 0.0362, + "num_tokens": 118111543.0, + "reward": -1.4667087614536285, + "reward_std": 6.767246961593628, + "rewards/get_chromagram_reward": 0.6119817018508911, + "rewards/get_chromagram_reward_std": 0.11939271688461303, + "rewards/get_intelligibility_reward": -4.9941377401351925, + "rewards/get_intelligibility_reward_std": 10.768574285507203, + "rewards/get_target_len_reward": -0.017970026470720768, + "rewards/get_target_len_reward_std": 0.04830687399953604, + "step": 3850 + }, + { + "advantages": 3.47942140166424e-07, + "advantages_std": 1.5706040978431701, + "clip_ratio": 0.0, + "completion_length": 84.92738189697266, + "epoch": 2.9037593984962404, + "grad_norm": 39.0, + "kl": 0.36917597651481626, + "learning_rate": 3.5488721804511284e-06, + "loss": 0.0396, + "num_tokens": 118412535.0, + "reward": -1.5638644456863404, + "reward_std": 6.324653100967407, + "rewards/get_chromagram_reward": 0.6199922919273376, + "rewards/get_chromagram_reward_std": 0.11465816274285316, + "rewards/get_intelligibility_reward": -5.289899444580078, + "rewards/get_intelligibility_reward_std": 9.817295694351197, + "rewards/get_target_len_reward": -0.021685893088579176, + "rewards/get_target_len_reward_std": 0.06315642818808556, + "step": 3860 + }, + { + "advantages": 5.0415576424711614e-08, + "advantages_std": 1.6583212971687318, + "clip_ratio": 0.0, + "completion_length": 83.9226203918457, + "epoch": 2.911278195488722, + "grad_norm": 13.25, + "kl": 0.3216040194034576, + "learning_rate": 3.5451127819548876e-06, + "loss": 0.0366, + "num_tokens": 118710697.0, + "reward": -1.5152422875165938, + "reward_std": 6.521786212921143, + "rewards/get_chromagram_reward": 0.6143280088901519, + "rewards/get_chromagram_reward_std": 0.11242129802703857, + "rewards/get_intelligibility_reward": -5.136507201194763, + "rewards/get_intelligibility_reward_std": 10.324334335327148, + "rewards/get_target_len_reward": -0.023547363001853228, + "rewards/get_target_len_reward_std": 0.06723555289208889, + "step": 3870 + }, + { + "advantages": 2.756714764018398e-07, + "advantages_std": 1.6054322957992553, + "clip_ratio": 0.0, + "completion_length": 84.46131134033203, + "epoch": 2.918796992481203, + "grad_norm": 7.125, + "kl": 1.9177187487483025, + "learning_rate": 3.541353383458647e-06, + "loss": 0.195, + "num_tokens": 119010444.0, + "reward": -1.537421664595604, + "reward_std": 6.536387872695923, + "rewards/get_chromagram_reward": 0.6246936738491058, + "rewards/get_chromagram_reward_std": 0.120548328012228, + "rewards/get_intelligibility_reward": -5.211082994937897, + "rewards/get_intelligibility_reward_std": 10.240724563598633, + "rewards/get_target_len_reward": -0.025875354651361704, + "rewards/get_target_len_reward_std": 0.07914061769843102, + "step": 3880 + }, + { + "advantages": -1.1523565177640193e-07, + "advantages_std": 1.6115208506584167, + "clip_ratio": 0.0, + "completion_length": 83.78690567016602, + "epoch": 2.9263157894736844, + "grad_norm": 15.625, + "kl": 0.4440599873661995, + "learning_rate": 3.5375939849624065e-06, + "loss": 0.0492, + "num_tokens": 119308399.0, + "reward": -1.7521640941500665, + "reward_std": 6.507619380950928, + "rewards/get_chromagram_reward": 0.6227650046348572, + "rewards/get_chromagram_reward_std": 0.10821129679679871, + "rewards/get_intelligibility_reward": -5.861425828933716, + "rewards/get_intelligibility_reward_std": 9.89284119606018, + "rewards/get_target_len_reward": -0.017831097729504107, + "rewards/get_target_len_reward_std": 0.051672331802546975, + "step": 3890 + }, + { + "advantages": -1.3858080052386868e-07, + "advantages_std": 1.5852330923080444, + "clip_ratio": 0.0, + "completion_length": 83.91012115478516, + "epoch": 2.9338345864661655, + "grad_norm": 8.75, + "kl": 0.40669423937797544, + "learning_rate": 3.5338345864661657e-06, + "loss": 0.0424, + "num_tokens": 119607390.0, + "reward": -1.5483486637473107, + "reward_std": 6.5325416088104244, + "rewards/get_chromagram_reward": 0.6435725927352905, + "rewards/get_chromagram_reward_std": 0.11586638018488885, + "rewards/get_intelligibility_reward": -5.266064453125, + "rewards/get_intelligibility_reward_std": 10.214018297195434, + "rewards/get_target_len_reward": -0.022553973458707334, + "rewards/get_target_len_reward_std": 0.05295693334192038, + "step": 3900 + }, + { + "advantages": 1.5149518617363357e-08, + "advantages_std": 1.4693358659744262, + "clip_ratio": 0.0, + "completion_length": 83.45654907226563, + "epoch": 2.9413533834586465, + "grad_norm": 6.5, + "kl": 0.33210055381059644, + "learning_rate": 3.5300751879699253e-06, + "loss": 0.0382, + "num_tokens": 119905127.0, + "reward": -1.5625251412391663, + "reward_std": 6.870270299911499, + "rewards/get_chromagram_reward": 0.6261168956756592, + "rewards/get_chromagram_reward_std": 0.12031473070383072, + "rewards/get_intelligibility_reward": -5.286927700042725, + "rewards/get_intelligibility_reward_std": 10.942294502258301, + "rewards/get_target_len_reward": -0.02676438381895423, + "rewards/get_target_len_reward_std": 0.07906926460564137, + "step": 3910 + }, + { + "advantages": 4.013379450995558e-07, + "advantages_std": 1.5395816683769226, + "clip_ratio": 0.0, + "completion_length": 82.18333587646484, + "epoch": 2.948872180451128, + "grad_norm": 49.25, + "kl": 0.341029454767704, + "learning_rate": 3.5263157894736846e-06, + "loss": 0.0413, + "num_tokens": 120199128.0, + "reward": -1.3827453568577766, + "reward_std": 6.446432161331177, + "rewards/get_chromagram_reward": 0.641999465227127, + "rewards/get_chromagram_reward_std": 0.12880906984210014, + "rewards/get_intelligibility_reward": -4.763112473487854, + "rewards/get_intelligibility_reward_std": 10.24520902633667, + "rewards/get_target_len_reward": -0.02712297812104225, + "rewards/get_target_len_reward_std": 0.06887171734124423, + "step": 3920 + }, + { + "advantages": -1.703699510358092e-07, + "advantages_std": 1.6273478150367737, + "clip_ratio": 0.0, + "completion_length": 88.26131057739258, + "epoch": 2.956390977443609, + "grad_norm": 6.46875, + "kl": 0.6627166286110878, + "learning_rate": 3.522556390977444e-06, + "loss": 0.0655, + "num_tokens": 120509668.0, + "reward": -1.0766555294394493, + "reward_std": 6.545140647888184, + "rewards/get_chromagram_reward": 0.6167593955993652, + "rewards/get_chromagram_reward_std": 0.10943646654486656, + "rewards/get_intelligibility_reward": -3.826186215877533, + "rewards/get_intelligibility_reward_std": 10.702708339691162, + "rewards/get_target_len_reward": -0.020539425686001776, + "rewards/get_target_len_reward_std": 0.05624462254345417, + "step": 3930 + }, + { + "advantages": -2.739330188461508e-07, + "advantages_std": 1.5973445296287536, + "clip_ratio": 0.0, + "completion_length": 88.50952606201172, + "epoch": 2.9639097744360905, + "grad_norm": 6.0, + "kl": 0.49120003134012225, + "learning_rate": 3.5187969924812034e-06, + "loss": 0.0537, + "num_tokens": 120820132.0, + "reward": -1.2331419989466668, + "reward_std": 6.877620697021484, + "rewards/get_chromagram_reward": 0.6266380608081817, + "rewards/get_chromagram_reward_std": 0.11246325150132179, + "rewards/get_intelligibility_reward": -4.305523836612702, + "rewards/get_intelligibility_reward_std": 11.225216484069824, + "rewards/get_target_len_reward": -0.020539984665811063, + "rewards/get_target_len_reward_std": 0.05534586645662785, + "step": 3940 + }, + { + "advantages": 3.228584937176038e-07, + "advantages_std": 1.466474747657776, + "clip_ratio": 0.0, + "completion_length": 90.61488189697266, + "epoch": 2.9714285714285715, + "grad_norm": 4.96875, + "kl": 0.28244465589523315, + "learning_rate": 3.515037593984963e-06, + "loss": 0.0314, + "num_tokens": 121137356.0, + "reward": -1.2253394410014153, + "reward_std": 6.3654192924499515, + "rewards/get_chromagram_reward": 0.6074323713779449, + "rewards/get_chromagram_reward_std": 0.1043780043721199, + "rewards/get_intelligibility_reward": -4.267912495136261, + "rewards/get_intelligibility_reward_std": 10.203703880310059, + "rewards/get_target_len_reward": -0.015537928231060505, + "rewards/get_target_len_reward_std": 0.054243368841707704, + "step": 3950 + }, + { + "advantages": 3.1106175413242454e-07, + "advantages_std": 1.6410012602806092, + "clip_ratio": 0.0, + "completion_length": 87.02619171142578, + "epoch": 2.9789473684210526, + "grad_norm": 50.25, + "kl": 0.3348252400755882, + "learning_rate": 3.511278195488722e-06, + "loss": 0.0372, + "num_tokens": 121444121.0, + "reward": -1.5893162369728089, + "reward_std": 6.552996873855591, + "rewards/get_chromagram_reward": 0.6217017948627472, + "rewards/get_chromagram_reward_std": 0.12044238150119782, + "rewards/get_intelligibility_reward": -5.368297362327576, + "rewards/get_intelligibility_reward_std": 10.290545845031739, + "rewards/get_target_len_reward": -0.021352790016680955, + "rewards/get_target_len_reward_std": 0.05885081067681312, + "step": 3960 + }, + { + "advantages": -1.2516975207432778e-07, + "advantages_std": 1.56015487909317, + "clip_ratio": 0.0, + "completion_length": 83.93392944335938, + "epoch": 2.9864661654135336, + "grad_norm": 8.875, + "kl": 0.49049554467201234, + "learning_rate": 3.507518796992482e-06, + "loss": 0.0536, + "num_tokens": 121743241.0, + "reward": -1.6406149506568908, + "reward_std": 6.627180910110473, + "rewards/get_chromagram_reward": 0.6197909057140351, + "rewards/get_chromagram_reward_std": 0.1163574256002903, + "rewards/get_intelligibility_reward": -5.520972895622253, + "rewards/get_intelligibility_reward_std": 10.391157054901123, + "rewards/get_target_len_reward": -0.020662406273186208, + "rewards/get_target_len_reward_std": 0.05791729502379894, + "step": 3970 + }, + { + "advantages": -1.0728836059570313e-07, + "advantages_std": 1.7315122485160828, + "clip_ratio": 0.0, + "completion_length": 87.04404830932617, + "epoch": 2.993984962406015, + "grad_norm": 5.9375, + "kl": 0.33801840990781784, + "learning_rate": 3.5037593984962407e-06, + "loss": 0.0378, + "num_tokens": 122050238.0, + "reward": -1.2605494730174542, + "reward_std": 6.6820306301116945, + "rewards/get_chromagram_reward": 0.6320303499698638, + "rewards/get_chromagram_reward_std": 0.106711595505476, + "rewards/get_intelligibility_reward": -4.391807705163956, + "rewards/get_intelligibility_reward_std": 10.786450862884521, + "rewards/get_target_len_reward": -0.021870699431747197, + "rewards/get_target_len_reward_std": 0.06427764222025871, + "step": 3980 + }, + { + "advantages": -3.352760788999376e-08, + "advantages_std": 1.570639932155609, + "clip_ratio": 0.0, + "completion_length": 86.67654876708984, + "epoch": 3.0022556390977444, + "grad_norm": 6.625, + "kl": 0.3771525263786316, + "learning_rate": 3.5e-06, + "loss": 0.0436, + "num_tokens": 122356745.0, + "reward": -1.5819456607103348, + "reward_std": 6.953858041763306, + "rewards/get_chromagram_reward": 0.6200494110584259, + "rewards/get_chromagram_reward_std": 0.10788874998688698, + "rewards/get_intelligibility_reward": -5.346407115459442, + "rewards/get_intelligibility_reward_std": 11.080866146087647, + "rewards/get_target_len_reward": -0.01947901090607047, + "rewards/get_target_len_reward_std": 0.06660321317613124, + "step": 3990 + }, + { + "advantages": -7.202227827463049e-08, + "advantages_std": 1.568999421596527, + "clip_ratio": 0.0, + "completion_length": 84.76785888671876, + "epoch": 3.0097744360902254, + "grad_norm": 5.65625, + "kl": 0.28501827269792557, + "learning_rate": 3.4962406015037596e-06, + "loss": 0.0331, + "num_tokens": 122657611.0, + "reward": -1.4566645920276642, + "reward_std": 6.35735330581665, + "rewards/get_chromagram_reward": 0.6238480865955353, + "rewards/get_chromagram_reward_std": 0.11462956219911576, + "rewards/get_intelligibility_reward": -4.972574901580811, + "rewards/get_intelligibility_reward_std": 10.054164218902589, + "rewards/get_target_len_reward": -0.021266722306609153, + "rewards/get_target_len_reward_std": 0.06082735937088728, + "step": 4000 + }, + { + "advantages": 2.60521970929517e-07, + "advantages_std": 1.5789404511451721, + "clip_ratio": 0.0, + "completion_length": 87.2101203918457, + "epoch": 3.017293233082707, + "grad_norm": 4.84375, + "kl": 0.29096812158823016, + "learning_rate": 3.492481203007519e-06, + "loss": 0.0343, + "num_tokens": 122965654.0, + "reward": -1.1574354212731124, + "reward_std": 6.544857168197632, + "rewards/get_chromagram_reward": 0.6343218684196472, + "rewards/get_chromagram_reward_std": 0.12248421981930732, + "rewards/get_intelligibility_reward": -4.086401665210724, + "rewards/get_intelligibility_reward_std": 10.68290309906006, + "rewards/get_target_len_reward": -0.020226311590522527, + "rewards/get_target_len_reward_std": 0.060124521143734455, + "step": 4010 + }, + { + "advantages": 6.233653380149917e-08, + "advantages_std": 1.6112825989723205, + "clip_ratio": 0.0, + "completion_length": 90.42916870117188, + "epoch": 3.024812030075188, + "grad_norm": 6.40625, + "kl": 0.5243023946881294, + "learning_rate": 3.4887218045112785e-06, + "loss": 0.0519, + "num_tokens": 123282490.0, + "reward": -1.1066797733306886, + "reward_std": 6.21809573173523, + "rewards/get_chromagram_reward": 0.6188024818897248, + "rewards/get_chromagram_reward_std": 0.10658950209617615, + "rewards/get_intelligibility_reward": -3.921287989616394, + "rewards/get_intelligibility_reward_std": 10.159177494049072, + "rewards/get_target_len_reward": -0.017553656082600354, + "rewards/get_target_len_reward_std": 0.04201601464301348, + "step": 4020 + }, + { + "advantages": -1.4801820782395226e-07, + "advantages_std": 1.5155102372169496, + "clip_ratio": 0.0, + "completion_length": 83.6482162475586, + "epoch": 3.032330827067669, + "grad_norm": 7.03125, + "kl": 0.2800201430916786, + "learning_rate": 3.4849624060150377e-06, + "loss": 0.0295, + "num_tokens": 123579919.0, + "reward": -1.9094644904136657, + "reward_std": 6.683042049407959, + "rewards/get_chromagram_reward": 0.6124842643737793, + "rewards/get_chromagram_reward_std": 0.10707958713173867, + "rewards/get_intelligibility_reward": -6.324036312103272, + "rewards/get_intelligibility_reward_std": 10.17835292816162, + "rewards/get_target_len_reward": -0.016841100715100765, + "rewards/get_target_len_reward_std": 0.05463197343051433, + "step": 4030 + }, + { + "advantages": -3.23355206965914e-07, + "advantages_std": 1.685700011253357, + "clip_ratio": 0.0, + "completion_length": 87.7982162475586, + "epoch": 3.0398496240601505, + "grad_norm": 7.03125, + "kl": 0.33719453066587446, + "learning_rate": 3.4812030075187973e-06, + "loss": 0.0335, + "num_tokens": 123889945.0, + "reward": -1.158366894721985, + "reward_std": 6.468680953979492, + "rewards/get_chromagram_reward": 0.6212324619293212, + "rewards/get_chromagram_reward_std": 0.10868031159043312, + "rewards/get_intelligibility_reward": -4.077363419532776, + "rewards/get_intelligibility_reward_std": 10.567723560333253, + "rewards/get_target_len_reward": -0.018969547282904387, + "rewards/get_target_len_reward_std": 0.055462539196014404, + "step": 4040 + }, + { + "advantages": -3.690520955501597e-07, + "advantages_std": 1.6781371116638184, + "clip_ratio": 0.0, + "completion_length": 87.94702606201172, + "epoch": 3.0473684210526315, + "grad_norm": 12.4375, + "kl": 0.35713528394699096, + "learning_rate": 3.4774436090225565e-06, + "loss": 0.0418, + "num_tokens": 124199361.0, + "reward": -1.5527551651000977, + "reward_std": 6.463404417037964, + "rewards/get_chromagram_reward": 0.6307406187057495, + "rewards/get_chromagram_reward_std": 0.12030550241470336, + "rewards/get_intelligibility_reward": -5.258705353736877, + "rewards/get_intelligibility_reward_std": 10.130701637268066, + "rewards/get_target_len_reward": -0.03030052110552788, + "rewards/get_target_len_reward_std": 0.09487388208508492, + "step": 4050 + }, + { + "advantages": -8.524705986445724e-07, + "advantages_std": 1.6060773849487304, + "clip_ratio": 0.0, + "completion_length": 85.1053581237793, + "epoch": 3.054887218045113, + "grad_norm": 240.0, + "kl": 0.36646874248981476, + "learning_rate": 3.473684210526316e-06, + "loss": 0.0379, + "num_tokens": 124500506.0, + "reward": -1.5579203933477401, + "reward_std": 6.218460988998413, + "rewards/get_chromagram_reward": 0.6300382137298584, + "rewards/get_chromagram_reward_std": 0.11434244513511657, + "rewards/get_intelligibility_reward": -5.2850764155387875, + "rewards/get_intelligibility_reward_std": 9.650420475006104, + "rewards/get_target_len_reward": -0.018722762074321508, + "rewards/get_target_len_reward_std": 0.060862186923623086, + "step": 4060 + }, + { + "advantages": 3.8469832190912713e-07, + "advantages_std": 1.6515644788742065, + "clip_ratio": 0.0, + "completion_length": 87.67440567016601, + "epoch": 3.062406015037594, + "grad_norm": 7.625, + "kl": 1.3306469723582268, + "learning_rate": 3.4699248120300754e-06, + "loss": 0.1346, + "num_tokens": 124809554.0, + "reward": -1.245470690727234, + "reward_std": 6.5784827709198, + "rewards/get_chromagram_reward": 0.617160576581955, + "rewards/get_chromagram_reward_std": 0.12427505478262901, + "rewards/get_intelligibility_reward": -4.331724762916565, + "rewards/get_intelligibility_reward_std": 10.69231686592102, + "rewards/get_target_len_reward": -0.02184757627546787, + "rewards/get_target_len_reward_std": 0.058031280897557734, + "step": 4070 + }, + { + "advantages": 6.439785323664182e-07, + "advantages_std": 1.5893252611160278, + "clip_ratio": 0.0, + "completion_length": 85.86964416503906, + "epoch": 3.069924812030075, + "grad_norm": 7.1875, + "kl": 0.33594403713941573, + "learning_rate": 3.4661654135338346e-06, + "loss": 0.0411, + "num_tokens": 125113679.0, + "reward": -1.4050700664520264, + "reward_std": 6.404324150085449, + "rewards/get_chromagram_reward": 0.6259469866752625, + "rewards/get_chromagram_reward_std": 0.12025773078203202, + "rewards/get_intelligibility_reward": -4.814673900604248, + "rewards/get_intelligibility_reward_std": 10.243736839294433, + "rewards/get_target_len_reward": -0.026482987217605114, + "rewards/get_target_len_reward_std": 0.09377836473286152, + "step": 4080 + }, + { + "advantages": -9.785095542724775e-08, + "advantages_std": 1.579916250705719, + "clip_ratio": 0.0, + "completion_length": 88.43095474243164, + "epoch": 3.0774436090225565, + "grad_norm": 10.875, + "kl": 0.34741307944059374, + "learning_rate": 3.4624060150375943e-06, + "loss": 0.0371, + "num_tokens": 125425440.0, + "reward": -1.3672380074858665, + "reward_std": 6.848063945770264, + "rewards/get_chromagram_reward": 0.6082988262176514, + "rewards/get_chromagram_reward_std": 0.11984210386872292, + "rewards/get_intelligibility_reward": -4.689711439609527, + "rewards/get_intelligibility_reward_std": 11.027862167358398, + "rewards/get_target_len_reward": -0.020301106479018928, + "rewards/get_target_len_reward_std": 0.06079982779920101, + "step": 4090 + }, + { + "advantages": 4.721184776457221e-07, + "advantages_std": 1.558591866493225, + "clip_ratio": 0.0, + "completion_length": 88.94464416503907, + "epoch": 3.0849624060150376, + "grad_norm": 5.875, + "kl": 0.2803475186228752, + "learning_rate": 3.4586466165413535e-06, + "loss": 0.0303, + "num_tokens": 125737918.0, + "reward": -1.2318539798259736, + "reward_std": 6.5522034645080565, + "rewards/get_chromagram_reward": 0.6231365621089935, + "rewards/get_chromagram_reward_std": 0.11571791395545006, + "rewards/get_intelligibility_reward": -4.298397623747587, + "rewards/get_intelligibility_reward_std": 10.569544792175293, + "rewards/get_target_len_reward": -0.020300750527530907, + "rewards/get_target_len_reward_std": 0.04793478585779667, + "step": 4100 + }, + { + "advantages": -1.5075007979703515e-07, + "advantages_std": 1.5395202040672302, + "clip_ratio": 0.0, + "completion_length": 87.48690643310547, + "epoch": 3.0924812030075186, + "grad_norm": 5.28125, + "kl": 1.836045852303505, + "learning_rate": 3.454887218045113e-06, + "loss": 0.1841, + "num_tokens": 126045904.0, + "reward": -1.3981635391712188, + "reward_std": 6.713280916213989, + "rewards/get_chromagram_reward": 0.6141719341278076, + "rewards/get_chromagram_reward_std": 0.11686758324503899, + "rewards/get_intelligibility_reward": -4.790717744827271, + "rewards/get_intelligibility_reward_std": 10.80199375152588, + "rewards/get_target_len_reward": -0.017944508977234364, + "rewards/get_target_len_reward_std": 0.04508624579757452, + "step": 4110 + }, + { + "advantages": 5.513429854886454e-08, + "advantages_std": 1.6173288822174072, + "clip_ratio": 0.0, + "completion_length": 87.71904983520508, + "epoch": 3.1, + "grad_norm": 5.4375, + "kl": 0.2904201149940491, + "learning_rate": 3.4511278195488724e-06, + "loss": 0.0313, + "num_tokens": 126355656.0, + "reward": -1.0718179211020469, + "reward_std": 6.851891374588012, + "rewards/get_chromagram_reward": 0.6362843096256257, + "rewards/get_chromagram_reward_std": 0.11849569082260132, + "rewards/get_intelligibility_reward": -3.8306349754333495, + "rewards/get_intelligibility_reward_std": 11.316781330108643, + "rewards/get_target_len_reward": -0.021102873608469963, + "rewards/get_target_len_reward_std": 0.05865535549819469, + "step": 4120 + }, + { + "advantages": -2.980231315063975e-08, + "advantages_std": 1.531466042995453, + "clip_ratio": 0.0, + "completion_length": 89.23035888671875, + "epoch": 3.107518796992481, + "grad_norm": 6.6875, + "kl": 0.34578198492527007, + "learning_rate": 3.447368421052632e-06, + "loss": 0.0366, + "num_tokens": 126668631.0, + "reward": -1.4846302151679993, + "reward_std": 6.385497617721557, + "rewards/get_chromagram_reward": 0.6100890100002289, + "rewards/get_chromagram_reward_std": 0.11665020361542702, + "rewards/get_intelligibility_reward": -5.043470954895019, + "rewards/get_intelligibility_reward_std": 10.072954416275024, + "rewards/get_target_len_reward": -0.020508491061627866, + "rewards/get_target_len_reward_std": 0.05244751274585724, + "step": 4130 + }, + { + "advantages": 3.315510070933669e-08, + "advantages_std": 1.5963869452476502, + "clip_ratio": 0.0, + "completion_length": 85.0803596496582, + "epoch": 3.1150375939849626, + "grad_norm": 9.6875, + "kl": 0.3159720331430435, + "learning_rate": 3.4436090225563912e-06, + "loss": 0.0325, + "num_tokens": 126970315.0, + "reward": -1.7319731652736663, + "reward_std": 7.204164934158325, + "rewards/get_chromagram_reward": 0.6253066539764405, + "rewards/get_chromagram_reward_std": 0.11562438681721687, + "rewards/get_intelligibility_reward": -5.801904332637787, + "rewards/get_intelligibility_reward_std": 11.298967266082764, + "rewards/get_target_len_reward": -0.01932156188413501, + "rewards/get_target_len_reward_std": 0.05420879852026701, + "step": 4140 + }, + { + "advantages": 6.544093583471522e-07, + "advantages_std": 1.646132493019104, + "clip_ratio": 0.0, + "completion_length": 88.97619247436523, + "epoch": 3.1225563909774436, + "grad_norm": 6.9375, + "kl": 0.3055782064795494, + "learning_rate": 3.439849624060151e-06, + "loss": 0.0408, + "num_tokens": 127282894.0, + "reward": -1.6036958336830138, + "reward_std": 6.741269922256469, + "rewards/get_chromagram_reward": 0.6201092064380646, + "rewards/get_chromagram_reward_std": 0.11851846948266029, + "rewards/get_intelligibility_reward": -5.403568816184998, + "rewards/get_intelligibility_reward_std": 10.67822847366333, + "rewards/get_target_len_reward": -0.027627493347972633, + "rewards/get_target_len_reward_std": 0.08369314391165972, + "step": 4150 + }, + { + "advantages": 1.5820066820992906e-07, + "advantages_std": 1.673720395565033, + "clip_ratio": 0.0, + "completion_length": 83.95952682495117, + "epoch": 3.1300751879699247, + "grad_norm": 712.0, + "kl": 0.35793513655662534, + "learning_rate": 3.43609022556391e-06, + "loss": 0.0385, + "num_tokens": 127581882.0, + "reward": -1.3141912584193052, + "reward_std": 6.152334928512573, + "rewards/get_chromagram_reward": 0.6244514942169189, + "rewards/get_chromagram_reward_std": 0.13061213493347168, + "rewards/get_intelligibility_reward": -4.543868839740753, + "rewards/get_intelligibility_reward_std": 9.792318058013915, + "rewards/get_target_len_reward": -0.02315631527453661, + "rewards/get_target_len_reward_std": 0.06220987867563963, + "step": 4160 + }, + { + "advantages": 4.592041378259637e-07, + "advantages_std": 1.6728333115577698, + "clip_ratio": 0.0, + "completion_length": 88.5232162475586, + "epoch": 3.137593984962406, + "grad_norm": 6.71875, + "kl": 0.29836671203374865, + "learning_rate": 3.4323308270676693e-06, + "loss": 0.0321, + "num_tokens": 127893670.0, + "reward": -1.011973148584366, + "reward_std": 6.23221607208252, + "rewards/get_chromagram_reward": 0.6238932073116302, + "rewards/get_chromagram_reward_std": 0.10449873432517051, + "rewards/get_intelligibility_reward": -3.6372927367687224, + "rewards/get_intelligibility_reward_std": 10.114261817932128, + "rewards/get_target_len_reward": -0.022519584652036427, + "rewards/get_target_len_reward_std": 0.06431333236396312, + "step": 4170 + }, + { + "advantages": -4.6566130720293584e-07, + "advantages_std": 1.4346014618873597, + "clip_ratio": 0.0, + "completion_length": 82.23154907226562, + "epoch": 3.145112781954887, + "grad_norm": 10.1875, + "kl": 0.3649785041809082, + "learning_rate": 3.428571428571429e-06, + "loss": 0.0445, + "num_tokens": 128187560.0, + "reward": -1.5847628176212312, + "reward_std": 6.996919250488281, + "rewards/get_chromagram_reward": 0.6297712743282318, + "rewards/get_chromagram_reward_std": 0.12266267240047454, + "rewards/get_intelligibility_reward": -5.356306481361389, + "rewards/get_intelligibility_reward_std": 11.103976488113403, + "rewards/get_target_len_reward": -0.0277529826387763, + "rewards/get_target_len_reward_std": 0.08649206086993218, + "step": 4180 + }, + { + "advantages": 1.5919408538067615e-07, + "advantages_std": 1.5202927470207215, + "clip_ratio": 0.0, + "completion_length": 86.965478515625, + "epoch": 3.1526315789473682, + "grad_norm": 12.75, + "kl": 0.3561545431613922, + "learning_rate": 3.424812030075188e-06, + "loss": 0.0376, + "num_tokens": 128494265.0, + "reward": -1.5775128185749054, + "reward_std": 7.364692258834839, + "rewards/get_chromagram_reward": 0.6272209942340851, + "rewards/get_chromagram_reward_std": 0.11569681465625763, + "rewards/get_intelligibility_reward": -5.338159966468811, + "rewards/get_intelligibility_reward_std": 11.840453338623046, + "rewards/get_target_len_reward": -0.021599231753498316, + "rewards/get_target_len_reward_std": 0.06570550277829171, + "step": 4190 + }, + { + "advantages": 1.7397105858130147e-07, + "advantages_std": 1.581853848695755, + "clip_ratio": 0.0, + "completion_length": 83.82678680419922, + "epoch": 3.1601503759398497, + "grad_norm": 6.59375, + "kl": 0.48840090334415437, + "learning_rate": 3.421052631578948e-06, + "loss": 0.0553, + "num_tokens": 128792206.0, + "reward": -1.8744078114628793, + "reward_std": 6.874015951156617, + "rewards/get_chromagram_reward": 0.6129431843757629, + "rewards/get_chromagram_reward_std": 0.11338778585195541, + "rewards/get_intelligibility_reward": -6.217310810089112, + "rewards/get_intelligibility_reward_std": 10.503016376495362, + "rewards/get_target_len_reward": -0.01885540150105953, + "rewards/get_target_len_reward_std": 0.0585413821041584, + "step": 4200 + }, + { + "advantages": 2.3220976776983094e-07, + "advantages_std": 1.6209957242012023, + "clip_ratio": 0.0, + "completion_length": 88.77619247436523, + "epoch": 3.1676691729323307, + "grad_norm": 6.0, + "kl": 0.302373868227005, + "learning_rate": 3.417293233082707e-06, + "loss": 0.0313, + "num_tokens": 129104105.0, + "reward": -1.3020709201693534, + "reward_std": 6.620100355148315, + "rewards/get_chromagram_reward": 0.6347114503383636, + "rewards/get_chromagram_reward_std": 0.11503036171197892, + "rewards/get_intelligibility_reward": -4.517418801784515, + "rewards/get_intelligibility_reward_std": 10.546687459945678, + "rewards/get_target_len_reward": -0.023505217209458352, + "rewards/get_target_len_reward_std": 0.06372208669781684, + "step": 4210 + }, + { + "advantages": 1.820425197252007e-07, + "advantages_std": 1.699629533290863, + "clip_ratio": 0.0, + "completion_length": 87.24583587646484, + "epoch": 3.1751879699248122, + "grad_norm": 7.4375, + "kl": 0.3359279319643974, + "learning_rate": 3.4135338345864667e-06, + "loss": 0.0409, + "num_tokens": 129412696.0, + "reward": -1.1203251257538795, + "reward_std": 6.46322226524353, + "rewards/get_chromagram_reward": 0.6190076887607574, + "rewards/get_chromagram_reward_std": 0.11276645958423615, + "rewards/get_intelligibility_reward": -3.958110880851746, + "rewards/get_intelligibility_reward_std": 10.538503217697144, + "rewards/get_target_len_reward": -0.021872010454535483, + "rewards/get_target_len_reward_std": 0.06405184045433998, + "step": 4220 + }, + { + "advantages": -7.053217050412286e-08, + "advantages_std": 1.6343294858932496, + "clip_ratio": 0.0, + "completion_length": 84.58809661865234, + "epoch": 3.1827067669172933, + "grad_norm": 6.78125, + "kl": 0.27931652069091795, + "learning_rate": 3.409774436090226e-06, + "loss": 0.0302, + "num_tokens": 129713005.0, + "reward": -1.5770192325115204, + "reward_std": 6.417839813232422, + "rewards/get_chromagram_reward": 0.6163597881793976, + "rewards/get_chromagram_reward_std": 0.11609133034944534, + "rewards/get_intelligibility_reward": -5.326514279842376, + "rewards/get_intelligibility_reward_std": 9.951054191589355, + "rewards/get_target_len_reward": -0.020902913995087147, + "rewards/get_target_len_reward_std": 0.052859509550035, + "step": 4230 + }, + { + "advantages": 8.195635814445268e-09, + "advantages_std": 1.5448844194412232, + "clip_ratio": 0.0, + "completion_length": 85.81488189697265, + "epoch": 3.1902255639097743, + "grad_norm": 69.5, + "kl": 0.3631991773843765, + "learning_rate": 3.4060150375939856e-06, + "loss": 0.0406, + "num_tokens": 130016638.0, + "reward": -1.707671296596527, + "reward_std": 6.869535779953003, + "rewards/get_chromagram_reward": 0.6243698179721833, + "rewards/get_chromagram_reward_std": 0.10792898535728454, + "rewards/get_intelligibility_reward": -5.727483916282654, + "rewards/get_intelligibility_reward_std": 10.7578125, + "rewards/get_target_len_reward": -0.019899446703493594, + "rewards/get_target_len_reward_std": 0.05657290127128363, + "step": 4240 + }, + { + "advantages": -5.8983766848541565e-08, + "advantages_std": 1.5598833680152893, + "clip_ratio": 0.0, + "completion_length": 89.1851203918457, + "epoch": 3.197744360902256, + "grad_norm": 6.0, + "kl": 0.3095327615737915, + "learning_rate": 3.4022556390977448e-06, + "loss": 0.0325, + "num_tokens": 130330190.0, + "reward": -1.2964784324169158, + "reward_std": 6.4273745059967045, + "rewards/get_chromagram_reward": 0.6348487615585328, + "rewards/get_chromagram_reward_std": 0.10957509577274323, + "rewards/get_intelligibility_reward": -4.506310987472534, + "rewards/get_intelligibility_reward_std": 10.336654472351075, + "rewards/get_target_len_reward": -0.0179728452116251, + "rewards/get_target_len_reward_std": 0.04697036426514387, + "step": 4250 + }, + { + "advantages": 4.770855213109826e-07, + "advantages_std": 1.5714811325073241, + "clip_ratio": 0.0, + "completion_length": 88.26726455688477, + "epoch": 3.205263157894737, + "grad_norm": 25.5, + "kl": 0.3470078229904175, + "learning_rate": 3.3984962406015044e-06, + "loss": 0.0395, + "num_tokens": 130639846.0, + "reward": -1.596861571073532, + "reward_std": 7.196740436553955, + "rewards/get_chromagram_reward": 0.6274778127670289, + "rewards/get_chromagram_reward_std": 0.10798213258385658, + "rewards/get_intelligibility_reward": -5.391012513637543, + "rewards/get_intelligibility_reward_std": 11.393933773040771, + "rewards/get_target_len_reward": -0.027049866039305925, + "rewards/get_target_len_reward_std": 0.08225924111902713, + "step": 4260 + }, + { + "advantages": -8.766849290964274e-08, + "advantages_std": 1.6335813045501708, + "clip_ratio": 0.0, + "completion_length": 87.04404907226562, + "epoch": 3.212781954887218, + "grad_norm": 24.5, + "kl": 0.37787252515554426, + "learning_rate": 3.3947368421052636e-06, + "loss": 0.0501, + "num_tokens": 130947324.0, + "reward": -1.5003271281719208, + "reward_std": 6.7568199157714846, + "rewards/get_chromagram_reward": 0.6264678537845612, + "rewards/get_chromagram_reward_std": 0.11311981976032257, + "rewards/get_intelligibility_reward": -5.102498412132263, + "rewards/get_intelligibility_reward_std": 10.762168216705323, + "rewards/get_target_len_reward": -0.02495050337165594, + "rewards/get_target_len_reward_std": 0.07984323929995299, + "step": 4270 + }, + { + "advantages": 4.2983642032368154e-07, + "advantages_std": 1.4764443516731263, + "clip_ratio": 0.0, + "completion_length": 87.52797775268554, + "epoch": 3.2203007518796993, + "grad_norm": 7.9375, + "kl": 0.27519893944263457, + "learning_rate": 3.3909774436090224e-06, + "loss": 0.0331, + "num_tokens": 131256343.0, + "reward": -1.131436224281788, + "reward_std": 6.341731071472168, + "rewards/get_chromagram_reward": 0.6292557060718537, + "rewards/get_chromagram_reward_std": 0.11208050698041916, + "rewards/get_intelligibility_reward": -4.000416457653046, + "rewards/get_intelligibility_reward_std": 10.307479953765869, + "rewards/get_target_len_reward": -0.023147699516266586, + "rewards/get_target_len_reward_std": 0.05790396872907877, + "step": 4280 + }, + { + "advantages": 1.567105432087601e-07, + "advantages_std": 1.7350687742233277, + "clip_ratio": 0.0, + "completion_length": 86.61488265991211, + "epoch": 3.2278195488721804, + "grad_norm": 27.125, + "kl": 0.33322153240442276, + "learning_rate": 3.387218045112782e-06, + "loss": 0.0367, + "num_tokens": 131562398.0, + "reward": -1.4957343488931656, + "reward_std": 6.397630310058593, + "rewards/get_chromagram_reward": 0.6309226214885711, + "rewards/get_chromagram_reward_std": 0.11377585753798485, + "rewards/get_intelligibility_reward": -5.09162460565567, + "rewards/get_intelligibility_reward_std": 10.076897811889648, + "rewards/get_target_len_reward": -0.02650076886638999, + "rewards/get_target_len_reward_std": 0.06703396700322628, + "step": 4290 + }, + { + "advantages": -1.9446015784296832e-07, + "advantages_std": 1.5864139795303345, + "clip_ratio": 0.0, + "completion_length": 90.55357284545899, + "epoch": 3.235338345864662, + "grad_norm": 9.3125, + "kl": 2.30724019408226, + "learning_rate": 3.3834586466165413e-06, + "loss": 0.2299, + "num_tokens": 131879855.0, + "reward": -1.2683696322143079, + "reward_std": 7.173541069030762, + "rewards/get_chromagram_reward": 0.6216107368469238, + "rewards/get_chromagram_reward_std": 0.10671053901314735, + "rewards/get_intelligibility_reward": -4.412684118747711, + "rewards/get_intelligibility_reward_std": 11.600953006744385, + "rewards/get_target_len_reward": -0.014035291131585836, + "rewards/get_target_len_reward_std": 0.03394932132214308, + "step": 4300 + }, + { + "advantages": 9.869535868567425e-07, + "advantages_std": 1.5972688794136047, + "clip_ratio": 0.0, + "completion_length": 87.88631134033203, + "epoch": 3.242857142857143, + "grad_norm": 13.3125, + "kl": 0.3564337491989136, + "learning_rate": 3.379699248120301e-06, + "loss": 0.0376, + "num_tokens": 132188535.0, + "reward": -1.5792127377353609, + "reward_std": 6.730674457550049, + "rewards/get_chromagram_reward": 0.6139270603656769, + "rewards/get_chromagram_reward_std": 0.12355838790535927, + "rewards/get_intelligibility_reward": -5.331458967924118, + "rewards/get_intelligibility_reward_std": 10.577913284301758, + "rewards/get_target_len_reward": -0.020106138475239278, + "rewards/get_target_len_reward_std": 0.0641857735812664, + "step": 4310 + }, + { + "advantages": 2.992649868360786e-07, + "advantages_std": 1.4977822065353394, + "clip_ratio": 0.0, + "completion_length": 83.48511962890625, + "epoch": 3.250375939849624, + "grad_norm": 6.6875, + "kl": 0.3735776156187057, + "learning_rate": 3.37593984962406e-06, + "loss": 0.0468, + "num_tokens": 132486001.0, + "reward": -1.7604040503501892, + "reward_std": 6.668347644805908, + "rewards/get_chromagram_reward": 0.6218710958957672, + "rewards/get_chromagram_reward_std": 0.10823953151702881, + "rewards/get_intelligibility_reward": -5.87978732585907, + "rewards/get_intelligibility_reward_std": 10.34724760055542, + "rewards/get_target_len_reward": -0.02329575140029192, + "rewards/get_target_len_reward_std": 0.07086060345172882, + "step": 4320 + }, + { + "advantages": 6.556511209510063e-08, + "advantages_std": 1.5302933931350708, + "clip_ratio": 0.0, + "completion_length": 83.34404830932617, + "epoch": 3.2578947368421054, + "grad_norm": 688.0, + "kl": 0.4539714246988297, + "learning_rate": 3.37218045112782e-06, + "loss": 0.051, + "num_tokens": 132782535.0, + "reward": -1.3723966896533966, + "reward_std": 6.291196537017822, + "rewards/get_chromagram_reward": 0.6126464605331421, + "rewards/get_chromagram_reward_std": 0.11417317017912865, + "rewards/get_intelligibility_reward": -4.7077836990356445, + "rewards/get_intelligibility_reward_std": 10.042604541778564, + "rewards/get_target_len_reward": -0.022052537463605405, + "rewards/get_target_len_reward_std": 0.06837241873145103, + "step": 4330 + }, + { + "advantages": 6.258488127741657e-08, + "advantages_std": 1.5377010941505431, + "clip_ratio": 0.0, + "completion_length": 87.44643096923828, + "epoch": 3.2654135338345864, + "grad_norm": 5.03125, + "kl": 0.31609337627887724, + "learning_rate": 3.368421052631579e-06, + "loss": 0.0319, + "num_tokens": 133090793.0, + "reward": -1.359285932779312, + "reward_std": 6.470762872695923, + "rewards/get_chromagram_reward": 0.6181576669216156, + "rewards/get_chromagram_reward_std": 0.113900126516819, + "rewards/get_intelligibility_reward": -4.6754331350326535, + "rewards/get_intelligibility_reward_std": 10.384093761444092, + "rewards/get_target_len_reward": -0.020582077372819186, + "rewards/get_target_len_reward_std": 0.05547735020518303, + "step": 4340 + }, + { + "advantages": -6.258487887933483e-08, + "advantages_std": 1.5956597447395324, + "clip_ratio": 0.0, + "completion_length": 88.53869247436523, + "epoch": 3.272932330827068, + "grad_norm": 12.3125, + "kl": 0.3077188953757286, + "learning_rate": 3.3646616541353387e-06, + "loss": 0.035, + "num_tokens": 133402151.0, + "reward": -1.2024756371974945, + "reward_std": 6.525686979293823, + "rewards/get_chromagram_reward": 0.6273522794246673, + "rewards/get_chromagram_reward_std": 0.11311362758278846, + "rewards/get_intelligibility_reward": -4.21550475358963, + "rewards/get_intelligibility_reward_std": 10.441200542449952, + "rewards/get_target_len_reward": -0.01927430145442486, + "rewards/get_target_len_reward_std": 0.05576913226395845, + "step": 4350 + }, + { + "advantages": 1.668930101228483e-07, + "advantages_std": 1.5654626369476319, + "clip_ratio": 0.0, + "completion_length": 90.2880973815918, + "epoch": 3.280451127819549, + "grad_norm": 72.5, + "kl": 0.30325102657079694, + "learning_rate": 3.360902255639098e-06, + "loss": 0.0348, + "num_tokens": 133718235.0, + "reward": -1.231583520770073, + "reward_std": 6.6298370361328125, + "rewards/get_chromagram_reward": 0.6353707134723663, + "rewards/get_chromagram_reward_std": 0.10845707952976227, + "rewards/get_intelligibility_reward": -4.312479996681214, + "rewards/get_intelligibility_reward_std": 10.759493350982666, + "rewards/get_target_len_reward": -0.01764109553769231, + "rewards/get_target_len_reward_std": 0.050665826164186, + "step": 4360 + }, + { + "advantages": 4.169841787415862e-07, + "advantages_std": 1.686136043071747, + "clip_ratio": 0.0, + "completion_length": 86.95416870117188, + "epoch": 3.28796992481203, + "grad_norm": 12.0625, + "kl": 0.30330993682146073, + "learning_rate": 3.357142857142857e-06, + "loss": 0.0316, + "num_tokens": 134024293.0, + "reward": -1.4639351397752762, + "reward_std": 6.9123969078063965, + "rewards/get_chromagram_reward": 0.6185981154441833, + "rewards/get_chromagram_reward_std": 0.11106212437152863, + "rewards/get_intelligibility_reward": -4.990768309682608, + "rewards/get_intelligibility_reward_std": 11.024269771575927, + "rewards/get_target_len_reward": -0.019634839612990618, + "rewards/get_target_len_reward_std": 0.05602564513683319, + "step": 4370 + }, + { + "advantages": -2.493461053632018e-07, + "advantages_std": 1.4997942090034484, + "clip_ratio": 0.0, + "completion_length": 85.8958351135254, + "epoch": 3.2954887218045115, + "grad_norm": 6.75, + "kl": 0.37191055417060853, + "learning_rate": 3.3533834586466168e-06, + "loss": 0.0448, + "num_tokens": 134327979.0, + "reward": -1.4511731714010239, + "reward_std": 6.788499164581299, + "rewards/get_chromagram_reward": 0.6098757028579712, + "rewards/get_chromagram_reward_std": 0.1217208631336689, + "rewards/get_intelligibility_reward": -4.94060070514679, + "rewards/get_intelligibility_reward_std": 10.785972690582275, + "rewards/get_target_len_reward": -0.0227941183373332, + "rewards/get_target_len_reward_std": 0.07210518475621938, + "step": 4380 + }, + { + "advantages": -1.7806888479299233e-07, + "advantages_std": 1.7041251063346863, + "clip_ratio": 0.0, + "completion_length": 83.92202529907226, + "epoch": 3.3030075187969925, + "grad_norm": 8.125, + "kl": 0.36032059490680696, + "learning_rate": 3.349624060150376e-06, + "loss": 0.0346, + "num_tokens": 134626620.0, + "reward": -1.2390080988407135, + "reward_std": 6.203968286514282, + "rewards/get_chromagram_reward": 0.6310724079608917, + "rewards/get_chromagram_reward_std": 0.11540384292602539, + "rewards/get_intelligibility_reward": -4.326459050178528, + "rewards/get_intelligibility_reward_std": 9.993881130218506, + "rewards/get_target_len_reward": -0.021637386176735163, + "rewards/get_target_len_reward_std": 0.05318964570760727, + "step": 4390 + }, + { + "advantages": -1.288950535638378e-07, + "advantages_std": 1.6415579080581666, + "clip_ratio": 0.0, + "completion_length": 85.83631134033203, + "epoch": 3.3105263157894735, + "grad_norm": 35.75, + "kl": 0.3649152874946594, + "learning_rate": 3.3458646616541356e-06, + "loss": 0.0388, + "num_tokens": 134930223.0, + "reward": -1.3480420507490636, + "reward_std": 6.561384868621826, + "rewards/get_chromagram_reward": 0.627709686756134, + "rewards/get_chromagram_reward_std": 0.10990332737565041, + "rewards/get_intelligibility_reward": -4.6510482132434845, + "rewards/get_intelligibility_reward_std": 10.510717582702636, + "rewards/get_target_len_reward": -0.020787397399544716, + "rewards/get_target_len_reward_std": 0.05646887123584747, + "step": 4400 + }, + { + "advantages": 6.740292057827446e-07, + "advantages_std": 1.5014996767044066, + "clip_ratio": 0.0, + "completion_length": 86.23452453613281, + "epoch": 3.318045112781955, + "grad_norm": 6.0, + "kl": 0.3484359845519066, + "learning_rate": 3.342105263157895e-06, + "loss": 0.0386, + "num_tokens": 135235842.0, + "reward": -1.4315605893731118, + "reward_std": 6.73180627822876, + "rewards/get_chromagram_reward": 0.6199473381042481, + "rewards/get_chromagram_reward_std": 0.11735682263970375, + "rewards/get_intelligibility_reward": -4.8925375759601595, + "rewards/get_intelligibility_reward_std": 10.742376804351807, + "rewards/get_target_len_reward": -0.02209125757217407, + "rewards/get_target_len_reward_std": 0.05951045509427786, + "step": 4410 + }, + { + "advantages": 2.495944428915209e-07, + "advantages_std": 1.6141934156417848, + "clip_ratio": 0.0, + "completion_length": 87.96666870117187, + "epoch": 3.325563909774436, + "grad_norm": 6.1875, + "kl": 0.29121205657720567, + "learning_rate": 3.3383458646616545e-06, + "loss": 0.0353, + "num_tokens": 135545071.0, + "reward": -1.6051747798919678, + "reward_std": 6.904460906982422, + "rewards/get_chromagram_reward": 0.6305911779403687, + "rewards/get_chromagram_reward_std": 0.11587974280118943, + "rewards/get_intelligibility_reward": -5.420707416534424, + "rewards/get_intelligibility_reward_std": 10.881964015960694, + "rewards/get_target_len_reward": -0.02540780254639685, + "rewards/get_target_len_reward_std": 0.08899888359010219, + "step": 4420 + }, + { + "advantages": 1.688798274557257e-08, + "advantages_std": 1.6381218433380127, + "clip_ratio": 0.0, + "completion_length": 86.28392944335937, + "epoch": 3.333082706766917, + "grad_norm": 11.625, + "kl": 0.32549781948328016, + "learning_rate": 3.3345864661654137e-06, + "loss": 0.0356, + "num_tokens": 135850066.0, + "reward": -1.3877425879240035, + "reward_std": 6.77799243927002, + "rewards/get_chromagram_reward": 0.6214170634746552, + "rewards/get_chromagram_reward_std": 0.11292667016386986, + "rewards/get_intelligibility_reward": -4.76411754488945, + "rewards/get_intelligibility_reward_std": 10.808771133422852, + "rewards/get_target_len_reward": -0.02052699653431773, + "rewards/get_target_len_reward_std": 0.060480015352368355, + "step": 4430 + }, + { + "advantages": -5.215406439162962e-07, + "advantages_std": 1.612470018863678, + "clip_ratio": 0.0, + "completion_length": 89.95238342285157, + "epoch": 3.3406015037593986, + "grad_norm": 6.46875, + "kl": 0.40137475579977033, + "learning_rate": 3.3308270676691734e-06, + "loss": 0.0445, + "num_tokens": 136164492.0, + "reward": -1.291347751021385, + "reward_std": 6.56026291847229, + "rewards/get_chromagram_reward": 0.6152363359928131, + "rewards/get_chromagram_reward_std": 0.1162784643471241, + "rewards/get_intelligibility_reward": -4.46769335269928, + "rewards/get_intelligibility_reward_std": 10.557899475097656, + "rewards/get_target_len_reward": -0.021585862338542938, + "rewards/get_target_len_reward_std": 0.06894133090972901, + "step": 4440 + }, + { + "advantages": -3.2906732094772906e-07, + "advantages_std": 1.5803971409797668, + "clip_ratio": 0.0, + "completion_length": 84.28928604125977, + "epoch": 3.3481203007518796, + "grad_norm": 5.75, + "kl": 0.3210767716169357, + "learning_rate": 3.3270676691729326e-06, + "loss": 0.0362, + "num_tokens": 136463505.0, + "reward": -1.60392969250679, + "reward_std": 6.731313037872314, + "rewards/get_chromagram_reward": 0.6151593327522278, + "rewards/get_chromagram_reward_std": 0.1124209813773632, + "rewards/get_intelligibility_reward": -5.408218407630921, + "rewards/get_intelligibility_reward_std": 10.616331481933594, + "rewards/get_target_len_reward": -0.01872968636453152, + "rewards/get_target_len_reward_std": 0.059635018557310106, + "step": 4450 + }, + { + "advantages": -1.527369065001949e-07, + "advantages_std": 1.6485271215438844, + "clip_ratio": 0.0, + "completion_length": 86.61131134033204, + "epoch": 3.355639097744361, + "grad_norm": 10.4375, + "kl": 0.304203300178051, + "learning_rate": 3.3233082706766922e-06, + "loss": 0.0305, + "num_tokens": 136769357.0, + "reward": -1.5627503097057343, + "reward_std": 6.813011837005615, + "rewards/get_chromagram_reward": 0.6316563546657562, + "rewards/get_chromagram_reward_std": 0.12207503393292427, + "rewards/get_intelligibility_reward": -5.299207505583763, + "rewards/get_intelligibility_reward_std": 10.698434257507325, + "rewards/get_target_len_reward": -0.020699765533208847, + "rewards/get_target_len_reward_std": 0.04991193488240242, + "step": 4460 + }, + { + "advantages": 4.86274569766465e-07, + "advantages_std": 1.6251121640205384, + "clip_ratio": 0.0, + "completion_length": 89.69643096923828, + "epoch": 3.363157894736842, + "grad_norm": 4.84375, + "kl": 0.42891152799129484, + "learning_rate": 3.3195488721804515e-06, + "loss": 0.0485, + "num_tokens": 137084220.0, + "reward": -1.1922965973615647, + "reward_std": 6.639740610122681, + "rewards/get_chromagram_reward": 0.6291784584522248, + "rewards/get_chromagram_reward_std": 0.10962832942605019, + "rewards/get_intelligibility_reward": -4.18378599062562, + "rewards/get_intelligibility_reward_std": 10.700201082229615, + "rewards/get_target_len_reward": -0.022282037045806648, + "rewards/get_target_len_reward_std": 0.06122244410216808, + "step": 4470 + }, + { + "advantages": 2.7716159820556643e-07, + "advantages_std": 1.5915523767471313, + "clip_ratio": 0.0, + "completion_length": 84.36666793823242, + "epoch": 3.370676691729323, + "grad_norm": 6.03125, + "kl": 0.390315043926239, + "learning_rate": 3.3157894736842107e-06, + "loss": 0.0408, + "num_tokens": 137384058.0, + "reward": -1.3131475508213044, + "reward_std": 6.7762237071990965, + "rewards/get_chromagram_reward": 0.6207613468170166, + "rewards/get_chromagram_reward_std": 0.11534344181418418, + "rewards/get_intelligibility_reward": -4.539241921901703, + "rewards/get_intelligibility_reward_std": 10.954913902282716, + "rewards/get_target_len_reward": -0.02096187099814415, + "rewards/get_target_len_reward_std": 0.061504085268825295, + "step": 4480 + }, + { + "advantages": 3.4123660181961667e-07, + "advantages_std": 1.507073664665222, + "clip_ratio": 0.0, + "completion_length": 88.43333358764649, + "epoch": 3.3781954887218046, + "grad_norm": 6.53125, + "kl": 0.32767518907785415, + "learning_rate": 3.3120300751879703e-06, + "loss": 0.0375, + "num_tokens": 137694904.0, + "reward": -1.521827945113182, + "reward_std": 6.90161714553833, + "rewards/get_chromagram_reward": 0.6179491519927979, + "rewards/get_chromagram_reward_std": 0.10849400088191033, + "rewards/get_intelligibility_reward": -5.163233387470245, + "rewards/get_intelligibility_reward_std": 10.99599552154541, + "rewards/get_target_len_reward": -0.02019930398091674, + "rewards/get_target_len_reward_std": 0.06717992164194583, + "step": 4490 + }, + { + "advantages": -1.70990843173513e-07, + "advantages_std": 1.6654401540756225, + "clip_ratio": 0.0, + "completion_length": 84.30773849487305, + "epoch": 3.3857142857142857, + "grad_norm": 6.125, + "kl": 0.35676948428153993, + "learning_rate": 3.3082706766917295e-06, + "loss": 0.0372, + "num_tokens": 137994260.0, + "reward": -1.6494194865226746, + "reward_std": 6.620905637741089, + "rewards/get_chromagram_reward": 0.6207099735736847, + "rewards/get_chromagram_reward_std": 0.11385154500603675, + "rewards/get_intelligibility_reward": -5.549079060554504, + "rewards/get_intelligibility_reward_std": 10.367657470703126, + "rewards/get_target_len_reward": -0.019889032002538443, + "rewards/get_target_len_reward_std": 0.054109343141317365, + "step": 4500 + }, + { + "advantages": 8.779268085845615e-08, + "advantages_std": 1.5318343758583068, + "clip_ratio": 0.0, + "completion_length": 88.48333511352538, + "epoch": 3.3909774436090228, + "grad_norm": 10.6875, + "kl": 0.2969478860497475, + "learning_rate": 3.304511278195489e-06, + "loss": 0.0394, + "num_tokens": 310857.0, + "reward": -1.3281305372714995, + "reward_std": 6.489425706863403, + "rewards/get_chromagram_reward": 0.6274168491363525, + "rewards/get_chromagram_reward_std": 0.1028469517827034, + "rewards/get_intelligibility_reward": -4.59235405921936, + "rewards/get_intelligibility_reward_std": 10.438762092590332, + "rewards/get_target_len_reward": -0.019454195350408553, + "rewards/get_target_len_reward_std": 0.06737818010151386, + "step": 4510 + }, + { + "advantages": -2.575417425987325e-07, + "advantages_std": 1.537089204788208, + "clip_ratio": 0.0, + "completion_length": 87.34881057739258, + "epoch": 3.398496240601504, + "grad_norm": 10.125, + "kl": 0.29680820405483244, + "learning_rate": 3.3007518796992484e-06, + "loss": 0.0333, + "num_tokens": 618257.0, + "reward": -1.3931241035461426, + "reward_std": 6.6218328952789305, + "rewards/get_chromagram_reward": 0.6188321650028229, + "rewards/get_chromagram_reward_std": 0.1196521833539009, + "rewards/get_intelligibility_reward": -4.778675246238708, + "rewards/get_intelligibility_reward_std": 10.661188697814941, + "rewards/get_target_len_reward": -0.019528946094214916, + "rewards/get_target_len_reward_std": 0.0580808324739337, + "step": 4520 + }, + { + "advantages": 6.085882752415728e-07, + "advantages_std": 1.6382742047309875, + "clip_ratio": 0.0, + "completion_length": 87.75, + "epoch": 3.406015037593985, + "grad_norm": 74.5, + "kl": 0.37056227773427963, + "learning_rate": 3.296992481203008e-06, + "loss": 0.0363, + "num_tokens": 927145.0, + "reward": -1.0709830440580845, + "reward_std": 6.485338163375855, + "rewards/get_chromagram_reward": 0.6164808750152588, + "rewards/get_chromagram_reward_std": 0.10840248018503189, + "rewards/get_intelligibility_reward": -3.810542845726013, + "rewards/get_intelligibility_reward_std": 10.537764549255371, + "rewards/get_target_len_reward": -0.01888696802780032, + "rewards/get_target_len_reward_std": 0.0492866700515151, + "step": 4530 + }, + { + "advantages": 2.6449561971730875e-07, + "advantages_std": 1.5188130497932435, + "clip_ratio": 0.0, + "completion_length": 84.0553581237793, + "epoch": 3.4135338345864663, + "grad_norm": 5.0625, + "kl": 0.38036876618862153, + "learning_rate": 3.2932330827067673e-06, + "loss": 0.0437, + "num_tokens": 1224484.0, + "reward": -2.0424502193927765, + "reward_std": 7.005794954299927, + "rewards/get_chromagram_reward": 0.6144976735115051, + "rewards/get_chromagram_reward_std": 0.11669495552778245, + "rewards/get_intelligibility_reward": -6.720308995246887, + "rewards/get_intelligibility_reward_std": 10.61027421951294, + "rewards/get_target_len_reward": -0.021538918651640416, + "rewards/get_target_len_reward_std": 0.07094106562435627, + "step": 4540 + }, + { + "advantages": -5.481143944052747e-07, + "advantages_std": 1.5452426671981812, + "clip_ratio": 0.0, + "completion_length": 85.57797775268554, + "epoch": 3.4210526315789473, + "grad_norm": 18.125, + "kl": 0.4040832698345184, + "learning_rate": 3.289473684210527e-06, + "loss": 0.0411, + "num_tokens": 1527343.0, + "reward": -1.387267404794693, + "reward_std": 6.645217752456665, + "rewards/get_chromagram_reward": 0.6343863129615783, + "rewards/get_chromagram_reward_std": 0.1153879277408123, + "rewards/get_intelligibility_reward": -4.774351906776428, + "rewards/get_intelligibility_reward_std": 10.63620548248291, + "rewards/get_target_len_reward": -0.021836266200989485, + "rewards/get_target_len_reward_std": 0.05838818326592445, + "step": 4550 + }, + { + "advantages": -3.427267358802055e-07, + "advantages_std": 1.5205845952033996, + "clip_ratio": 0.0, + "completion_length": 87.99464492797851, + "epoch": 3.4285714285714284, + "grad_norm": 6.46875, + "kl": 0.332698717713356, + "learning_rate": 3.285714285714286e-06, + "loss": 0.034, + "num_tokens": 1837449.0, + "reward": -1.2367383658885955, + "reward_std": 6.677933168411255, + "rewards/get_chromagram_reward": 0.6336679756641388, + "rewards/get_chromagram_reward_std": 0.11803798377513885, + "rewards/get_intelligibility_reward": -4.325644779205322, + "rewards/get_intelligibility_reward_std": 10.876698303222657, + "rewards/get_target_len_reward": -0.018238031212240456, + "rewards/get_target_len_reward_std": 0.04330310449004173, + "step": 4560 + }, + { + "advantages": -3.489355350438927e-07, + "advantages_std": 1.6003393173217773, + "clip_ratio": 0.0, + "completion_length": 90.34761962890624, + "epoch": 3.43609022556391, + "grad_norm": 7.8125, + "kl": 0.32162316888570786, + "learning_rate": 3.281954887218045e-06, + "loss": 0.0356, + "num_tokens": 2153465.0, + "reward": -1.3049261048436165, + "reward_std": 6.861018323898316, + "rewards/get_chromagram_reward": 0.6241649210453033, + "rewards/get_chromagram_reward_std": 0.1164263904094696, + "rewards/get_intelligibility_reward": -4.515636777877807, + "rewards/get_intelligibility_reward_std": 11.109533786773682, + "rewards/get_target_len_reward": -0.023306295182555913, + "rewards/get_target_len_reward_std": 0.06607088632881641, + "step": 4570 + }, + { + "advantages": -1.3560057441353023e-07, + "advantages_std": 1.559881293773651, + "clip_ratio": 0.0, + "completion_length": 83.56488342285157, + "epoch": 3.443609022556391, + "grad_norm": 6.0625, + "kl": 0.35016684532165526, + "learning_rate": 3.278195488721805e-06, + "loss": 0.042, + "num_tokens": 2449893.0, + "reward": -1.8763112545013427, + "reward_std": 7.130662345886231, + "rewards/get_chromagram_reward": 0.6139516234397888, + "rewards/get_chromagram_reward_std": 0.11391275078058243, + "rewards/get_intelligibility_reward": -6.219006633758545, + "rewards/get_intelligibility_reward_std": 10.96840171813965, + "rewards/get_target_len_reward": -0.023878414928913117, + "rewards/get_target_len_reward_std": 0.07354874908924103, + "step": 4580 + }, + { + "advantages": 2.5009116271235144e-07, + "advantages_std": 1.6059733986854554, + "clip_ratio": 0.0, + "completion_length": 84.68631134033203, + "epoch": 3.451127819548872, + "grad_norm": 6.4375, + "kl": 0.33188803791999816, + "learning_rate": 3.274436090225564e-06, + "loss": 0.035, + "num_tokens": 2750204.0, + "reward": -1.4741009950637818, + "reward_std": 6.520594120025635, + "rewards/get_chromagram_reward": 0.6148521661758423, + "rewards/get_chromagram_reward_std": 0.12287932783365249, + "rewards/get_intelligibility_reward": -5.016161251068115, + "rewards/get_intelligibility_reward_std": 10.359986591339112, + "rewards/get_target_len_reward": -0.02099353475496173, + "rewards/get_target_len_reward_std": 0.05685290042310953, + "step": 4590 + }, + { + "advantages": -4.27166703786952e-08, + "advantages_std": 1.4332894206047058, + "clip_ratio": 0.0, + "completion_length": 87.39881057739258, + "epoch": 3.4586466165413534, + "grad_norm": 7.875, + "kl": 0.31006584167480467, + "learning_rate": 3.270676691729324e-06, + "loss": 0.0335, + "num_tokens": 3058288.0, + "reward": -1.3671068586409092, + "reward_std": 6.128223896026611, + "rewards/get_chromagram_reward": 0.6180779874324799, + "rewards/get_chromagram_reward_std": 0.11426214426755905, + "rewards/get_intelligibility_reward": -4.699258416891098, + "rewards/get_intelligibility_reward_std": 9.655228281021119, + "rewards/get_target_len_reward": -0.0201399652287364, + "rewards/get_target_len_reward_std": 0.0592557929456234, + "step": 4600 + }, + { + "advantages": 1.0222197062148553e-06, + "advantages_std": 1.5769393920898438, + "clip_ratio": 0.0, + "completion_length": 85.55357284545899, + "epoch": 3.4661654135338344, + "grad_norm": 17.875, + "kl": 0.3076698824763298, + "learning_rate": 3.2669172932330827e-06, + "loss": 0.0353, + "num_tokens": 3361429.0, + "reward": -1.5159668922424316, + "reward_std": 6.47130823135376, + "rewards/get_chromagram_reward": 0.6088860273361206, + "rewards/get_chromagram_reward_std": 0.12104258313775063, + "rewards/get_intelligibility_reward": -5.12894773632288, + "rewards/get_intelligibility_reward_std": 10.181937217712402, + "rewards/get_target_len_reward": -0.02783880215138197, + "rewards/get_target_len_reward_std": 0.10612631607800722, + "step": 4610 + }, + { + "advantages": 5.679826216464789e-07, + "advantages_std": 1.5524175405502318, + "clip_ratio": 0.0, + "completion_length": 85.43214340209961, + "epoch": 3.473684210526316, + "grad_norm": 5.3125, + "kl": 1.2340461641550065, + "learning_rate": 3.2631578947368423e-06, + "loss": 0.1223, + "num_tokens": 3665014.0, + "reward": -1.3473031282424928, + "reward_std": 6.949818515777588, + "rewards/get_chromagram_reward": 0.6227212309837341, + "rewards/get_chromagram_reward_std": 0.12013640999794006, + "rewards/get_intelligibility_reward": -4.6440167903900145, + "rewards/get_intelligibility_reward_std": 11.21874017715454, + "rewards/get_target_len_reward": -0.02061356231570244, + "rewards/get_target_len_reward_std": 0.04689461421221495, + "step": 4620 + }, + { + "advantages": 5.125999696709016e-07, + "advantages_std": 1.6045400023460388, + "clip_ratio": 0.0, + "completion_length": 87.08928756713867, + "epoch": 3.481203007518797, + "grad_norm": 278.0, + "kl": 231.49522580206394, + "learning_rate": 3.2593984962406015e-06, + "loss": 23.1596, + "num_tokens": 3972578.0, + "reward": -1.8033069729804994, + "reward_std": 7.021324205398559, + "rewards/get_chromagram_reward": 0.6099605858325958, + "rewards/get_chromagram_reward_std": 0.11153682023286819, + "rewards/get_intelligibility_reward": -5.99813141822815, + "rewards/get_intelligibility_reward_std": 10.870650100708009, + "rewards/get_target_len_reward": -0.02174974959343672, + "rewards/get_target_len_reward_std": 0.060309494659304616, + "step": 4630 + }, + { + "advantages": -6.929039315650698e-08, + "advantages_std": 1.7106932163238526, + "clip_ratio": 0.0, + "completion_length": 88.45535812377929, + "epoch": 3.488721804511278, + "grad_norm": 388.0, + "kl": 0.50369683355093, + "learning_rate": 3.255639097744361e-06, + "loss": 0.0563, + "num_tokens": 4283808.0, + "reward": -1.3488947361707688, + "reward_std": 7.233368158340454, + "rewards/get_chromagram_reward": 0.6195383369922638, + "rewards/get_chromagram_reward_std": 0.11464283838868142, + "rewards/get_intelligibility_reward": -4.639280533790588, + "rewards/get_intelligibility_reward_std": 11.70107069015503, + "rewards/get_target_len_reward": -0.026941781863570213, + "rewards/get_target_len_reward_std": 0.07510890010744334, + "step": 4640 + }, + { + "advantages": 2.623846254934392e-07, + "advantages_std": 1.5433639526367187, + "clip_ratio": 0.0, + "completion_length": 86.15416793823242, + "epoch": 3.4962406015037595, + "grad_norm": 17.5, + "kl": 0.3447202920913696, + "learning_rate": 3.2518796992481204e-06, + "loss": 0.0393, + "num_tokens": 4587949.0, + "reward": -1.67192000746727, + "reward_std": 6.874111652374268, + "rewards/get_chromagram_reward": 0.6280596375465393, + "rewards/get_chromagram_reward_std": 0.11481318324804306, + "rewards/get_intelligibility_reward": -5.623130202293396, + "rewards/get_intelligibility_reward_std": 10.843562889099122, + "rewards/get_target_len_reward": -0.02068912973627448, + "rewards/get_target_len_reward_std": 0.06118360720574856, + "step": 4650 + }, + { + "advantages": 2.0650526977306072e-07, + "advantages_std": 1.5221888184547425, + "clip_ratio": 0.0, + "completion_length": 83.7500015258789, + "epoch": 3.5037593984962405, + "grad_norm": 6.875, + "kl": 0.3721516489982605, + "learning_rate": 3.24812030075188e-06, + "loss": 0.0423, + "num_tokens": 4886558.0, + "reward": -1.9768889904022218, + "reward_std": 7.324736595153809, + "rewards/get_chromagram_reward": 0.6346181631088257, + "rewards/get_chromagram_reward_std": 0.12539106458425522, + "rewards/get_intelligibility_reward": -6.539831948280335, + "rewards/get_intelligibility_reward_std": 11.161996984481812, + "rewards/get_target_len_reward": -0.025452758464962245, + "rewards/get_target_len_reward_std": 0.06582551747560501, + "step": 4660 + }, + { + "advantages": 6.457169860141221e-07, + "advantages_std": 1.672259545326233, + "clip_ratio": 0.0, + "completion_length": 84.77797775268554, + "epoch": 3.511278195488722, + "grad_norm": 8.625, + "kl": 0.46219568848609927, + "learning_rate": 3.2443609022556393e-06, + "loss": 0.05, + "num_tokens": 5187815.0, + "reward": -1.132588255405426, + "reward_std": 6.148283672332764, + "rewards/get_chromagram_reward": 0.6355343520641327, + "rewards/get_chromagram_reward_std": 0.11606954038143158, + "rewards/get_intelligibility_reward": -4.009374761581421, + "rewards/get_intelligibility_reward_std": 9.920799160003662, + "rewards/get_target_len_reward": -0.02392408112064004, + "rewards/get_target_len_reward_std": 0.07128265760838985, + "step": 4670 + }, + { + "advantages": -6.233652989351413e-08, + "advantages_std": 1.5050897002220154, + "clip_ratio": 0.0, + "completion_length": 85.82321624755859, + "epoch": 3.518796992481203, + "grad_norm": 5.28125, + "kl": 0.32279101610183714, + "learning_rate": 3.2406015037593985e-06, + "loss": 0.037, + "num_tokens": 5491254.0, + "reward": -1.5685214262455702, + "reward_std": 6.899728059768677, + "rewards/get_chromagram_reward": 0.6327625930309295, + "rewards/get_chromagram_reward_std": 0.11731386631727218, + "rewards/get_intelligibility_reward": -5.3153788626194, + "rewards/get_intelligibility_reward_std": 10.879010200500488, + "rewards/get_target_len_reward": -0.022947657201439143, + "rewards/get_target_len_reward_std": 0.06573955528438091, + "step": 4680 + }, + { + "advantages": 5.985300219890632e-08, + "advantages_std": 1.7275002241134643, + "clip_ratio": 0.0, + "completion_length": 87.0125015258789, + "epoch": 3.526315789473684, + "grad_norm": 6.78125, + "kl": 0.2822705447673798, + "learning_rate": 3.236842105263158e-06, + "loss": 0.0312, + "num_tokens": 5798147.0, + "reward": -1.4026454925537108, + "reward_std": 6.647600078582764, + "rewards/get_chromagram_reward": 0.6193685412406922, + "rewards/get_chromagram_reward_std": 0.10237304717302323, + "rewards/get_intelligibility_reward": -4.810842823982239, + "rewards/get_intelligibility_reward_std": 10.632196998596191, + "rewards/get_target_len_reward": -0.01646197042427957, + "rewards/get_target_len_reward_std": 0.062437703087925914, + "step": 4690 + }, + { + "advantages": 3.8569172522429085e-07, + "advantages_std": 1.3912248253822326, + "clip_ratio": 0.0, + "completion_length": 87.73393020629882, + "epoch": 3.5338345864661656, + "grad_norm": 4.40625, + "kl": 0.30773247331380843, + "learning_rate": 3.2330827067669174e-06, + "loss": 0.0341, + "num_tokens": 6106896.0, + "reward": -1.9746524155139924, + "reward_std": 7.20500168800354, + "rewards/get_chromagram_reward": 0.6334104359149932, + "rewards/get_chromagram_reward_std": 0.12139937430620193, + "rewards/get_intelligibility_reward": -6.537299847602844, + "rewards/get_intelligibility_reward_std": 11.083994102478027, + "rewards/get_target_len_reward": -0.02006738306954503, + "rewards/get_target_len_reward_std": 0.05874664410948753, + "step": 4700 + }, + { + "advantages": -2.4798018021243707e-07, + "advantages_std": 1.584977638721466, + "clip_ratio": 0.0, + "completion_length": 85.10952529907226, + "epoch": 3.5413533834586466, + "grad_norm": 5.8125, + "kl": 0.3366622805595398, + "learning_rate": 3.229323308270677e-06, + "loss": 0.0388, + "num_tokens": 6409107.0, + "reward": -1.544246843457222, + "reward_std": 6.5916718482971195, + "rewards/get_chromagram_reward": 0.6293317794799804, + "rewards/get_chromagram_reward_std": 0.12301539331674576, + "rewards/get_intelligibility_reward": -5.237197121977806, + "rewards/get_intelligibility_reward_std": 10.314287614822387, + "rewards/get_target_len_reward": -0.024875110294669867, + "rewards/get_target_len_reward_std": 0.07248621061444283, + "step": 4710 + }, + { + "advantages": -9.549162074407037e-08, + "advantages_std": 1.5304245591163634, + "clip_ratio": 0.0, + "completion_length": 87.42024002075195, + "epoch": 3.548872180451128, + "grad_norm": 7.09375, + "kl": 0.3081626623868942, + "learning_rate": 3.2255639097744362e-06, + "loss": 0.0329, + "num_tokens": 6716518.0, + "reward": -2.0039563357830046, + "reward_std": 7.060414791107178, + "rewards/get_chromagram_reward": 0.6098299086093902, + "rewards/get_chromagram_reward_std": 0.11160081923007965, + "rewards/get_intelligibility_reward": -6.60257580280304, + "rewards/get_intelligibility_reward_std": 10.738980102539063, + "rewards/get_target_len_reward": -0.019122610334306955, + "rewards/get_target_len_reward_std": 0.05139910690486431, + "step": 4720 + }, + { + "advantages": 2.8014182902325046e-07, + "advantages_std": 1.6335660099983216, + "clip_ratio": 0.0, + "completion_length": 85.40833435058593, + "epoch": 3.556390977443609, + "grad_norm": 7.375, + "kl": 0.27823727279901506, + "learning_rate": 3.221804511278196e-06, + "loss": 0.0361, + "num_tokens": 7018209.0, + "reward": -1.754353404045105, + "reward_std": 6.8036487102508545, + "rewards/get_chromagram_reward": 0.6300295114517211, + "rewards/get_chromagram_reward_std": 0.10109626650810241, + "rewards/get_intelligibility_reward": -5.869866466522216, + "rewards/get_intelligibility_reward_std": 10.537944126129151, + "rewards/get_target_len_reward": -0.0232229333370924, + "rewards/get_target_len_reward_std": 0.07139034196734428, + "step": 4730 + }, + { + "advantages": -4.5696896577851476e-07, + "advantages_std": 1.5473376512527466, + "clip_ratio": 0.0, + "completion_length": 85.30416793823242, + "epoch": 3.56390977443609, + "grad_norm": 6.09375, + "kl": 0.4926650047302246, + "learning_rate": 3.218045112781955e-06, + "loss": 0.0546, + "num_tokens": 7320787.0, + "reward": -1.7560069799423217, + "reward_std": 6.866075706481934, + "rewards/get_chromagram_reward": 0.6117322325706482, + "rewards/get_chromagram_reward_std": 0.12210858911275864, + "rewards/get_intelligibility_reward": -5.8582494258880615, + "rewards/get_intelligibility_reward_std": 10.652660131454468, + "rewards/get_target_len_reward": -0.021503351628780365, + "rewards/get_target_len_reward_std": 0.06012616865336895, + "step": 4740 + }, + { + "advantages": 3.178914056434223e-08, + "advantages_std": 1.5654615759849548, + "clip_ratio": 0.0, + "completion_length": 86.27678604125977, + "epoch": 3.571428571428571, + "grad_norm": 92.0, + "kl": 0.32612827718257903, + "learning_rate": 3.2142857142857147e-06, + "loss": 0.0354, + "num_tokens": 7625945.0, + "reward": -1.7317584097385406, + "reward_std": 6.7727696895599365, + "rewards/get_chromagram_reward": 0.6288199841976165, + "rewards/get_chromagram_reward_std": 0.12393123582005501, + "rewards/get_intelligibility_reward": -5.801967740058899, + "rewards/get_intelligibility_reward_std": 10.51182508468628, + "rewards/get_target_len_reward": -0.022127049788832665, + "rewards/get_target_len_reward_std": 0.06373270452022553, + "step": 4750 + }, + { + "advantages": 1.9458433229146976e-07, + "advantages_std": 1.6080638766288757, + "clip_ratio": 0.0, + "completion_length": 84.5077392578125, + "epoch": 3.5789473684210527, + "grad_norm": 4.71875, + "kl": 0.6539877519011498, + "learning_rate": 3.210526315789474e-06, + "loss": 0.071, + "num_tokens": 7925601.0, + "reward": -1.5671055257320403, + "reward_std": 6.523413801193238, + "rewards/get_chromagram_reward": 0.6197600662708282, + "rewards/get_chromagram_reward_std": 0.11611171290278435, + "rewards/get_intelligibility_reward": -5.29871027469635, + "rewards/get_intelligibility_reward_std": 10.249914932250977, + "rewards/get_target_len_reward": -0.022366233076900242, + "rewards/get_target_len_reward_std": 0.06861714329570531, + "step": 4760 + }, + { + "advantages": 2.4686258957018483e-07, + "advantages_std": 1.670961356163025, + "clip_ratio": 0.0, + "completion_length": 88.0184539794922, + "epoch": 3.5864661654135337, + "grad_norm": 9.5, + "kl": 4.304474097490311, + "learning_rate": 3.206766917293233e-06, + "loss": 0.4369, + "num_tokens": 8234782.0, + "reward": -1.4802373588085174, + "reward_std": 6.493495321273803, + "rewards/get_chromagram_reward": 0.6224713683128357, + "rewards/get_chromagram_reward_std": 0.10268469974398613, + "rewards/get_intelligibility_reward": -5.039081716537476, + "rewards/get_intelligibility_reward_std": 10.251253795623779, + "rewards/get_target_len_reward": -0.024101494625210763, + "rewards/get_target_len_reward_std": 0.0734918974339962, + "step": 4770 + }, + { + "advantages": -3.136694559202624e-07, + "advantages_std": 1.4593484938144683, + "clip_ratio": 0.0, + "completion_length": 89.94404907226563, + "epoch": 3.593984962406015, + "grad_norm": 9.3125, + "kl": 0.34921103417873384, + "learning_rate": 3.203007518796993e-06, + "loss": 0.0377, + "num_tokens": 8549105.0, + "reward": -1.6996458053588868, + "reward_std": 6.481504344940186, + "rewards/get_chromagram_reward": 0.6326122224330902, + "rewards/get_chromagram_reward_std": 0.12093279138207436, + "rewards/get_intelligibility_reward": -5.710180354118347, + "rewards/get_intelligibility_reward_std": 10.087297391891479, + "rewards/get_target_len_reward": -0.021368958707898855, + "rewards/get_target_len_reward_std": 0.052948375791311265, + "step": 4780 + }, + { + "advantages": -7.274250350519651e-07, + "advantages_std": 1.6031969785690308, + "clip_ratio": 0.0, + "completion_length": 84.48095321655273, + "epoch": 3.601503759398496, + "grad_norm": 6.59375, + "kl": 0.33725603520870207, + "learning_rate": 3.199248120300752e-06, + "loss": 0.0396, + "num_tokens": 8848920.0, + "reward": -1.6145397573709488, + "reward_std": 6.478304386138916, + "rewards/get_chromagram_reward": 0.6312460958957672, + "rewards/get_chromagram_reward_std": 0.1125837966799736, + "rewards/get_intelligibility_reward": -5.45187383890152, + "rewards/get_intelligibility_reward_std": 10.112965631484986, + "rewards/get_target_len_reward": -0.02299130242317915, + "rewards/get_target_len_reward_std": 0.06794755682349204, + "step": 4790 + }, + { + "advantages": -1.6589960027957318e-07, + "advantages_std": 1.6106690645217896, + "clip_ratio": 0.0, + "completion_length": 87.36369247436524, + "epoch": 3.6090225563909772, + "grad_norm": 6.09375, + "kl": 0.3092425674200058, + "learning_rate": 3.1954887218045117e-06, + "loss": 0.0315, + "num_tokens": 9156658.0, + "reward": -1.5559602946043014, + "reward_std": 6.706591558456421, + "rewards/get_chromagram_reward": 0.6154250383377076, + "rewards/get_chromagram_reward_std": 0.1058032289147377, + "rewards/get_intelligibility_reward": -5.264953482151031, + "rewards/get_intelligibility_reward_std": 10.521115493774413, + "rewards/get_target_len_reward": -0.018352086283266546, + "rewards/get_target_len_reward_std": 0.051684724539518355, + "step": 4800 + }, + { + "advantages": 5.599111375431676e-07, + "advantages_std": 1.5521262526512145, + "clip_ratio": 0.0, + "completion_length": 88.90714492797852, + "epoch": 3.6165413533834587, + "grad_norm": 7.25, + "kl": 0.9247394904494286, + "learning_rate": 3.191729323308271e-06, + "loss": 0.0967, + "num_tokens": 9468894.0, + "reward": -1.198024618625641, + "reward_std": 6.5147710800170895, + "rewards/get_chromagram_reward": 0.6325487613677978, + "rewards/get_chromagram_reward_std": 0.11576045975089073, + "rewards/get_intelligibility_reward": -4.203026843070984, + "rewards/get_intelligibility_reward_std": 10.598825645446777, + "rewards/get_target_len_reward": -0.02359545128419995, + "rewards/get_target_len_reward_std": 0.0703369751572609, + "step": 4810 + }, + { + "advantages": 1.8129747232364933e-07, + "advantages_std": 1.662482452392578, + "clip_ratio": 0.0, + "completion_length": 86.67857284545899, + "epoch": 3.6240601503759398, + "grad_norm": 8.875, + "kl": 0.3507300466299057, + "learning_rate": 3.1879699248120305e-06, + "loss": 0.0386, + "num_tokens": 9774695.0, + "reward": -1.4802094399929047, + "reward_std": 6.700056171417236, + "rewards/get_chromagram_reward": 0.6103806674480439, + "rewards/get_chromagram_reward_std": 0.11778130531311035, + "rewards/get_intelligibility_reward": -5.0308449268341064, + "rewards/get_intelligibility_reward_std": 10.702712249755859, + "rewards/get_target_len_reward": -0.02016364596784115, + "rewards/get_target_len_reward_std": 0.059031769074499606, + "step": 4820 + }, + { + "advantages": -4.316369796697472e-07, + "advantages_std": 1.531548523902893, + "clip_ratio": 0.0, + "completion_length": 87.11250228881836, + "epoch": 3.6315789473684212, + "grad_norm": 6.40625, + "kl": 0.3791714206337929, + "learning_rate": 3.1842105263157898e-06, + "loss": 0.0439, + "num_tokens": 10081945.0, + "reward": -1.2015679739415646, + "reward_std": 6.660047101974487, + "rewards/get_chromagram_reward": 0.6140450894832611, + "rewards/get_chromagram_reward_std": 0.11099514588713646, + "rewards/get_intelligibility_reward": -4.195827615261078, + "rewards/get_intelligibility_reward_std": 10.852205181121827, + "rewards/get_target_len_reward": -0.022921310737729073, + "rewards/get_target_len_reward_std": 0.07505319323390722, + "step": 4830 + }, + { + "advantages": -6.854534433387016e-08, + "advantages_std": 1.5445081114768981, + "clip_ratio": 0.0, + "completion_length": 84.11845397949219, + "epoch": 3.6390977443609023, + "grad_norm": 5.25, + "kl": 0.33220981657505033, + "learning_rate": 3.1804511278195494e-06, + "loss": 0.0388, + "num_tokens": 10381469.0, + "reward": -1.2563096657395363, + "reward_std": 6.40204119682312, + "rewards/get_chromagram_reward": 0.6238720178604126, + "rewards/get_chromagram_reward_std": 0.10978959575295448, + "rewards/get_intelligibility_reward": -4.369728851318359, + "rewards/get_intelligibility_reward_std": 10.306414556503295, + "rewards/get_target_len_reward": -0.023071921616792678, + "rewards/get_target_len_reward_std": 0.06400219611823559, + "step": 4840 + }, + { + "advantages": -5.635122782621238e-07, + "advantages_std": 1.5880194425582885, + "clip_ratio": 0.0, + "completion_length": 87.5827407836914, + "epoch": 3.6466165413533833, + "grad_norm": 8.5, + "kl": 2.8647284686565397, + "learning_rate": 3.1766917293233086e-06, + "loss": 0.2937, + "num_tokens": 10690013.0, + "reward": -1.343219232559204, + "reward_std": 6.797761154174805, + "rewards/get_chromagram_reward": 0.6102810621261596, + "rewards/get_chromagram_reward_std": 0.11765508279204369, + "rewards/get_intelligibility_reward": -4.6079377889633175, + "rewards/get_intelligibility_reward_std": 11.04647216796875, + "rewards/get_target_len_reward": -0.032000647950917484, + "rewards/get_target_len_reward_std": 0.09445926304906607, + "step": 4850 + }, + { + "advantages": 2.635022156027844e-07, + "advantages_std": 1.715597116947174, + "clip_ratio": 0.0, + "completion_length": 86.18869171142578, + "epoch": 3.654135338345865, + "grad_norm": 9.875, + "kl": 0.2896317094564438, + "learning_rate": 3.1729323308270683e-06, + "loss": 0.0364, + "num_tokens": 10995680.0, + "reward": -1.201186391711235, + "reward_std": 6.774267244338989, + "rewards/get_chromagram_reward": 0.6166905164718628, + "rewards/get_chromagram_reward_std": 0.1286042921245098, + "rewards/get_intelligibility_reward": -4.195585256814956, + "rewards/get_intelligibility_reward_std": 11.000476360321045, + "rewards/get_target_len_reward": -0.024664169922471047, + "rewards/get_target_len_reward_std": 0.0782824408262968, + "step": 4860 + }, + { + "advantages": -3.4297507873759513e-07, + "advantages_std": 1.5256575226783753, + "clip_ratio": 0.0, + "completion_length": 84.12143096923828, + "epoch": 3.661654135338346, + "grad_norm": 11.0625, + "kl": 0.31614808589220045, + "learning_rate": 3.1691729323308275e-06, + "loss": 0.0328, + "num_tokens": 11295271.0, + "reward": -1.4852625608444214, + "reward_std": 6.9996030807495115, + "rewards/get_chromagram_reward": 0.6112887680530548, + "rewards/get_chromagram_reward_std": 0.1129858560860157, + "rewards/get_intelligibility_reward": -5.048020737618208, + "rewards/get_intelligibility_reward_std": 11.152630519866943, + "rewards/get_target_len_reward": -0.019055381417274475, + "rewards/get_target_len_reward_std": 0.050889964960515496, + "step": 4870 + }, + { + "advantages": -1.9197663547743105e-07, + "advantages_std": 1.6212410807609559, + "clip_ratio": 0.0, + "completion_length": 87.62381134033203, + "epoch": 3.6691729323308273, + "grad_norm": 6.375, + "kl": 0.3076537221670151, + "learning_rate": 3.1654135338345863e-06, + "loss": 0.0396, + "num_tokens": 11604076.0, + "reward": -1.5547768741846084, + "reward_std": 6.924158525466919, + "rewards/get_chromagram_reward": 0.6123470544815064, + "rewards/get_chromagram_reward_std": 0.11614794582128525, + "rewards/get_intelligibility_reward": -5.257422703504562, + "rewards/get_intelligibility_reward_std": 10.931137180328369, + "rewards/get_target_len_reward": -0.019254606403410434, + "rewards/get_target_len_reward_std": 0.06625755876302719, + "step": 4880 + }, + { + "advantages": 3.3279261231200507e-07, + "advantages_std": 1.529116427898407, + "clip_ratio": 0.0, + "completion_length": 86.49702529907226, + "epoch": 3.6766917293233083, + "grad_norm": 5.78125, + "kl": 0.5446156710386276, + "learning_rate": 3.1616541353383464e-06, + "loss": 0.057, + "num_tokens": 11909235.0, + "reward": -1.4663063704967498, + "reward_std": 6.5054491519927975, + "rewards/get_chromagram_reward": 0.6186487138271332, + "rewards/get_chromagram_reward_std": 0.11071438938379288, + "rewards/get_intelligibility_reward": -4.994838905334473, + "rewards/get_intelligibility_reward_std": 10.342679977416992, + "rewards/get_target_len_reward": -0.022728720400482415, + "rewards/get_target_len_reward_std": 0.06532426942139864, + "step": 4890 + }, + { + "advantages": 1.2839835150657564e-07, + "advantages_std": 1.4912607192993164, + "clip_ratio": 0.0, + "completion_length": 87.38214416503907, + "epoch": 3.6842105263157894, + "grad_norm": 5.75, + "kl": 0.32485940903425214, + "learning_rate": 3.157894736842105e-06, + "loss": 0.0372, + "num_tokens": 12217138.0, + "reward": -1.553096640110016, + "reward_std": 6.913800048828125, + "rewards/get_chromagram_reward": 0.6143791019916535, + "rewards/get_chromagram_reward_std": 0.11344245597720146, + "rewards/get_intelligibility_reward": -5.252873635292053, + "rewards/get_intelligibility_reward_std": 11.019554710388183, + "rewards/get_target_len_reward": -0.020795133616775274, + "rewards/get_target_len_reward_std": 0.06670989170670509, + "step": 4900 + }, + { + "advantages": 9.685751578558666e-09, + "advantages_std": 1.5933101773262024, + "clip_ratio": 0.0, + "completion_length": 86.33809661865234, + "epoch": 3.6917293233082704, + "grad_norm": 8.125, + "kl": 0.3074103772640228, + "learning_rate": 3.1541353383458652e-06, + "loss": 0.0328, + "num_tokens": 12522530.0, + "reward": -1.1914991319179535, + "reward_std": 6.638744497299195, + "rewards/get_chromagram_reward": 0.6417280793190002, + "rewards/get_chromagram_reward_std": 0.11801392138004303, + "rewards/get_intelligibility_reward": -4.192426967620849, + "rewards/get_intelligibility_reward_std": 10.866980838775635, + "rewards/get_target_len_reward": -0.023798331245779993, + "rewards/get_target_len_reward_std": 0.058177833631634715, + "step": 4910 + }, + { + "advantages": -1.671413589976467e-07, + "advantages_std": 1.5453439235687256, + "clip_ratio": 0.0, + "completion_length": 84.20357360839844, + "epoch": 3.699248120300752, + "grad_norm": 6.34375, + "kl": 0.32516286969184877, + "learning_rate": 3.150375939849624e-06, + "loss": 0.0346, + "num_tokens": 12821557.0, + "reward": -1.3945641126483679, + "reward_std": 6.341374254226684, + "rewards/get_chromagram_reward": 0.6144876718521118, + "rewards/get_chromagram_reward_std": 0.10897763669490815, + "rewards/get_intelligibility_reward": -4.780710679292679, + "rewards/get_intelligibility_reward_std": 9.99721794128418, + "rewards/get_target_len_reward": -0.017468852270394564, + "rewards/get_target_len_reward_std": 0.04808750338852406, + "step": 4920 + }, + { + "advantages": 2.66482444999383e-07, + "advantages_std": 1.658053195476532, + "clip_ratio": 0.0, + "completion_length": 90.80357208251954, + "epoch": 3.706766917293233, + "grad_norm": 6.0, + "kl": 0.6095966547727585, + "learning_rate": 3.146616541353384e-06, + "loss": 0.0635, + "num_tokens": 13138313.0, + "reward": -1.650380975008011, + "reward_std": 6.63803768157959, + "rewards/get_chromagram_reward": 0.6273995757102966, + "rewards/get_chromagram_reward_std": 0.11100057512521744, + "rewards/get_intelligibility_reward": -5.559595322608947, + "rewards/get_intelligibility_reward_std": 10.292674160003662, + "rewards/get_target_len_reward": -0.018946948274970055, + "rewards/get_target_len_reward_std": 0.05267423167824745, + "step": 4930 + }, + { + "advantages": -4.592041420892201e-07, + "advantages_std": 1.539730954170227, + "clip_ratio": 0.0, + "completion_length": 88.36726379394531, + "epoch": 3.7142857142857144, + "grad_norm": 11.4375, + "kl": 0.3707980513572693, + "learning_rate": 3.142857142857143e-06, + "loss": 0.042, + "num_tokens": 13449400.0, + "reward": -1.102984681725502, + "reward_std": 6.229949712753296, + "rewards/get_chromagram_reward": 0.6193095803260803, + "rewards/get_chromagram_reward_std": 0.11251397728919983, + "rewards/get_intelligibility_reward": -3.908903980255127, + "rewards/get_intelligibility_reward_std": 10.1692476272583, + "rewards/get_target_len_reward": -0.01935954224318266, + "rewards/get_target_len_reward_std": 0.05940852351486683, + "step": 4940 + }, + { + "advantages": 3.3428273980007364e-07, + "advantages_std": 1.5412377834320068, + "clip_ratio": 0.0, + "completion_length": 85.96607284545898, + "epoch": 3.7218045112781954, + "grad_norm": 5.4375, + "kl": 0.36561394929885865, + "learning_rate": 3.139097744360903e-06, + "loss": 0.04, + "num_tokens": 13753869.0, + "reward": -1.2172423183918, + "reward_std": 6.248193788528442, + "rewards/get_chromagram_reward": 0.6351997315883636, + "rewards/get_chromagram_reward_std": 0.1150067277252674, + "rewards/get_intelligibility_reward": -4.263802683353424, + "rewards/get_intelligibility_reward_std": 10.08340663909912, + "rewards/get_target_len_reward": -0.02312365211546421, + "rewards/get_target_len_reward_std": 0.06662276312708855, + "step": 4950 + }, + { + "advantages": -3.019968872308709e-07, + "advantages_std": 1.5049192070961, + "clip_ratio": 0.0, + "completion_length": 85.79285888671875, + "epoch": 3.7293233082706765, + "grad_norm": 8.5, + "kl": 0.34319745302200316, + "learning_rate": 3.1353383458646618e-06, + "loss": 0.0405, + "num_tokens": 14057869.0, + "reward": -1.6160476624965667, + "reward_std": 6.543230485916138, + "rewards/get_chromagram_reward": 0.6266816258430481, + "rewards/get_chromagram_reward_std": 0.11776885390281677, + "rewards/get_intelligibility_reward": -5.453230166435242, + "rewards/get_intelligibility_reward_std": 10.165988731384278, + "rewards/get_target_len_reward": -0.021594143752008677, + "rewards/get_target_len_reward_std": 0.06191838830709458, + "step": 4960 + }, + { + "advantages": -2.3196142358017368e-07, + "advantages_std": 1.4668985962867738, + "clip_ratio": 0.0, + "completion_length": 86.55059661865235, + "epoch": 3.736842105263158, + "grad_norm": 7.125, + "kl": 0.32584773898124697, + "learning_rate": 3.131578947368421e-06, + "loss": 0.0348, + "num_tokens": 14363201.0, + "reward": -1.4537909626960754, + "reward_std": 6.800926256179809, + "rewards/get_chromagram_reward": 0.6184272468090057, + "rewards/get_chromagram_reward_std": 0.11034496873617172, + "rewards/get_intelligibility_reward": -4.961474227905273, + "rewards/get_intelligibility_reward_std": 10.864050674438477, + "rewards/get_target_len_reward": -0.018325691297650337, + "rewards/get_target_len_reward_std": 0.05187810454517603, + "step": 4970 + }, + { + "advantages": -4.0121377935520287e-07, + "advantages_std": 1.5062777817249298, + "clip_ratio": 0.0, + "completion_length": 81.4023826599121, + "epoch": 3.744360902255639, + "grad_norm": 44.0, + "kl": 0.4334402531385422, + "learning_rate": 3.1278195488721806e-06, + "loss": 0.0473, + "num_tokens": 14654849.0, + "reward": -1.7004128456115724, + "reward_std": 7.087176370620727, + "rewards/get_chromagram_reward": 0.6041354537010193, + "rewards/get_chromagram_reward_std": 0.1227414608001709, + "rewards/get_intelligibility_reward": -5.686217975616455, + "rewards/get_intelligibility_reward_std": 11.177903652191162, + "rewards/get_target_len_reward": -0.019155793637037278, + "rewards/get_target_len_reward_std": 0.054726789519190785, + "step": 4980 + }, + { + "advantages": -3.9388738315437877e-07, + "advantages_std": 1.6339298367500306, + "clip_ratio": 0.0, + "completion_length": 92.73988265991211, + "epoch": 3.7518796992481205, + "grad_norm": 9.25, + "kl": 0.3501076936721802, + "learning_rate": 3.12406015037594e-06, + "loss": 0.038, + "num_tokens": 14977890.0, + "reward": -1.289211356639862, + "reward_std": 6.95147590637207, + "rewards/get_chromagram_reward": 0.6183295786380768, + "rewards/get_chromagram_reward_std": 0.11973841786384583, + "rewards/get_intelligibility_reward": -4.466941666603089, + "rewards/get_intelligibility_reward_std": 11.286186218261719, + "rewards/get_target_len_reward": -0.01902186619117856, + "rewards/get_target_len_reward_std": 0.04339134152978659, + "step": 4990 + }, + { + "advantages": -2.359350901315338e-08, + "advantages_std": 1.6250181078910828, + "clip_ratio": 0.0, + "completion_length": 85.16071701049805, + "epoch": 3.7593984962406015, + "grad_norm": 5.125, + "kl": 0.3183282628655434, + "learning_rate": 3.1203007518796995e-06, + "loss": 0.0353, + "num_tokens": 15279955.0, + "reward": -1.4234922677278519, + "reward_std": 6.963759803771973, + "rewards/get_chromagram_reward": 0.6406114637851715, + "rewards/get_chromagram_reward_std": 0.11178898885846138, + "rewards/get_intelligibility_reward": -4.882576875388622, + "rewards/get_intelligibility_reward_std": 11.115171527862548, + "rewards/get_target_len_reward": -0.028511168900877237, + "rewards/get_target_len_reward_std": 0.08178133703768253, + "step": 5000 + }, + { + "advantages": -1.344829901661626e-07, + "advantages_std": 1.6288373589515686, + "clip_ratio": 0.0, + "completion_length": 85.88869171142578, + "epoch": 3.7669172932330826, + "grad_norm": 5.96875, + "kl": 0.3554558753967285, + "learning_rate": 3.1165413533834587e-06, + "loss": 0.0356, + "num_tokens": 303525.0, + "reward": -1.5104371786117554, + "reward_std": 6.379262018203735, + "rewards/get_chromagram_reward": 0.6029898881912231, + "rewards/get_chromagram_reward_std": 0.10567670539021493, + "rewards/get_intelligibility_reward": -5.115782928466797, + "rewards/get_intelligibility_reward_std": 10.068166732788086, + "rewards/get_target_len_reward": -0.018518290482461452, + "rewards/get_target_len_reward_std": 0.050624676048755646, + "step": 5010 + }, + { + "advantages": -9.26852234783837e-07, + "advantages_std": 1.5683493494987488, + "clip_ratio": 0.0, + "completion_length": 89.2023826599121, + "epoch": 3.774436090225564, + "grad_norm": 240.0, + "kl": 0.32017101496458056, + "learning_rate": 3.1127819548872184e-06, + "loss": 0.0361, + "num_tokens": 617139.0, + "reward": -1.1960038989782333, + "reward_std": 6.3604504585266115, + "rewards/get_chromagram_reward": 0.6310937643051148, + "rewards/get_chromagram_reward_std": 0.11309906244277954, + "rewards/get_intelligibility_reward": -4.197369801998138, + "rewards/get_intelligibility_reward_std": 10.345066165924072, + "rewards/get_target_len_reward": -0.021735391952097415, + "rewards/get_target_len_reward_std": 0.061658013984560965, + "step": 5020 + }, + { + "advantages": 6.482005375119115e-07, + "advantages_std": 1.5073270559310914, + "clip_ratio": 0.0, + "completion_length": 84.32083663940429, + "epoch": 3.781954887218045, + "grad_norm": 6.375, + "kl": 0.3702144831418991, + "learning_rate": 3.1090225563909776e-06, + "loss": 0.0369, + "num_tokens": 917088.0, + "reward": -1.6622222304344176, + "reward_std": 6.832606649398803, + "rewards/get_chromagram_reward": 0.6209613680839539, + "rewards/get_chromagram_reward_std": 0.11562097668647767, + "rewards/get_intelligibility_reward": -5.585749959945678, + "rewards/get_intelligibility_reward_std": 10.770749187469482, + "rewards/get_target_len_reward": -0.02187794419005513, + "rewards/get_target_len_reward_std": 0.05569152720272541, + "step": 5030 + }, + { + "advantages": 5.339583211139143e-08, + "advantages_std": 1.5658186316490172, + "clip_ratio": 0.0, + "completion_length": 87.19940643310547, + "epoch": 3.7894736842105265, + "grad_norm": 5.84375, + "kl": 0.29390337616205214, + "learning_rate": 3.1052631578947372e-06, + "loss": 0.0324, + "num_tokens": 1224793.0, + "reward": -1.600793306529522, + "reward_std": 7.168339109420776, + "rewards/get_chromagram_reward": 0.6240476608276367, + "rewards/get_chromagram_reward_std": 0.12075399681925773, + "rewards/get_intelligibility_reward": -5.402066552639008, + "rewards/get_intelligibility_reward_std": 11.336180496215821, + "rewards/get_target_len_reward": -0.02436084356158972, + "rewards/get_target_len_reward_std": 0.07069507241249084, + "step": 5040 + }, + { + "advantages": 3.2360356787553003e-07, + "advantages_std": 1.6109248757362367, + "clip_ratio": 0.0, + "completion_length": 86.20297698974609, + "epoch": 3.7969924812030076, + "grad_norm": 23.5, + "kl": 0.4897716358304024, + "learning_rate": 3.1015037593984964e-06, + "loss": 0.0565, + "num_tokens": 1530154.0, + "reward": -1.476647686958313, + "reward_std": 6.300374603271484, + "rewards/get_chromagram_reward": 0.6303463339805603, + "rewards/get_chromagram_reward_std": 0.11567277759313584, + "rewards/get_intelligibility_reward": -5.035875868797302, + "rewards/get_intelligibility_reward_std": 9.897969913482665, + "rewards/get_target_len_reward": -0.024413358047604562, + "rewards/get_target_len_reward_std": 0.0697399366647005, + "step": 5050 + }, + { + "advantages": 1.1672577286958585e-07, + "advantages_std": 1.5832074165344239, + "clip_ratio": 0.0, + "completion_length": 86.55000228881836, + "epoch": 3.8045112781954886, + "grad_norm": 9.3125, + "kl": 0.38453815281391146, + "learning_rate": 3.097744360902256e-06, + "loss": 0.0451, + "num_tokens": 1835452.0, + "reward": -1.5706634759902953, + "reward_std": 6.566565227508545, + "rewards/get_chromagram_reward": 0.6134556949138641, + "rewards/get_chromagram_reward_std": 0.11115473136305809, + "rewards/get_intelligibility_reward": -5.305679714679718, + "rewards/get_intelligibility_reward_std": 10.304097652435303, + "rewards/get_target_len_reward": -0.019766069017350674, + "rewards/get_target_len_reward_std": 0.06396161615848542, + "step": 5060 + }, + { + "advantages": 2.0898878716479885e-07, + "advantages_std": 1.5226559519767762, + "clip_ratio": 0.0, + "completion_length": 88.90893096923828, + "epoch": 3.8120300751879697, + "grad_norm": 26.5, + "kl": 0.4461306095123291, + "learning_rate": 3.0939849624060153e-06, + "loss": 0.0489, + "num_tokens": 2147803.0, + "reward": -1.4417099684476853, + "reward_std": 6.737149572372436, + "rewards/get_chromagram_reward": 0.6207392811775208, + "rewards/get_chromagram_reward_std": 0.10657211765646935, + "rewards/get_intelligibility_reward": -4.927727246284485, + "rewards/get_intelligibility_reward_std": 10.79146318435669, + "rewards/get_target_len_reward": -0.018141804076731206, + "rewards/get_target_len_reward_std": 0.0579329727217555, + "step": 5070 + }, + { + "advantages": -4.967044731074566e-09, + "advantages_std": 1.6526432633399963, + "clip_ratio": 0.0, + "completion_length": 88.12440567016601, + "epoch": 3.819548872180451, + "grad_norm": 29.375, + "kl": 0.3443562790751457, + "learning_rate": 3.0902255639097745e-06, + "loss": 0.0343, + "num_tokens": 2457928.0, + "reward": -1.4987624168395997, + "reward_std": 6.716041421890258, + "rewards/get_chromagram_reward": 0.6303575217723847, + "rewards/get_chromagram_reward_std": 0.12052299976348876, + "rewards/get_intelligibility_reward": -5.10470449924469, + "rewards/get_intelligibility_reward_std": 10.704877853393555, + "rewards/get_target_len_reward": -0.02193996049463749, + "rewards/get_target_len_reward_std": 0.049775147996842864, + "step": 5080 + }, + { + "advantages": -3.377596939913019e-08, + "advantages_std": 1.5974238514900208, + "clip_ratio": 0.0, + "completion_length": 87.68869171142578, + "epoch": 3.827067669172932, + "grad_norm": 6.90625, + "kl": 0.3182404175400734, + "learning_rate": 3.086466165413534e-06, + "loss": 0.0336, + "num_tokens": 2766111.0, + "reward": -1.2910590320825577, + "reward_std": 6.435832595825195, + "rewards/get_chromagram_reward": 0.6295618176460266, + "rewards/get_chromagram_reward_std": 0.1058080993592739, + "rewards/get_intelligibility_reward": -4.485034775733948, + "rewards/get_intelligibility_reward_std": 10.362379455566407, + "rewards/get_target_len_reward": -0.017704028356820344, + "rewards/get_target_len_reward_std": 0.052550424635410306, + "step": 5090 + }, + { + "advantages": 4.005928971650974e-07, + "advantages_std": 1.7196611046791077, + "clip_ratio": 0.0, + "completion_length": 87.6732162475586, + "epoch": 3.8345864661654137, + "grad_norm": 6.6875, + "kl": 0.3247728988528252, + "learning_rate": 3.0827067669172934e-06, + "loss": 0.042, + "num_tokens": 3074294.0, + "reward": -1.3364479541778564, + "reward_std": 6.6891755104064945, + "rewards/get_chromagram_reward": 0.6277061879634858, + "rewards/get_chromagram_reward_std": 0.11151268780231476, + "rewards/get_intelligibility_reward": -4.606927335262299, + "rewards/get_intelligibility_reward_std": 10.708397817611694, + "rewards/get_target_len_reward": -0.03012237846851349, + "rewards/get_target_len_reward_std": 0.09449879247695207, + "step": 5100 + }, + { + "advantages": -5.45382511063508e-07, + "advantages_std": 1.5904954671859741, + "clip_ratio": 0.0, + "completion_length": 87.38333358764649, + "epoch": 3.8421052631578947, + "grad_norm": 103.0, + "kl": 0.38762595504522324, + "learning_rate": 3.078947368421053e-06, + "loss": 0.0409, + "num_tokens": 3382576.0, + "reward": -1.5673535346984864, + "reward_std": 7.143572378158569, + "rewards/get_chromagram_reward": 0.6240078985691071, + "rewards/get_chromagram_reward_std": 0.12458935901522636, + "rewards/get_intelligibility_reward": -5.301816511154175, + "rewards/get_intelligibility_reward_std": 11.481878900527954, + "rewards/get_target_len_reward": -0.024251798167824745, + "rewards/get_target_len_reward_std": 0.05677758939564228, + "step": 5110 + }, + { + "advantages": -7.698935178268585e-09, + "advantages_std": 1.6819909691810608, + "clip_ratio": 0.0, + "completion_length": 88.9523826599121, + "epoch": 3.8496240601503757, + "grad_norm": 7.65625, + "kl": 0.3676734402775764, + "learning_rate": 3.0751879699248123e-06, + "loss": 0.0389, + "num_tokens": 3695398.0, + "reward": -1.2386444240808487, + "reward_std": 6.5803868770599365, + "rewards/get_chromagram_reward": 0.6292356431484223, + "rewards/get_chromagram_reward_std": 0.10431931540369987, + "rewards/get_intelligibility_reward": -4.326017516851425, + "rewards/get_intelligibility_reward_std": 10.606090307235718, + "rewards/get_target_len_reward": -0.019151047244668006, + "rewards/get_target_len_reward_std": 0.05265425220131874, + "step": 5120 + }, + { + "advantages": 2.2624931172998686e-07, + "advantages_std": 1.6426711320877074, + "clip_ratio": 0.0, + "completion_length": 86.9380973815918, + "epoch": 3.857142857142857, + "grad_norm": 9.8125, + "kl": 0.4054348558187485, + "learning_rate": 3.071428571428572e-06, + "loss": 0.0429, + "num_tokens": 4001074.0, + "reward": -1.4268037647008895, + "reward_std": 6.356537961959839, + "rewards/get_chromagram_reward": 0.6203414976596833, + "rewards/get_chromagram_reward_std": 0.1233817383646965, + "rewards/get_intelligibility_reward": -4.875811457633972, + "rewards/get_intelligibility_reward_std": 10.090334129333495, + "rewards/get_target_len_reward": -0.024940951261669397, + "rewards/get_target_len_reward_std": 0.07505465373396873, + "step": 5130 + }, + { + "advantages": -2.1855025522654614e-08, + "advantages_std": 1.6684409856796265, + "clip_ratio": 0.0, + "completion_length": 87.81845397949219, + "epoch": 3.8646616541353382, + "grad_norm": 49.5, + "kl": 2.80745629966259, + "learning_rate": 3.067669172932331e-06, + "loss": 0.2834, + "num_tokens": 4310100.0, + "reward": -1.5709318161010741, + "reward_std": 6.355284643173218, + "rewards/get_chromagram_reward": 0.6249613583087921, + "rewards/get_chromagram_reward_std": 0.11457760408520698, + "rewards/get_intelligibility_reward": -5.31957859992981, + "rewards/get_intelligibility_reward_std": 9.948035335540771, + "rewards/get_target_len_reward": -0.018177997972816228, + "rewards/get_target_len_reward_std": 0.04518711529672146, + "step": 5140 + }, + { + "advantages": -4.976987952431955e-07, + "advantages_std": 1.5554571747779846, + "clip_ratio": 0.0, + "completion_length": 88.6583351135254, + "epoch": 3.8721804511278197, + "grad_norm": 13.0625, + "kl": 0.36514002084732056, + "learning_rate": 3.0639097744360908e-06, + "loss": 0.041, + "num_tokens": 4621810.0, + "reward": -1.5353900849819184, + "reward_std": 7.019238233566284, + "rewards/get_chromagram_reward": 0.6143237709999084, + "rewards/get_chromagram_reward_std": 0.11802843660116195, + "rewards/get_intelligibility_reward": -5.199806427955627, + "rewards/get_intelligibility_reward_std": 11.24576940536499, + "rewards/get_target_len_reward": -0.020687189139425754, + "rewards/get_target_len_reward_std": 0.06613438948988914, + "step": 5150 + }, + { + "advantages": -1.0952354614346405e-07, + "advantages_std": 1.5083892703056336, + "clip_ratio": 0.0, + "completion_length": 84.21547775268554, + "epoch": 3.8796992481203008, + "grad_norm": 6.46875, + "kl": 0.37466873079538343, + "learning_rate": 3.06015037593985e-06, + "loss": 0.0473, + "num_tokens": 4920387.0, + "reward": -1.8744422435760497, + "reward_std": 6.897575569152832, + "rewards/get_chromagram_reward": 0.6116405963897705, + "rewards/get_chromagram_reward_std": 0.11380776911973953, + "rewards/get_intelligibility_reward": -6.2126370668411255, + "rewards/get_intelligibility_reward_std": 10.630679559707641, + "rewards/get_target_len_reward": -0.022330059483647347, + "rewards/get_target_len_reward_std": 0.06991768572479487, + "step": 5160 + }, + { + "advantages": 5.540748404087026e-07, + "advantages_std": 1.548570156097412, + "clip_ratio": 0.0, + "completion_length": 89.57083587646484, + "epoch": 3.887218045112782, + "grad_norm": 5.03125, + "kl": 0.37331474870443343, + "learning_rate": 3.0563909774436092e-06, + "loss": 0.0406, + "num_tokens": 5234446.0, + "reward": -1.1399612367153167, + "reward_std": 6.340262699127197, + "rewards/get_chromagram_reward": 0.6268186211585999, + "rewards/get_chromagram_reward_std": 0.11353924125432968, + "rewards/get_intelligibility_reward": -4.029066967964172, + "rewards/get_intelligibility_reward_std": 10.31572060585022, + "rewards/get_target_len_reward": -0.017635060101747514, + "rewards/get_target_len_reward_std": 0.04803536366671324, + "step": 5170 + }, + { + "advantages": 1.4801821208720866e-07, + "advantages_std": 1.6175037503242493, + "clip_ratio": 0.0, + "completion_length": 86.76726379394532, + "epoch": 3.8947368421052633, + "grad_norm": 5.9375, + "kl": 0.3426645964384079, + "learning_rate": 3.052631578947369e-06, + "loss": 0.034, + "num_tokens": 5542037.0, + "reward": -0.9634460397064686, + "reward_std": 6.810406589508057, + "rewards/get_chromagram_reward": 0.6277351975440979, + "rewards/get_chromagram_reward_std": 0.1226750746369362, + "rewards/get_intelligibility_reward": -3.495256319642067, + "rewards/get_intelligibility_reward_std": 11.23453130722046, + "rewards/get_target_len_reward": -0.022816949151456357, + "rewards/get_target_len_reward_std": 0.059058988466858864, + "step": 5180 + }, + { + "advantages": 6.829697838384163e-08, + "advantages_std": 1.60453599691391, + "clip_ratio": 0.0, + "completion_length": 85.44404907226563, + "epoch": 3.9022556390977443, + "grad_norm": 5.6875, + "kl": 0.29529436230659484, + "learning_rate": 3.048872180451128e-06, + "loss": 0.0377, + "num_tokens": 5844093.0, + "reward": -1.7944436550140381, + "reward_std": 7.4176818370819095, + "rewards/get_chromagram_reward": 0.6188875913619996, + "rewards/get_chromagram_reward_std": 0.11493304148316383, + "rewards/get_intelligibility_reward": -5.977695441246032, + "rewards/get_intelligibility_reward_std": 11.72701063156128, + "rewards/get_target_len_reward": -0.024522352125495674, + "rewards/get_target_len_reward_std": 0.07692326549440623, + "step": 5190 + }, + { + "advantages": 1.5000502742168465e-07, + "advantages_std": 1.6334968209266663, + "clip_ratio": 0.0, + "completion_length": 87.25774002075195, + "epoch": 3.909774436090226, + "grad_norm": 7.96875, + "kl": 0.317596735060215, + "learning_rate": 3.0451127819548877e-06, + "loss": 0.0316, + "num_tokens": 6152382.0, + "reward": -1.4345587491989136, + "reward_std": 7.033844709396362, + "rewards/get_chromagram_reward": 0.6279944956302643, + "rewards/get_chromagram_reward_std": 0.11789287552237511, + "rewards/get_intelligibility_reward": -4.9106168985366825, + "rewards/get_intelligibility_reward_std": 11.35363712310791, + "rewards/get_target_len_reward": -0.021053369250148536, + "rewards/get_target_len_reward_std": 0.05207511857151985, + "step": 5200 + }, + { + "advantages": 2.2264819250494837e-07, + "advantages_std": 1.5829517245292664, + "clip_ratio": 0.0, + "completion_length": 84.24107284545899, + "epoch": 3.917293233082707, + "grad_norm": 10.75, + "kl": 2.4750936955213545, + "learning_rate": 3.0413533834586465e-06, + "loss": 0.2519, + "num_tokens": 6452238.0, + "reward": -1.3005388617515563, + "reward_std": 6.142632246017456, + "rewards/get_chromagram_reward": 0.6174070298671722, + "rewards/get_chromagram_reward_std": 0.115874382853508, + "rewards/get_intelligibility_reward": -4.4966700077056885, + "rewards/get_intelligibility_reward_std": 9.827916622161865, + "rewards/get_target_len_reward": -0.022353346459567547, + "rewards/get_target_len_reward_std": 0.06505865342915058, + "step": 5210 + }, + { + "advantages": -1.3584891953577482e-07, + "advantages_std": 1.6031673908233643, + "clip_ratio": 0.0, + "completion_length": 85.95059661865234, + "epoch": 3.924812030075188, + "grad_norm": 6.28125, + "kl": 0.3633052855730057, + "learning_rate": 3.0375939849624066e-06, + "loss": 0.0391, + "num_tokens": 6756142.0, + "reward": -1.3448066473007203, + "reward_std": 6.417456722259521, + "rewards/get_chromagram_reward": 0.6151858687400817, + "rewards/get_chromagram_reward_std": 0.1155214361846447, + "rewards/get_intelligibility_reward": -4.626984453201294, + "rewards/get_intelligibility_reward_std": 10.28032922744751, + "rewards/get_target_len_reward": -0.022621163725852968, + "rewards/get_target_len_reward_std": 0.0578670272603631, + "step": 5220 + }, + { + "advantages": -1.2144447225637123e-07, + "advantages_std": 1.6513906240463256, + "clip_ratio": 0.0, + "completion_length": 90.49821624755859, + "epoch": 3.932330827067669, + "grad_norm": 7.625, + "kl": 0.3150393143296242, + "learning_rate": 3.0338345864661654e-06, + "loss": 0.0356, + "num_tokens": 7073060.0, + "reward": -1.3395723063498735, + "reward_std": 6.531378984451294, + "rewards/get_chromagram_reward": 0.6276501834392547, + "rewards/get_chromagram_reward_std": 0.11458624824881554, + "rewards/get_intelligibility_reward": -4.622202610969543, + "rewards/get_intelligibility_reward_std": 10.413419675827026, + "rewards/get_target_len_reward": -0.02416415549814701, + "rewards/get_target_len_reward_std": 0.06485766638070345, + "step": 5230 + }, + { + "advantages": -5.2154058760578435e-08, + "advantages_std": 1.5587757110595704, + "clip_ratio": 0.0, + "completion_length": 90.5952392578125, + "epoch": 3.9398496240601504, + "grad_norm": 6.6875, + "kl": 0.3194952175021172, + "learning_rate": 3.0300751879699255e-06, + "loss": 0.0384, + "num_tokens": 7389197.0, + "reward": -1.1797830283641815, + "reward_std": 6.843469095230103, + "rewards/get_chromagram_reward": 0.6258788108825684, + "rewards/get_chromagram_reward_std": 0.11164259016513825, + "rewards/get_intelligibility_reward": -4.134021139144897, + "rewards/get_intelligibility_reward_std": 11.231058502197266, + "rewards/get_target_len_reward": -0.031206544488668442, + "rewards/get_target_len_reward_std": 0.08871262595057487, + "step": 5240 + }, + { + "advantages": -1.1151036183321139e-07, + "advantages_std": 1.6567686200141907, + "clip_ratio": 0.0, + "completion_length": 89.1255973815918, + "epoch": 3.9473684210526314, + "grad_norm": 34.0, + "kl": 0.3676748931407928, + "learning_rate": 3.0263157894736843e-06, + "loss": 0.0399, + "num_tokens": 7702461.0, + "reward": -1.3696911913342773, + "reward_std": 6.905460023880005, + "rewards/get_chromagram_reward": 0.6132007837295532, + "rewards/get_chromagram_reward_std": 0.11636709868907928, + "rewards/get_intelligibility_reward": -4.701757583022117, + "rewards/get_intelligibility_reward_std": 11.0114595413208, + "rewards/get_target_len_reward": -0.020516569539904595, + "rewards/get_target_len_reward_std": 0.05645679645240307, + "step": 5250 + }, + { + "advantages": 1.514020262050053e-07, + "advantages_std": 1.469593095779419, + "clip_ratio": 0.0, + "completion_length": 88.95714492797852, + "epoch": 3.954887218045113, + "grad_norm": 10.4375, + "kl": 0.32012175023555756, + "learning_rate": 3.0225563909774443e-06, + "loss": 0.0339, + "num_tokens": 8014001.0, + "reward": -1.283732157945633, + "reward_std": 6.034783267974854, + "rewards/get_chromagram_reward": 0.6218043804168701, + "rewards/get_chromagram_reward_std": 0.11377415880560875, + "rewards/get_intelligibility_reward": -4.453184795379639, + "rewards/get_intelligibility_reward_std": 9.587284135818482, + "rewards/get_target_len_reward": -0.01981568681076169, + "rewards/get_target_len_reward_std": 0.05037029702216387, + "step": 5260 + }, + { + "advantages": -5.8636070718876e-07, + "advantages_std": 1.4802544116973877, + "clip_ratio": 0.0, + "completion_length": 88.19524002075195, + "epoch": 3.962406015037594, + "grad_norm": 8.125, + "kl": 0.31927538812160494, + "learning_rate": 3.018796992481203e-06, + "loss": 0.0385, + "num_tokens": 8324684.0, + "reward": -1.6413162469863891, + "reward_std": 7.5248010635375975, + "rewards/get_chromagram_reward": 0.6196599304676056, + "rewards/get_chromagram_reward_std": 0.09920540302991868, + "rewards/get_intelligibility_reward": -5.523752021789551, + "rewards/get_intelligibility_reward_std": 12.014527988433837, + "rewards/get_target_len_reward": -0.01985640712082386, + "rewards/get_target_len_reward_std": 0.07063727751374245, + "step": 5270 + }, + { + "advantages": -4.023314090773056e-08, + "advantages_std": 1.621880567073822, + "clip_ratio": 0.0, + "completion_length": 83.53392944335937, + "epoch": 3.969924812030075, + "grad_norm": 10.8125, + "kl": 0.3786572337150574, + "learning_rate": 3.0150375939849623e-06, + "loss": 0.0384, + "num_tokens": 8623832.0, + "reward": -1.1323532313108444, + "reward_std": 6.37744836807251, + "rewards/get_chromagram_reward": 0.6239664614200592, + "rewards/get_chromagram_reward_std": 0.11369733661413192, + "rewards/get_intelligibility_reward": -4.0000452876091, + "rewards/get_intelligibility_reward_std": 10.421091842651368, + "rewards/get_target_len_reward": -0.02098073624074459, + "rewards/get_target_len_reward_std": 0.050571346655488014, + "step": 5280 + }, + { + "advantages": 6.544092250315714e-08, + "advantages_std": 1.570748794078827, + "clip_ratio": 0.0, + "completion_length": 90.32381134033203, + "epoch": 3.9774436090225564, + "grad_norm": 29.25, + "kl": 0.30713569074869157, + "learning_rate": 3.011278195488722e-06, + "loss": 0.0364, + "num_tokens": 8940216.0, + "reward": -1.531138226389885, + "reward_std": 6.566509771347046, + "rewards/get_chromagram_reward": 0.6204177737236023, + "rewards/get_chromagram_reward_std": 0.11796076446771622, + "rewards/get_intelligibility_reward": -5.190147817134857, + "rewards/get_intelligibility_reward_std": 10.306473445892333, + "rewards/get_target_len_reward": -0.023684403765946625, + "rewards/get_target_len_reward_std": 0.06714439634233713, + "step": 5290 + }, + { + "advantages": 5.191813119864718e-07, + "advantages_std": 1.5766889333724976, + "clip_ratio": 0.0, + "completion_length": 83.86428680419922, + "epoch": 3.9849624060150375, + "grad_norm": 6.75, + "kl": 0.34546895027160646, + "learning_rate": 3.007518796992481e-06, + "loss": 0.0395, + "num_tokens": 9238461.0, + "reward": -1.8154918551445007, + "reward_std": 6.851765584945679, + "rewards/get_chromagram_reward": 0.6145689308643341, + "rewards/get_chromagram_reward_std": 0.11415692195296287, + "rewards/get_intelligibility_reward": -6.03966007232666, + "rewards/get_intelligibility_reward_std": 10.629767608642577, + "rewards/get_target_len_reward": -0.02138425037264824, + "rewards/get_target_len_reward_std": 0.07321446239948273, + "step": 5300 + }, + { + "advantages": -7.209678862807323e-07, + "advantages_std": 1.627264392375946, + "clip_ratio": 0.0, + "completion_length": 90.0910743713379, + "epoch": 3.992481203007519, + "grad_norm": 5.25, + "kl": 0.3158954918384552, + "learning_rate": 3.003759398496241e-06, + "loss": 0.036, + "num_tokens": 9553891.0, + "reward": -0.9919865518808365, + "reward_std": 6.70737476348877, + "rewards/get_chromagram_reward": 0.6357428431510925, + "rewards/get_chromagram_reward_std": 0.11332304775714874, + "rewards/get_intelligibility_reward": -3.587333357334137, + "rewards/get_intelligibility_reward_std": 11.042314529418945, + "rewards/get_target_len_reward": -0.02436893656849861, + "rewards/get_target_len_reward_std": 0.07317685410380363, + "step": 5310 + }, + { + "advantages": 2.3655593013316433e-07, + "advantages_std": 1.5000331044197082, + "clip_ratio": 0.0, + "completion_length": 85.18512115478515, + "epoch": 4.000751879699248, + "grad_norm": 10.0, + "kl": 0.31935170739889146, + "learning_rate": 3e-06, + "loss": 0.0323, + "num_tokens": 9856035.0, + "reward": -1.2852010980248452, + "reward_std": 6.327541875839233, + "rewards/get_chromagram_reward": 0.613725996017456, + "rewards/get_chromagram_reward_std": 0.11773486211895942, + "rewards/get_intelligibility_reward": -4.447852373123169, + "rewards/get_intelligibility_reward_std": 10.121273040771484, + "rewards/get_target_len_reward": -0.02147660292685032, + "rewards/get_target_len_reward_std": 0.05728430114686489, + "step": 5320 + }, + { + "advantages": -1.1920929736675135e-07, + "advantages_std": 1.4331650376319884, + "clip_ratio": 0.0, + "completion_length": 89.57559738159179, + "epoch": 4.00827067669173, + "grad_norm": 9.3125, + "kl": 0.3469295933842659, + "learning_rate": 2.9962406015037597e-06, + "loss": 0.0379, + "num_tokens": 10170468.0, + "reward": -0.9236498028039932, + "reward_std": 6.302606296539307, + "rewards/get_chromagram_reward": 0.6118164241313935, + "rewards/get_chromagram_reward_std": 0.1398451879620552, + "rewards/get_intelligibility_reward": -3.3608950555324553, + "rewards/get_intelligibility_reward_std": 10.378479623794556, + "rewards/get_target_len_reward": -0.021870562620460988, + "rewards/get_target_len_reward_std": 0.05407197326421738, + "step": 5330 + }, + { + "advantages": 2.4115046244332915e-07, + "advantages_std": 1.4898594737052917, + "clip_ratio": 0.0, + "completion_length": 88.45476379394532, + "epoch": 4.015789473684211, + "grad_norm": 6.5, + "kl": 0.3065282255411148, + "learning_rate": 2.992481203007519e-06, + "loss": 0.0307, + "num_tokens": 10481673.0, + "reward": -1.1274623550474643, + "reward_std": 6.374505424499512, + "rewards/get_chromagram_reward": 0.6194785416126252, + "rewards/get_chromagram_reward_std": 0.10782580673694611, + "rewards/get_intelligibility_reward": -3.984870785474777, + "rewards/get_intelligibility_reward_std": 10.420499992370605, + "rewards/get_target_len_reward": -0.016994608193635942, + "rewards/get_target_len_reward_std": 0.04643943645060063, + "step": 5340 + }, + { + "advantages": -6.996095393674296e-07, + "advantages_std": 1.5091347455978394, + "clip_ratio": 0.0, + "completion_length": 87.61726303100586, + "epoch": 4.023308270676692, + "grad_norm": 6.40625, + "kl": 0.318861123919487, + "learning_rate": 2.9887218045112786e-06, + "loss": 0.0322, + "num_tokens": 10790785.0, + "reward": -1.4366509914398193, + "reward_std": 6.499625730514526, + "rewards/get_chromagram_reward": 0.6275740385055542, + "rewards/get_chromagram_reward_std": 0.11150631085038185, + "rewards/get_intelligibility_reward": -4.918511700630188, + "rewards/get_intelligibility_reward_std": 10.360735988616943, + "rewards/get_target_len_reward": -0.0190150436013937, + "rewards/get_target_len_reward_std": 0.046726927347481254, + "step": 5350 + }, + { + "advantages": 2.533197374532392e-07, + "advantages_std": 1.444432508945465, + "clip_ratio": 0.0, + "completion_length": 89.69166946411133, + "epoch": 4.030827067669173, + "grad_norm": 464.0, + "kl": 0.40669417977333067, + "learning_rate": 2.984962406015038e-06, + "loss": 0.0499, + "num_tokens": 11105345.0, + "reward": -0.9948464393615722, + "reward_std": 6.640193319320678, + "rewards/get_chromagram_reward": 0.605463171005249, + "rewards/get_chromagram_reward_std": 0.12197316065430641, + "rewards/get_intelligibility_reward": -3.5692497849464417, + "rewards/get_intelligibility_reward_std": 10.988998126983642, + "rewards/get_target_len_reward": -0.020752519182860852, + "rewards/get_target_len_reward_std": 0.07180812451988458, + "step": 5360 + }, + { + "advantages": -4.1847427603158847e-07, + "advantages_std": 1.4834824800491333, + "clip_ratio": 0.0, + "completion_length": 86.3851203918457, + "epoch": 4.038345864661654, + "grad_norm": 8.6875, + "kl": 0.3193838641047478, + "learning_rate": 2.981203007518797e-06, + "loss": 0.0337, + "num_tokens": 11410833.0, + "reward": -1.4203301072120667, + "reward_std": 6.485980701446533, + "rewards/get_chromagram_reward": 0.6130522310733795, + "rewards/get_chromagram_reward_std": 0.1125810906291008, + "rewards/get_intelligibility_reward": -4.852126169204712, + "rewards/get_intelligibility_reward_std": 10.270601749420166, + "rewards/get_target_len_reward": -0.021915959380567073, + "rewards/get_target_len_reward_std": 0.07191921528428794, + "step": 5370 + }, + { + "advantages": -8.195636169716636e-09, + "advantages_std": 1.409821331501007, + "clip_ratio": 0.0, + "completion_length": 86.14285888671876, + "epoch": 4.045864661654135, + "grad_norm": 15.5, + "kl": 0.4851821750402451, + "learning_rate": 2.9774436090225567e-06, + "loss": 0.0523, + "num_tokens": 11714740.0, + "reward": -1.6428377270698546, + "reward_std": 6.631846380233765, + "rewards/get_chromagram_reward": 0.6138918578624726, + "rewards/get_chromagram_reward_std": 0.11235097497701645, + "rewards/get_intelligibility_reward": -5.515289831161499, + "rewards/get_intelligibility_reward_std": 10.434710884094239, + "rewards/get_target_len_reward": -0.02711508497595787, + "rewards/get_target_len_reward_std": 0.08662580009549856, + "step": 5380 + }, + { + "advantages": -6.115685017960005e-07, + "advantages_std": 1.613796353340149, + "clip_ratio": 0.0, + "completion_length": 88.29404907226562, + "epoch": 4.053383458646617, + "grad_norm": 5.1875, + "kl": 0.290841107070446, + "learning_rate": 2.973684210526316e-06, + "loss": 0.0296, + "num_tokens": 12025384.0, + "reward": -1.0370060920715332, + "reward_std": 6.411907720565796, + "rewards/get_chromagram_reward": 0.6243631541728973, + "rewards/get_chromagram_reward_std": 0.1036013200879097, + "rewards/get_intelligibility_reward": -3.718669390678406, + "rewards/get_intelligibility_reward_std": 10.511144828796386, + "rewards/get_target_len_reward": -0.016711823269724847, + "rewards/get_target_len_reward_std": 0.0399149265140295, + "step": 5390 + }, + { + "advantages": 7.460514893864456e-07, + "advantages_std": 1.6174774885177612, + "clip_ratio": 0.0, + "completion_length": 88.3398826599121, + "epoch": 4.060902255639098, + "grad_norm": 7.1875, + "kl": 0.34107607007026675, + "learning_rate": 2.9699248120300755e-06, + "loss": 0.0348, + "num_tokens": 12336297.0, + "reward": -1.747072759270668, + "reward_std": 7.106879091262817, + "rewards/get_chromagram_reward": 0.6148701965808868, + "rewards/get_chromagram_reward_std": 0.11414720416069031, + "rewards/get_intelligibility_reward": -5.834241986274719, + "rewards/get_intelligibility_reward_std": 11.133132362365723, + "rewards/get_target_len_reward": -0.021846203505992888, + "rewards/get_target_len_reward_std": 0.056982779502868654, + "step": 5400 + }, + { + "advantages": -1.835326429500128e-07, + "advantages_std": 1.6123695611953734, + "clip_ratio": 0.0, + "completion_length": 89.4154769897461, + "epoch": 4.068421052631579, + "grad_norm": 10.0625, + "kl": 0.5835892543196678, + "learning_rate": 2.9661654135338348e-06, + "loss": 0.0606, + "num_tokens": 12650398.0, + "reward": -1.3808288365602492, + "reward_std": 6.739897966384888, + "rewards/get_chromagram_reward": 0.6306729674339294, + "rewards/get_chromagram_reward_std": 0.10669080466032028, + "rewards/get_intelligibility_reward": -4.7529010534286495, + "rewards/get_intelligibility_reward_std": 10.854820346832275, + "rewards/get_target_len_reward": -0.02025802955031395, + "rewards/get_target_len_reward_std": 0.05213299170136452, + "step": 5410 + }, + { + "advantages": 3.25590386296426e-07, + "advantages_std": 1.5314580202102661, + "clip_ratio": 0.0, + "completion_length": 87.20892868041992, + "epoch": 4.07593984962406, + "grad_norm": 79.0, + "kl": 0.5323092341423035, + "learning_rate": 2.9624060150375944e-06, + "loss": 0.0581, + "num_tokens": 12957528.0, + "reward": -1.5797663807868958, + "reward_std": 6.701803588867188, + "rewards/get_chromagram_reward": 0.6151145219802856, + "rewards/get_chromagram_reward_std": 0.10405527502298355, + "rewards/get_intelligibility_reward": -5.333010649681091, + "rewards/get_intelligibility_reward_std": 10.452555561065674, + "rewards/get_target_len_reward": -0.021402441896498203, + "rewards/get_target_len_reward_std": 0.06293704155832529, + "step": 5420 + }, + { + "advantages": -1.4801821208720866e-07, + "advantages_std": 1.6071329593658448, + "clip_ratio": 0.0, + "completion_length": 85.10119171142578, + "epoch": 4.083458646616541, + "grad_norm": 6.5, + "kl": 0.2981695577502251, + "learning_rate": 2.9586466165413536e-06, + "loss": 0.0306, + "num_tokens": 13259052.0, + "reward": -1.3751710176467895, + "reward_std": 6.559592008590698, + "rewards/get_chromagram_reward": 0.6217553555965424, + "rewards/get_chromagram_reward_std": 0.11424238607287407, + "rewards/get_intelligibility_reward": -4.729965303838253, + "rewards/get_intelligibility_reward_std": 10.498331451416016, + "rewards/get_target_len_reward": -0.017302784696221353, + "rewards/get_target_len_reward_std": 0.049371255189180376, + "step": 5430 + }, + { + "advantages": 5.831321175264747e-07, + "advantages_std": 1.4913076996803283, + "clip_ratio": 0.0, + "completion_length": 91.3130973815918, + "epoch": 4.090977443609023, + "grad_norm": 12352.0, + "kl": 1.394778886437416, + "learning_rate": 2.9548872180451133e-06, + "loss": 0.141, + "num_tokens": 13577797.0, + "reward": -1.4640146307647228, + "reward_std": 7.061498022079467, + "rewards/get_chromagram_reward": 0.6307594776153564, + "rewards/get_chromagram_reward_std": 0.10615155696868897, + "rewards/get_intelligibility_reward": -5.001734495162964, + "rewards/get_intelligibility_reward_std": 11.286694431304932, + "rewards/get_target_len_reward": -0.021068642288446425, + "rewards/get_target_len_reward_std": 0.0496134627610445, + "step": 5440 + }, + { + "advantages": 4.137555919214719e-07, + "advantages_std": 1.691400933265686, + "clip_ratio": 0.0, + "completion_length": 89.92916641235351, + "epoch": 4.098496240601504, + "grad_norm": 6.375, + "kl": 0.3118233859539032, + "learning_rate": 2.9511278195488725e-06, + "loss": 0.036, + "num_tokens": 13893208.0, + "reward": -1.2735981404781342, + "reward_std": 6.645020818710327, + "rewards/get_chromagram_reward": 0.6332329690456391, + "rewards/get_chromagram_reward_std": 0.11299219503998756, + "rewards/get_intelligibility_reward": -4.42809339761734, + "rewards/get_intelligibility_reward_std": 10.679800605773925, + "rewards/get_target_len_reward": -0.02593356678262353, + "rewards/get_target_len_reward_std": 0.07397942505776882, + "step": 5450 + }, + { + "advantages": 5.35696766945648e-07, + "advantages_std": 1.7403199791908264, + "clip_ratio": 0.0, + "completion_length": 87.48869247436524, + "epoch": 4.106015037593985, + "grad_norm": 7.65625, + "kl": 0.3418915793299675, + "learning_rate": 2.9473684210526317e-06, + "loss": 0.0369, + "num_tokens": 14201166.0, + "reward": -1.415563040971756, + "reward_std": 6.7440876960754395, + "rewards/get_chromagram_reward": 0.6164253532886506, + "rewards/get_chromagram_reward_std": 0.11742549315094948, + "rewards/get_intelligibility_reward": -4.8431459903717045, + "rewards/get_intelligibility_reward_std": 10.860113525390625, + "rewards/get_target_len_reward": -0.019968316610902547, + "rewards/get_target_len_reward_std": 0.05547744482755661, + "step": 5460 + }, + { + "advantages": -2.1656354647348054e-07, + "advantages_std": 1.6330742359161377, + "clip_ratio": 0.0, + "completion_length": 87.73690719604492, + "epoch": 4.113533834586466, + "grad_norm": 6.28125, + "kl": 0.3976124137639999, + "learning_rate": 2.9436090225563914e-06, + "loss": 0.0383, + "num_tokens": 14510130.0, + "reward": -0.9823278225958347, + "reward_std": 6.470097780227661, + "rewards/get_chromagram_reward": 0.6272465288639069, + "rewards/get_chromagram_reward_std": 0.12040900364518166, + "rewards/get_intelligibility_reward": -3.553873872756958, + "rewards/get_intelligibility_reward_std": 10.69575605392456, + "rewards/get_target_len_reward": -0.020355920772999525, + "rewards/get_target_len_reward_std": 0.04151589758694172, + "step": 5470 + }, + { + "advantages": -5.078812620240569e-07, + "advantages_std": 1.4891107320785522, + "clip_ratio": 0.0, + "completion_length": 90.52381210327148, + "epoch": 4.121052631578947, + "grad_norm": 12.4375, + "kl": 0.33563627153635023, + "learning_rate": 2.9398496240601506e-06, + "loss": 0.0344, + "num_tokens": 14826580.0, + "reward": -1.1729493260383606, + "reward_std": 6.722590684890747, + "rewards/get_chromagram_reward": 0.6255541265010833, + "rewards/get_chromagram_reward_std": 0.1094115249812603, + "rewards/get_intelligibility_reward": -4.125180602073669, + "rewards/get_intelligibility_reward_std": 11.05735092163086, + "rewards/get_target_len_reward": -0.01922136452049017, + "rewards/get_target_len_reward_std": 0.05560791753232479, + "step": 5480 + }, + { + "advantages": 2.928078387753885e-07, + "advantages_std": 1.6454651832580567, + "clip_ratio": 0.0, + "completion_length": 88.87262115478515, + "epoch": 4.128571428571428, + "grad_norm": 7.28125, + "kl": 0.39739508330821993, + "learning_rate": 2.9360902255639102e-06, + "loss": 0.0451, + "num_tokens": 15138643.0, + "reward": -1.1072691828012466, + "reward_std": 6.511484289169312, + "rewards/get_chromagram_reward": 0.6285755276679993, + "rewards/get_chromagram_reward_std": 0.11080964356660843, + "rewards/get_intelligibility_reward": -3.926626533269882, + "rewards/get_intelligibility_reward_std": 10.474292373657226, + "rewards/get_target_len_reward": -0.023756447620689868, + "rewards/get_target_len_reward_std": 0.06292850002646447, + "step": 5490 + }, + { + "advantages": 1.8253922888789021e-07, + "advantages_std": 1.596780240535736, + "clip_ratio": 0.0, + "completion_length": 87.02202529907227, + "epoch": 4.13609022556391, + "grad_norm": 8.125, + "kl": 0.2828808709979057, + "learning_rate": 2.9323308270676694e-06, + "loss": 0.0319, + "num_tokens": 15445656.0, + "reward": -1.3161565911024808, + "reward_std": 6.487222576141358, + "rewards/get_chromagram_reward": 0.6143887758255004, + "rewards/get_chromagram_reward_std": 0.1173232764005661, + "rewards/get_intelligibility_reward": -4.541189068555832, + "rewards/get_intelligibility_reward_std": 10.356179904937743, + "rewards/get_target_len_reward": -0.02166922325268388, + "rewards/get_target_len_reward_std": 0.06364731937646866, + "step": 5500 + }, + { + "advantages": -1.914799199198569e-07, + "advantages_std": 1.5466854929924012, + "clip_ratio": 0.0, + "completion_length": 86.2648826599121, + "epoch": 4.143609022556391, + "grad_norm": 6.53125, + "kl": 0.344246631860733, + "learning_rate": 2.928571428571429e-06, + "loss": 0.0368, + "num_tokens": 15750267.0, + "reward": -1.5448312044143677, + "reward_std": 6.672153091430664, + "rewards/get_chromagram_reward": 0.6186954915523529, + "rewards/get_chromagram_reward_std": 0.11632555276155472, + "rewards/get_intelligibility_reward": -5.229874622821808, + "rewards/get_intelligibility_reward_std": 10.553904008865356, + "rewards/get_target_len_reward": -0.02331397421658039, + "rewards/get_target_len_reward_std": 0.06315616220235824, + "step": 5510 + }, + { + "advantages": 1.713633551503335e-07, + "advantages_std": 1.539741826057434, + "clip_ratio": 0.0, + "completion_length": 87.74524002075195, + "epoch": 4.151127819548872, + "grad_norm": 9.3125, + "kl": 0.3175946220755577, + "learning_rate": 2.9248120300751883e-06, + "loss": 0.0348, + "num_tokens": 16059223.0, + "reward": -1.3458143293857574, + "reward_std": 6.4293193340301515, + "rewards/get_chromagram_reward": 0.6102846920490265, + "rewards/get_chromagram_reward_std": 0.11779590845108032, + "rewards/get_intelligibility_reward": -4.625285410881043, + "rewards/get_intelligibility_reward_std": 10.368893718719482, + "rewards/get_target_len_reward": -0.022442126646637917, + "rewards/get_target_len_reward_std": 0.06540702283382416, + "step": 5520 + }, + { + "advantages": 4.1226550706596755e-08, + "advantages_std": 1.5556996703147887, + "clip_ratio": 0.0, + "completion_length": 85.48750152587891, + "epoch": 4.158646616541353, + "grad_norm": 8.75, + "kl": 0.3429344639182091, + "learning_rate": 2.921052631578948e-06, + "loss": 0.0391, + "num_tokens": 16361772.0, + "reward": -1.7375137686729432, + "reward_std": 6.66321702003479, + "rewards/get_chromagram_reward": 0.6258727490901947, + "rewards/get_chromagram_reward_std": 0.11534877270460128, + "rewards/get_intelligibility_reward": -5.815235280990601, + "rewards/get_intelligibility_reward_std": 10.311593818664551, + "rewards/get_target_len_reward": -0.023178601637482643, + "rewards/get_target_len_reward_std": 0.060010458901524545, + "step": 5530 + }, + { + "advantages": 1.2442469881079888e-07, + "advantages_std": 1.5329967498779298, + "clip_ratio": 0.0, + "completion_length": 86.05119247436524, + "epoch": 4.166165413533834, + "grad_norm": 6.1875, + "kl": 0.3746921971440315, + "learning_rate": 2.9172932330827068e-06, + "loss": 0.0507, + "num_tokens": 16665461.0, + "reward": -1.7003209590911865, + "reward_std": 6.663384437561035, + "rewards/get_chromagram_reward": 0.6338975608348847, + "rewards/get_chromagram_reward_std": 0.11849100887775421, + "rewards/get_intelligibility_reward": -5.70644314289093, + "rewards/get_intelligibility_reward_std": 10.250672149658204, + "rewards/get_target_len_reward": -0.02841706983745098, + "rewards/get_target_len_reward_std": 0.08157789278775454, + "step": 5540 + }, + { + "advantages": 2.5990108838414016e-07, + "advantages_std": 1.5833655834197997, + "clip_ratio": 0.0, + "completion_length": 85.0321434020996, + "epoch": 4.173684210526316, + "grad_norm": 7.1875, + "kl": 0.3090577393770218, + "learning_rate": 2.913533834586467e-06, + "loss": 0.0338, + "num_tokens": 16966534.0, + "reward": -1.4059063911437988, + "reward_std": 6.507475471496582, + "rewards/get_chromagram_reward": 0.61700838804245, + "rewards/get_chromagram_reward_std": 0.12314107269048691, + "rewards/get_intelligibility_reward": -4.815485262870789, + "rewards/get_intelligibility_reward_std": 10.44538254737854, + "rewards/get_target_len_reward": -0.019241928216069938, + "rewards/get_target_len_reward_std": 0.054523023031651976, + "step": 5550 + }, + { + "advantages": 4.5100848637957825e-07, + "advantages_std": 1.6410022377967834, + "clip_ratio": 0.0, + "completion_length": 87.3773811340332, + "epoch": 4.181203007518797, + "grad_norm": 7.125, + "kl": 0.3442951112985611, + "learning_rate": 2.9097744360902256e-06, + "loss": 0.0377, + "num_tokens": 17274196.0, + "reward": -1.3003638498485088, + "reward_std": 6.568748521804809, + "rewards/get_chromagram_reward": 0.6183872222900391, + "rewards/get_chromagram_reward_std": 0.11773400530219078, + "rewards/get_intelligibility_reward": -4.495910170674324, + "rewards/get_intelligibility_reward_std": 10.582419872283936, + "rewards/get_target_len_reward": -0.02356829959899187, + "rewards/get_target_len_reward_std": 0.06113504599779844, + "step": 5560 + }, + { + "advantages": -8.667507920279149e-08, + "advantages_std": 1.654595386981964, + "clip_ratio": 0.0, + "completion_length": 87.95535888671876, + "epoch": 4.188721804511278, + "grad_norm": 6.125, + "kl": 0.2908389538526535, + "learning_rate": 2.906015037593985e-06, + "loss": 0.0346, + "num_tokens": 17583336.0, + "reward": -1.6878258869051934, + "reward_std": 6.9916908740997314, + "rewards/get_chromagram_reward": 0.6317319989204406, + "rewards/get_chromagram_reward_std": 0.11053061783313751, + "rewards/get_intelligibility_reward": -5.674407863616944, + "rewards/get_intelligibility_reward_std": 10.983782196044922, + "rewards/get_target_len_reward": -0.020801611058413983, + "rewards/get_target_len_reward_std": 0.06081876866519451, + "step": 5570 + }, + { + "advantages": 3.578762374445432e-07, + "advantages_std": 1.6615803837776184, + "clip_ratio": 0.0, + "completion_length": 88.32202453613282, + "epoch": 4.196240601503759, + "grad_norm": 16.375, + "kl": 0.44996539801359176, + "learning_rate": 2.9022556390977445e-06, + "loss": 0.0478, + "num_tokens": 17894230.0, + "reward": -1.3394523441791535, + "reward_std": 6.421341896057129, + "rewards/get_chromagram_reward": 0.6383058249950408, + "rewards/get_chromagram_reward_std": 0.11125587001442909, + "rewards/get_intelligibility_reward": -4.6326796293258665, + "rewards/get_intelligibility_reward_std": 10.272837495803833, + "rewards/get_target_len_reward": -0.023982838820666076, + "rewards/get_target_len_reward_std": 0.05926213786005974, + "step": 5580 + }, + { + "advantages": -7.5052185337654e-07, + "advantages_std": 1.6332746505737306, + "clip_ratio": 0.0, + "completion_length": 88.58750076293946, + "epoch": 4.20375939849624, + "grad_norm": 6.15625, + "kl": 0.3342150181531906, + "learning_rate": 2.8984962406015037e-06, + "loss": 0.0402, + "num_tokens": 18206003.0, + "reward": -1.4808701485395432, + "reward_std": 7.0243466854095455, + "rewards/get_chromagram_reward": 0.6189306735992431, + "rewards/get_chromagram_reward_std": 0.11732441484928131, + "rewards/get_intelligibility_reward": -5.037647825479508, + "rewards/get_intelligibility_reward_std": 11.133300590515137, + "rewards/get_target_len_reward": -0.02389297802001238, + "rewards/get_target_len_reward_std": 0.07138866055756807, + "step": 5590 + }, + { + "advantages": 5.985300020938667e-07, + "advantages_std": 1.5033831596374512, + "clip_ratio": 0.0, + "completion_length": 83.35476379394531, + "epoch": 4.211278195488722, + "grad_norm": 44.5, + "kl": 0.37868370711803434, + "learning_rate": 2.8947368421052634e-06, + "loss": 0.0388, + "num_tokens": 18503732.0, + "reward": -1.8184113264083863, + "reward_std": 6.87455472946167, + "rewards/get_chromagram_reward": 0.6338491857051849, + "rewards/get_chromagram_reward_std": 0.11686685383319854, + "rewards/get_intelligibility_reward": -6.063382339477539, + "rewards/get_intelligibility_reward_std": 10.610133171081543, + "rewards/get_target_len_reward": -0.025700561329722406, + "rewards/get_target_len_reward_std": 0.06520087532699108, + "step": 5600 + }, + { + "advantages": -2.515812752790225e-07, + "advantages_std": 1.4947779536247254, + "clip_ratio": 0.0, + "completion_length": 87.99464492797851, + "epoch": 4.218796992481203, + "grad_norm": 12.0625, + "kl": 0.4214991435408592, + "learning_rate": 2.8909774436090226e-06, + "loss": 0.0469, + "num_tokens": 18814136.0, + "reward": -1.0186500787734984, + "reward_std": 6.064566564559937, + "rewards/get_chromagram_reward": 0.6329184353351593, + "rewards/get_chromagram_reward_std": 0.10823804661631584, + "rewards/get_intelligibility_reward": -3.66917080283165, + "rewards/get_intelligibility_reward_std": 9.890348434448242, + "rewards/get_target_len_reward": -0.019697726145386694, + "rewards/get_target_len_reward_std": 0.05908492133021355, + "step": 5610 + }, + { + "advantages": -2.615153995577657e-07, + "advantages_std": 1.5261252880096436, + "clip_ratio": 0.0, + "completion_length": 89.44642944335938, + "epoch": 4.226315789473684, + "grad_norm": 50.0, + "kl": 0.9879576608538627, + "learning_rate": 2.8872180451127822e-06, + "loss": 0.1026, + "num_tokens": 19128465.0, + "reward": -1.367058303952217, + "reward_std": 6.9173722743988035, + "rewards/get_chromagram_reward": 0.6275076985359191, + "rewards/get_chromagram_reward_std": 0.10938590541481971, + "rewards/get_intelligibility_reward": -4.710054632276297, + "rewards/get_intelligibility_reward_std": 11.131224727630615, + "rewards/get_target_len_reward": -0.018627769872546197, + "rewards/get_target_len_reward_std": 0.0545194873586297, + "step": 5620 + }, + { + "advantages": -9.983778284095025e-08, + "advantages_std": 1.6729804039001466, + "clip_ratio": 0.0, + "completion_length": 87.58035736083984, + "epoch": 4.233834586466165, + "grad_norm": 6.6875, + "kl": 3.7320891961455347, + "learning_rate": 2.8834586466165414e-06, + "loss": 0.3752, + "num_tokens": 19437278.0, + "reward": -1.323493231832981, + "reward_std": 6.493629074096679, + "rewards/get_chromagram_reward": 0.6278137683868408, + "rewards/get_chromagram_reward_std": 0.12112269923090935, + "rewards/get_intelligibility_reward": -4.576982426643371, + "rewards/get_intelligibility_reward_std": 10.355177402496338, + "rewards/get_target_len_reward": -0.021310653630644084, + "rewards/get_target_len_reward_std": 0.05675790887326002, + "step": 5630 + }, + { + "advantages": -1.630435328081603e-07, + "advantages_std": 1.5453658819198608, + "clip_ratio": 0.0, + "completion_length": 87.91547927856445, + "epoch": 4.241353383458646, + "grad_norm": 9.75, + "kl": 0.38267752528190613, + "learning_rate": 2.879699248120301e-06, + "loss": 0.0406, + "num_tokens": 19747203.0, + "reward": -1.4079192280769348, + "reward_std": 6.9246522903442385, + "rewards/get_chromagram_reward": 0.6170772731304168, + "rewards/get_chromagram_reward_std": 0.11740869060158729, + "rewards/get_intelligibility_reward": -4.8179065704345705, + "rewards/get_intelligibility_reward_std": 11.177426147460938, + "rewards/get_target_len_reward": -0.022928063943982124, + "rewards/get_target_len_reward_std": 0.061421534046530724, + "step": 5640 + }, + { + "advantages": 4.1338305045712787e-07, + "advantages_std": 1.6221726179122924, + "clip_ratio": 0.0, + "completion_length": 85.67976303100586, + "epoch": 4.248872180451128, + "grad_norm": 6.9375, + "kl": 5.435917441546917, + "learning_rate": 2.8759398496240603e-06, + "loss": 0.5473, + "num_tokens": 20050422.0, + "reward": -2.090642374753952, + "reward_std": 7.292528247833252, + "rewards/get_chromagram_reward": 0.6181860446929932, + "rewards/get_chromagram_reward_std": 0.11201700642704963, + "rewards/get_intelligibility_reward": -6.86620078086853, + "rewards/get_intelligibility_reward_std": 11.104904079437256, + "rewards/get_target_len_reward": -0.02391198929399252, + "rewards/get_target_len_reward_std": 0.07691430859267712, + "step": 5650 + }, + { + "advantages": 1.1871259175677551e-07, + "advantages_std": 1.589626944065094, + "clip_ratio": 0.0, + "completion_length": 85.58928680419922, + "epoch": 4.256390977443609, + "grad_norm": 82.5, + "kl": 0.360989385843277, + "learning_rate": 2.8721804511278195e-06, + "loss": 0.0408, + "num_tokens": 20353824.0, + "reward": -1.523736972361803, + "reward_std": 6.698141288757324, + "rewards/get_chromagram_reward": 0.611247593164444, + "rewards/get_chromagram_reward_std": 0.1271521754562855, + "rewards/get_intelligibility_reward": -5.157619923353195, + "rewards/get_intelligibility_reward_std": 10.54830846786499, + "rewards/get_target_len_reward": -0.024838272668421268, + "rewards/get_target_len_reward_std": 0.0677162567153573, + "step": 5660 + }, + { + "advantages": -4.0257972386825715e-07, + "advantages_std": 1.6105037808418274, + "clip_ratio": 0.0, + "completion_length": 90.30833435058594, + "epoch": 4.26390977443609, + "grad_norm": 6.75, + "kl": 0.3523553296923637, + "learning_rate": 2.868421052631579e-06, + "loss": 0.0402, + "num_tokens": 20669973.0, + "reward": -1.3020567789673805, + "reward_std": 6.694693613052368, + "rewards/get_chromagram_reward": 0.6207749962806701, + "rewards/get_chromagram_reward_std": 0.11944424584507943, + "rewards/get_intelligibility_reward": -4.507850003242493, + "rewards/get_intelligibility_reward_std": 10.755848407745361, + "rewards/get_target_len_reward": -0.01909503461793065, + "rewards/get_target_len_reward_std": 0.05830676984041929, + "step": 5670 + }, + { + "advantages": 3.429750753625171e-07, + "advantages_std": 1.5029768705368043, + "clip_ratio": 0.0, + "completion_length": 81.91309661865235, + "epoch": 4.271428571428571, + "grad_norm": 6.875, + "kl": 0.32668228149414064, + "learning_rate": 2.8646616541353384e-06, + "loss": 0.0355, + "num_tokens": 20963273.0, + "reward": -1.5594689965248107, + "reward_std": 6.404059028625488, + "rewards/get_chromagram_reward": 0.6218547642230987, + "rewards/get_chromagram_reward_std": 0.1123454861342907, + "rewards/get_intelligibility_reward": -5.275913119316101, + "rewards/get_intelligibility_reward_std": 10.039891481399536, + "rewards/get_target_len_reward": -0.024348314758390187, + "rewards/get_target_len_reward_std": 0.07023859769105911, + "step": 5680 + }, + { + "advantages": 3.874301484074749e-08, + "advantages_std": 1.4815411686897277, + "clip_ratio": 0.0, + "completion_length": 88.20654907226563, + "epoch": 4.278947368421052, + "grad_norm": 8.75, + "kl": 0.3460480824112892, + "learning_rate": 2.860902255639098e-06, + "loss": 0.0406, + "num_tokens": 21273796.0, + "reward": -1.302916806936264, + "reward_std": 7.124443292617798, + "rewards/get_chromagram_reward": 0.6314963459968567, + "rewards/get_chromagram_reward_std": 0.1167108178138733, + "rewards/get_intelligibility_reward": -4.514073705673217, + "rewards/get_intelligibility_reward_std": 11.613936042785644, + "rewards/get_target_len_reward": -0.026172821037471296, + "rewards/get_target_len_reward_std": 0.08674208335578441, + "step": 5690 + }, + { + "advantages": 4.0456654346598955e-07, + "advantages_std": 1.5474624276161193, + "clip_ratio": 0.0, + "completion_length": 87.81726379394532, + "epoch": 4.286466165413533, + "grad_norm": 10.6875, + "kl": 0.30418709963560103, + "learning_rate": 2.8571428571428573e-06, + "loss": 0.0319, + "num_tokens": 21582695.0, + "reward": -1.9298257410526276, + "reward_std": 6.67116961479187, + "rewards/get_chromagram_reward": 0.6151466488838195, + "rewards/get_chromagram_reward_std": 0.1229398138821125, + "rewards/get_intelligibility_reward": -6.38664014339447, + "rewards/get_intelligibility_reward_std": 10.024635791778564, + "rewards/get_target_len_reward": -0.017983187455683947, + "rewards/get_target_len_reward_std": 0.046716343890875575, + "step": 5700 + }, + { + "advantages": 6.929040683445464e-08, + "advantages_std": 1.5471153259277344, + "clip_ratio": 0.0, + "completion_length": 84.2125015258789, + "epoch": 4.293984962406015, + "grad_norm": 22.0, + "kl": 0.31379797756671907, + "learning_rate": 2.853383458646617e-06, + "loss": 0.0397, + "num_tokens": 21881815.0, + "reward": -1.4730405285954475, + "reward_std": 6.675574111938476, + "rewards/get_chromagram_reward": 0.6127762138843537, + "rewards/get_chromagram_reward_std": 0.11312752440571786, + "rewards/get_intelligibility_reward": -5.006561887264252, + "rewards/get_intelligibility_reward_std": 10.592585468292237, + "rewards/get_target_len_reward": -0.02533548539504409, + "rewards/get_target_len_reward_std": 0.08211482018232345, + "step": 5710 + }, + { + "advantages": 2.2575259350787746e-07, + "advantages_std": 1.615368866920471, + "clip_ratio": 0.0, + "completion_length": 86.06250228881837, + "epoch": 4.301503759398496, + "grad_norm": 9.8125, + "kl": 0.36106809973716736, + "learning_rate": 2.849624060150376e-06, + "loss": 0.0412, + "num_tokens": 22185867.0, + "reward": -1.664530771970749, + "reward_std": 6.669352960586548, + "rewards/get_chromagram_reward": 0.6247674703598023, + "rewards/get_chromagram_reward_std": 0.10432815700769424, + "rewards/get_intelligibility_reward": -5.594386541843415, + "rewards/get_intelligibility_reward_std": 10.220803928375243, + "rewards/get_target_len_reward": -0.023972975183278324, + "rewards/get_target_len_reward_std": 0.06604787111282348, + "step": 5720 + }, + { + "advantages": -2.2351742465787083e-07, + "advantages_std": 1.6799475312232972, + "clip_ratio": 0.0, + "completion_length": 86.37381057739258, + "epoch": 4.309022556390977, + "grad_norm": 6.125, + "kl": 0.3318557575345039, + "learning_rate": 2.8458646616541358e-06, + "loss": 0.0321, + "num_tokens": 22491298.0, + "reward": -1.5887473464012145, + "reward_std": 6.799813938140869, + "rewards/get_chromagram_reward": 0.6177287518978118, + "rewards/get_chromagram_reward_std": 0.12023614346981049, + "rewards/get_intelligibility_reward": -5.365889692306519, + "rewards/get_intelligibility_reward_std": 10.740260219573974, + "rewards/get_target_len_reward": -0.018080861307680607, + "rewards/get_target_len_reward_std": 0.04215618222951889, + "step": 5730 + }, + { + "advantages": -2.6077032799776134e-07, + "advantages_std": 1.5404383897781373, + "clip_ratio": 0.0, + "completion_length": 86.54404983520507, + "epoch": 4.3165413533834585, + "grad_norm": 9.4375, + "kl": 0.30639515817165375, + "learning_rate": 2.842105263157895e-06, + "loss": 0.0359, + "num_tokens": 22796677.0, + "reward": -1.5064129531383514, + "reward_std": 6.709846925735474, + "rewards/get_chromagram_reward": 0.625343632698059, + "rewards/get_chromagram_reward_std": 0.11143359690904617, + "rewards/get_intelligibility_reward": -5.122057247161865, + "rewards/get_intelligibility_reward_std": 10.567590045928956, + "rewards/get_target_len_reward": -0.022525263484567404, + "rewards/get_target_len_reward_std": 0.07170127313584089, + "step": 5740 + }, + { + "advantages": -7.40587732428466e-07, + "advantages_std": 1.668913996219635, + "clip_ratio": 0.0, + "completion_length": 84.5898826599121, + "epoch": 4.3240601503759395, + "grad_norm": 23.25, + "kl": 0.5153587549924851, + "learning_rate": 2.8383458646616546e-06, + "loss": 0.061, + "num_tokens": 23096548.0, + "reward": -2.0747710138559343, + "reward_std": 7.238304138183594, + "rewards/get_chromagram_reward": 0.6240490555763245, + "rewards/get_chromagram_reward_std": 0.11386263146996498, + "rewards/get_intelligibility_reward": -6.8224663734436035, + "rewards/get_intelligibility_reward_std": 10.980531883239745, + "rewards/get_target_len_reward": -0.025895378738641738, + "rewards/get_target_len_reward_std": 0.078858133405447, + "step": 5750 + }, + { + "advantages": 8.49366280419872e-08, + "advantages_std": 1.5249725341796876, + "clip_ratio": 0.0, + "completion_length": 85.34047927856446, + "epoch": 4.331578947368421, + "grad_norm": 7.0, + "kl": 0.33452349007129667, + "learning_rate": 2.834586466165414e-06, + "loss": 0.042, + "num_tokens": 23398591.0, + "reward": -1.6896941900253295, + "reward_std": 6.341463661193847, + "rewards/get_chromagram_reward": 0.6132433891296387, + "rewards/get_chromagram_reward_std": 0.11532968133687974, + "rewards/get_intelligibility_reward": -5.659934663772583, + "rewards/get_intelligibility_reward_std": 9.797483968734742, + "rewards/get_target_len_reward": -0.022390842065215112, + "rewards/get_target_len_reward_std": 0.07444148659706115, + "step": 5760 + }, + { + "advantages": -9.158005980225425e-08, + "advantages_std": 1.5952982187271119, + "clip_ratio": 0.0, + "completion_length": 84.70654983520508, + "epoch": 4.3390977443609025, + "grad_norm": 10.4375, + "kl": 0.4010193169116974, + "learning_rate": 2.830827067669173e-06, + "loss": 0.0402, + "num_tokens": 23699966.0, + "reward": -1.2389403194189073, + "reward_std": 6.306393384933472, + "rewards/get_chromagram_reward": 0.6315956294536591, + "rewards/get_chromagram_reward_std": 0.12252237275242805, + "rewards/get_intelligibility_reward": -4.322550284862518, + "rewards/get_intelligibility_reward_std": 10.139668178558349, + "rewards/get_target_len_reward": -0.02586617963388562, + "rewards/get_target_len_reward_std": 0.060991795361042024, + "step": 5770 + }, + { + "advantages": 2.7641655195864703e-07, + "advantages_std": 1.6074973464012146, + "clip_ratio": 0.0, + "completion_length": 86.55714492797851, + "epoch": 4.3466165413533835, + "grad_norm": 6.53125, + "kl": 0.34225198477506635, + "learning_rate": 2.8270676691729327e-06, + "loss": 0.0425, + "num_tokens": 24005654.0, + "reward": -1.5039369583129882, + "reward_std": 6.346541547775269, + "rewards/get_chromagram_reward": 0.6210005640983581, + "rewards/get_chromagram_reward_std": 0.11820514425635338, + "rewards/get_intelligibility_reward": -5.111461114883423, + "rewards/get_intelligibility_reward_std": 9.993875980377197, + "rewards/get_target_len_reward": -0.021349991485476495, + "rewards/get_target_len_reward_std": 0.06595882065594197, + "step": 5780 + }, + { + "advantages": -2.7815503500505656e-07, + "advantages_std": 1.572441029548645, + "clip_ratio": 0.0, + "completion_length": 84.38393020629883, + "epoch": 4.3541353383458645, + "grad_norm": 12.5625, + "kl": 0.35131756067276, + "learning_rate": 2.823308270676692e-06, + "loss": 0.0393, + "num_tokens": 24305185.0, + "reward": -1.598071312904358, + "reward_std": 6.891125774383545, + "rewards/get_chromagram_reward": 0.6182050764560699, + "rewards/get_chromagram_reward_std": 0.12791308984160424, + "rewards/get_intelligibility_reward": -5.390382766723633, + "rewards/get_intelligibility_reward_std": 10.94007167816162, + "rewards/get_target_len_reward": -0.022035928349941968, + "rewards/get_target_len_reward_std": 0.05774991624057293, + "step": 5790 + }, + { + "advantages": 6.737808433854297e-07, + "advantages_std": 1.6271725296974182, + "clip_ratio": 0.0, + "completion_length": 86.93452529907226, + "epoch": 4.361654135338346, + "grad_norm": 6.8125, + "kl": 0.5817287877202034, + "learning_rate": 2.8195488721804516e-06, + "loss": 0.0607, + "num_tokens": 24612401.0, + "reward": -1.4664264440536499, + "reward_std": 6.983836317062378, + "rewards/get_chromagram_reward": 0.6144366025924682, + "rewards/get_chromagram_reward_std": 0.1035338170826435, + "rewards/get_intelligibility_reward": -4.996148389577866, + "rewards/get_intelligibility_reward_std": 10.968209648132325, + "rewards/get_target_len_reward": -0.017567448690533637, + "rewards/get_target_len_reward_std": 0.05456386059522629, + "step": 5800 + }, + { + "advantages": 4.3933590703204574e-07, + "advantages_std": 1.5751075744628906, + "clip_ratio": 0.0, + "completion_length": 84.38928756713867, + "epoch": 4.369172932330827, + "grad_norm": 5.46875, + "kl": 0.3394057586789131, + "learning_rate": 2.815789473684211e-06, + "loss": 0.0349, + "num_tokens": 24912174.0, + "reward": -1.1729264855384827, + "reward_std": 6.212002897262574, + "rewards/get_chromagram_reward": 0.6229874014854431, + "rewards/get_chromagram_reward_std": 0.11585783213376999, + "rewards/get_intelligibility_reward": -4.122948503494262, + "rewards/get_intelligibility_reward_std": 10.047537708282471, + "rewards/get_target_len_reward": -0.018818165455013514, + "rewards/get_target_len_reward_std": 0.056252822279930115, + "step": 5810 + }, + { + "advantages": 5.04652664323757e-07, + "advantages_std": 1.6236942529678344, + "clip_ratio": 0.0, + "completion_length": 89.79702529907226, + "epoch": 4.3766917293233085, + "grad_norm": 5.21875, + "kl": 0.30837641209363936, + "learning_rate": 2.8120300751879705e-06, + "loss": 0.0327, + "num_tokens": 25225844.0, + "reward": -1.4446959674358368, + "reward_std": 6.88113808631897, + "rewards/get_chromagram_reward": 0.616160649061203, + "rewards/get_chromagram_reward_std": 0.11032316386699677, + "rewards/get_intelligibility_reward": -4.931406950950622, + "rewards/get_intelligibility_reward_std": 10.971225118637085, + "rewards/get_target_len_reward": -0.018841464072465897, + "rewards/get_target_len_reward_std": 0.05351784508675337, + "step": 5820 + }, + { + "advantages": -1.0083119761361559e-07, + "advantages_std": 1.6048307299613953, + "clip_ratio": 0.0, + "completion_length": 88.46666870117187, + "epoch": 4.38421052631579, + "grad_norm": 11.75, + "kl": 5.939772760868072, + "learning_rate": 2.8082706766917297e-06, + "loss": 0.6032, + "num_tokens": 25536047.0, + "reward": -1.379792395234108, + "reward_std": 6.840978622436523, + "rewards/get_chromagram_reward": 0.6219044208526612, + "rewards/get_chromagram_reward_std": 0.1071255199611187, + "rewards/get_intelligibility_reward": -4.737642979621887, + "rewards/get_intelligibility_reward_std": 11.058589458465576, + "rewards/get_target_len_reward": -0.023638481460511684, + "rewards/get_target_len_reward_std": 0.08313055895268917, + "step": 5830 + }, + { + "advantages": 8.421639904554467e-07, + "advantages_std": 1.7479196310043335, + "clip_ratio": 0.0, + "completion_length": 89.10416946411132, + "epoch": 4.391729323308271, + "grad_norm": 5.875, + "kl": 0.3234842225909233, + "learning_rate": 2.8045112781954893e-06, + "loss": 0.0352, + "num_tokens": 25849416.0, + "reward": -1.7320613265037537, + "reward_std": 6.80491828918457, + "rewards/get_chromagram_reward": 0.6146539449691772, + "rewards/get_chromagram_reward_std": 0.11938665956258773, + "rewards/get_intelligibility_reward": -5.792370629310608, + "rewards/get_intelligibility_reward_std": 10.65215892791748, + "rewards/get_target_len_reward": -0.01846689051017165, + "rewards/get_target_len_reward_std": 0.04976162984967232, + "step": 5840 + }, + { + "advantages": -6.41991583449908e-08, + "advantages_std": 1.493267583847046, + "clip_ratio": 0.0, + "completion_length": 86.42857284545899, + "epoch": 4.399248120300752, + "grad_norm": 11.3125, + "kl": 0.39334663897752764, + "learning_rate": 2.8007518796992485e-06, + "loss": 0.0486, + "num_tokens": 26154685.0, + "reward": -1.2574773922562599, + "reward_std": 7.319183826446533, + "rewards/get_chromagram_reward": 0.6342748403549194, + "rewards/get_chromagram_reward_std": 0.11602752506732941, + "rewards/get_intelligibility_reward": -4.3805874206125734, + "rewards/get_intelligibility_reward_std": 11.956838607788086, + "rewards/get_target_len_reward": -0.026119467988610267, + "rewards/get_target_len_reward_std": 0.0734117228537798, + "step": 5850 + }, + { + "advantages": -6.218751522624189e-07, + "advantages_std": 1.4529019594192505, + "clip_ratio": 0.0, + "completion_length": 87.32619247436523, + "epoch": 4.406766917293233, + "grad_norm": 12.625, + "kl": 0.3028931975364685, + "learning_rate": 2.7969924812030073e-06, + "loss": 0.037, + "num_tokens": 26463154.0, + "reward": -1.484662154316902, + "reward_std": 6.946614217758179, + "rewards/get_chromagram_reward": 0.6176621794700623, + "rewards/get_chromagram_reward_std": 0.11052877753973007, + "rewards/get_intelligibility_reward": -5.0487511396408085, + "rewards/get_intelligibility_reward_std": 11.156039571762085, + "rewards/get_target_len_reward": -0.022897268738597633, + "rewards/get_target_len_reward_std": 0.07214595098048449, + "step": 5860 + }, + { + "advantages": -2.3742517116787098e-07, + "advantages_std": 1.6307622075080872, + "clip_ratio": 0.0, + "completion_length": 84.73988265991211, + "epoch": 4.414285714285715, + "grad_norm": 17.0, + "kl": 0.3400808498263359, + "learning_rate": 2.793233082706767e-06, + "loss": 0.0393, + "num_tokens": 26763952.0, + "reward": -1.5526393622159957, + "reward_std": 7.049071836471557, + "rewards/get_chromagram_reward": 0.6250545501708984, + "rewards/get_chromagram_reward_std": 0.11878361329436302, + "rewards/get_intelligibility_reward": -5.262066769599914, + "rewards/get_intelligibility_reward_std": 11.234350681304932, + "rewards/get_target_len_reward": -0.020905664563179015, + "rewards/get_target_len_reward_std": 0.061207803711295125, + "step": 5870 + }, + { + "advantages": 1.2715656794171082e-07, + "advantages_std": 1.5928070425987244, + "clip_ratio": 0.0, + "completion_length": 86.91369247436523, + "epoch": 4.421804511278196, + "grad_norm": 6.875, + "kl": 0.3369604110717773, + "learning_rate": 2.789473684210526e-06, + "loss": 0.0325, + "num_tokens": 27070512.0, + "reward": -1.6088357031345368, + "reward_std": 6.812682485580444, + "rewards/get_chromagram_reward": 0.6178681910037994, + "rewards/get_chromagram_reward_std": 0.11353974491357803, + "rewards/get_intelligibility_reward": -5.426232194900512, + "rewards/get_intelligibility_reward_std": 10.729567432403565, + "rewards/get_target_len_reward": -0.018142933025956154, + "rewards/get_target_len_reward_std": 0.04331641979515553, + "step": 5880 + }, + { + "advantages": 3.601114144657913e-08, + "advantages_std": 1.674160885810852, + "clip_ratio": 0.0, + "completion_length": 88.12619323730469, + "epoch": 4.429323308270677, + "grad_norm": 6.21875, + "kl": 0.39946324825286866, + "learning_rate": 2.785714285714286e-06, + "loss": 0.0444, + "num_tokens": 27380269.0, + "reward": -1.5472537845373153, + "reward_std": 6.7644494533538815, + "rewards/get_chromagram_reward": 0.6225440621376037, + "rewards/get_chromagram_reward_std": 0.11376380547881126, + "rewards/get_intelligibility_reward": -5.241733276844025, + "rewards/get_intelligibility_reward_std": 10.670757675170899, + "rewards/get_target_len_reward": -0.022571913711726666, + "rewards/get_target_len_reward_std": 0.06656058058142662, + "step": 5890 + }, + { + "advantages": -1.8551947178480077e-07, + "advantages_std": 1.478583037853241, + "clip_ratio": 0.0, + "completion_length": 88.22381057739258, + "epoch": 4.436842105263158, + "grad_norm": 3104.0, + "kl": 0.5128657639026641, + "learning_rate": 2.781954887218045e-06, + "loss": 0.0545, + "num_tokens": 27690235.0, + "reward": -1.3492045670747757, + "reward_std": 6.4950531959533695, + "rewards/get_chromagram_reward": 0.640613716840744, + "rewards/get_chromagram_reward_std": 0.10715582817792893, + "rewards/get_intelligibility_reward": -4.665818774700165, + "rewards/get_intelligibility_reward_std": 10.406336688995362, + "rewards/get_target_len_reward": -0.02240826766937971, + "rewards/get_target_len_reward_std": 0.05692440681159496, + "step": 5900 + }, + { + "advantages": -8.891026226365284e-08, + "advantages_std": 1.6906216025352478, + "clip_ratio": 0.0, + "completion_length": 84.81190719604493, + "epoch": 4.444360902255639, + "grad_norm": 6.875, + "kl": 0.37526106983423235, + "learning_rate": 2.7781954887218047e-06, + "loss": 0.037, + "num_tokens": 27991745.0, + "reward": -1.7090884655714036, + "reward_std": 6.80400128364563, + "rewards/get_chromagram_reward": 0.6135359227657318, + "rewards/get_chromagram_reward_std": 0.11744362115859985, + "rewards/get_intelligibility_reward": -5.7205850839614865, + "rewards/get_intelligibility_reward_std": 10.553275012969971, + "rewards/get_target_len_reward": -0.020216171443462373, + "rewards/get_target_len_reward_std": 0.051091530919075014, + "step": 5910 + }, + { + "advantages": 3.427265564681647e-08, + "advantages_std": 1.5322723031044005, + "clip_ratio": 0.0, + "completion_length": 87.55000152587891, + "epoch": 4.451879699248121, + "grad_norm": 182.0, + "kl": 0.3043200358748436, + "learning_rate": 2.774436090225564e-06, + "loss": 0.0305, + "num_tokens": 28300332.0, + "reward": -1.1945252060890197, + "reward_std": 6.519233894348145, + "rewards/get_chromagram_reward": 0.6296118319034576, + "rewards/get_chromagram_reward_std": 0.11107028499245644, + "rewards/get_intelligibility_reward": -4.194262075424194, + "rewards/get_intelligibility_reward_std": 10.339669322967529, + "rewards/get_target_len_reward": -0.01892517600208521, + "rewards/get_target_len_reward_std": 0.05237778052687645, + "step": 5920 + }, + { + "advantages": -3.881752730805488e-07, + "advantages_std": 1.521967101097107, + "clip_ratio": 0.0, + "completion_length": 86.82916870117188, + "epoch": 4.459398496240602, + "grad_norm": 58.75, + "kl": 0.26121333986520767, + "learning_rate": 2.7706766917293236e-06, + "loss": 0.0365, + "num_tokens": 28607212.0, + "reward": -1.2213778406381608, + "reward_std": 6.433756160736084, + "rewards/get_chromagram_reward": 0.6160942673683166, + "rewards/get_chromagram_reward_std": 0.11823522076010703, + "rewards/get_intelligibility_reward": -4.2590295061469075, + "rewards/get_intelligibility_reward_std": 10.392317485809325, + "rewards/get_target_len_reward": -0.021197985392063856, + "rewards/get_target_len_reward_std": 0.07294883448630571, + "step": 5930 + }, + { + "advantages": 1.671413798476351e-07, + "advantages_std": 1.613970112800598, + "clip_ratio": 0.0, + "completion_length": 87.19940643310547, + "epoch": 4.466917293233083, + "grad_norm": 19.5, + "kl": 0.2971110358834267, + "learning_rate": 2.766917293233083e-06, + "loss": 0.037, + "num_tokens": 28914533.0, + "reward": -1.7813652843236922, + "reward_std": 7.045272636413574, + "rewards/get_chromagram_reward": 0.6196362137794494, + "rewards/get_chromagram_reward_std": 0.112163445353508, + "rewards/get_intelligibility_reward": -5.94050749540329, + "rewards/get_intelligibility_reward_std": 11.014069366455079, + "rewards/get_target_len_reward": -0.02322419285774231, + "rewards/get_target_len_reward_std": 0.07357236295938492, + "step": 5940 + }, + { + "advantages": -4.954636096954346e-08, + "advantages_std": 1.5950510263442994, + "clip_ratio": 0.0, + "completion_length": 86.2148826599121, + "epoch": 4.474436090225564, + "grad_norm": 12.9375, + "kl": 0.33923769295215606, + "learning_rate": 2.7631578947368424e-06, + "loss": 0.0401, + "num_tokens": 29219031.0, + "reward": -1.6365281403064729, + "reward_std": 6.434076976776123, + "rewards/get_chromagram_reward": 0.6088695049285888, + "rewards/get_chromagram_reward_std": 0.12716799676418306, + "rewards/get_intelligibility_reward": -5.496372726559639, + "rewards/get_intelligibility_reward_std": 9.85527925491333, + "rewards/get_target_len_reward": -0.02208081311546266, + "rewards/get_target_len_reward_std": 0.05841316077858209, + "step": 5950 + }, + { + "advantages": -9.988745290456791e-07, + "advantages_std": 1.6721760392189027, + "clip_ratio": 0.0, + "completion_length": 87.43869171142578, + "epoch": 4.481954887218045, + "grad_norm": 205.0, + "kl": 0.35931061804294584, + "learning_rate": 2.7593984962406017e-06, + "loss": 0.0415, + "num_tokens": 29526634.0, + "reward": -1.6175777792930603, + "reward_std": 7.441208934783935, + "rewards/get_chromagram_reward": 0.6047444045543671, + "rewards/get_chromagram_reward_std": 0.11898068115115165, + "rewards/get_intelligibility_reward": -5.432984733581543, + "rewards/get_intelligibility_reward_std": 11.966805171966552, + "rewards/get_target_len_reward": -0.02449268251657486, + "rewards/get_target_len_reward_std": 0.07319775484502315, + "step": 5960 + }, + { + "advantages": 4.3138864107561404e-07, + "advantages_std": 1.5195866227149963, + "clip_ratio": 0.0, + "completion_length": 86.16488342285156, + "epoch": 4.489473684210527, + "grad_norm": 7.09375, + "kl": 0.3500840783119202, + "learning_rate": 2.755639097744361e-06, + "loss": 0.038, + "num_tokens": 29831733.0, + "reward": -1.5222080022096633, + "reward_std": 6.935330963134765, + "rewards/get_chromagram_reward": 0.6276670038700104, + "rewards/get_chromagram_reward_std": 0.11789945214986801, + "rewards/get_intelligibility_reward": -5.172858917713166, + "rewards/get_intelligibility_reward_std": 10.956598567962647, + "rewards/get_target_len_reward": -0.021431863773614168, + "rewards/get_target_len_reward_std": 0.05911620147526264, + "step": 5970 + }, + { + "advantages": -3.2161674425879026e-07, + "advantages_std": 1.5618364930152893, + "clip_ratio": 0.0, + "completion_length": 85.67976303100586, + "epoch": 4.496992481203008, + "grad_norm": 22.875, + "kl": 0.3210230380296707, + "learning_rate": 2.7518796992481205e-06, + "loss": 0.0369, + "num_tokens": 30135432.0, + "reward": -1.274936705827713, + "reward_std": 6.508914232254028, + "rewards/get_chromagram_reward": 0.6171582043170929, + "rewards/get_chromagram_reward_std": 0.12334928214550019, + "rewards/get_intelligibility_reward": -4.41784838438034, + "rewards/get_intelligibility_reward_std": 10.500092363357544, + "rewards/get_target_len_reward": -0.02411962877959013, + "rewards/get_target_len_reward_std": 0.06979301236569882, + "step": 5980 + }, + { + "advantages": 3.8544338281099045e-07, + "advantages_std": 1.5411863803863526, + "clip_ratio": 0.0, + "completion_length": 86.49702606201171, + "epoch": 4.504511278195489, + "grad_norm": 7.53125, + "kl": 0.333082078397274, + "learning_rate": 2.7481203007518798e-06, + "loss": 0.0345, + "num_tokens": 30440601.0, + "reward": -1.6373848259449004, + "reward_std": 6.695670700073242, + "rewards/get_chromagram_reward": 0.6192308783531189, + "rewards/get_chromagram_reward_std": 0.11601671576499939, + "rewards/get_intelligibility_reward": -5.509091401100159, + "rewards/get_intelligibility_reward_std": 10.489329147338868, + "rewards/get_target_len_reward": -0.02229350171983242, + "rewards/get_target_len_reward_std": 0.06566942296922207, + "step": 5990 + }, + { + "advantages": 4.122653649574204e-08, + "advantages_std": 1.57593115568161, + "clip_ratio": 0.0, + "completion_length": 91.12321472167969, + "epoch": 4.51203007518797, + "grad_norm": 6.6875, + "kl": 0.3484980553388596, + "learning_rate": 2.7443609022556394e-06, + "loss": 0.0396, + "num_tokens": 30758623.0, + "reward": -1.3039451286196708, + "reward_std": 6.552640724182129, + "rewards/get_chromagram_reward": 0.6223575115203858, + "rewards/get_chromagram_reward_std": 0.10774782225489617, + "rewards/get_intelligibility_reward": -4.510845673084259, + "rewards/get_intelligibility_reward_std": 10.53021697998047, + "rewards/get_target_len_reward": -0.023346944618970154, + "rewards/get_target_len_reward_std": 0.06934394463896751, + "step": 6000 + }, + { + "advantages": 1.7446777462737372e-07, + "advantages_std": 1.4582236886024476, + "clip_ratio": 0.0, + "completion_length": 86.5726203918457, + "epoch": 4.519548872180451, + "grad_norm": 49.5, + "kl": 0.45119605511426925, + "learning_rate": 2.7406015037593986e-06, + "loss": 0.0486, + "num_tokens": 31064652.0, + "reward": -1.4632755875587464, + "reward_std": 7.048706197738648, + "rewards/get_chromagram_reward": 0.6153858244419098, + "rewards/get_chromagram_reward_std": 0.12012667879462242, + "rewards/get_intelligibility_reward": -4.986211991310119, + "rewards/get_intelligibility_reward_std": 11.311538219451904, + "rewards/get_target_len_reward": -0.019000414945185184, + "rewards/get_target_len_reward_std": 0.05883214082568884, + "step": 6010 + }, + { + "advantages": 4.943460442774495e-07, + "advantages_std": 1.5501924872398376, + "clip_ratio": 0.0, + "completion_length": 88.29702529907226, + "epoch": 4.527067669172933, + "grad_norm": 13.1875, + "kl": 0.3232778489589691, + "learning_rate": 2.7368421052631583e-06, + "loss": 0.0332, + "num_tokens": 31375492.0, + "reward": -1.3596054553985595, + "reward_std": 6.917938184738159, + "rewards/get_chromagram_reward": 0.6279544234275818, + "rewards/get_chromagram_reward_std": 0.11212313920259476, + "rewards/get_intelligibility_reward": -4.684613796649501, + "rewards/get_intelligibility_reward_std": 11.098008298873902, + "rewards/get_target_len_reward": -0.02215675003826618, + "rewards/get_target_len_reward_std": 0.06159700192511082, + "step": 6020 + }, + { + "advantages": 3.1466286944947795e-07, + "advantages_std": 1.5159233927726745, + "clip_ratio": 0.0, + "completion_length": 85.98511962890625, + "epoch": 4.534586466165414, + "grad_norm": 8.25, + "kl": 6.474253372848034, + "learning_rate": 2.7330827067669175e-06, + "loss": 0.6535, + "num_tokens": 31679996.0, + "reward": -1.3839904189109802, + "reward_std": 6.673179626464844, + "rewards/get_chromagram_reward": 0.6101464450359344, + "rewards/get_chromagram_reward_std": 0.1205007255077362, + "rewards/get_intelligibility_reward": -4.739246428012848, + "rewards/get_intelligibility_reward_std": 10.690727710723877, + "rewards/get_target_len_reward": -0.02287101689726114, + "rewards/get_target_len_reward_std": 0.07206516806036234, + "step": 6030 + }, + { + "advantages": -6.804864085552254e-08, + "advantages_std": 1.549793303012848, + "clip_ratio": 0.0, + "completion_length": 83.12321548461914, + "epoch": 4.542105263157895, + "grad_norm": 155.0, + "kl": 0.35599401146173476, + "learning_rate": 2.729323308270677e-06, + "loss": 0.04, + "num_tokens": 31976212.0, + "reward": -1.6936394423246384, + "reward_std": 6.708538150787353, + "rewards/get_chromagram_reward": 0.6269149005413055, + "rewards/get_chromagram_reward_std": 0.11131934896111488, + "rewards/get_intelligibility_reward": -5.6877417176961895, + "rewards/get_intelligibility_reward_std": 10.37338047027588, + "rewards/get_target_len_reward": -0.02009119251742959, + "rewards/get_target_len_reward_std": 0.056248923763632774, + "step": 6040 + }, + { + "advantages": 3.6309162680936425e-07, + "advantages_std": 1.6578883528709412, + "clip_ratio": 0.0, + "completion_length": 87.18095397949219, + "epoch": 4.549624060150376, + "grad_norm": 9.5625, + "kl": 0.311375567317009, + "learning_rate": 2.7255639097744363e-06, + "loss": 0.0328, + "num_tokens": 32283730.0, + "reward": -1.406548136472702, + "reward_std": 6.439837408065796, + "rewards/get_chromagram_reward": 0.616950273513794, + "rewards/get_chromagram_reward_std": 0.11176617294549943, + "rewards/get_intelligibility_reward": -4.8177523732185366, + "rewards/get_intelligibility_reward_std": 10.18739709854126, + "rewards/get_target_len_reward": -0.01884202305227518, + "rewards/get_target_len_reward_std": 0.05634100623428821, + "step": 6050 + }, + { + "advantages": 4.1847434539832305e-08, + "advantages_std": 1.6048791885375977, + "clip_ratio": 0.0, + "completion_length": 88.63393096923828, + "epoch": 4.557142857142857, + "grad_norm": 6.59375, + "kl": 0.30048550814390185, + "learning_rate": 2.7218045112781956e-06, + "loss": 0.0307, + "num_tokens": 32595796.0, + "reward": -1.2904392518103123, + "reward_std": 6.2870903491973875, + "rewards/get_chromagram_reward": 0.6291167497634887, + "rewards/get_chromagram_reward_std": 0.11226154044270516, + "rewards/get_intelligibility_reward": -4.4783551633358005, + "rewards/get_intelligibility_reward_std": 10.016265392303467, + "rewards/get_target_len_reward": -0.02207921463996172, + "rewards/get_target_len_reward_std": 0.06249589528888464, + "step": 6060 + }, + { + "advantages": -4.0456654346598955e-07, + "advantages_std": 1.509422194957733, + "clip_ratio": 0.0, + "completion_length": 86.4000015258789, + "epoch": 4.564661654135338, + "grad_norm": 6.1875, + "kl": 0.3658337786793709, + "learning_rate": 2.7180451127819552e-06, + "loss": 0.0457, + "num_tokens": 32901569.0, + "reward": -1.341392619907856, + "reward_std": 6.766161584854126, + "rewards/get_chromagram_reward": 0.6196685910224915, + "rewards/get_chromagram_reward_std": 0.114710883051157, + "rewards/get_intelligibility_reward": -4.619689786434174, + "rewards/get_intelligibility_reward_std": 10.862145328521729, + "rewards/get_target_len_reward": -0.024156391993165015, + "rewards/get_target_len_reward_std": 0.07279833741486072, + "step": 6070 + }, + { + "advantages": 2.572933830435886e-07, + "advantages_std": 1.5198671460151671, + "clip_ratio": 0.0, + "completion_length": 86.99702529907226, + "epoch": 4.57218045112782, + "grad_norm": 536.0, + "kl": 0.42363296151161195, + "learning_rate": 2.7142857142857144e-06, + "loss": 0.042, + "num_tokens": 33208757.0, + "reward": -1.2752907037734986, + "reward_std": 6.531996488571167, + "rewards/get_chromagram_reward": 0.6229283511638641, + "rewards/get_chromagram_reward_std": 0.09791875258088112, + "rewards/get_intelligibility_reward": -4.431623411178589, + "rewards/get_intelligibility_reward_std": 10.481385231018066, + "rewards/get_target_len_reward": -0.017176955845206975, + "rewards/get_target_len_reward_std": 0.05317041240632534, + "step": 6080 + }, + { + "advantages": 4.5324364350562975e-08, + "advantages_std": 1.5965524196624756, + "clip_ratio": 0.0, + "completion_length": 88.30119247436524, + "epoch": 4.579699248120301, + "grad_norm": 4.71875, + "kl": 0.8488052666187287, + "learning_rate": 2.710526315789474e-06, + "loss": 0.088, + "num_tokens": 33518832.0, + "reward": -1.3742637276649474, + "reward_std": 6.573501634597778, + "rewards/get_chromagram_reward": 0.6275453746318818, + "rewards/get_chromagram_reward_std": 0.10700947791337967, + "rewards/get_intelligibility_reward": -4.729785847663879, + "rewards/get_intelligibility_reward_std": 10.549948787689209, + "rewards/get_target_len_reward": -0.020550532080233098, + "rewards/get_target_len_reward_std": 0.058156965486705306, + "step": 6090 + }, + { + "advantages": 2.0836791065903525e-07, + "advantages_std": 1.679631507396698, + "clip_ratio": 0.0, + "completion_length": 86.75595321655274, + "epoch": 4.587218045112782, + "grad_norm": 9.0, + "kl": 0.3424948573112488, + "learning_rate": 2.7067669172932333e-06, + "loss": 0.0378, + "num_tokens": 33825338.0, + "reward": -1.4295830607414246, + "reward_std": 6.916474342346191, + "rewards/get_chromagram_reward": 0.6332552254199981, + "rewards/get_chromagram_reward_std": 0.11635537669062615, + "rewards/get_intelligibility_reward": -4.903472948074341, + "rewards/get_intelligibility_reward_std": 11.094056224822998, + "rewards/get_target_len_reward": -0.018531178031116723, + "rewards/get_target_len_reward_std": 0.05028481315821409, + "step": 6100 + }, + { + "advantages": 4.855295596684073e-08, + "advantages_std": 1.5576176047325134, + "clip_ratio": 0.0, + "completion_length": 85.89881134033203, + "epoch": 4.594736842105263, + "grad_norm": 7.71875, + "kl": 0.3236625760793686, + "learning_rate": 2.703007518796993e-06, + "loss": 0.0392, + "num_tokens": 34128529.0, + "reward": -1.6338713705539702, + "reward_std": 6.288772249221802, + "rewards/get_chromagram_reward": 0.6248431921005249, + "rewards/get_chromagram_reward_std": 0.10880339443683625, + "rewards/get_intelligibility_reward": -5.505643081665039, + "rewards/get_intelligibility_reward_std": 9.724865436553955, + "rewards/get_target_len_reward": -0.020813790801912545, + "rewards/get_target_len_reward_std": 0.06379029210656881, + "step": 6110 + }, + { + "advantages": -1.3758738646174606e-07, + "advantages_std": 1.5842162370681763, + "clip_ratio": 0.0, + "completion_length": 88.39940719604492, + "epoch": 4.602255639097744, + "grad_norm": 8.5625, + "kl": 0.3194336831569672, + "learning_rate": 2.699248120300752e-06, + "loss": 0.0432, + "num_tokens": 34439010.0, + "reward": -1.4361367881298066, + "reward_std": 6.373421096801758, + "rewards/get_chromagram_reward": 0.6246614634990693, + "rewards/get_chromagram_reward_std": 0.11062444038689137, + "rewards/get_intelligibility_reward": -4.907348370552063, + "rewards/get_intelligibility_reward_std": 10.147270202636719, + "rewards/get_target_len_reward": -0.025723107066005467, + "rewards/get_target_len_reward_std": 0.08910752348601818, + "step": 6120 + }, + { + "advantages": 3.765026921342951e-07, + "advantages_std": 1.5026398301124573, + "clip_ratio": 0.0, + "completion_length": 90.22500152587891, + "epoch": 4.609774436090225, + "grad_norm": 10.625, + "kl": 0.31463173031806946, + "learning_rate": 2.695488721804512e-06, + "loss": 0.0356, + "num_tokens": 34754401.0, + "reward": -1.5472688972949982, + "reward_std": 6.971684503555298, + "rewards/get_chromagram_reward": 0.6299036264419555, + "rewards/get_chromagram_reward_std": 0.11574003919959068, + "rewards/get_intelligibility_reward": -5.250876641273498, + "rewards/get_intelligibility_reward_std": 11.020786952972411, + "rewards/get_target_len_reward": -0.020833592116832732, + "rewards/get_target_len_reward_std": 0.060291562043130395, + "step": 6130 + }, + { + "advantages": -3.4719706292207776e-07, + "advantages_std": 1.5989922523498534, + "clip_ratio": 0.0, + "completion_length": 86.1327407836914, + "epoch": 4.617293233082707, + "grad_norm": 8.1875, + "kl": 0.3493267551064491, + "learning_rate": 2.691729323308271e-06, + "loss": 0.0382, + "num_tokens": 35059092.0, + "reward": -1.512378105521202, + "reward_std": 6.478836536407471, + "rewards/get_chromagram_reward": 0.6267882108688354, + "rewards/get_chromagram_reward_std": 0.1290317542850971, + "rewards/get_intelligibility_reward": -5.140925347805023, + "rewards/get_intelligibility_reward_std": 10.085025024414062, + "rewards/get_target_len_reward": -0.022996900044381617, + "rewards/get_target_len_reward_std": 0.06523961815983056, + "step": 6140 + }, + { + "advantages": -5.173186604423563e-07, + "advantages_std": 1.6256378650665284, + "clip_ratio": 0.0, + "completion_length": 85.16726303100586, + "epoch": 4.624812030075188, + "grad_norm": 4.875, + "kl": 1.8388477712869644, + "learning_rate": 2.6879699248120307e-06, + "loss": 0.1896, + "num_tokens": 35360995.0, + "reward": -1.585798019170761, + "reward_std": 6.935663890838623, + "rewards/get_chromagram_reward": 0.6336425006389618, + "rewards/get_chromagram_reward_std": 0.11773469522595406, + "rewards/get_intelligibility_reward": -5.3694363832473755, + "rewards/get_intelligibility_reward_std": 10.936893081665039, + "rewards/get_target_len_reward": -0.02159978710114956, + "rewards/get_target_len_reward_std": 0.051993397623300554, + "step": 6150 + }, + { + "advantages": 3.2732882999653156e-07, + "advantages_std": 1.5918954968452455, + "clip_ratio": 0.0, + "completion_length": 84.12916870117188, + "epoch": 4.632330827067669, + "grad_norm": 7.65625, + "kl": 0.3650706380605698, + "learning_rate": 2.68421052631579e-06, + "loss": 0.0391, + "num_tokens": 35660326.0, + "reward": -1.4566324774175883, + "reward_std": 6.758801698684692, + "rewards/get_chromagram_reward": 0.6240825772285461, + "rewards/get_chromagram_reward_std": 0.12061264365911484, + "rewards/get_intelligibility_reward": -4.972877359390258, + "rewards/get_intelligibility_reward_std": 10.724132823944093, + "rewards/get_target_len_reward": -0.021102200075984002, + "rewards/get_target_len_reward_std": 0.0565574087202549, + "step": 6160 + }, + { + "advantages": 1.2367963790893556e-07, + "advantages_std": 1.581169807910919, + "clip_ratio": 0.0, + "completion_length": 86.17619171142579, + "epoch": 4.63984962406015, + "grad_norm": 16.5, + "kl": 5.859141428768635, + "learning_rate": 2.6804511278195487e-06, + "loss": 0.5881, + "num_tokens": 35964356.0, + "reward": -1.3016383200883865, + "reward_std": 6.410171842575073, + "rewards/get_chromagram_reward": 0.6194089889526367, + "rewards/get_chromagram_reward_std": 0.12417329400777817, + "rewards/get_intelligibility_reward": -4.501974666118622, + "rewards/get_intelligibility_reward_std": 10.251604223251343, + "rewards/get_target_len_reward": -0.022349087800830603, + "rewards/get_target_len_reward_std": 0.05504023898392916, + "step": 6170 + }, + { + "advantages": 3.129243957289418e-07, + "advantages_std": 1.5140856862068177, + "clip_ratio": 0.0, + "completion_length": 90.33214416503907, + "epoch": 4.647368421052631, + "grad_norm": 68.0, + "kl": 0.35507272034883497, + "learning_rate": 2.6766917293233088e-06, + "loss": 0.0364, + "num_tokens": 36279969.0, + "reward": -1.4424680143594741, + "reward_std": 6.719956970214843, + "rewards/get_chromagram_reward": 0.6270939588546753, + "rewards/get_chromagram_reward_std": 0.11368402689695359, + "rewards/get_intelligibility_reward": -4.934299838542938, + "rewards/get_intelligibility_reward_std": 10.716573667526244, + "rewards/get_target_len_reward": -0.020197873562574388, + "rewards/get_target_len_reward_std": 0.04989261887967587, + "step": 6180 + }, + { + "advantages": -2.2873283924695898e-07, + "advantages_std": 1.5742897629737853, + "clip_ratio": 0.0, + "completion_length": 84.81488265991212, + "epoch": 4.654887218045113, + "grad_norm": 16.875, + "kl": 0.28883529752492904, + "learning_rate": 2.6729323308270676e-06, + "loss": 0.0323, + "num_tokens": 36581099.0, + "reward": -1.557493907213211, + "reward_std": 6.904000568389892, + "rewards/get_chromagram_reward": 0.6277475774288177, + "rewards/get_chromagram_reward_std": 0.10811701565980911, + "rewards/get_intelligibility_reward": -5.280087733268738, + "rewards/get_intelligibility_reward_std": 11.021557521820068, + "rewards/get_target_len_reward": -0.02014122884720564, + "rewards/get_target_len_reward_std": 0.05904182381927967, + "step": 6190 + }, + { + "advantages": 1.4007091770906753e-07, + "advantages_std": 1.486948847770691, + "clip_ratio": 0.0, + "completion_length": 87.18452606201171, + "epoch": 4.662406015037594, + "grad_norm": 5.53125, + "kl": 0.35808763206005095, + "learning_rate": 2.669172932330827e-06, + "loss": 0.0396, + "num_tokens": 36888822.0, + "reward": -1.5488356798887253, + "reward_std": 6.3901426792144775, + "rewards/get_chromagram_reward": 0.6087078809738159, + "rewards/get_chromagram_reward_std": 0.1133020430803299, + "rewards/get_intelligibility_reward": -5.235795629024506, + "rewards/get_intelligibility_reward_std": 10.033557605743407, + "rewards/get_target_len_reward": -0.01941908346489072, + "rewards/get_target_len_reward_std": 0.05598939694464207, + "step": 6200 + }, + { + "advantages": 1.9818544743088752e-07, + "advantages_std": 1.7082177996635437, + "clip_ratio": 0.0, + "completion_length": 83.97797775268555, + "epoch": 4.669924812030075, + "grad_norm": 6.34375, + "kl": 0.35583060383796694, + "learning_rate": 2.6654135338345864e-06, + "loss": 0.0364, + "num_tokens": 37186803.0, + "reward": -1.5818522050976753, + "reward_std": 6.408353328704834, + "rewards/get_chromagram_reward": 0.6205709517002106, + "rewards/get_chromagram_reward_std": 0.11420291811227798, + "rewards/get_intelligibility_reward": -5.345265340805054, + "rewards/get_intelligibility_reward_std": 10.000323915481568, + "rewards/get_target_len_reward": -0.02086184676736593, + "rewards/get_target_len_reward_std": 0.04689359571784735, + "step": 6210 + }, + { + "advantages": -1.7484029939396352e-07, + "advantages_std": 1.533245360851288, + "clip_ratio": 0.0, + "completion_length": 87.19404983520508, + "epoch": 4.677443609022556, + "grad_norm": 5.75, + "kl": 0.3654496863484383, + "learning_rate": 2.661654135338346e-06, + "loss": 0.037, + "num_tokens": 37494393.0, + "reward": -1.2985429098829626, + "reward_std": 6.674247598648071, + "rewards/get_chromagram_reward": 0.6185528457164764, + "rewards/get_chromagram_reward_std": 0.11509222164750099, + "rewards/get_intelligibility_reward": -4.494230937957764, + "rewards/get_intelligibility_reward_std": 10.719346714019775, + "rewards/get_target_len_reward": -0.019950428698211908, + "rewards/get_target_len_reward_std": 0.04983298964798451, + "step": 6220 + }, + { + "advantages": -4.867712846134964e-07, + "advantages_std": 1.6637269616127015, + "clip_ratio": 0.0, + "completion_length": 84.39404983520508, + "epoch": 4.684962406015037, + "grad_norm": 13.0625, + "kl": 0.40054383873939514, + "learning_rate": 2.6578947368421053e-06, + "loss": 0.0464, + "num_tokens": 37793558.0, + "reward": -1.656310772895813, + "reward_std": 6.542108106613159, + "rewards/get_chromagram_reward": 0.6114983022212982, + "rewards/get_chromagram_reward_std": 0.10526356026530266, + "rewards/get_intelligibility_reward": -5.55853419303894, + "rewards/get_intelligibility_reward_std": 10.181151390075684, + "rewards/get_target_len_reward": -0.021896019019186495, + "rewards/get_target_len_reward_std": 0.06076169461011886, + "step": 6230 + }, + { + "advantages": -2.510845732217604e-07, + "advantages_std": 1.5965062618255614, + "clip_ratio": 0.0, + "completion_length": 89.59166793823242, + "epoch": 4.692481203007519, + "grad_norm": 8.875, + "kl": 0.3339507460594177, + "learning_rate": 2.654135338345865e-06, + "loss": 0.0326, + "num_tokens": 38107248.0, + "reward": -1.2790717422962188, + "reward_std": 6.208766460418701, + "rewards/get_chromagram_reward": 0.6216834604740142, + "rewards/get_chromagram_reward_std": 0.11403061151504516, + "rewards/get_intelligibility_reward": -4.439843034744262, + "rewards/get_intelligibility_reward_std": 9.951672554016113, + "rewards/get_target_len_reward": -0.019055431988090277, + "rewards/get_target_len_reward_std": 0.04317720346152783, + "step": 6240 + }, + { + "advantages": 7.944802661086215e-07, + "advantages_std": 1.623925805091858, + "clip_ratio": 0.0, + "completion_length": 82.06845397949219, + "epoch": 4.7, + "grad_norm": 37.25, + "kl": 0.3709164083003998, + "learning_rate": 2.650375939849624e-06, + "loss": 0.0414, + "num_tokens": 38400768.0, + "reward": -1.6665802896022797, + "reward_std": 6.8437591075897215, + "rewards/get_chromagram_reward": 0.6265310943126678, + "rewards/get_chromagram_reward_std": 0.11978982761502266, + "rewards/get_intelligibility_reward": -5.603765249252319, + "rewards/get_intelligibility_reward_std": 10.746640014648438, + "rewards/get_target_len_reward": -0.022506364062428473, + "rewards/get_target_len_reward_std": 0.05831002295017242, + "step": 6250 + }, + { + "advantages": 1.7372271798876682e-07, + "advantages_std": 1.5418607473373414, + "clip_ratio": 0.0, + "completion_length": 89.39166870117188, + "epoch": 4.707518796992481, + "grad_norm": 4416.0, + "kl": 0.570494931936264, + "learning_rate": 2.6466165413533834e-06, + "loss": 0.0596, + "num_tokens": 38714090.0, + "reward": -1.370600515604019, + "reward_std": 6.201184606552124, + "rewards/get_chromagram_reward": 0.6187590003013611, + "rewards/get_chromagram_reward_std": 0.10438089221715927, + "rewards/get_intelligibility_reward": -4.716014695167542, + "rewards/get_intelligibility_reward_std": 9.873149538040161, + "rewards/get_target_len_reward": -0.014545533526688813, + "rewards/get_target_len_reward_std": 0.04638975989073515, + "step": 6260 + }, + { + "advantages": 2.384185914472425e-07, + "advantages_std": 1.533749508857727, + "clip_ratio": 0.0, + "completion_length": 85.84940719604492, + "epoch": 4.715037593984962, + "grad_norm": 80.5, + "kl": 1.3595893025398254, + "learning_rate": 2.642857142857143e-06, + "loss": 0.1384, + "num_tokens": 39018557.0, + "reward": -1.3898645401000977, + "reward_std": 6.5797117233276365, + "rewards/get_chromagram_reward": 0.6220493018627167, + "rewards/get_chromagram_reward_std": 0.1071101889014244, + "rewards/get_intelligibility_reward": -4.771307897567749, + "rewards/get_intelligibility_reward_std": 10.541376209259033, + "rewards/get_target_len_reward": -0.020334663148969413, + "rewards/get_target_len_reward_std": 0.05680564884096384, + "step": 6270 + }, + { + "advantages": -1.1126200334388159e-07, + "advantages_std": 1.5531922578811646, + "clip_ratio": 0.0, + "completion_length": 84.42619400024414, + "epoch": 4.722556390977443, + "grad_norm": 17.25, + "kl": 0.35954761505126953, + "learning_rate": 2.6390977443609022e-06, + "loss": 0.0373, + "num_tokens": 39318340.0, + "reward": -1.4835880193859339, + "reward_std": 6.443195438385009, + "rewards/get_chromagram_reward": 0.6273998200893403, + "rewards/get_chromagram_reward_std": 0.11976640075445175, + "rewards/get_intelligibility_reward": -5.056619435548782, + "rewards/get_intelligibility_reward_std": 9.997103500366212, + "rewards/get_target_len_reward": -0.02154421918094158, + "rewards/get_target_len_reward_std": 0.053043334558606145, + "step": 6280 + }, + { + "advantages": -2.135833483407623e-08, + "advantages_std": 1.5955830335617065, + "clip_ratio": 0.0, + "completion_length": 90.60833511352538, + "epoch": 4.730075187969925, + "grad_norm": 5.3125, + "kl": 0.3239411249756813, + "learning_rate": 2.635338345864662e-06, + "loss": 0.0357, + "num_tokens": 39635641.0, + "reward": -0.942909163236618, + "reward_std": 6.630526304244995, + "rewards/get_chromagram_reward": 0.6229256749153137, + "rewards/get_chromagram_reward_std": 0.10916498303413391, + "rewards/get_intelligibility_reward": -3.4299890637397765, + "rewards/get_intelligibility_reward_std": 11.004190635681152, + "rewards/get_target_len_reward": -0.021664107311517, + "rewards/get_target_len_reward_std": 0.06791071593761444, + "step": 6290 + }, + { + "advantages": 1.9532938999589078e-07, + "advantages_std": 1.6577616095542909, + "clip_ratio": 0.0, + "completion_length": 85.73452377319336, + "epoch": 4.737593984962406, + "grad_norm": 6.90625, + "kl": 0.37113538682460784, + "learning_rate": 2.631578947368421e-06, + "loss": 0.0406, + "num_tokens": 39938723.0, + "reward": -1.636850079894066, + "reward_std": 6.650091171264648, + "rewards/get_chromagram_reward": 0.6210036218166352, + "rewards/get_chromagram_reward_std": 0.11724439710378647, + "rewards/get_intelligibility_reward": -5.510291111469269, + "rewards/get_intelligibility_reward_std": 10.393040084838868, + "rewards/get_target_len_reward": -0.021262429282069208, + "rewards/get_target_len_reward_std": 0.05347590520977974, + "step": 6300 + }, + { + "advantages": 1.6291936191237256e-07, + "advantages_std": 1.6182387113571166, + "clip_ratio": 0.0, + "completion_length": 87.6976219177246, + "epoch": 4.745112781954887, + "grad_norm": 7968.0, + "kl": 1.5957046091556548, + "learning_rate": 2.6278195488721808e-06, + "loss": 0.1626, + "num_tokens": 40247825.0, + "reward": -1.2495031118392945, + "reward_std": 6.350716972351075, + "rewards/get_chromagram_reward": 0.606559020280838, + "rewards/get_chromagram_reward_std": 0.12095492407679558, + "rewards/get_intelligibility_reward": -4.336573672294617, + "rewards/get_intelligibility_reward_std": 10.243553924560548, + "rewards/get_target_len_reward": -0.018494523130357265, + "rewards/get_target_len_reward_std": 0.04909849762916565, + "step": 6310 + }, + { + "advantages": 3.113101016083419e-07, + "advantages_std": 1.5516540169715882, + "clip_ratio": 0.0, + "completion_length": 86.06964416503907, + "epoch": 4.752631578947368, + "grad_norm": 6.625, + "kl": 0.9928636848926544, + "learning_rate": 2.62406015037594e-06, + "loss": 0.1056, + "num_tokens": 40551708.0, + "reward": -1.4838507711887359, + "reward_std": 6.722097969055175, + "rewards/get_chromagram_reward": 0.6187690258026123, + "rewards/get_chromagram_reward_std": 0.11046081259846688, + "rewards/get_intelligibility_reward": -5.0466511964797975, + "rewards/get_intelligibility_reward_std": 10.710439920425415, + "rewards/get_target_len_reward": -0.023669831547886135, + "rewards/get_target_len_reward_std": 0.06956328004598618, + "step": 6320 + }, + { + "advantages": -2.3916364231268974e-07, + "advantages_std": 1.5447567343711852, + "clip_ratio": 0.0, + "completion_length": 84.32024002075195, + "epoch": 4.760150375939849, + "grad_norm": 7.53125, + "kl": 0.31202267557382585, + "learning_rate": 2.6203007518796996e-06, + "loss": 0.0355, + "num_tokens": 40850743.0, + "reward": -1.411372572183609, + "reward_std": 6.221960210800171, + "rewards/get_chromagram_reward": 0.6327088832855224, + "rewards/get_chromagram_reward_std": 0.11209949627518653, + "rewards/get_intelligibility_reward": -4.8436802387237545, + "rewards/get_intelligibility_reward_std": 9.854103612899781, + "rewards/get_target_len_reward": -0.023146109841763973, + "rewards/get_target_len_reward_std": 0.06779935285449028, + "step": 6330 + }, + { + "advantages": -4.929800956343655e-07, + "advantages_std": 1.4955716013908387, + "clip_ratio": 0.0, + "completion_length": 85.45416793823242, + "epoch": 4.767669172932331, + "grad_norm": 6.1875, + "kl": 0.3646396204829216, + "learning_rate": 2.616541353383459e-06, + "loss": 0.0451, + "num_tokens": 41152837.0, + "reward": -1.8766667008399964, + "reward_std": 6.958877801895142, + "rewards/get_chromagram_reward": 0.6213249683380127, + "rewards/get_chromagram_reward_std": 0.1146389216184616, + "rewards/get_intelligibility_reward": -6.225960445404053, + "rewards/get_intelligibility_reward_std": 10.729834365844727, + "rewards/get_target_len_reward": -0.02536428887397051, + "rewards/get_target_len_reward_std": 0.07496323771774768, + "step": 6340 + }, + { + "advantages": 4.221998040065955e-09, + "advantages_std": 1.5300183176994324, + "clip_ratio": 0.0, + "completion_length": 85.9511932373047, + "epoch": 4.775187969924812, + "grad_norm": 23.5, + "kl": 1.3359659627079963, + "learning_rate": 2.6127819548872185e-06, + "loss": 0.1377, + "num_tokens": 41456485.0, + "reward": -1.5328714907169343, + "reward_std": 6.626535129547119, + "rewards/get_chromagram_reward": 0.615748256444931, + "rewards/get_chromagram_reward_std": 0.11465367525815964, + "rewards/get_intelligibility_reward": -5.193982696533203, + "rewards/get_intelligibility_reward_std": 10.48976697921753, + "rewards/get_target_len_reward": -0.020379604259505867, + "rewards/get_target_len_reward_std": 0.05834213700145483, + "step": 6350 + }, + { + "advantages": 1.7310182585106304e-07, + "advantages_std": 1.5485412716865539, + "clip_ratio": 0.0, + "completion_length": 89.55119171142579, + "epoch": 4.782706766917293, + "grad_norm": 6.75, + "kl": 0.31952681094408036, + "learning_rate": 2.6090225563909777e-06, + "loss": 0.0336, + "num_tokens": 41770739.0, + "reward": -1.2756139397621156, + "reward_std": 6.977913093566895, + "rewards/get_chromagram_reward": 0.6330669701099396, + "rewards/get_chromagram_reward_std": 0.10896242782473564, + "rewards/get_intelligibility_reward": -4.439062762260437, + "rewards/get_intelligibility_reward_std": 11.30651569366455, + "rewards/get_target_len_reward": -0.020845694560557605, + "rewards/get_target_len_reward_std": 0.058593994937837124, + "step": 6360 + }, + { + "advantages": 3.568828176980787e-07, + "advantages_std": 1.5717073440551759, + "clip_ratio": 0.0, + "completion_length": 88.12797775268555, + "epoch": 4.790225563909774, + "grad_norm": 5.46875, + "kl": 0.27634538114070895, + "learning_rate": 2.605263157894737e-06, + "loss": 0.0345, + "num_tokens": 42079458.0, + "reward": -1.7293182492256165, + "reward_std": 6.731516885757446, + "rewards/get_chromagram_reward": 0.6031282305717468, + "rewards/get_chromagram_reward_std": 0.10961822122335434, + "rewards/get_intelligibility_reward": -5.771774411201477, + "rewards/get_intelligibility_reward_std": 10.530457973480225, + "rewards/get_target_len_reward": -0.019308150000870227, + "rewards/get_target_len_reward_std": 0.06329579185694456, + "step": 6370 + }, + { + "advantages": 5.245208821058611e-07, + "advantages_std": 1.6197218537330627, + "clip_ratio": 0.0, + "completion_length": 88.53095321655273, + "epoch": 4.797744360902255, + "grad_norm": 5.78125, + "kl": 0.29962356984615324, + "learning_rate": 2.6015037593984966e-06, + "loss": 0.0278, + "num_tokens": 42390738.0, + "reward": -1.6248762607574463, + "reward_std": 6.814621114730835, + "rewards/get_chromagram_reward": 0.6148935854434967, + "rewards/get_chromagram_reward_std": 0.11921465694904328, + "rewards/get_intelligibility_reward": -5.47128050327301, + "rewards/get_intelligibility_reward_std": 10.650694179534913, + "rewards/get_target_len_reward": -0.018241762649267912, + "rewards/get_target_len_reward_std": 0.05343018397688866, + "step": 6380 + }, + { + "advantages": 3.583729402123481e-07, + "advantages_std": 1.600301730632782, + "clip_ratio": 0.0, + "completion_length": 86.42619323730469, + "epoch": 4.8052631578947365, + "grad_norm": 8.5625, + "kl": 6.743069607019424, + "learning_rate": 2.597744360902256e-06, + "loss": 0.674, + "num_tokens": 42696242.0, + "reward": -1.2496988654136658, + "reward_std": 6.139151859283447, + "rewards/get_chromagram_reward": 0.6119321584701538, + "rewards/get_chromagram_reward_std": 0.11752462461590767, + "rewards/get_intelligibility_reward": -4.340754376351834, + "rewards/get_intelligibility_reward_std": 9.796479940414429, + "rewards/get_target_len_reward": -0.020274097472429274, + "rewards/get_target_len_reward_std": 0.051021433994174005, + "step": 6390 + }, + { + "advantages": 5.687276654953166e-07, + "advantages_std": 1.5225232601165772, + "clip_ratio": 0.0, + "completion_length": 86.62500076293945, + "epoch": 4.812781954887218, + "grad_norm": 5.53125, + "kl": 0.3688139796257019, + "learning_rate": 2.5939849624060154e-06, + "loss": 0.0437, + "num_tokens": 43002051.0, + "reward": -1.2026754826307298, + "reward_std": 6.638499164581299, + "rewards/get_chromagram_reward": 0.6256828069686889, + "rewards/get_chromagram_reward_std": 0.11374538168311119, + "rewards/get_intelligibility_reward": -4.210258966684341, + "rewards/get_intelligibility_reward_std": 10.757501363754272, + "rewards/get_target_len_reward": -0.023450003052130342, + "rewards/get_target_len_reward_std": 0.07306296471506357, + "step": 6400 + }, + { + "advantages": 1.3063351693709536e-07, + "advantages_std": 1.5896078944206238, + "clip_ratio": 0.0, + "completion_length": 84.06428756713868, + "epoch": 4.820300751879699, + "grad_norm": 8.0, + "kl": 0.2945214152336121, + "learning_rate": 2.5902255639097747e-06, + "loss": 0.029, + "num_tokens": 43301153.0, + "reward": -1.2195492424070835, + "reward_std": 6.1925302028656, + "rewards/get_chromagram_reward": 0.6336007118225098, + "rewards/get_chromagram_reward_std": 0.11149605363607407, + "rewards/get_intelligibility_reward": -4.271958157420158, + "rewards/get_intelligibility_reward_std": 9.869881725311279, + "rewards/get_target_len_reward": -0.02029011370614171, + "rewards/get_target_len_reward_std": 0.048033690080046654, + "step": 6410 + }, + { + "advantages": -3.386288867091025e-07, + "advantages_std": 1.5211575150489807, + "clip_ratio": 0.0, + "completion_length": 85.46607513427735, + "epoch": 4.8278195488721805, + "grad_norm": 8.3125, + "kl": 0.5205657571554184, + "learning_rate": 2.5864661654135343e-06, + "loss": 0.0572, + "num_tokens": 43604022.0, + "reward": -1.1797817513346671, + "reward_std": 6.35502986907959, + "rewards/get_chromagram_reward": 0.6147702217102051, + "rewards/get_chromagram_reward_std": 0.10805823653936386, + "rewards/get_intelligibility_reward": -4.129573428630829, + "rewards/get_intelligibility_reward_std": 10.311836242675781, + "rewards/get_target_len_reward": -0.024541809875518083, + "rewards/get_target_len_reward_std": 0.08319154866039753, + "step": 6420 + }, + { + "advantages": 3.0547380305279146e-07, + "advantages_std": 1.6374112010002135, + "clip_ratio": 0.0, + "completion_length": 86.75059661865234, + "epoch": 4.8353383458646615, + "grad_norm": 7.34375, + "kl": 0.42518229633569715, + "learning_rate": 2.5827067669172935e-06, + "loss": 0.0461, + "num_tokens": 43909645.0, + "reward": -1.500042522698641, + "reward_std": 6.227635717391967, + "rewards/get_chromagram_reward": 0.6166976928710938, + "rewards/get_chromagram_reward_std": 0.10093749687075615, + "rewards/get_intelligibility_reward": -5.097601294517517, + "rewards/get_intelligibility_reward_std": 9.62869644165039, + "rewards/get_target_len_reward": -0.019223684445023537, + "rewards/get_target_len_reward_std": 0.054870061576366425, + "step": 6430 + }, + { + "advantages": -2.2687019125555706e-07, + "advantages_std": 1.6094887852668762, + "clip_ratio": 0.0, + "completion_length": 86.8827392578125, + "epoch": 4.8428571428571425, + "grad_norm": 5.875, + "kl": 0.30442375540733335, + "learning_rate": 2.578947368421053e-06, + "loss": 0.0338, + "num_tokens": 44216428.0, + "reward": -1.6521910965442657, + "reward_std": 7.041847658157349, + "rewards/get_chromagram_reward": 0.6428345263004303, + "rewards/get_chromagram_reward_std": 0.11256081908941269, + "rewards/get_intelligibility_reward": -5.579641795158386, + "rewards/get_intelligibility_reward_std": 11.189670085906982, + "rewards/get_target_len_reward": -0.019765730388462542, + "rewards/get_target_len_reward_std": 0.052372989058494565, + "step": 6440 + }, + { + "advantages": -4.147489249817227e-08, + "advantages_std": 1.584288239479065, + "clip_ratio": 0.0, + "completion_length": 84.63809661865234, + "epoch": 4.850375939849624, + "grad_norm": 9.8125, + "kl": 0.3660952433943748, + "learning_rate": 2.5751879699248124e-06, + "loss": 0.0423, + "num_tokens": 44516628.0, + "reward": -1.6733587265014649, + "reward_std": 6.637929773330688, + "rewards/get_chromagram_reward": 0.6172063827514649, + "rewards/get_chromagram_reward_std": 0.11280516609549522, + "rewards/get_intelligibility_reward": -5.615116000175476, + "rewards/get_intelligibility_reward_std": 10.391260719299316, + "rewards/get_target_len_reward": -0.022166123893111945, + "rewards/get_target_len_reward_std": 0.0620707118883729, + "step": 6450 + }, + { + "advantages": 2.9553966918172138e-08, + "advantages_std": 1.5499458074569703, + "clip_ratio": 0.0, + "completion_length": 83.31964416503907, + "epoch": 4.8578947368421055, + "grad_norm": 5.5, + "kl": 0.32719179838895796, + "learning_rate": 2.571428571428571e-06, + "loss": 0.0333, + "num_tokens": 44814027.0, + "reward": -1.6808424234390258, + "reward_std": 6.966293144226074, + "rewards/get_chromagram_reward": 0.6218828916549682, + "rewards/get_chromagram_reward_std": 0.10610488280653954, + "rewards/get_intelligibility_reward": -5.644246053695679, + "rewards/get_intelligibility_reward_std": 11.015114688873291, + "rewards/get_target_len_reward": -0.020163825061172248, + "rewards/get_target_len_reward_std": 0.05375215411186218, + "step": 6460 + }, + { + "advantages": 3.8867194760427994e-07, + "advantages_std": 1.5340965390205383, + "clip_ratio": 0.0, + "completion_length": 83.85119171142578, + "epoch": 4.8654135338345865, + "grad_norm": 5.6875, + "kl": 0.48933424055576324, + "learning_rate": 2.5676691729323313e-06, + "loss": 0.0527, + "num_tokens": 45112541.0, + "reward": -1.62738236784935, + "reward_std": 6.789244747161865, + "rewards/get_chromagram_reward": 0.6253014147281647, + "rewards/get_chromagram_reward_std": 0.12089017108082771, + "rewards/get_intelligibility_reward": -5.4844811916351315, + "rewards/get_intelligibility_reward_std": 10.714595985412597, + "rewards/get_target_len_reward": -0.02296700868755579, + "rewards/get_target_len_reward_std": 0.0656088100746274, + "step": 6470 + }, + { + "advantages": -1.1821588685734241e-07, + "advantages_std": 1.5755035519599914, + "clip_ratio": 0.0, + "completion_length": 88.55357360839844, + "epoch": 4.872932330827068, + "grad_norm": 5.65625, + "kl": 0.33493589907884597, + "learning_rate": 2.56390977443609e-06, + "loss": 0.0415, + "num_tokens": 45424425.0, + "reward": -1.141952557489276, + "reward_std": 6.468104410171509, + "rewards/get_chromagram_reward": 0.6155132055282593, + "rewards/get_chromagram_reward_std": 0.09994912594556808, + "rewards/get_intelligibility_reward": -4.016626697778702, + "rewards/get_intelligibility_reward_std": 10.548732280731201, + "rewards/get_target_len_reward": -0.024744043592363596, + "rewards/get_target_len_reward_std": 0.07314223255962134, + "step": 6480 + }, + { + "advantages": -8.195643630415362e-09, + "advantages_std": 1.5248024225234986, + "clip_ratio": 0.0, + "completion_length": 87.0875015258789, + "epoch": 4.880451127819549, + "grad_norm": 5.6875, + "kl": 3.428301727771759, + "learning_rate": 2.56015037593985e-06, + "loss": 0.3461, + "num_tokens": 45731578.0, + "reward": -1.7025025725364684, + "reward_std": 7.153896188735962, + "rewards/get_chromagram_reward": 0.6351093053817749, + "rewards/get_chromagram_reward_std": 0.10930986404418945, + "rewards/get_intelligibility_reward": -5.719335460662842, + "rewards/get_intelligibility_reward_std": 11.252530097961426, + "rewards/get_target_len_reward": -0.023281274922192098, + "rewards/get_target_len_reward_std": 0.07412473894655705, + "step": 6490 + }, + { + "advantages": 1.7036997057573444e-07, + "advantages_std": 1.540906298160553, + "clip_ratio": 0.0, + "completion_length": 87.96785888671874, + "epoch": 4.88796992481203, + "grad_norm": 6.84375, + "kl": 0.3502262085676193, + "learning_rate": 2.556390977443609e-06, + "loss": 0.0407, + "num_tokens": 46040680.0, + "reward": -1.0871734350919724, + "reward_std": 5.983257627487182, + "rewards/get_chromagram_reward": 0.6318377196788788, + "rewards/get_chromagram_reward_std": 0.11660416722297669, + "rewards/get_intelligibility_reward": -3.868463712930679, + "rewards/get_intelligibility_reward_std": 9.709487009048463, + "rewards/get_target_len_reward": -0.024894051626324652, + "rewards/get_target_len_reward_std": 0.0769047923386097, + "step": 6500 + }, + { + "advantages": 6.072223367681317e-07, + "advantages_std": 1.7283748388290405, + "clip_ratio": 0.0, + "completion_length": 88.71309661865234, + "epoch": 4.895488721804512, + "grad_norm": 6.5625, + "kl": 0.29507723152637483, + "learning_rate": 2.552631578947369e-06, + "loss": 0.0317, + "num_tokens": 46351916.0, + "reward": -1.631992408633232, + "reward_std": 6.526308012008667, + "rewards/get_chromagram_reward": 0.6183791100978852, + "rewards/get_chromagram_reward_std": 0.10845671147108078, + "rewards/get_intelligibility_reward": -5.496023435890675, + "rewards/get_intelligibility_reward_std": 10.073901176452637, + "rewards/get_target_len_reward": -0.018332591652870177, + "rewards/get_target_len_reward_std": 0.05578165017068386, + "step": 6510 + }, + { + "advantages": -3.129243992816555e-08, + "advantages_std": 1.5458887219429016, + "clip_ratio": 0.0, + "completion_length": 88.31904983520508, + "epoch": 4.903007518796993, + "grad_norm": 6.5, + "kl": 0.3002905026078224, + "learning_rate": 2.548872180451128e-06, + "loss": 0.0293, + "num_tokens": 46661545.0, + "reward": -1.633839136362076, + "reward_std": 6.5747246742248535, + "rewards/get_chromagram_reward": 0.620064640045166, + "rewards/get_chromagram_reward_std": 0.10467863827943802, + "rewards/get_intelligibility_reward": -5.506745052337647, + "rewards/get_intelligibility_reward_std": 10.277530574798584, + "rewards/get_target_len_reward": -0.01483667390421033, + "rewards/get_target_len_reward_std": 0.03543837685137987, + "step": 6520 + }, + { + "advantages": 3.6607186615356113e-07, + "advantages_std": 1.5753893375396728, + "clip_ratio": 0.0, + "completion_length": 84.10476226806641, + "epoch": 4.910526315789474, + "grad_norm": 13.1875, + "kl": 0.32883389592170714, + "learning_rate": 2.545112781954888e-06, + "loss": 0.0391, + "num_tokens": 46960276.0, + "reward": -1.2893819987773896, + "reward_std": 6.348278760910034, + "rewards/get_chromagram_reward": 0.6218626320362091, + "rewards/get_chromagram_reward_std": 0.10212240219116211, + "rewards/get_intelligibility_reward": -4.469036054611206, + "rewards/get_intelligibility_reward_std": 10.204508018493652, + "rewards/get_target_len_reward": -0.020972410589456557, + "rewards/get_target_len_reward_std": 0.06330780945718288, + "step": 6530 + }, + { + "advantages": 1.6006330909590362e-07, + "advantages_std": 1.5176787972450256, + "clip_ratio": 0.0, + "completion_length": 89.59047622680664, + "epoch": 4.918045112781955, + "grad_norm": 7.4375, + "kl": 0.3287327170372009, + "learning_rate": 2.5413533834586467e-06, + "loss": 0.0348, + "num_tokens": 47274035.0, + "reward": -1.4367490768432618, + "reward_std": 6.943380355834961, + "rewards/get_chromagram_reward": 0.6169800817966461, + "rewards/get_chromagram_reward_std": 0.10801626220345498, + "rewards/get_intelligibility_reward": -4.91004763841629, + "rewards/get_intelligibility_reward_std": 11.124363040924072, + "rewards/get_target_len_reward": -0.017179496679455043, + "rewards/get_target_len_reward_std": 0.047147853672504424, + "step": 6540 + }, + { + "advantages": -1.2964010949190196e-07, + "advantages_std": 1.4592286229133606, + "clip_ratio": 0.0, + "completion_length": 87.46904907226562, + "epoch": 4.925563909774436, + "grad_norm": 77.0, + "kl": 1.5469397068023683, + "learning_rate": 2.5375939849624063e-06, + "loss": 0.158, + "num_tokens": 47582513.0, + "reward": -1.3547814309597015, + "reward_std": 6.633420515060425, + "rewards/get_chromagram_reward": 0.62143474817276, + "rewards/get_chromagram_reward_std": 0.11483238711953163, + "rewards/get_intelligibility_reward": -4.66466805934906, + "rewards/get_intelligibility_reward_std": 10.740064477920532, + "rewards/get_target_len_reward": -0.02111075660213828, + "rewards/get_target_len_reward_std": 0.05489732790738344, + "step": 6550 + }, + { + "advantages": -9.43740351644351e-08, + "advantages_std": 1.5623414754867553, + "clip_ratio": 0.0, + "completion_length": 86.35833511352538, + "epoch": 4.933082706766918, + "grad_norm": 6.09375, + "kl": 0.2813438355922699, + "learning_rate": 2.5338345864661655e-06, + "loss": 0.0347, + "num_tokens": 47887515.0, + "reward": -1.7280932068824768, + "reward_std": 6.756200218200684, + "rewards/get_chromagram_reward": 0.6194582223892212, + "rewards/get_chromagram_reward_std": 0.11464768573641777, + "rewards/get_intelligibility_reward": -5.78103654384613, + "rewards/get_intelligibility_reward_std": 10.545590496063232, + "rewards/get_target_len_reward": -0.022701161913573742, + "rewards/get_target_len_reward_std": 0.08312043901532888, + "step": 6560 + }, + { + "advantages": 1.3187527301283807e-07, + "advantages_std": 1.5597585439682007, + "clip_ratio": 0.0, + "completion_length": 88.42857284545899, + "epoch": 4.940601503759399, + "grad_norm": 26.0, + "kl": 0.3721115231513977, + "learning_rate": 2.5300751879699247e-06, + "loss": 0.0402, + "num_tokens": 48198543.0, + "reward": -1.2894165456295013, + "reward_std": 6.483512449264526, + "rewards/get_chromagram_reward": 0.6240399837493896, + "rewards/get_chromagram_reward_std": 0.12066565677523614, + "rewards/get_intelligibility_reward": -4.470979905128479, + "rewards/get_intelligibility_reward_std": 10.422832012176514, + "rewards/get_target_len_reward": -0.02130947196856141, + "rewards/get_target_len_reward_std": 0.058992895483970645, + "step": 6570 + }, + { + "advantages": -7.182359979651665e-07, + "advantages_std": 1.524202859401703, + "clip_ratio": 0.0, + "completion_length": 86.0982162475586, + "epoch": 4.94812030075188, + "grad_norm": 5.75, + "kl": 0.32063417583703996, + "learning_rate": 2.5263157894736844e-06, + "loss": 0.0388, + "num_tokens": 48502433.0, + "reward": -1.8312727063894272, + "reward_std": 7.22638783454895, + "rewards/get_chromagram_reward": 0.6144508063793183, + "rewards/get_chromagram_reward_std": 0.10415496379137039, + "rewards/get_intelligibility_reward": -6.086835551261902, + "rewards/get_intelligibility_reward_std": 11.28345012664795, + "rewards/get_target_len_reward": -0.02143293796107173, + "rewards/get_target_len_reward_std": 0.07228237017989159, + "step": 6580 + }, + { + "advantages": 3.88796137418268e-07, + "advantages_std": 1.619350051879883, + "clip_ratio": 0.0, + "completion_length": 85.75714340209962, + "epoch": 4.955639097744361, + "grad_norm": 78.0, + "kl": 0.3856597736477852, + "learning_rate": 2.5225563909774436e-06, + "loss": 0.0452, + "num_tokens": 48805915.0, + "reward": -1.8449522614479066, + "reward_std": 6.886161613464355, + "rewards/get_chromagram_reward": 0.6164387345314026, + "rewards/get_chromagram_reward_std": 0.11670946702361107, + "rewards/get_intelligibility_reward": -6.128629541397094, + "rewards/get_intelligibility_reward_std": 10.64172887802124, + "rewards/get_target_len_reward": -0.022665658872574566, + "rewards/get_target_len_reward_std": 0.06206500325351953, + "step": 6590 + }, + { + "advantages": -1.877546353057369e-07, + "advantages_std": 1.624682652950287, + "clip_ratio": 0.0, + "completion_length": 85.98928680419922, + "epoch": 4.963157894736842, + "grad_norm": 87.0, + "kl": 0.3426424890756607, + "learning_rate": 2.5187969924812033e-06, + "loss": 0.038, + "num_tokens": 49110315.0, + "reward": -1.4495089689269662, + "reward_std": 6.569057416915894, + "rewards/get_chromagram_reward": 0.6227473258972168, + "rewards/get_chromagram_reward_std": 0.1206977717578411, + "rewards/get_intelligibility_reward": -4.950021553039551, + "rewards/get_intelligibility_reward_std": 10.386702346801759, + "rewards/get_target_len_reward": -0.02125244690105319, + "rewards/get_target_len_reward_std": 0.06872284896671772, + "step": 6600 + }, + { + "advantages": -1.185884169530027e-07, + "advantages_std": 1.5967671275138855, + "clip_ratio": 0.0, + "completion_length": 82.43988265991212, + "epoch": 4.970676691729324, + "grad_norm": 7.15625, + "kl": 0.7244855403900147, + "learning_rate": 2.5150375939849625e-06, + "loss": 0.0808, + "num_tokens": 49404204.0, + "reward": -1.7379024147987365, + "reward_std": 6.669083547592163, + "rewards/get_chromagram_reward": 0.6282551884651184, + "rewards/get_chromagram_reward_std": 0.11871347054839135, + "rewards/get_intelligibility_reward": -5.818054795265198, + "rewards/get_intelligibility_reward_std": 10.382137298583984, + "rewards/get_target_len_reward": -0.023907260969281197, + "rewards/get_target_len_reward_std": 0.07197411060333252, + "step": 6610 + }, + { + "advantages": -4.0841601247620927e-07, + "advantages_std": 1.6466867446899414, + "clip_ratio": 0.0, + "completion_length": 88.15833435058593, + "epoch": 4.978195488721805, + "grad_norm": 23.5, + "kl": 0.29043773263692857, + "learning_rate": 2.511278195488722e-06, + "loss": 0.0308, + "num_tokens": 49714079.0, + "reward": -1.126624122262001, + "reward_std": 6.541045331954956, + "rewards/get_chromagram_reward": 0.6227805554866791, + "rewards/get_chromagram_reward_std": 0.11063579246401786, + "rewards/get_intelligibility_reward": -3.984594986587763, + "rewards/get_intelligibility_reward_std": 10.58214750289917, + "rewards/get_target_len_reward": -0.018057726556435227, + "rewards/get_target_len_reward_std": 0.0481023607775569, + "step": 6620 + }, + { + "advantages": -2.9206275371507217e-07, + "advantages_std": 1.5873522877693176, + "clip_ratio": 0.0, + "completion_length": 91.16012115478516, + "epoch": 4.985714285714286, + "grad_norm": 8.5, + "kl": 0.36675150841474535, + "learning_rate": 2.5075187969924813e-06, + "loss": 0.039, + "num_tokens": 50032559.0, + "reward": -1.2497528120875359, + "reward_std": 6.907018804550171, + "rewards/get_chromagram_reward": 0.6320128381252289, + "rewards/get_chromagram_reward_std": 0.10616158470511436, + "rewards/get_intelligibility_reward": -4.356342947483062, + "rewards/get_intelligibility_reward_std": 11.155338287353516, + "rewards/get_target_len_reward": -0.02492810133844614, + "rewards/get_target_len_reward_std": 0.07248065434396267, + "step": 6630 + }, + { + "advantages": -2.669791513199016e-07, + "advantages_std": 1.5647882103919983, + "clip_ratio": 0.0, + "completion_length": 89.2732162475586, + "epoch": 4.993233082706767, + "grad_norm": 10.75, + "kl": 0.2904526948928833, + "learning_rate": 2.503759398496241e-06, + "loss": 0.0348, + "num_tokens": 50345461.0, + "reward": -1.317164820432663, + "reward_std": 6.558288669586181, + "rewards/get_chromagram_reward": 0.6253573298454285, + "rewards/get_chromagram_reward_std": 0.10857684016227723, + "rewards/get_intelligibility_reward": -4.557005780935287, + "rewards/get_intelligibility_reward_std": 10.478938770294189, + "rewards/get_target_len_reward": -0.019845707342028618, + "rewards/get_target_len_reward_std": 0.06356870625168085, + "step": 6640 + }, + { + "advantages": -5.700936128505419e-07, + "advantages_std": 1.5285292983055114, + "clip_ratio": 0.0, + "completion_length": 84.10797882080078, + "epoch": 5.001503759398497, + "grad_norm": 306.0, + "kl": 0.5828635230660438, + "learning_rate": 2.5e-06, + "loss": 0.0609, + "num_tokens": 50648514.0, + "reward": -1.8416775107383727, + "reward_std": 7.091966581344605, + "rewards/get_chromagram_reward": 0.6169986069202423, + "rewards/get_chromagram_reward_std": 0.11487487629055977, + "rewards/get_intelligibility_reward": -6.121673631668091, + "rewards/get_intelligibility_reward_std": 10.917494773864746, + "rewards/get_target_len_reward": -0.020357232261449098, + "rewards/get_target_len_reward_std": 0.05068073961883783, + "step": 6650 + }, + { + "advantages": 4.743536479168142e-08, + "advantages_std": 1.649160099029541, + "clip_ratio": 0.0, + "completion_length": 83.18393020629883, + "epoch": 5.009022556390978, + "grad_norm": 8.6875, + "kl": 0.39235475957393645, + "learning_rate": 2.4962406015037594e-06, + "loss": 0.0453, + "num_tokens": 50944468.0, + "reward": -1.8302626073360444, + "reward_std": 6.798886489868164, + "rewards/get_chromagram_reward": 0.6179421901702881, + "rewards/get_chromagram_reward_std": 0.1180720493197441, + "rewards/get_intelligibility_reward": -6.085371446609497, + "rewards/get_intelligibility_reward_std": 10.489519023895264, + "rewards/get_target_len_reward": -0.023358290363103152, + "rewards/get_target_len_reward_std": 0.07059708088636399, + "step": 6660 + }, + { + "advantages": -2.121552803657778e-07, + "advantages_std": 1.5431331992149353, + "clip_ratio": 0.0, + "completion_length": 84.96071548461914, + "epoch": 5.016541353383459, + "grad_norm": 6.75, + "kl": 0.2815273180603981, + "learning_rate": 2.492481203007519e-06, + "loss": 0.0269, + "num_tokens": 51246029.0, + "reward": -1.4928331673145294, + "reward_std": 6.165701866149902, + "rewards/get_chromagram_reward": 0.627123236656189, + "rewards/get_chromagram_reward_std": 0.10546824783086776, + "rewards/get_intelligibility_reward": -5.086357855796814, + "rewards/get_intelligibility_reward_std": 9.6989670753479, + "rewards/get_target_len_reward": -0.019264612533152103, + "rewards/get_target_len_reward_std": 0.049389274418354036, + "step": 6670 + }, + { + "advantages": 5.016724191619915e-07, + "advantages_std": 1.626193630695343, + "clip_ratio": 0.0, + "completion_length": 86.72797775268555, + "epoch": 5.02406015037594, + "grad_norm": 7.03125, + "kl": 0.3750910758972168, + "learning_rate": 2.4887218045112783e-06, + "loss": 0.0397, + "num_tokens": 51552511.0, + "reward": -1.5638932228088378, + "reward_std": 6.7187182903289795, + "rewards/get_chromagram_reward": 0.6333837032318115, + "rewards/get_chromagram_reward_std": 0.12098120152950287, + "rewards/get_intelligibility_reward": -5.302863430976868, + "rewards/get_intelligibility_reward_std": 10.55617914199829, + "rewards/get_target_len_reward": -0.02219974249601364, + "rewards/get_target_len_reward_std": 0.05710374694317579, + "step": 6680 + }, + { + "advantages": 3.1342108570697745e-07, + "advantages_std": 1.5879101514816285, + "clip_ratio": 0.0, + "completion_length": 82.8029769897461, + "epoch": 5.031578947368421, + "grad_norm": 9.0625, + "kl": 0.5160459518432617, + "learning_rate": 2.484962406015038e-06, + "loss": 0.0557, + "num_tokens": 51848678.0, + "reward": -1.152049209177494, + "reward_std": 6.030084133148193, + "rewards/get_chromagram_reward": 0.6250806391239166, + "rewards/get_chromagram_reward_std": 0.12028426826000213, + "rewards/get_intelligibility_reward": -4.054237425327301, + "rewards/get_intelligibility_reward_std": 9.741949558258057, + "rewards/get_target_len_reward": -0.02699065739288926, + "rewards/get_target_len_reward_std": 0.07074901163578033, + "step": 6690 + }, + { + "advantages": -5.6376058399365546e-08, + "advantages_std": 1.5316031932830811, + "clip_ratio": 0.0, + "completion_length": 86.11011962890625, + "epoch": 5.039097744360903, + "grad_norm": 6.21875, + "kl": 0.3129287138581276, + "learning_rate": 2.481203007518797e-06, + "loss": 0.0345, + "num_tokens": 52152743.0, + "reward": -1.6599902629852294, + "reward_std": 6.832069444656372, + "rewards/get_chromagram_reward": 0.6020946443080902, + "rewards/get_chromagram_reward_std": 0.12764331623911856, + "rewards/get_intelligibility_reward": -5.563377809524536, + "rewards/get_intelligibility_reward_std": 10.737880897521972, + "rewards/get_target_len_reward": -0.018687471002340316, + "rewards/get_target_len_reward_std": 0.05601380094885826, + "step": 6700 + }, + { + "advantages": 2.5232635039174054e-07, + "advantages_std": 1.609774398803711, + "clip_ratio": 0.0, + "completion_length": 87.4648826599121, + "epoch": 5.046616541353384, + "grad_norm": 7.96875, + "kl": 0.34581609070301056, + "learning_rate": 2.4774436090225564e-06, + "loss": 0.0382, + "num_tokens": 52461229.0, + "reward": -1.466331911087036, + "reward_std": 6.7329998970031735, + "rewards/get_chromagram_reward": 0.6073729395866394, + "rewards/get_chromagram_reward_std": 0.1180558256804943, + "rewards/get_intelligibility_reward": -4.988929557800293, + "rewards/get_intelligibility_reward_std": 10.727745819091798, + "rewards/get_target_len_reward": -0.017438811622560023, + "rewards/get_target_len_reward_std": 0.04986635074019432, + "step": 6710 + }, + { + "advantages": -7.363657630321541e-08, + "advantages_std": 1.5829906702041625, + "clip_ratio": 0.0, + "completion_length": 81.79226303100586, + "epoch": 5.054135338345865, + "grad_norm": 6.875, + "kl": 0.4496503293514252, + "learning_rate": 2.473684210526316e-06, + "loss": 0.0492, + "num_tokens": 52754446.0, + "reward": -1.3893569886684418, + "reward_std": 6.619420766830444, + "rewards/get_chromagram_reward": 0.6111753046512604, + "rewards/get_chromagram_reward_std": 0.12401786893606186, + "rewards/get_intelligibility_reward": -4.755018162727356, + "rewards/get_intelligibility_reward_std": 10.626482391357422, + "rewards/get_target_len_reward": -0.024227831698954105, + "rewards/get_target_len_reward_std": 0.06835556291043758, + "step": 6720 + }, + { + "advantages": -1.763303600910149e-08, + "advantages_std": 1.6146543741226196, + "clip_ratio": 0.0, + "completion_length": 91.1583351135254, + "epoch": 5.061654135338346, + "grad_norm": 1392.0, + "kl": 0.45773075222969056, + "learning_rate": 2.4699248120300752e-06, + "loss": 0.047, + "num_tokens": 53072797.0, + "reward": -1.0947829756885767, + "reward_std": 6.209275341033935, + "rewards/get_chromagram_reward": 0.6310640692710876, + "rewards/get_chromagram_reward_std": 0.10584187656641006, + "rewards/get_intelligibility_reward": -3.8928360402584077, + "rewards/get_intelligibility_reward_std": 10.099512815475464, + "rewards/get_target_len_reward": -0.022576854890212418, + "rewards/get_target_len_reward_std": 0.07170646525919437, + "step": 6730 + }, + { + "advantages": -8.13106710850775e-07, + "advantages_std": 1.6019715070724487, + "clip_ratio": 0.0, + "completion_length": 84.63809661865234, + "epoch": 5.069172932330827, + "grad_norm": 7.65625, + "kl": 0.3156878113746643, + "learning_rate": 2.466165413533835e-06, + "loss": 0.0365, + "num_tokens": 53373173.0, + "reward": -1.7333171367645264, + "reward_std": 6.950545787811279, + "rewards/get_chromagram_reward": 0.6180140256881714, + "rewards/get_chromagram_reward_std": 0.11169339194893838, + "rewards/get_intelligibility_reward": -5.79854383468628, + "rewards/get_intelligibility_reward_std": 10.903221225738525, + "rewards/get_target_len_reward": -0.019421275705099106, + "rewards/get_target_len_reward_std": 0.06264973357319832, + "step": 6740 + }, + { + "advantages": -1.368423205860836e-07, + "advantages_std": 1.5889723181724549, + "clip_ratio": 0.0, + "completion_length": 85.70952529907227, + "epoch": 5.076691729323309, + "grad_norm": 7.09375, + "kl": 0.3136765375733376, + "learning_rate": 2.462406015037594e-06, + "loss": 0.0334, + "num_tokens": 53676787.0, + "reward": -1.5442959815263748, + "reward_std": 7.033763265609741, + "rewards/get_chromagram_reward": 0.6433658242225647, + "rewards/get_chromagram_reward_std": 0.1127834253013134, + "rewards/get_intelligibility_reward": -5.251438069343567, + "rewards/get_intelligibility_reward_std": 11.205382347106934, + "rewards/get_target_len_reward": -0.024815433658659458, + "rewards/get_target_len_reward_std": 0.061740655824542044, + "step": 6750 + }, + { + "advantages": 4.728635475181875e-07, + "advantages_std": 1.6415120124816895, + "clip_ratio": 0.0, + "completion_length": 86.85595397949218, + "epoch": 5.08421052631579, + "grad_norm": 6.96875, + "kl": 0.3616165786981583, + "learning_rate": 2.4586466165413538e-06, + "loss": 0.0388, + "num_tokens": 53982610.0, + "reward": -1.7239043295383454, + "reward_std": 6.90065188407898, + "rewards/get_chromagram_reward": 0.6219595074653625, + "rewards/get_chromagram_reward_std": 0.10513537898659706, + "rewards/get_intelligibility_reward": -5.775780349969864, + "rewards/get_intelligibility_reward_std": 10.657470417022704, + "rewards/get_target_len_reward": -0.017891742382198574, + "rewards/get_target_len_reward_std": 0.05384985618293285, + "step": 6760 + }, + { + "advantages": 2.967814694443405e-07, + "advantages_std": 1.7612375378608705, + "clip_ratio": 0.0, + "completion_length": 87.82857208251953, + "epoch": 5.091729323308271, + "grad_norm": 7.78125, + "kl": 0.37060387432575226, + "learning_rate": 2.454887218045113e-06, + "loss": 0.0418, + "num_tokens": 54291505.0, + "reward": -1.7024814426898955, + "reward_std": 7.071507167816162, + "rewards/get_chromagram_reward": 0.6242042541503906, + "rewards/get_chromagram_reward_std": 0.11118239611387253, + "rewards/get_intelligibility_reward": -5.710920715332032, + "rewards/get_intelligibility_reward_std": 11.158560276031494, + "rewards/get_target_len_reward": -0.020727519784122704, + "rewards/get_target_len_reward_std": 0.06277465522289276, + "step": 6770 + }, + { + "advantages": -5.712108475108835e-09, + "advantages_std": 1.6215251684188843, + "clip_ratio": 0.0, + "completion_length": 88.75714492797852, + "epoch": 5.099248120300752, + "grad_norm": 38.75, + "kl": 0.4336851522326469, + "learning_rate": 2.4511278195488726e-06, + "loss": 0.0472, + "num_tokens": 54603325.0, + "reward": -1.2727844998240472, + "reward_std": 6.598020696640015, + "rewards/get_chromagram_reward": 0.6093868017196655, + "rewards/get_chromagram_reward_std": 0.1134963721036911, + "rewards/get_intelligibility_reward": -4.4110959649086, + "rewards/get_intelligibility_reward_std": 10.600707530975342, + "rewards/get_target_len_reward": -0.016644243523478507, + "rewards/get_target_len_reward_std": 0.051578975096344945, + "step": 6780 + }, + { + "advantages": 2.786517171671221e-07, + "advantages_std": 1.5503077149391173, + "clip_ratio": 0.0, + "completion_length": 87.54642944335937, + "epoch": 5.106766917293233, + "grad_norm": 5.5625, + "kl": 0.32320014089345933, + "learning_rate": 2.447368421052632e-06, + "loss": 0.0341, + "num_tokens": 54912339.0, + "reward": -1.5714792966842652, + "reward_std": 6.982712078094482, + "rewards/get_chromagram_reward": 0.6172931432723999, + "rewards/get_chromagram_reward_std": 0.10349898040294647, + "rewards/get_intelligibility_reward": -5.315335440635681, + "rewards/get_intelligibility_reward_std": 11.076660537719727, + "rewards/get_target_len_reward": -0.016395517718046905, + "rewards/get_target_len_reward_std": 0.047106191515922546, + "step": 6790 + }, + { + "advantages": -5.935630156272964e-08, + "advantages_std": 1.6334615707397462, + "clip_ratio": 0.0, + "completion_length": 85.74702606201171, + "epoch": 5.114285714285714, + "grad_norm": 10.0, + "kl": 0.3186270877718925, + "learning_rate": 2.443609022556391e-06, + "loss": 0.0331, + "num_tokens": 55216325.0, + "reward": -1.288988533616066, + "reward_std": 6.493602895736695, + "rewards/get_chromagram_reward": 0.6300028264522552, + "rewards/get_chromagram_reward_std": 0.12195887714624405, + "rewards/get_intelligibility_reward": -4.469488799571991, + "rewards/get_intelligibility_reward_std": 10.4141845703125, + "rewards/get_target_len_reward": -0.027479395363479854, + "rewards/get_target_len_reward_std": 0.07249160967767239, + "step": 6800 + }, + { + "advantages": -7.649263109144045e-07, + "advantages_std": 1.6298179507255555, + "clip_ratio": 0.0, + "completion_length": 87.87619171142578, + "epoch": 5.121804511278196, + "grad_norm": 13.0, + "kl": 0.30407789051532746, + "learning_rate": 2.4398496240601503e-06, + "loss": 0.0375, + "num_tokens": 55525445.0, + "reward": -1.5689442038536072, + "reward_std": 7.007902336120606, + "rewards/get_chromagram_reward": 0.6134120762348175, + "rewards/get_chromagram_reward_std": 0.11479166969656944, + "rewards/get_intelligibility_reward": -5.299472689628601, + "rewards/get_intelligibility_reward_std": 11.139194774627686, + "rewards/get_target_len_reward": -0.02077170191332698, + "rewards/get_target_len_reward_std": 0.06966968681663274, + "step": 6810 + }, + { + "advantages": 4.023312931700218e-08, + "advantages_std": 1.6519222855567932, + "clip_ratio": 0.0, + "completion_length": 86.25595397949219, + "epoch": 5.129323308270677, + "grad_norm": 40.5, + "kl": 0.3467926293611526, + "learning_rate": 2.43609022556391e-06, + "loss": 0.0386, + "num_tokens": 55830230.0, + "reward": -1.8048245370388032, + "reward_std": 6.856232643127441, + "rewards/get_chromagram_reward": 0.6267795324325561, + "rewards/get_chromagram_reward_std": 0.12441687732934952, + "rewards/get_intelligibility_reward": -6.0160400390625, + "rewards/get_intelligibility_reward_std": 10.567765140533448, + "rewards/get_target_len_reward": -0.02521284818649292, + "rewards/get_target_len_reward_std": 0.07437393795698881, + "step": 6820 + }, + { + "advantages": 4.423161307443024e-07, + "advantages_std": 1.5914803862571716, + "clip_ratio": 0.0, + "completion_length": 86.9851203918457, + "epoch": 5.136842105263158, + "grad_norm": 8.0, + "kl": 0.3530740290880203, + "learning_rate": 2.432330827067669e-06, + "loss": 0.0352, + "num_tokens": 56137102.0, + "reward": -1.563096636533737, + "reward_std": 6.7047443866729735, + "rewards/get_chromagram_reward": 0.6265234291553498, + "rewards/get_chromagram_reward_std": 0.12023014053702355, + "rewards/get_intelligibility_reward": -5.292724537849426, + "rewards/get_intelligibility_reward_std": 10.608396053314209, + "rewards/get_target_len_reward": -0.023088517505675553, + "rewards/get_target_len_reward_std": 0.0586923124268651, + "step": 6830 + }, + { + "advantages": -5.491078042041409e-07, + "advantages_std": 1.517658293247223, + "clip_ratio": 0.0, + "completion_length": 85.73631057739257, + "epoch": 5.144360902255639, + "grad_norm": 59.0, + "kl": 0.3680226504802704, + "learning_rate": 2.428571428571429e-06, + "loss": 0.0385, + "num_tokens": 56441444.0, + "reward": -1.2847602039575576, + "reward_std": 6.313221168518067, + "rewards/get_chromagram_reward": 0.6244631409645081, + "rewards/get_chromagram_reward_std": 0.12040503397583961, + "rewards/get_intelligibility_reward": -4.454781115055084, + "rewards/get_intelligibility_reward_std": 10.152585697174072, + "rewards/get_target_len_reward": -0.023962332773953675, + "rewards/get_target_len_reward_std": 0.05895144417881966, + "step": 6840 + }, + { + "advantages": 5.302330219336682e-08, + "advantages_std": 1.52640398144722, + "clip_ratio": 0.0, + "completion_length": 88.34166870117187, + "epoch": 5.15187969924812, + "grad_norm": 5.15625, + "kl": 0.32784585654735565, + "learning_rate": 2.424812030075188e-06, + "loss": 0.0328, + "num_tokens": 56752125.0, + "reward": -1.2381734997034073, + "reward_std": 6.723718500137329, + "rewards/get_chromagram_reward": 0.6423853456974029, + "rewards/get_chromagram_reward_std": 0.10823142379522324, + "rewards/get_intelligibility_reward": -4.337746638059616, + "rewards/get_intelligibility_reward_std": 10.807358932495116, + "rewards/get_target_len_reward": -0.01915889075025916, + "rewards/get_target_len_reward_std": 0.05332515276968479, + "step": 6850 + }, + { + "advantages": 1.184642421492299e-07, + "advantages_std": 1.5295220255851745, + "clip_ratio": 0.0, + "completion_length": 86.89345474243164, + "epoch": 5.159398496240602, + "grad_norm": 19.25, + "kl": 5.167052660882473, + "learning_rate": 2.4210526315789477e-06, + "loss": 0.5225, + "num_tokens": 57058305.0, + "reward": -1.584045022726059, + "reward_std": 6.4145008563995365, + "rewards/get_chromagram_reward": 0.6162258267402649, + "rewards/get_chromagram_reward_std": 0.1133840948343277, + "rewards/get_intelligibility_reward": -5.345023941993714, + "rewards/get_intelligibility_reward_std": 9.983595991134644, + "rewards/get_target_len_reward": -0.02333657452836633, + "rewards/get_target_len_reward_std": 0.0719031471759081, + "step": 6860 + }, + { + "advantages": -4.3710078045933185e-08, + "advantages_std": 1.5951733827590941, + "clip_ratio": 0.0, + "completion_length": 86.81666946411133, + "epoch": 5.166917293233083, + "grad_norm": 7.90625, + "kl": 0.35654806196689603, + "learning_rate": 2.417293233082707e-06, + "loss": 0.0423, + "num_tokens": 57364259.0, + "reward": -1.0671080329455436, + "reward_std": 6.709721088409424, + "rewards/get_chromagram_reward": 0.6349423170089722, + "rewards/get_chromagram_reward_std": 0.11176861301064492, + "rewards/get_intelligibility_reward": -3.807164826989174, + "rewards/get_intelligibility_reward_std": 10.922046852111816, + "rewards/get_target_len_reward": -0.029101449809968472, + "rewards/get_target_len_reward_std": 0.08822835758328437, + "step": 6870 + }, + { + "advantages": -5.061427756913872e-07, + "advantages_std": 1.558658480644226, + "clip_ratio": 0.0, + "completion_length": 84.79107284545898, + "epoch": 5.174436090225564, + "grad_norm": 7.84375, + "kl": 24.10377275198698, + "learning_rate": 2.4135338345864665e-06, + "loss": 2.4124, + "num_tokens": 57665170.0, + "reward": -1.6351744055747985, + "reward_std": 6.380785751342773, + "rewards/get_chromagram_reward": 0.6248750269412995, + "rewards/get_chromagram_reward_std": 0.10694740414619446, + "rewards/get_intelligibility_reward": -5.510381889343262, + "rewards/get_intelligibility_reward_std": 9.946515560150146, + "rewards/get_target_len_reward": -0.020016012340784074, + "rewards/get_target_len_reward_std": 0.05424492470920086, + "step": 6880 + }, + { + "advantages": -1.986825282074278e-09, + "advantages_std": 1.5632656812667847, + "clip_ratio": 0.0, + "completion_length": 89.39166641235352, + "epoch": 5.181954887218045, + "grad_norm": 12.25, + "kl": 0.3236946240067482, + "learning_rate": 2.4097744360902257e-06, + "loss": 0.0371, + "num_tokens": 57978655.0, + "reward": -1.1818639472126962, + "reward_std": 6.2812474250793455, + "rewards/get_chromagram_reward": 0.6271504878997802, + "rewards/get_chromagram_reward_std": 0.09476440995931626, + "rewards/get_intelligibility_reward": -4.154732119292021, + "rewards/get_intelligibility_reward_std": 10.153310012817382, + "rewards/get_target_len_reward": -0.018009957671165467, + "rewards/get_target_len_reward_std": 0.05352111738175154, + "step": 6890 + }, + { + "advantages": 1.5199186123027176e-07, + "advantages_std": 1.5968881249427795, + "clip_ratio": 0.0, + "completion_length": 89.56309509277344, + "epoch": 5.189473684210526, + "grad_norm": 7.03125, + "kl": 0.31880530416965486, + "learning_rate": 2.406015037593985e-06, + "loss": 0.0362, + "num_tokens": 58292583.0, + "reward": -1.1216454744338988, + "reward_std": 6.318594741821289, + "rewards/get_chromagram_reward": 0.6131006479263306, + "rewards/get_chromagram_reward_std": 0.11267746463418007, + "rewards/get_intelligibility_reward": -3.9571539878845217, + "rewards/get_intelligibility_reward_std": 10.343779563903809, + "rewards/get_target_len_reward": -0.020882988907396795, + "rewards/get_target_len_reward_std": 0.06357498727738857, + "step": 6900 + }, + { + "advantages": -9.909271625474503e-08, + "advantages_std": 1.583446776866913, + "clip_ratio": 0.0, + "completion_length": 86.56964492797852, + "epoch": 5.196992481203008, + "grad_norm": 6.15625, + "kl": 0.38114998638629916, + "learning_rate": 2.4022556390977446e-06, + "loss": 0.039, + "num_tokens": 58598561.0, + "reward": -1.4668249249458314, + "reward_std": 6.721897220611572, + "rewards/get_chromagram_reward": 0.6037787020206451, + "rewards/get_chromagram_reward_std": 0.1092615433037281, + "rewards/get_intelligibility_reward": -4.986512398719787, + "rewards/get_intelligibility_reward_std": 10.674504327774049, + "rewards/get_target_len_reward": -0.017740893363952636, + "rewards/get_target_len_reward_std": 0.0504965964704752, + "step": 6910 + }, + { + "advantages": 3.6979716639962134e-07, + "advantages_std": 1.5312896370887756, + "clip_ratio": 0.0, + "completion_length": 84.78273849487304, + "epoch": 5.204511278195489, + "grad_norm": 9.5, + "kl": 0.5774701595306396, + "learning_rate": 2.398496240601504e-06, + "loss": 0.0619, + "num_tokens": 58898988.0, + "reward": -1.5599512100219726, + "reward_std": 6.487179136276245, + "rewards/get_chromagram_reward": 0.6300956845283509, + "rewards/get_chromagram_reward_std": 0.11254699677228927, + "rewards/get_intelligibility_reward": -5.289097785949707, + "rewards/get_intelligibility_reward_std": 10.138393115997314, + "rewards/get_target_len_reward": -0.02085120417177677, + "rewards/get_target_len_reward_std": 0.054374009184539315, + "step": 6920 + }, + { + "advantages": 9.064873864872425e-08, + "advantages_std": 1.4809496760368348, + "clip_ratio": 0.0, + "completion_length": 86.4476203918457, + "epoch": 5.21203007518797, + "grad_norm": 7.0, + "kl": 0.3302300497889519, + "learning_rate": 2.3947368421052635e-06, + "loss": 0.0375, + "num_tokens": 59204310.0, + "reward": -1.5772881627082824, + "reward_std": 6.619961738586426, + "rewards/get_chromagram_reward": 0.6164861679077148, + "rewards/get_chromagram_reward_std": 0.11783003509044647, + "rewards/get_intelligibility_reward": -5.327880811691284, + "rewards/get_intelligibility_reward_std": 10.359909629821777, + "rewards/get_target_len_reward": -0.020469481870532037, + "rewards/get_target_len_reward_std": 0.05776517633348703, + "step": 6930 + }, + { + "advantages": -5.712101369681477e-09, + "advantages_std": 1.6208165287971497, + "clip_ratio": 0.0, + "completion_length": 90.48214492797851, + "epoch": 5.219548872180451, + "grad_norm": 4.0625, + "kl": 0.28569827526807784, + "learning_rate": 2.3909774436090227e-06, + "loss": 0.0295, + "num_tokens": 59521105.0, + "reward": -1.22709841132164, + "reward_std": 6.416428852081299, + "rewards/get_chromagram_reward": 0.6274036824703216, + "rewards/get_chromagram_reward_std": 0.11061776801943779, + "rewards/get_intelligibility_reward": -4.292467278242111, + "rewards/get_intelligibility_reward_std": 10.17729892730713, + "rewards/get_target_len_reward": -0.016231359355151652, + "rewards/get_target_len_reward_std": 0.041813553869724275, + "step": 6940 + }, + { + "advantages": -3.9935112092770455e-07, + "advantages_std": 1.5138915538787843, + "clip_ratio": 0.0, + "completion_length": 89.7500015258789, + "epoch": 5.227067669172932, + "grad_norm": 10.25, + "kl": 0.3521960288286209, + "learning_rate": 2.3872180451127823e-06, + "loss": 0.0354, + "num_tokens": 59835649.0, + "reward": -1.3172665178775786, + "reward_std": 6.572981929779052, + "rewards/get_chromagram_reward": 0.6248763024806976, + "rewards/get_chromagram_reward_std": 0.10387115105986595, + "rewards/get_intelligibility_reward": -4.556372022628784, + "rewards/get_intelligibility_reward_std": 10.613866996765136, + "rewards/get_target_len_reward": -0.02030346216633916, + "rewards/get_target_len_reward_std": 0.06047505233436823, + "step": 6950 + }, + { + "advantages": 3.9984780286772546e-08, + "advantages_std": 1.3496541380882263, + "clip_ratio": 0.0, + "completion_length": 85.63928680419922, + "epoch": 5.234586466165413, + "grad_norm": 8.3125, + "kl": 0.33269438743591306, + "learning_rate": 2.3834586466165416e-06, + "loss": 0.0387, + "num_tokens": 60138755.0, + "reward": -1.9132965922355651, + "reward_std": 6.963921070098877, + "rewards/get_chromagram_reward": 0.6157672822475433, + "rewards/get_chromagram_reward_std": 0.11723127737641334, + "rewards/get_intelligibility_reward": -6.334445428848267, + "rewards/get_intelligibility_reward_std": 10.675588703155517, + "rewards/get_target_len_reward": -0.0212111490778625, + "rewards/get_target_len_reward_std": 0.06555038467049598, + "step": 6960 + }, + { + "advantages": 1.5447536867441157e-07, + "advantages_std": 1.706715500354767, + "clip_ratio": 0.0, + "completion_length": 89.80952529907226, + "epoch": 5.242105263157895, + "grad_norm": 7.875, + "kl": 0.3068757638335228, + "learning_rate": 2.379699248120301e-06, + "loss": 0.0351, + "num_tokens": 60452409.0, + "reward": -1.1487817078828813, + "reward_std": 6.275458145141601, + "rewards/get_chromagram_reward": 0.6316677033901215, + "rewards/get_chromagram_reward_std": 0.10417709574103355, + "rewards/get_intelligibility_reward": -4.055424535274506, + "rewards/get_intelligibility_reward_std": 10.131557607650757, + "rewards/get_target_len_reward": -0.022588130366057158, + "rewards/get_target_len_reward_std": 0.07189572602510452, + "step": 6970 + }, + { + "advantages": -4.221995197895012e-08, + "advantages_std": 1.666536283493042, + "clip_ratio": 0.0, + "completion_length": 86.27559814453124, + "epoch": 5.249624060150376, + "grad_norm": 6.0625, + "kl": 3.1084075570106506, + "learning_rate": 2.3759398496240604e-06, + "loss": 0.3127, + "num_tokens": 60757343.0, + "reward": -1.7111223936080933, + "reward_std": 6.904574012756347, + "rewards/get_chromagram_reward": 0.6233543515205383, + "rewards/get_chromagram_reward_std": 0.11629278510808945, + "rewards/get_intelligibility_reward": -5.738094091415405, + "rewards/get_intelligibility_reward_std": 10.80052490234375, + "rewards/get_target_len_reward": -0.01862709941342473, + "rewards/get_target_len_reward_std": 0.04503080155700445, + "step": 6980 + }, + { + "advantages": 5.811452865600586e-08, + "advantages_std": 1.6017412543296814, + "clip_ratio": 0.0, + "completion_length": 87.71666793823242, + "epoch": 5.257142857142857, + "grad_norm": 6.40625, + "kl": 0.662769903242588, + "learning_rate": 2.3721804511278197e-06, + "loss": 0.0647, + "num_tokens": 61066441.0, + "reward": -1.690003263950348, + "reward_std": 7.024064779281616, + "rewards/get_chromagram_reward": 0.626186752319336, + "rewards/get_chromagram_reward_std": 0.11964567676186562, + "rewards/get_intelligibility_reward": -5.6770260572433475, + "rewards/get_intelligibility_reward_std": 11.044268608093262, + "rewards/get_target_len_reward": -0.019170239195227624, + "rewards/get_target_len_reward_std": 0.04387279041111469, + "step": 6990 + }, + { + "advantages": -5.098680901483022e-07, + "advantages_std": 1.6851074337959289, + "clip_ratio": 0.0, + "completion_length": 90.42500152587891, + "epoch": 5.264661654135338, + "grad_norm": 5.0625, + "kl": 0.4801910310983658, + "learning_rate": 2.368421052631579e-06, + "loss": 0.0531, + "num_tokens": 61383224.0, + "reward": -1.1600703239440917, + "reward_std": 6.438305616378784, + "rewards/get_chromagram_reward": 0.6169378876686096, + "rewards/get_chromagram_reward_std": 0.11968811750411987, + "rewards/get_intelligibility_reward": -4.075756704807281, + "rewards/get_intelligibility_reward_std": 10.412063598632812, + "rewards/get_target_len_reward": -0.021391940582543612, + "rewards/get_target_len_reward_std": 0.05608705058693886, + "step": 7000 + }, + { + "advantages": 5.019208067835734e-07, + "advantages_std": 1.6616424560546874, + "clip_ratio": 0.0, + "completion_length": 83.74107208251954, + "epoch": 5.272180451127819, + "grad_norm": 7.21875, + "kl": 0.44268949925899503, + "learning_rate": 2.3646616541353385e-06, + "loss": 0.0489, + "num_tokens": 61680912.0, + "reward": -1.8132120728492738, + "reward_std": 6.99270076751709, + "rewards/get_chromagram_reward": 0.6127820551395416, + "rewards/get_chromagram_reward_std": 0.1150453269481659, + "rewards/get_intelligibility_reward": -6.030172061920166, + "rewards/get_intelligibility_reward_std": 10.928667163848877, + "rewards/get_target_len_reward": -0.022245942149311304, + "rewards/get_target_len_reward_std": 0.0636180106550455, + "step": 7010 + }, + { + "advantages": 3.9910278815114e-07, + "advantages_std": 1.6216766953468322, + "clip_ratio": 0.0, + "completion_length": 85.14702377319335, + "epoch": 5.279699248120301, + "grad_norm": 5.71875, + "kl": 0.3775395154953003, + "learning_rate": 2.3609022556390977e-06, + "loss": 0.0443, + "num_tokens": 61982083.0, + "reward": -1.5514660596847534, + "reward_std": 6.568768739700317, + "rewards/get_chromagram_reward": 0.6127265453338623, + "rewards/get_chromagram_reward_std": 0.1180819720029831, + "rewards/get_intelligibility_reward": -5.243985009193421, + "rewards/get_intelligibility_reward_std": 10.370185852050781, + "rewards/get_target_len_reward": -0.023139323480427264, + "rewards/get_target_len_reward_std": 0.07175651714205741, + "step": 7020 + }, + { + "advantages": -2.2128225509732147e-07, + "advantages_std": 1.4390228509902954, + "clip_ratio": 0.0, + "completion_length": 90.20476379394532, + "epoch": 5.287218045112782, + "grad_norm": 7.03125, + "kl": 0.3158534452319145, + "learning_rate": 2.3571428571428574e-06, + "loss": 0.0319, + "num_tokens": 62298384.0, + "reward": -1.0992859616875648, + "reward_std": 6.4978162288665775, + "rewards/get_chromagram_reward": 0.6307229638099671, + "rewards/get_chromagram_reward_std": 0.1130222037434578, + "rewards/get_intelligibility_reward": -3.906294071674347, + "rewards/get_intelligibility_reward_std": 10.669676256179809, + "rewards/get_target_len_reward": -0.022286569233983755, + "rewards/get_target_len_reward_std": 0.053410691767930986, + "step": 7030 + }, + { + "advantages": 3.4521024296907397e-07, + "advantages_std": 1.474492335319519, + "clip_ratio": 0.0, + "completion_length": 85.87381210327149, + "epoch": 5.294736842105263, + "grad_norm": 5.1875, + "kl": 0.3069776311516762, + "learning_rate": 2.3533834586466166e-06, + "loss": 0.0318, + "num_tokens": 62602452.0, + "reward": -1.6596662104129791, + "reward_std": 6.60387659072876, + "rewards/get_chromagram_reward": 0.6276511192321778, + "rewards/get_chromagram_reward_std": 0.11999709233641624, + "rewards/get_intelligibility_reward": -5.584190630912781, + "rewards/get_intelligibility_reward_std": 10.274257373809814, + "rewards/get_target_len_reward": -0.02245878903195262, + "rewards/get_target_len_reward_std": 0.05738620981574059, + "step": 7040 + }, + { + "advantages": -1.3262032894090225e-07, + "advantages_std": 1.645529079437256, + "clip_ratio": 0.0, + "completion_length": 88.70119171142578, + "epoch": 5.302255639097744, + "grad_norm": 7.59375, + "kl": 0.31840053349733355, + "learning_rate": 2.3496240601503762e-06, + "loss": 0.035, + "num_tokens": 62914016.0, + "reward": -1.4617899343371392, + "reward_std": 6.983079671859741, + "rewards/get_chromagram_reward": 0.6129271507263183, + "rewards/get_chromagram_reward_std": 0.11817508563399315, + "rewards/get_intelligibility_reward": -4.9796409726142885, + "rewards/get_intelligibility_reward_std": 11.137241172790528, + "rewards/get_target_len_reward": -0.018655857909470795, + "rewards/get_target_len_reward_std": 0.05034475326538086, + "step": 7050 + }, + { + "advantages": 3.899136231666489e-08, + "advantages_std": 1.5345199227333068, + "clip_ratio": 0.0, + "completion_length": 87.96964416503906, + "epoch": 5.309774436090225, + "grad_norm": 11.25, + "kl": 0.42554541379213334, + "learning_rate": 2.3458646616541355e-06, + "loss": 0.0469, + "num_tokens": 63223477.0, + "reward": -1.5128212684765459, + "reward_std": 6.479464769363403, + "rewards/get_chromagram_reward": 0.6212924182415008, + "rewards/get_chromagram_reward_std": 0.12423446327447892, + "rewards/get_intelligibility_reward": -5.140022248029709, + "rewards/get_intelligibility_reward_std": 10.14908390045166, + "rewards/get_target_len_reward": -0.019733687210828067, + "rewards/get_target_len_reward_std": 0.053223836794495584, + "step": 7060 + }, + { + "advantages": 4.6094261083595713e-07, + "advantages_std": 1.4596411824226379, + "clip_ratio": 0.0, + "completion_length": 87.01131134033203, + "epoch": 5.317293233082707, + "grad_norm": 21.0, + "kl": 0.3311875075101852, + "learning_rate": 2.342105263157895e-06, + "loss": 0.0376, + "num_tokens": 63529504.0, + "reward": -1.3178690791130065, + "reward_std": 6.07402229309082, + "rewards/get_chromagram_reward": 0.6423762559890747, + "rewards/get_chromagram_reward_std": 0.10435187965631484, + "rewards/get_intelligibility_reward": -4.571545362472534, + "rewards/get_intelligibility_reward_std": 9.608693504333496, + "rewards/get_target_len_reward": -0.024437942169606687, + "rewards/get_target_len_reward_std": 0.07988403253257274, + "step": 7070 + }, + { + "advantages": -1.0418395106626121e-07, + "advantages_std": 1.5732410550117493, + "clip_ratio": 0.0, + "completion_length": 87.19583587646484, + "epoch": 5.324812030075188, + "grad_norm": 5.3125, + "kl": 0.439504337310791, + "learning_rate": 2.3383458646616543e-06, + "loss": 0.05, + "num_tokens": 63836515.0, + "reward": -1.24023876786232, + "reward_std": 6.505569648742676, + "rewards/get_chromagram_reward": 0.6219327926635743, + "rewards/get_chromagram_reward_std": 0.10478176176548004, + "rewards/get_intelligibility_reward": -4.321042706817389, + "rewards/get_intelligibility_reward_std": 10.534570789337158, + "rewards/get_target_len_reward": -0.021606145799160002, + "rewards/get_target_len_reward_std": 0.07151275128126144, + "step": 7080 + }, + { + "advantages": -6.737808490697717e-07, + "advantages_std": 1.6721099257469176, + "clip_ratio": 0.0, + "completion_length": 89.05476303100586, + "epoch": 5.332330827067669, + "grad_norm": 6.28125, + "kl": 0.2852652370929718, + "learning_rate": 2.334586466165414e-06, + "loss": 0.0308, + "num_tokens": 64148637.0, + "reward": -1.182309341430664, + "reward_std": 5.901613664627075, + "rewards/get_chromagram_reward": 0.6252353847026825, + "rewards/get_chromagram_reward_std": 0.11671873182058334, + "rewards/get_intelligibility_reward": -4.154728293418884, + "rewards/get_intelligibility_reward_std": 9.46047306060791, + "rewards/get_target_len_reward": -0.017434802697971465, + "rewards/get_target_len_reward_std": 0.05399100258946419, + "step": 7090 + }, + { + "advantages": -4.215787043904129e-07, + "advantages_std": 1.4545109629631043, + "clip_ratio": 0.0, + "completion_length": 86.90357208251953, + "epoch": 5.33984962406015, + "grad_norm": 34.5, + "kl": 0.8135642141103745, + "learning_rate": 2.330827067669173e-06, + "loss": 0.0854, + "num_tokens": 64454798.0, + "reward": -1.7862254559993744, + "reward_std": 6.553706693649292, + "rewards/get_chromagram_reward": 0.6116910398006439, + "rewards/get_chromagram_reward_std": 0.10669722333550453, + "rewards/get_intelligibility_reward": -5.948037195205688, + "rewards/get_intelligibility_reward_std": 10.042701148986817, + "rewards/get_target_len_reward": -0.02233006376773119, + "rewards/get_target_len_reward_std": 0.0645206457003951, + "step": 7100 + }, + { + "advantages": -7.164975386331207e-08, + "advantages_std": 1.5383620381355285, + "clip_ratio": 0.0, + "completion_length": 87.3398811340332, + "epoch": 5.347368421052631, + "grad_norm": 6.15625, + "kl": 0.31754042506217955, + "learning_rate": 2.3270676691729324e-06, + "loss": 0.0366, + "num_tokens": 64762510.0, + "reward": -1.8574148535728454, + "reward_std": 7.235694837570191, + "rewards/get_chromagram_reward": 0.624306446313858, + "rewards/get_chromagram_reward_std": 0.11558253094553947, + "rewards/get_intelligibility_reward": -6.175735664367676, + "rewards/get_intelligibility_reward_std": 11.251740074157714, + "rewards/get_target_len_reward": -0.02081504138186574, + "rewards/get_target_len_reward_std": 0.059531612880527975, + "step": 7110 + }, + { + "advantages": 1.9793709107318592e-07, + "advantages_std": 1.5872637391090394, + "clip_ratio": 0.0, + "completion_length": 85.45297698974609, + "epoch": 5.354887218045112, + "grad_norm": 7.96875, + "kl": 0.3048421382904053, + "learning_rate": 2.3233082706766916e-06, + "loss": 0.0402, + "num_tokens": 65064777.0, + "reward": -1.672779655456543, + "reward_std": 7.417824840545654, + "rewards/get_chromagram_reward": 0.6289528369903564, + "rewards/get_chromagram_reward_std": 0.11590608209371567, + "rewards/get_intelligibility_reward": -5.623922848701477, + "rewards/get_intelligibility_reward_std": 11.883197689056397, + "rewards/get_target_len_reward": -0.023368793446570633, + "rewards/get_target_len_reward_std": 0.08596869017928839, + "step": 7120 + }, + { + "advantages": -5.041558601703855e-08, + "advantages_std": 1.5343198895454406, + "clip_ratio": 0.0, + "completion_length": 87.90774078369141, + "epoch": 5.362406015037594, + "grad_norm": 10.25, + "kl": 0.40592711269855497, + "learning_rate": 2.3195488721804513e-06, + "loss": 0.0454, + "num_tokens": 65373399.0, + "reward": -1.4265998385846614, + "reward_std": 6.847759199142456, + "rewards/get_chromagram_reward": 0.6110140025615692, + "rewards/get_chromagram_reward_std": 0.11806119754910469, + "rewards/get_intelligibility_reward": -4.8684638172388075, + "rewards/get_intelligibility_reward_std": 10.972066974639892, + "rewards/get_target_len_reward": -0.022349441517144443, + "rewards/get_target_len_reward_std": 0.07365586645901204, + "step": 7130 + }, + { + "advantages": -4.6330194436450256e-07, + "advantages_std": 1.4813837170600892, + "clip_ratio": 0.0, + "completion_length": 88.28869400024413, + "epoch": 5.369924812030075, + "grad_norm": 5.125, + "kl": 0.3149084284901619, + "learning_rate": 2.3157894736842105e-06, + "loss": 0.0357, + "num_tokens": 65684331.0, + "reward": -1.166980442777276, + "reward_std": 6.096787214279175, + "rewards/get_chromagram_reward": 0.6261518657207489, + "rewards/get_chromagram_reward_std": 0.10937800630927086, + "rewards/get_intelligibility_reward": -4.104114997386932, + "rewards/get_intelligibility_reward_std": 9.773707628250122, + "rewards/get_target_len_reward": -0.02297802213579416, + "rewards/get_target_len_reward_std": 0.07116317190229893, + "step": 7140 + }, + { + "advantages": -4.321336177781632e-08, + "advantages_std": 1.5572047114372254, + "clip_ratio": 0.0, + "completion_length": 85.3553581237793, + "epoch": 5.377443609022556, + "grad_norm": 5.90625, + "kl": 0.3070035442709923, + "learning_rate": 2.31203007518797e-06, + "loss": 0.0399, + "num_tokens": 65985864.0, + "reward": -1.5421805202960968, + "reward_std": 6.477402973175049, + "rewards/get_chromagram_reward": 0.6324486792087555, + "rewards/get_chromagram_reward_std": 0.11431118622422218, + "rewards/get_intelligibility_reward": -5.233871936798096, + "rewards/get_intelligibility_reward_std": 10.193256998062134, + "rewards/get_target_len_reward": -0.025118078384548426, + "rewards/get_target_len_reward_std": 0.07251648269593716, + "step": 7150 + }, + { + "advantages": 3.0522546214939437e-07, + "advantages_std": 1.5489553928375244, + "clip_ratio": 0.0, + "completion_length": 88.06726303100587, + "epoch": 5.384962406015037, + "grad_norm": 11.8125, + "kl": 0.32657683938741683, + "learning_rate": 2.3082706766917294e-06, + "loss": 0.037, + "num_tokens": 66296128.0, + "reward": -1.4618814080953597, + "reward_std": 6.870611715316772, + "rewards/get_chromagram_reward": 0.6149131417274475, + "rewards/get_chromagram_reward_std": 0.11258464604616165, + "rewards/get_intelligibility_reward": -4.979121434688568, + "rewards/get_intelligibility_reward_std": 10.994513130187988, + "rewards/get_target_len_reward": -0.021435728576034308, + "rewards/get_target_len_reward_std": 0.07056615706533194, + "step": 7160 + }, + { + "advantages": 3.5663447448541776e-07, + "advantages_std": 1.491836416721344, + "clip_ratio": 0.0, + "completion_length": 86.39881134033203, + "epoch": 5.392481203007518, + "grad_norm": 98.0, + "kl": 0.5265922620892525, + "learning_rate": 2.304511278195489e-06, + "loss": 0.0578, + "num_tokens": 66601121.0, + "reward": -1.5936202168464662, + "reward_std": 7.05308780670166, + "rewards/get_chromagram_reward": 0.6205924928188324, + "rewards/get_chromagram_reward_std": 0.11182191222906113, + "rewards/get_intelligibility_reward": -5.383054161071778, + "rewards/get_intelligibility_reward_std": 11.244775199890137, + "rewards/get_target_len_reward": -0.018398610036820175, + "rewards/get_target_len_reward_std": 0.04998060278594494, + "step": 7170 + }, + { + "advantages": -2.58783499385018e-07, + "advantages_std": 1.5922078490257263, + "clip_ratio": 0.0, + "completion_length": 85.5148826599121, + "epoch": 5.4, + "grad_norm": 290.0, + "kl": 0.36700445264577863, + "learning_rate": 2.3007518796992482e-06, + "loss": 0.0381, + "num_tokens": 66903910.0, + "reward": -1.4047640979290008, + "reward_std": 6.410592555999756, + "rewards/get_chromagram_reward": 0.6141754031181336, + "rewards/get_chromagram_reward_std": 0.11500288918614388, + "rewards/get_intelligibility_reward": -4.811613512039185, + "rewards/get_intelligibility_reward_std": 10.22232813835144, + "rewards/get_target_len_reward": -0.01685400800779462, + "rewards/get_target_len_reward_std": 0.051270670257508755, + "step": 7180 + }, + { + "advantages": -3.978610166655017e-07, + "advantages_std": 1.548390531539917, + "clip_ratio": 0.0, + "completion_length": 91.05595321655274, + "epoch": 5.407518796992481, + "grad_norm": 5.3125, + "kl": 0.34714345484972, + "learning_rate": 2.296992481203008e-06, + "loss": 0.0371, + "num_tokens": 67222290.0, + "reward": -1.0444933593273162, + "reward_std": 6.631856918334961, + "rewards/get_chromagram_reward": 0.6159287035465241, + "rewards/get_chromagram_reward_std": 0.10580892786383629, + "rewards/get_intelligibility_reward": -3.72859765291214, + "rewards/get_intelligibility_reward_std": 10.923980712890625, + "rewards/get_target_len_reward": -0.02081095390021801, + "rewards/get_target_len_reward_std": 0.06703416649252177, + "step": 7190 + }, + { + "advantages": 5.582968611861361e-07, + "advantages_std": 1.5962399005889893, + "clip_ratio": 0.0, + "completion_length": 86.19881057739258, + "epoch": 5.415037593984962, + "grad_norm": 8.8125, + "kl": 0.344843378663063, + "learning_rate": 2.293233082706767e-06, + "loss": 0.0378, + "num_tokens": 67526984.0, + "reward": -1.3031162723898888, + "reward_std": 6.6909068584442135, + "rewards/get_chromagram_reward": 0.6307554066181182, + "rewards/get_chromagram_reward_std": 0.1207100123167038, + "rewards/get_intelligibility_reward": -4.517148065567016, + "rewards/get_intelligibility_reward_std": 10.817517948150634, + "rewards/get_target_len_reward": -0.022955980710685255, + "rewards/get_target_len_reward_std": 0.05329591147601605, + "step": 7200 + }, + { + "advantages": 6.263454807253765e-07, + "advantages_std": 1.6104538202285767, + "clip_ratio": 0.0, + "completion_length": 85.74107284545899, + "epoch": 5.4225563909774435, + "grad_norm": 36.25, + "kl": 0.36100142300128935, + "learning_rate": 2.2894736842105263e-06, + "loss": 0.0402, + "num_tokens": 67829982.0, + "reward": -1.5839881598949432, + "reward_std": 6.86516432762146, + "rewards/get_chromagram_reward": 0.6199360251426697, + "rewards/get_chromagram_reward_std": 0.1123465433716774, + "rewards/get_intelligibility_reward": -5.348700094223022, + "rewards/get_intelligibility_reward_std": 10.803610897064209, + "rewards/get_target_len_reward": -0.023200150951743125, + "rewards/get_target_len_reward_std": 0.07127108946442604, + "step": 7210 + }, + { + "advantages": -1.9644698063814302e-07, + "advantages_std": 1.6620344519615173, + "clip_ratio": 0.0, + "completion_length": 84.13869171142578, + "epoch": 5.4300751879699245, + "grad_norm": 7.0, + "kl": 0.33577401787042616, + "learning_rate": 2.285714285714286e-06, + "loss": 0.037, + "num_tokens": 68129217.0, + "reward": -1.6259113669395446, + "reward_std": 6.991596651077271, + "rewards/get_chromagram_reward": 0.6219640374183655, + "rewards/get_chromagram_reward_std": 0.11431905999779701, + "rewards/get_intelligibility_reward": -5.476862597465515, + "rewards/get_intelligibility_reward_std": 11.013343715667725, + "rewards/get_target_len_reward": -0.0228353314101696, + "rewards/get_target_len_reward_std": 0.062448183074593544, + "step": 7220 + }, + { + "advantages": 5.846222336458595e-07, + "advantages_std": 1.5011724472045898, + "clip_ratio": 0.0, + "completion_length": 88.38809585571289, + "epoch": 5.437593984962406, + "grad_norm": 7.34375, + "kl": 0.2984209552407265, + "learning_rate": 2.281954887218045e-06, + "loss": 0.0347, + "num_tokens": 68439441.0, + "reward": -1.5354242980480195, + "reward_std": 7.008116817474365, + "rewards/get_chromagram_reward": 0.602736109495163, + "rewards/get_chromagram_reward_std": 0.10836800113320351, + "rewards/get_intelligibility_reward": -5.19312492609024, + "rewards/get_intelligibility_reward_std": 11.163818836212158, + "rewards/get_target_len_reward": -0.015883707161992788, + "rewards/get_target_len_reward_std": 0.056575870141386986, + "step": 7230 + }, + { + "advantages": -1.2690824462424645e-07, + "advantages_std": 1.5233567714691163, + "clip_ratio": 0.0, + "completion_length": 86.97321548461915, + "epoch": 5.4451127819548875, + "grad_norm": 16.75, + "kl": 1.1598840221762656, + "learning_rate": 2.278195488721805e-06, + "loss": 0.1181, + "num_tokens": 68746932.0, + "reward": -1.5316576719284059, + "reward_std": 7.266022396087647, + "rewards/get_chromagram_reward": 0.6144776403903961, + "rewards/get_chromagram_reward_std": 0.1082504540681839, + "rewards/get_intelligibility_reward": -5.187613144516945, + "rewards/get_intelligibility_reward_std": 11.530435466766358, + "rewards/get_target_len_reward": -0.021837185509502886, + "rewards/get_target_len_reward_std": 0.06608111709356308, + "step": 7240 + }, + { + "advantages": -7.033348182972077e-07, + "advantages_std": 1.6308774948120117, + "clip_ratio": 0.0, + "completion_length": 85.37381057739258, + "epoch": 5.4526315789473685, + "grad_norm": 7.03125, + "kl": 0.33484105467796327, + "learning_rate": 2.274436090225564e-06, + "loss": 0.0373, + "num_tokens": 69049029.0, + "reward": -1.5764613687992095, + "reward_std": 6.747067260742187, + "rewards/get_chromagram_reward": 0.6007543444633484, + "rewards/get_chromagram_reward_std": 0.11851666420698166, + "rewards/get_intelligibility_reward": -5.304040777683258, + "rewards/get_intelligibility_reward_std": 10.560655975341797, + "rewards/get_target_len_reward": -0.026097355782985686, + "rewards/get_target_len_reward_std": 0.08660393953323364, + "step": 7250 + }, + { + "advantages": -4.818043635168578e-08, + "advantages_std": 1.5586305379867553, + "clip_ratio": 0.0, + "completion_length": 86.24881286621094, + "epoch": 5.4601503759398495, + "grad_norm": 7.125, + "kl": 0.3909894391894341, + "learning_rate": 2.2706766917293237e-06, + "loss": 0.0478, + "num_tokens": 69354051.0, + "reward": -1.437357211112976, + "reward_std": 6.97461724281311, + "rewards/get_chromagram_reward": 0.6212467789649964, + "rewards/get_chromagram_reward_std": 0.10954332649707794, + "rewards/get_intelligibility_reward": -4.909408502280712, + "rewards/get_intelligibility_reward_std": 11.192184829711914, + "rewards/get_target_len_reward": -0.02390976846218109, + "rewards/get_target_len_reward_std": 0.07783832289278507, + "step": 7260 + }, + { + "advantages": -1.3927619193054852e-06, + "advantages_std": 1.5793645858764649, + "clip_ratio": 0.0, + "completion_length": 86.78274078369141, + "epoch": 5.467669172932331, + "grad_norm": 11.625, + "kl": 0.5376500964164734, + "learning_rate": 2.266917293233083e-06, + "loss": 0.0585, + "num_tokens": 69659701.0, + "reward": -1.7209204077720641, + "reward_std": 6.849113702774048, + "rewards/get_chromagram_reward": 0.6152911186218262, + "rewards/get_chromagram_reward_std": 0.11797176897525788, + "rewards/get_intelligibility_reward": -5.754384231567383, + "rewards/get_intelligibility_reward_std": 10.6309889793396, + "rewards/get_target_len_reward": -0.02366781998425722, + "rewards/get_target_len_reward_std": 0.06547661423683167, + "step": 7270 + }, + { + "advantages": -4.146248176795098e-07, + "advantages_std": 1.5919759631156922, + "clip_ratio": 0.0, + "completion_length": 89.48750228881836, + "epoch": 5.4751879699248125, + "grad_norm": 8.0625, + "kl": 0.3412448182702065, + "learning_rate": 2.2631578947368426e-06, + "loss": 0.0408, + "num_tokens": 69973447.0, + "reward": -1.612433785200119, + "reward_std": 7.330462169647217, + "rewards/get_chromagram_reward": 0.6047484815120697, + "rewards/get_chromagram_reward_std": 0.11287015751004219, + "rewards/get_intelligibility_reward": -5.421263241767884, + "rewards/get_intelligibility_reward_std": 11.748994159698487, + "rewards/get_target_len_reward": -0.020786524470895528, + "rewards/get_target_len_reward_std": 0.07320697046816349, + "step": 7280 + }, + { + "advantages": 1.225620582800957e-07, + "advantages_std": 1.5420305848121643, + "clip_ratio": 0.0, + "completion_length": 84.21488189697266, + "epoch": 5.4827067669172935, + "grad_norm": 5.875, + "kl": 11.998553581535816, + "learning_rate": 2.259398496240602e-06, + "loss": 1.2037, + "num_tokens": 70273466.0, + "reward": -1.5845581710338592, + "reward_std": 6.450918436050415, + "rewards/get_chromagram_reward": 0.6186227262020111, + "rewards/get_chromagram_reward_std": 0.1141671285033226, + "rewards/get_intelligibility_reward": -5.353073540329933, + "rewards/get_intelligibility_reward_std": 10.01052770614624, + "rewards/get_target_len_reward": -0.019223571103066207, + "rewards/get_target_len_reward_std": 0.054246239550411704, + "step": 7290 + }, + { + "advantages": -2.4288893314405867e-07, + "advantages_std": 1.6326185584068298, + "clip_ratio": 0.0, + "completion_length": 89.59404983520508, + "epoch": 5.490225563909775, + "grad_norm": 6.53125, + "kl": 0.8095985978841782, + "learning_rate": 2.255639097744361e-06, + "loss": 0.0826, + "num_tokens": 70588453.0, + "reward": -1.3116859912872314, + "reward_std": 6.901690149307251, + "rewards/get_chromagram_reward": 0.6312417924404145, + "rewards/get_chromagram_reward_std": 0.11385724022984504, + "rewards/get_intelligibility_reward": -4.543382370471955, + "rewards/get_intelligibility_reward_std": 11.238386249542236, + "rewards/get_target_len_reward": -0.022917152382433414, + "rewards/get_target_len_reward_std": 0.05599991828203201, + "step": 7300 + }, + { + "advantages": -1.1771916632596913e-07, + "advantages_std": 1.5935291290283202, + "clip_ratio": 0.0, + "completion_length": 84.82916870117188, + "epoch": 5.497744360902256, + "grad_norm": 8.5, + "kl": 0.3385091483592987, + "learning_rate": 2.2518796992481202e-06, + "loss": 0.0437, + "num_tokens": 70888841.0, + "reward": -1.810219794511795, + "reward_std": 6.882726764678955, + "rewards/get_chromagram_reward": 0.6040708005428315, + "rewards/get_chromagram_reward_std": 0.11017877012491226, + "rewards/get_intelligibility_reward": -6.00965039730072, + "rewards/get_intelligibility_reward_std": 10.694256210327149, + "rewards/get_target_len_reward": -0.02507947999984026, + "rewards/get_target_len_reward_std": 0.07585760038346052, + "step": 7310 + }, + { + "advantages": -2.9454629952851973e-07, + "advantages_std": 1.7237526535987855, + "clip_ratio": 0.0, + "completion_length": 87.3398826599121, + "epoch": 5.505263157894737, + "grad_norm": 6.875, + "kl": 0.2888296276330948, + "learning_rate": 2.24812030075188e-06, + "loss": 0.0298, + "num_tokens": 71197368.0, + "reward": -1.3548449575901031, + "reward_std": 7.186491394042969, + "rewards/get_chromagram_reward": 0.6183264970779419, + "rewards/get_chromagram_reward_std": 0.11901157423853874, + "rewards/get_intelligibility_reward": -4.6635973930358885, + "rewards/get_intelligibility_reward_std": 11.709501457214355, + "rewards/get_target_len_reward": -0.01926371455192566, + "rewards/get_target_len_reward_std": 0.04918394237756729, + "step": 7320 + }, + { + "advantages": -2.2997459652174258e-07, + "advantages_std": 1.6268801808357238, + "clip_ratio": 0.0, + "completion_length": 91.177978515625, + "epoch": 5.512781954887218, + "grad_norm": 9.0, + "kl": 0.3528441786766052, + "learning_rate": 2.244360902255639e-06, + "loss": 0.0388, + "num_tokens": 71516401.0, + "reward": -1.370145285129547, + "reward_std": 6.830450391769409, + "rewards/get_chromagram_reward": 0.6379683613777161, + "rewards/get_chromagram_reward_std": 0.11868164539337159, + "rewards/get_intelligibility_reward": -4.725983762741089, + "rewards/get_intelligibility_reward_std": 11.004422569274903, + "rewards/get_target_len_reward": -0.022420282661914825, + "rewards/get_target_len_reward_std": 0.0538643242791295, + "step": 7330 + }, + { + "advantages": -5.488596244163091e-08, + "advantages_std": 1.6177343845367431, + "clip_ratio": 0.0, + "completion_length": 83.06428680419921, + "epoch": 5.5203007518797, + "grad_norm": 5.25, + "kl": 0.3498178094625473, + "learning_rate": 2.2406015037593987e-06, + "loss": 0.0417, + "num_tokens": 71812380.0, + "reward": -1.207906161621213, + "reward_std": 6.610358667373657, + "rewards/get_chromagram_reward": 0.6290356993675232, + "rewards/get_chromagram_reward_std": 0.10410335063934326, + "rewards/get_intelligibility_reward": -4.230634343624115, + "rewards/get_intelligibility_reward_std": 10.764556217193604, + "rewards/get_target_len_reward": -0.02211959520354867, + "rewards/get_target_len_reward_std": 0.06520765721797943, + "step": 7340 + }, + { + "advantages": -8.321057112681273e-07, + "advantages_std": 1.5614672541618346, + "clip_ratio": 0.0, + "completion_length": 86.23631057739257, + "epoch": 5.527819548872181, + "grad_norm": 6.5625, + "kl": 0.2928524002432823, + "learning_rate": 2.236842105263158e-06, + "loss": 0.0386, + "num_tokens": 72117651.0, + "reward": -1.6458905786275864, + "reward_std": 7.037843132019043, + "rewards/get_chromagram_reward": 0.6171200931072235, + "rewards/get_chromagram_reward_std": 0.10211833268404007, + "rewards/get_intelligibility_reward": -5.5341644287109375, + "rewards/get_intelligibility_reward_std": 11.144237804412843, + "rewards/get_target_len_reward": -0.020627187751233577, + "rewards/get_target_len_reward_std": 0.06789864487946033, + "step": 7350 + }, + { + "advantages": -6.174047882723244e-07, + "advantages_std": 1.5940613746643066, + "clip_ratio": 0.0, + "completion_length": 87.6255973815918, + "epoch": 5.535338345864662, + "grad_norm": 18.75, + "kl": 0.34271004796028137, + "learning_rate": 2.2330827067669176e-06, + "loss": 0.037, + "num_tokens": 72425365.0, + "reward": -1.725543212890625, + "reward_std": 6.623648262023925, + "rewards/get_chromagram_reward": 0.6141940057277679, + "rewards/get_chromagram_reward_std": 0.11702094674110412, + "rewards/get_intelligibility_reward": -5.767955183982849, + "rewards/get_intelligibility_reward_std": 10.295107078552245, + "rewards/get_target_len_reward": -0.022868365794420243, + "rewards/get_target_len_reward_std": 0.06632985081523657, + "step": 7360 + }, + { + "advantages": 1.5397867496602658e-07, + "advantages_std": 1.4260494589805603, + "clip_ratio": 0.0, + "completion_length": 86.57143096923828, + "epoch": 5.542857142857143, + "grad_norm": 4.75, + "kl": 0.3642714560031891, + "learning_rate": 2.229323308270677e-06, + "loss": 0.0418, + "num_tokens": 72730793.0, + "reward": -1.752213227748871, + "reward_std": 6.929638338088989, + "rewards/get_chromagram_reward": 0.6075048983097077, + "rewards/get_chromagram_reward_std": 0.11558273807168007, + "rewards/get_intelligibility_reward": -5.8434700012207035, + "rewards/get_intelligibility_reward_std": 10.849376010894776, + "rewards/get_target_len_reward": -0.020674252323806284, + "rewards/get_target_len_reward_std": 0.06688700504601001, + "step": 7370 + }, + { + "advantages": -1.380840899400937e-07, + "advantages_std": 1.7065526127815247, + "clip_ratio": 0.0, + "completion_length": 82.55714416503906, + "epoch": 5.550375939849624, + "grad_norm": 77.5, + "kl": 0.3779150277376175, + "learning_rate": 2.2255639097744365e-06, + "loss": 0.0513, + "num_tokens": 73025159.0, + "reward": -1.7196123540401458, + "reward_std": 7.126970195770264, + "rewards/get_chromagram_reward": 0.623649537563324, + "rewards/get_chromagram_reward_std": 0.11368174850940704, + "rewards/get_intelligibility_reward": -5.756095004081726, + "rewards/get_intelligibility_reward_std": 11.227530765533448, + "rewards/get_target_len_reward": -0.026391300559043884, + "rewards/get_target_len_reward_std": 0.08802502788603306, + "step": 7380 + }, + { + "advantages": 5.04155968883424e-07, + "advantages_std": 1.6110849261283875, + "clip_ratio": 0.0, + "completion_length": 89.42797775268555, + "epoch": 5.557894736842105, + "grad_norm": 5.78125, + "kl": 0.3004383772611618, + "learning_rate": 2.2218045112781957e-06, + "loss": 0.0302, + "num_tokens": 73339524.0, + "reward": -1.140392405539751, + "reward_std": 6.8683192253112795, + "rewards/get_chromagram_reward": 0.6287994384765625, + "rewards/get_chromagram_reward_std": 0.10455321967601776, + "rewards/get_intelligibility_reward": -4.034494996070862, + "rewards/get_intelligibility_reward_std": 11.122114038467407, + "rewards/get_target_len_reward": -0.015481433924287557, + "rewards/get_target_len_reward_std": 0.04037857819348574, + "step": 7390 + }, + { + "advantages": 3.5812459326933775e-07, + "advantages_std": 1.6297937989234925, + "clip_ratio": 0.0, + "completion_length": 88.87916793823243, + "epoch": 5.565413533834587, + "grad_norm": 13.25, + "kl": 0.30461192429065703, + "learning_rate": 2.218045112781955e-06, + "loss": 0.0372, + "num_tokens": 73652091.0, + "reward": -1.1990951776504517, + "reward_std": 6.741983842849732, + "rewards/get_chromagram_reward": 0.6169572472572327, + "rewards/get_chromagram_reward_std": 0.1067502036690712, + "rewards/get_intelligibility_reward": -4.190165710449219, + "rewards/get_intelligibility_reward_std": 10.995736217498779, + "rewards/get_target_len_reward": -0.02407686710357666, + "rewards/get_target_len_reward_std": 0.07429735269397497, + "step": 7400 + }, + { + "advantages": 1.0232131586462856e-07, + "advantages_std": 1.5955123424530029, + "clip_ratio": 0.0, + "completion_length": 87.42024078369141, + "epoch": 5.572932330827068, + "grad_norm": 8.9375, + "kl": 77.94752575904131, + "learning_rate": 2.2142857142857146e-06, + "loss": 7.7973, + "num_tokens": 73959565.0, + "reward": -1.600351732969284, + "reward_std": 6.964174509048462, + "rewards/get_chromagram_reward": 0.6232900559902191, + "rewards/get_chromagram_reward_std": 0.11058256328105927, + "rewards/get_intelligibility_reward": -5.403542852401733, + "rewards/get_intelligibility_reward_std": 11.038510799407959, + "rewards/get_target_len_reward": -0.020802028104662897, + "rewards/get_target_len_reward_std": 0.05858626961708069, + "step": 7410 + }, + { + "advantages": 1.9123157102285405e-08, + "advantages_std": 1.5321441888809204, + "clip_ratio": 0.0, + "completion_length": 91.97440795898437, + "epoch": 5.580451127819549, + "grad_norm": 4.78125, + "kl": 0.3133543863892555, + "learning_rate": 2.2105263157894738e-06, + "loss": 0.0342, + "num_tokens": 74279659.0, + "reward": -1.511323982477188, + "reward_std": 6.717669296264648, + "rewards/get_chromagram_reward": 0.6166324973106384, + "rewards/get_chromagram_reward_std": 0.10233291685581207, + "rewards/get_intelligibility_reward": -5.131410145759583, + "rewards/get_intelligibility_reward_std": 10.667114448547363, + "rewards/get_target_len_reward": -0.019194147270172834, + "rewards/get_target_len_reward_std": 0.055062121339142324, + "step": 7420 + }, + { + "advantages": -6.141762213474067e-07, + "advantages_std": 1.5866358041763307, + "clip_ratio": 0.0, + "completion_length": 85.48631057739257, + "epoch": 5.58796992481203, + "grad_norm": 9.125, + "kl": 0.5132203131914139, + "learning_rate": 2.2067669172932334e-06, + "loss": 0.0522, + "num_tokens": 74582539.0, + "reward": -1.4089649975299836, + "reward_std": 6.644739103317261, + "rewards/get_chromagram_reward": 0.6317284643650055, + "rewards/get_chromagram_reward_std": 0.10995041355490684, + "rewards/get_intelligibility_reward": -4.83715957403183, + "rewards/get_intelligibility_reward_std": 10.636877918243409, + "rewards/get_target_len_reward": -0.0214636730030179, + "rewards/get_target_len_reward_std": 0.06028429102152586, + "step": 7430 + }, + { + "advantages": 1.8378098758375928e-07, + "advantages_std": 1.5107809066772462, + "clip_ratio": 0.0, + "completion_length": 84.70178756713867, + "epoch": 5.595488721804511, + "grad_norm": 16.75, + "kl": 0.9980653643608093, + "learning_rate": 2.2030075187969927e-06, + "loss": 0.1048, + "num_tokens": 74882065.0, + "reward": -1.8240859806537628, + "reward_std": 6.827915096282959, + "rewards/get_chromagram_reward": 0.6137704908847809, + "rewards/get_chromagram_reward_std": 0.11075319945812226, + "rewards/get_intelligibility_reward": -6.064216899871826, + "rewards/get_intelligibility_reward_std": 10.448147773742676, + "rewards/get_target_len_reward": -0.021811048593372108, + "rewards/get_target_len_reward_std": 0.06830717157572508, + "step": 7440 + }, + { + "advantages": 3.5017728805541994e-08, + "advantages_std": 1.6366748571395875, + "clip_ratio": 0.0, + "completion_length": 84.80952453613281, + "epoch": 5.603007518796993, + "grad_norm": 6.46875, + "kl": 0.3172616258263588, + "learning_rate": 2.199248120300752e-06, + "loss": 0.0363, + "num_tokens": 75183463.0, + "reward": -1.5454985558986665, + "reward_std": 6.658914279937744, + "rewards/get_chromagram_reward": 0.6241901516914368, + "rewards/get_chromagram_reward_std": 0.11918274387717247, + "rewards/get_intelligibility_reward": -5.237881135940552, + "rewards/get_intelligibility_reward_std": 10.563720417022704, + "rewards/get_target_len_reward": -0.02280454756692052, + "rewards/get_target_len_reward_std": 0.06878137122839689, + "step": 7450 + }, + { + "advantages": 3.047287538748833e-07, + "advantages_std": 1.5836209535598755, + "clip_ratio": 0.0, + "completion_length": 85.54285736083985, + "epoch": 5.610526315789474, + "grad_norm": 7.1875, + "kl": 0.3748527690768242, + "learning_rate": 2.1954887218045115e-06, + "loss": 0.0372, + "num_tokens": 75486376.0, + "reward": -1.5894612610340118, + "reward_std": 7.040074014663697, + "rewards/get_chromagram_reward": 0.6304149627685547, + "rewards/get_chromagram_reward_std": 0.11504409015178681, + "rewards/get_intelligibility_reward": -5.378788644075394, + "rewards/get_intelligibility_reward_std": 11.16924238204956, + "rewards/get_target_len_reward": -0.020009812247008086, + "rewards/get_target_len_reward_std": 0.047809756547212603, + "step": 7460 + }, + { + "advantages": -1.231829600101264e-07, + "advantages_std": 1.5577387928962707, + "clip_ratio": 0.0, + "completion_length": 87.06845474243164, + "epoch": 5.618045112781955, + "grad_norm": 9.25, + "kl": 0.3719530820846558, + "learning_rate": 2.1917293233082707e-06, + "loss": 0.036, + "num_tokens": 75793227.0, + "reward": -1.6739049434661866, + "reward_std": 7.10861234664917, + "rewards/get_chromagram_reward": 0.6131251990795136, + "rewards/get_chromagram_reward_std": 0.1312383234500885, + "rewards/get_intelligibility_reward": -5.615152812004089, + "rewards/get_intelligibility_reward_std": 11.296269989013672, + "rewards/get_target_len_reward": -0.01968686729669571, + "rewards/get_target_len_reward_std": 0.0534376522526145, + "step": 7470 + }, + { + "advantages": 3.082056977632419e-07, + "advantages_std": 1.5980460882186889, + "clip_ratio": 0.0, + "completion_length": 88.05476379394531, + "epoch": 5.625563909774436, + "grad_norm": 22.375, + "kl": 0.6268338203430176, + "learning_rate": 2.1879699248120304e-06, + "loss": 0.0698, + "num_tokens": 76102876.0, + "reward": -1.3034660577774049, + "reward_std": 6.318447589874268, + "rewards/get_chromagram_reward": 0.6268084466457366, + "rewards/get_chromagram_reward_std": 0.10592088475823402, + "rewards/get_intelligibility_reward": -4.514228749275207, + "rewards/get_intelligibility_reward_std": 10.15007405281067, + "rewards/get_target_len_reward": -0.022977558616548776, + "rewards/get_target_len_reward_std": 0.06933909989893436, + "step": 7480 + }, + { + "advantages": -5.220373708425541e-07, + "advantages_std": 1.5136359333992004, + "clip_ratio": 0.0, + "completion_length": 87.81428680419921, + "epoch": 5.633082706766917, + "grad_norm": 64.0, + "kl": 0.34380061328411105, + "learning_rate": 2.1842105263157896e-06, + "loss": 0.0433, + "num_tokens": 76412451.0, + "reward": -1.3146987706422806, + "reward_std": 6.69021692276001, + "rewards/get_chromagram_reward": 0.6146059095859527, + "rewards/get_chromagram_reward_std": 0.11763279736042023, + "rewards/get_intelligibility_reward": -4.534822654724121, + "rewards/get_intelligibility_reward_std": 10.621045303344726, + "rewards/get_target_len_reward": -0.02387933572754264, + "rewards/get_target_len_reward_std": 0.06662974283099174, + "step": 7490 + }, + { + "advantages": -2.632538624425251e-07, + "advantages_std": 1.453635001182556, + "clip_ratio": 0.0, + "completion_length": 85.31369323730469, + "epoch": 5.640601503759399, + "grad_norm": 6.65625, + "kl": 0.3900526463985443, + "learning_rate": 2.180451127819549e-06, + "loss": 0.0464, + "num_tokens": 76714207.0, + "reward": -1.7231360912322997, + "reward_std": 6.901036357879638, + "rewards/get_chromagram_reward": 0.6194710373878479, + "rewards/get_chromagram_reward_std": 0.11405050083994865, + "rewards/get_intelligibility_reward": -5.76176085472107, + "rewards/get_intelligibility_reward_std": 10.817874336242676, + "rewards/get_target_len_reward": -0.02711833519861102, + "rewards/get_target_len_reward_std": 0.0795399196445942, + "step": 7500 + }, + { + "advantages": -2.1532178209326958e-07, + "advantages_std": 1.5806753516197205, + "clip_ratio": 0.0, + "completion_length": 88.37857284545899, + "epoch": 5.64812030075188, + "grad_norm": 58.0, + "kl": 0.3461156725883484, + "learning_rate": 2.1766917293233085e-06, + "loss": 0.0375, + "num_tokens": 77024921.0, + "reward": -1.7006117105484009, + "reward_std": 7.299945545196533, + "rewards/get_chromagram_reward": 0.6067075133323669, + "rewards/get_chromagram_reward_std": 0.116612958163023, + "rewards/get_intelligibility_reward": -5.686506867408752, + "rewards/get_intelligibility_reward_std": 11.537493228912354, + "rewards/get_target_len_reward": -0.02203559260815382, + "rewards/get_target_len_reward_std": 0.06246333085000515, + "step": 7510 + }, + { + "advantages": -1.5459956159702415e-07, + "advantages_std": 1.616200816631317, + "clip_ratio": 0.0, + "completion_length": 90.30774002075195, + "epoch": 5.655639097744361, + "grad_norm": 5.84375, + "kl": 0.3066115379333496, + "learning_rate": 2.1729323308270677e-06, + "loss": 0.0327, + "num_tokens": 77340767.0, + "reward": -1.258822149783373, + "reward_std": 6.4773036479949955, + "rewards/get_chromagram_reward": 0.6259684383869171, + "rewards/get_chromagram_reward_std": 0.12014298290014266, + "rewards/get_intelligibility_reward": -4.385247963666916, + "rewards/get_intelligibility_reward_std": 10.4630756855011, + "rewards/get_target_len_reward": -0.01718673426657915, + "rewards/get_target_len_reward_std": 0.048602374456822875, + "step": 7520 + }, + { + "advantages": -1.3783579788650967e-08, + "advantages_std": 1.5101877331733704, + "clip_ratio": 0.0, + "completion_length": 85.07559814453126, + "epoch": 5.663157894736842, + "grad_norm": 153.0, + "kl": 0.3391159653663635, + "learning_rate": 2.1691729323308273e-06, + "loss": 0.0357, + "num_tokens": 77642099.0, + "reward": -1.0761063262820243, + "reward_std": 6.302792119979858, + "rewards/get_chromagram_reward": 0.6141286730766297, + "rewards/get_chromagram_reward_std": 0.12359654754400254, + "rewards/get_intelligibility_reward": -3.8235466867685317, + "rewards/get_intelligibility_reward_std": 10.235271453857422, + "rewards/get_target_len_reward": -0.018900788482278587, + "rewards/get_target_len_reward_std": 0.05066053103655577, + "step": 7530 + }, + { + "advantages": -2.648681522998686e-07, + "advantages_std": 1.566270637512207, + "clip_ratio": 0.0, + "completion_length": 89.32321624755859, + "epoch": 5.670676691729323, + "grad_norm": 6.46875, + "kl": 0.3445367142558098, + "learning_rate": 2.1654135338345866e-06, + "loss": 0.039, + "num_tokens": 77955382.0, + "reward": -1.1872239589691163, + "reward_std": 6.858507299423218, + "rewards/get_chromagram_reward": 0.6298686146736145, + "rewards/get_chromagram_reward_std": 0.11859809085726739, + "rewards/get_intelligibility_reward": -4.16928243637085, + "rewards/get_intelligibility_reward_std": 11.18128228187561, + "rewards/get_target_len_reward": -0.022257814556360243, + "rewards/get_target_len_reward_std": 0.059147943183779715, + "step": 7540 + }, + { + "advantages": -2.829978953400314e-07, + "advantages_std": 1.4890665531158447, + "clip_ratio": 0.0, + "completion_length": 86.4071434020996, + "epoch": 5.678195488721805, + "grad_norm": 7.3125, + "kl": 0.3267993301153183, + "learning_rate": 2.161654135338346e-06, + "loss": 0.0369, + "num_tokens": 78260685.0, + "reward": -1.3021728478372097, + "reward_std": 6.207171201705933, + "rewards/get_chromagram_reward": 0.6221062183380127, + "rewards/get_chromagram_reward_std": 0.12322953790426254, + "rewards/get_intelligibility_reward": -4.508913117647171, + "rewards/get_intelligibility_reward_std": 9.848860836029052, + "rewards/get_target_len_reward": -0.01971148233860731, + "rewards/get_target_len_reward_std": 0.05028697308152914, + "step": 7550 + }, + { + "advantages": 2.8337041442227927e-07, + "advantages_std": 1.5456938207149507, + "clip_ratio": 0.0, + "completion_length": 89.8357162475586, + "epoch": 5.685714285714286, + "grad_norm": 7.71875, + "kl": 0.37217641323804856, + "learning_rate": 2.1578947368421054e-06, + "loss": 0.0419, + "num_tokens": 78575011.0, + "reward": -1.1617390155792235, + "reward_std": 6.960851192474365, + "rewards/get_chromagram_reward": 0.6310091912746429, + "rewards/get_chromagram_reward_std": 0.11414845660328865, + "rewards/get_intelligibility_reward": -4.094545310735702, + "rewards/get_intelligibility_reward_std": 11.278204441070557, + "rewards/get_target_len_reward": -0.021680734027177094, + "rewards/get_target_len_reward_std": 0.06393090002238751, + "step": 7560 + }, + { + "advantages": -1.9644698765475254e-07, + "advantages_std": 1.4756534457206727, + "clip_ratio": 0.0, + "completion_length": 86.86726379394531, + "epoch": 5.693233082706767, + "grad_norm": 7.1875, + "kl": 0.420529405772686, + "learning_rate": 2.154135338345865e-06, + "loss": 0.0466, + "num_tokens": 78882044.0, + "reward": -1.2242244243621827, + "reward_std": 6.362412405014038, + "rewards/get_chromagram_reward": 0.6175295114517212, + "rewards/get_chromagram_reward_std": 0.11478937491774559, + "rewards/get_intelligibility_reward": -4.266911506652832, + "rewards/get_intelligibility_reward_std": 10.337289953231812, + "rewards/get_target_len_reward": -0.02329086307436228, + "rewards/get_target_len_reward_std": 0.07511311620473862, + "step": 7570 + }, + { + "advantages": -4.892548970403254e-08, + "advantages_std": 1.5039716720581056, + "clip_ratio": 0.0, + "completion_length": 86.2928596496582, + "epoch": 5.700751879699248, + "grad_norm": 5.6875, + "kl": 0.421612012386322, + "learning_rate": 2.1503759398496243e-06, + "loss": 0.044, + "num_tokens": 79186669.0, + "reward": -1.5858042895793916, + "reward_std": 7.348571300506592, + "rewards/get_chromagram_reward": 0.6164960920810699, + "rewards/get_chromagram_reward_std": 0.11954497992992401, + "rewards/get_intelligibility_reward": -5.3503889560699465, + "rewards/get_intelligibility_reward_std": 11.802607250213622, + "rewards/get_target_len_reward": -0.02351978179067373, + "rewards/get_target_len_reward_std": 0.07501283418387175, + "step": 7580 + }, + { + "advantages": 4.142522826100503e-07, + "advantages_std": 1.5708836436271667, + "clip_ratio": 0.0, + "completion_length": 85.72381057739258, + "epoch": 5.708270676691729, + "grad_norm": 5.21875, + "kl": 0.8034768372774124, + "learning_rate": 2.146616541353384e-06, + "loss": 0.087, + "num_tokens": 79490048.0, + "reward": -1.3070945113897323, + "reward_std": 6.501582670211792, + "rewards/get_chromagram_reward": 0.6361547887325287, + "rewards/get_chromagram_reward_std": 0.11452390253543854, + "rewards/get_intelligibility_reward": -4.532324576377869, + "rewards/get_intelligibility_reward_std": 10.39711618423462, + "rewards/get_target_len_reward": -0.025113471318036318, + "rewards/get_target_len_reward_std": 0.06926766522228718, + "step": 7590 + }, + { + "advantages": 5.488594325697704e-08, + "advantages_std": 1.631120765209198, + "clip_ratio": 0.0, + "completion_length": 87.38928756713867, + "epoch": 5.715789473684211, + "grad_norm": 6.28125, + "kl": 0.372196751832962, + "learning_rate": 2.1428571428571427e-06, + "loss": 0.0431, + "num_tokens": 79798145.0, + "reward": -1.4786714985966682, + "reward_std": 6.542671346664429, + "rewards/get_chromagram_reward": 0.6265040755271911, + "rewards/get_chromagram_reward_std": 0.11047741249203683, + "rewards/get_intelligibility_reward": -5.038548780605197, + "rewards/get_intelligibility_reward_std": 10.313930130004882, + "rewards/get_target_len_reward": -0.023969475366175174, + "rewards/get_target_len_reward_std": 0.06568300873041152, + "step": 7600 + }, + { + "advantages": 5.471209846064085e-07, + "advantages_std": 1.6120773196220397, + "clip_ratio": 0.0, + "completion_length": 88.9708351135254, + "epoch": 5.723308270676692, + "grad_norm": 33.5, + "kl": 0.3500454694032669, + "learning_rate": 2.1390977443609024e-06, + "loss": 0.0343, + "num_tokens": 80110077.0, + "reward": -1.3915309190750123, + "reward_std": 6.181269693374634, + "rewards/get_chromagram_reward": 0.6148362159729004, + "rewards/get_chromagram_reward_std": 0.11888119354844093, + "rewards/get_intelligibility_reward": -4.770346236228943, + "rewards/get_intelligibility_reward_std": 9.799637031555175, + "rewards/get_target_len_reward": -0.019082391913980246, + "rewards/get_target_len_reward_std": 0.0465511741116643, + "step": 7610 + }, + { + "advantages": -2.140800191341441e-07, + "advantages_std": 1.5855832815170288, + "clip_ratio": 0.0, + "completion_length": 89.29226455688476, + "epoch": 5.730827067669173, + "grad_norm": 6.4375, + "kl": 0.3423966646194458, + "learning_rate": 2.1353383458646616e-06, + "loss": 0.0341, + "num_tokens": 80423645.0, + "reward": -1.2833735831081867, + "reward_std": 6.893964433670044, + "rewards/get_chromagram_reward": 0.6285513877868653, + "rewards/get_chromagram_reward_std": 0.11171592697501183, + "rewards/get_intelligibility_reward": -4.462010219693184, + "rewards/get_intelligibility_reward_std": 11.191399383544923, + "rewards/get_target_len_reward": -0.016661503352224827, + "rewards/get_target_len_reward_std": 0.042914232984185216, + "step": 7620 + }, + { + "advantages": 1.0579824163414742e-07, + "advantages_std": 1.7247357010841369, + "clip_ratio": 0.0, + "completion_length": 86.08928680419922, + "epoch": 5.738345864661654, + "grad_norm": 17.75, + "kl": 0.41501193344593046, + "learning_rate": 2.1315789473684212e-06, + "loss": 0.0444, + "num_tokens": 80727782.0, + "reward": -1.6765829205513, + "reward_std": 6.616048383712768, + "rewards/get_chromagram_reward": 0.6197386085987091, + "rewards/get_chromagram_reward_std": 0.1038549706339836, + "rewards/get_intelligibility_reward": -5.629750919342041, + "rewards/get_intelligibility_reward_std": 10.326595687866211, + "rewards/get_target_len_reward": -0.01973616676405072, + "rewards/get_target_len_reward_std": 0.05329372007399798, + "step": 7630 + }, + { + "advantages": 2.5952857072297777e-07, + "advantages_std": 1.517828369140625, + "clip_ratio": 0.0, + "completion_length": 89.41131210327148, + "epoch": 5.745864661654135, + "grad_norm": 7.6875, + "kl": 0.3498734712600708, + "learning_rate": 2.1278195488721805e-06, + "loss": 0.0336, + "num_tokens": 81041426.0, + "reward": -1.0377632051706314, + "reward_std": 6.863356018066407, + "rewards/get_chromagram_reward": 0.6367423355579376, + "rewards/get_chromagram_reward_std": 0.10499195754528046, + "rewards/get_intelligibility_reward": -3.7316944122314455, + "rewards/get_intelligibility_reward_std": 11.360644626617432, + "rewards/get_target_len_reward": -0.01833730023354292, + "rewards/get_target_len_reward_std": 0.04112956747412681, + "step": 7640 + }, + { + "advantages": 3.581245948680589e-07, + "advantages_std": 1.5799175381660462, + "clip_ratio": 0.0, + "completion_length": 88.0773826599121, + "epoch": 5.753383458646616, + "grad_norm": 8.1875, + "kl": 0.32614710479974746, + "learning_rate": 2.12406015037594e-06, + "loss": 0.0367, + "num_tokens": 81351297.0, + "reward": -1.2343915634031872, + "reward_std": 6.686226320266724, + "rewards/get_chromagram_reward": 0.6211168110370636, + "rewards/get_chromagram_reward_std": 0.10851754248142242, + "rewards/get_intelligibility_reward": -4.305308359861374, + "rewards/get_intelligibility_reward_std": 10.819230556488037, + "rewards/get_target_len_reward": -0.01898303721100092, + "rewards/get_target_len_reward_std": 0.05286200325936079, + "step": 7650 + }, + { + "advantages": 6.02876174582434e-07, + "advantages_std": 1.5238336324691772, + "clip_ratio": 0.0, + "completion_length": 82.91726379394531, + "epoch": 5.760902255639098, + "grad_norm": 6.6875, + "kl": 0.41712719202041626, + "learning_rate": 2.1203007518796993e-06, + "loss": 0.0433, + "num_tokens": 81647033.0, + "reward": -1.8270630359649658, + "reward_std": 7.040747261047363, + "rewards/get_chromagram_reward": 0.6239575922489167, + "rewards/get_chromagram_reward_std": 0.11793971508741379, + "rewards/get_intelligibility_reward": -6.084248375892639, + "rewards/get_intelligibility_reward_std": 10.95757598876953, + "rewards/get_target_len_reward": -0.020897910837084054, + "rewards/get_target_len_reward_std": 0.05306037589907646, + "step": 7660 + }, + { + "advantages": 1.9992392807921533e-07, + "advantages_std": 1.5137548208236695, + "clip_ratio": 0.0, + "completion_length": 86.46190567016602, + "epoch": 5.768421052631579, + "grad_norm": 4.8125, + "kl": 0.315133111178875, + "learning_rate": 2.116541353383459e-06, + "loss": 0.0352, + "num_tokens": 81953414.0, + "reward": -1.4287876427173614, + "reward_std": 6.695111894607544, + "rewards/get_chromagram_reward": 0.6318199157714843, + "rewards/get_chromagram_reward_std": 0.11796366795897484, + "rewards/get_intelligibility_reward": -4.898377990722656, + "rewards/get_intelligibility_reward_std": 10.68131456375122, + "rewards/get_target_len_reward": -0.019804536644369364, + "rewards/get_target_len_reward_std": 0.0511545468121767, + "step": 7670 + }, + { + "advantages": 4.3337545463373317e-07, + "advantages_std": 1.576704490184784, + "clip_ratio": 0.0, + "completion_length": 86.61726226806641, + "epoch": 5.77593984962406, + "grad_norm": 7.53125, + "kl": 0.29568569660186766, + "learning_rate": 2.112781954887218e-06, + "loss": 0.033, + "num_tokens": 82259441.0, + "reward": -0.9885002732276916, + "reward_std": 6.236829566955566, + "rewards/get_chromagram_reward": 0.6253950476646424, + "rewards/get_chromagram_reward_std": 0.10131782740354538, + "rewards/get_intelligibility_reward": -3.5707726955413817, + "rewards/get_intelligibility_reward_std": 10.291595935821533, + "rewards/get_target_len_reward": -0.020123045518994333, + "rewards/get_target_len_reward_std": 0.05881231594830751, + "step": 7680 + }, + { + "advantages": -6.640950957148561e-07, + "advantages_std": 1.5600481033325195, + "clip_ratio": 0.0, + "completion_length": 86.81904983520508, + "epoch": 5.783458646616541, + "grad_norm": 13.8125, + "kl": 0.33634247779846194, + "learning_rate": 2.109022556390978e-06, + "loss": 0.0394, + "num_tokens": 82566035.0, + "reward": -1.7160348892211914, + "reward_std": 6.599533224105835, + "rewards/get_chromagram_reward": 0.6210841238498688, + "rewards/get_chromagram_reward_std": 0.1084224171936512, + "rewards/get_intelligibility_reward": -5.749317216873169, + "rewards/get_intelligibility_reward_std": 10.231350469589234, + "rewards/get_target_len_reward": -0.019871008209884168, + "rewards/get_target_len_reward_std": 0.05601765606552363, + "step": 7690 + }, + { + "advantages": 2.1544597696987467e-07, + "advantages_std": 1.4199920415878295, + "clip_ratio": 0.0, + "completion_length": 85.48809661865235, + "epoch": 5.790977443609022, + "grad_norm": 9.0, + "kl": 0.3188432216644287, + "learning_rate": 2.105263157894737e-06, + "loss": 0.0401, + "num_tokens": 82869216.0, + "reward": -1.414464271068573, + "reward_std": 6.685119390487671, + "rewards/get_chromagram_reward": 0.6314283013343811, + "rewards/get_chromagram_reward_std": 0.11570866852998733, + "rewards/get_intelligibility_reward": -4.846628820896148, + "rewards/get_intelligibility_reward_std": 10.673337078094482, + "rewards/get_target_len_reward": -0.028192108776420356, + "rewards/get_target_len_reward_std": 0.08352104537189006, + "step": 7700 + }, + { + "advantages": 3.07957300549333e-08, + "advantages_std": 1.5753357529640197, + "clip_ratio": 0.0, + "completion_length": 89.41071548461915, + "epoch": 5.798496240601503, + "grad_norm": 8.1875, + "kl": 0.3553292080760002, + "learning_rate": 2.1015037593984963e-06, + "loss": 0.0424, + "num_tokens": 83183117.0, + "reward": -1.2461704462766647, + "reward_std": 6.781559991836548, + "rewards/get_chromagram_reward": 0.6093644201755524, + "rewards/get_chromagram_reward_std": 0.11293570399284363, + "rewards/get_intelligibility_reward": -4.326484024524689, + "rewards/get_intelligibility_reward_std": 11.03603982925415, + "rewards/get_target_len_reward": -0.0213914823718369, + "rewards/get_target_len_reward_std": 0.06803354378789664, + "step": 7710 + }, + { + "advantages": 4.512568470893541e-07, + "advantages_std": 1.483815038204193, + "clip_ratio": 0.0, + "completion_length": 87.26666870117188, + "epoch": 5.806015037593985, + "grad_norm": 7.15625, + "kl": 0.3535382956266403, + "learning_rate": 2.097744360902256e-06, + "loss": 0.0357, + "num_tokens": 83491814.0, + "reward": -1.2586440563201904, + "reward_std": 6.434259748458862, + "rewards/get_chromagram_reward": 0.6283224105834961, + "rewards/get_chromagram_reward_std": 0.12016476839780807, + "rewards/get_intelligibility_reward": -4.383782145380974, + "rewards/get_intelligibility_reward_std": 10.280045509338379, + "rewards/get_target_len_reward": -0.02047220030799508, + "rewards/get_target_len_reward_std": 0.047592471912503244, + "step": 7720 + }, + { + "advantages": 2.07871214286115e-07, + "advantages_std": 1.5112568855285644, + "clip_ratio": 0.0, + "completion_length": 85.03095397949218, + "epoch": 5.813533834586466, + "grad_norm": 6.84375, + "kl": 0.27754891514778135, + "learning_rate": 2.093984962406015e-06, + "loss": 0.0304, + "num_tokens": 83793626.0, + "reward": -1.3859003722667693, + "reward_std": 6.656201267242432, + "rewards/get_chromagram_reward": 0.6436172723770142, + "rewards/get_chromagram_reward_std": 0.1101572260260582, + "rewards/get_intelligibility_reward": -4.780960714817047, + "rewards/get_intelligibility_reward_std": 10.576345729827882, + "rewards/get_target_len_reward": -0.020357548724859953, + "rewards/get_target_len_reward_std": 0.05655680745840073, + "step": 7730 + }, + { + "advantages": 3.1640133970611826e-07, + "advantages_std": 1.6128756046295165, + "clip_ratio": 0.0, + "completion_length": 84.45476379394532, + "epoch": 5.821052631578947, + "grad_norm": 6.9375, + "kl": 0.36553671211004257, + "learning_rate": 2.090225563909775e-06, + "loss": 0.0402, + "num_tokens": 84093787.0, + "reward": -1.6369949102401733, + "reward_std": 6.2919535636901855, + "rewards/get_chromagram_reward": 0.6147024154663085, + "rewards/get_chromagram_reward_std": 0.12221779748797416, + "rewards/get_intelligibility_reward": -5.501388788223267, + "rewards/get_intelligibility_reward_std": 9.713677501678466, + "rewards/get_target_len_reward": -0.02429804615676403, + "rewards/get_target_len_reward_std": 0.07021530121564865, + "step": 7740 + }, + { + "advantages": 3.630916513230886e-07, + "advantages_std": 1.59818115234375, + "clip_ratio": 0.0, + "completion_length": 88.15774078369141, + "epoch": 5.828571428571428, + "grad_norm": 6.34375, + "kl": 0.3614527150988579, + "learning_rate": 2.086466165413534e-06, + "loss": 0.0421, + "num_tokens": 84403760.0, + "reward": -1.3270988881587982, + "reward_std": 6.766696786880493, + "rewards/get_chromagram_reward": 0.6228858053684234, + "rewards/get_chromagram_reward_std": 0.10732598975300789, + "rewards/get_intelligibility_reward": -4.585289144515992, + "rewards/get_intelligibility_reward_std": 10.932073068618774, + "rewards/get_target_len_reward": -0.01889320518821478, + "rewards/get_target_len_reward_std": 0.05684518478810787, + "step": 7750 + }, + { + "advantages": 1.343588138524865e-07, + "advantages_std": 1.5361783146858214, + "clip_ratio": 0.0, + "completion_length": 88.17143020629882, + "epoch": 5.836090225563909, + "grad_norm": 6.1875, + "kl": 0.341474187374115, + "learning_rate": 2.0827067669172937e-06, + "loss": 0.0388, + "num_tokens": 84713912.0, + "reward": -1.3829630866646767, + "reward_std": 6.212386417388916, + "rewards/get_chromagram_reward": 0.6140726923942565, + "rewards/get_chromagram_reward_std": 0.12224277853965759, + "rewards/get_intelligibility_reward": -4.742500221729278, + "rewards/get_intelligibility_reward_std": 9.82179946899414, + "rewards/get_target_len_reward": -0.020461480133235455, + "rewards/get_target_len_reward_std": 0.05814525857567787, + "step": 7760 + }, + { + "advantages": -2.9901662994546997e-07, + "advantages_std": 1.645896029472351, + "clip_ratio": 0.0, + "completion_length": 87.46190643310547, + "epoch": 5.843609022556391, + "grad_norm": 6.28125, + "kl": 0.3289607897400856, + "learning_rate": 2.078947368421053e-06, + "loss": 0.0401, + "num_tokens": 85020860.0, + "reward": -1.580271178483963, + "reward_std": 6.879051733016968, + "rewards/get_chromagram_reward": 0.6160095632076263, + "rewards/get_chromagram_reward_std": 0.10809171348810195, + "rewards/get_intelligibility_reward": -5.33557288646698, + "rewards/get_intelligibility_reward_std": 10.888761234283447, + "rewards/get_target_len_reward": -0.021249937638640404, + "rewards/get_target_len_reward_std": 0.0723442368209362, + "step": 7770 + }, + { + "advantages": -2.1358331139254006e-07, + "advantages_std": 1.487088394165039, + "clip_ratio": 0.0, + "completion_length": 87.21190643310547, + "epoch": 5.851127819548872, + "grad_norm": 12.9375, + "kl": 0.45677812695503234, + "learning_rate": 2.075187969924812e-06, + "loss": 0.0516, + "num_tokens": 85327139.0, + "reward": -1.6519016563892364, + "reward_std": 7.351545715332032, + "rewards/get_chromagram_reward": 0.6114323198795318, + "rewards/get_chromagram_reward_std": 0.1092762902379036, + "rewards/get_intelligibility_reward": -5.544794082641602, + "rewards/get_intelligibility_reward_std": 11.731245994567871, + "rewards/get_target_len_reward": -0.022342839650809763, + "rewards/get_target_len_reward_std": 0.0750244103372097, + "step": 7780 + }, + { + "advantages": -6.4571814561986685e-09, + "advantages_std": 1.5465797185897827, + "clip_ratio": 0.0, + "completion_length": 88.10119247436523, + "epoch": 5.858646616541353, + "grad_norm": 4.75, + "kl": 0.33119735270738604, + "learning_rate": 2.0714285714285717e-06, + "loss": 0.0384, + "num_tokens": 85636475.0, + "reward": -2.0101249754428863, + "reward_std": 7.168440437316894, + "rewards/get_chromagram_reward": 0.6261911392211914, + "rewards/get_chromagram_reward_std": 0.11736593246459961, + "rewards/get_intelligibility_reward": -6.6336499691009525, + "rewards/get_intelligibility_reward_std": 10.861632442474365, + "rewards/get_target_len_reward": -0.022915830463171007, + "rewards/get_target_len_reward_std": 0.07768816240131855, + "step": 7790 + }, + { + "advantages": -1.8725794106444483e-07, + "advantages_std": 1.5579930305480958, + "clip_ratio": 0.0, + "completion_length": 88.16666793823242, + "epoch": 5.866165413533834, + "grad_norm": 6.375, + "kl": 0.3023149937391281, + "learning_rate": 2.067669172932331e-06, + "loss": 0.0302, + "num_tokens": 85946568.0, + "reward": -1.6431579798460008, + "reward_std": 6.812919569015503, + "rewards/get_chromagram_reward": 0.6214454472064972, + "rewards/get_chromagram_reward_std": 0.11257565468549728, + "rewards/get_intelligibility_reward": -5.531436330080032, + "rewards/get_intelligibility_reward_std": 10.538073015213012, + "rewards/get_target_len_reward": -0.019482666440308095, + "rewards/get_target_len_reward_std": 0.044953730516135694, + "step": 7800 + }, + { + "advantages": -6.618599286412064e-08, + "advantages_std": 1.5409797191619874, + "clip_ratio": 0.0, + "completion_length": 86.61428604125976, + "epoch": 5.873684210526315, + "grad_norm": 6.9375, + "kl": 0.2656691923737526, + "learning_rate": 2.06390977443609e-06, + "loss": 0.0269, + "num_tokens": 86252028.0, + "reward": -1.5512358218431472, + "reward_std": 7.006728744506836, + "rewards/get_chromagram_reward": 0.6187478005886078, + "rewards/get_chromagram_reward_std": 0.10456798076629639, + "rewards/get_intelligibility_reward": -5.2547792315483095, + "rewards/get_intelligibility_reward_std": 11.130170154571534, + "rewards/get_target_len_reward": -0.01767557030543685, + "rewards/get_target_len_reward_std": 0.053920988366007803, + "step": 7810 + }, + { + "advantages": -3.606081140361539e-07, + "advantages_std": 1.4223445296287536, + "clip_ratio": 0.0, + "completion_length": 88.24464416503906, + "epoch": 5.881203007518797, + "grad_norm": 10.75, + "kl": 0.373692986369133, + "learning_rate": 2.06015037593985e-06, + "loss": 0.037, + "num_tokens": 86563084.0, + "reward": -1.4428786307573318, + "reward_std": 6.848339080810547, + "rewards/get_chromagram_reward": 0.6280100584030152, + "rewards/get_chromagram_reward_std": 0.11810790672898293, + "rewards/get_intelligibility_reward": -4.936243617534638, + "rewards/get_intelligibility_reward_std": 11.012694549560546, + "rewards/get_target_len_reward": -0.020402026176452637, + "rewards/get_target_len_reward_std": 0.0451013945043087, + "step": 7820 + }, + { + "advantages": 4.321336435353373e-08, + "advantages_std": 1.5101688921451568, + "clip_ratio": 0.0, + "completion_length": 88.09285736083984, + "epoch": 5.888721804511278, + "grad_norm": 6.5, + "kl": 0.39618532359600067, + "learning_rate": 2.056390977443609e-06, + "loss": 0.041, + "num_tokens": 86872944.0, + "reward": -1.3075867846608162, + "reward_std": 6.386407995223999, + "rewards/get_chromagram_reward": 0.6417074501514435, + "rewards/get_chromagram_reward_std": 0.10772898942232131, + "rewards/get_intelligibility_reward": -4.542570279538632, + "rewards/get_intelligibility_reward_std": 10.191238403320312, + "rewards/get_target_len_reward": -0.021897280309349298, + "rewards/get_target_len_reward_std": 0.06281909570097924, + "step": 7830 + }, + { + "advantages": -1.1250376275029339e-07, + "advantages_std": 1.5223307609558105, + "clip_ratio": 0.0, + "completion_length": 90.91964416503906, + "epoch": 5.896240601503759, + "grad_norm": 9.3125, + "kl": 0.34072367250919344, + "learning_rate": 2.0526315789473687e-06, + "loss": 0.0372, + "num_tokens": 87190111.0, + "reward": -1.261566150188446, + "reward_std": 6.712820816040039, + "rewards/get_chromagram_reward": 0.6167412042617798, + "rewards/get_chromagram_reward_std": 0.11125565245747567, + "rewards/get_intelligibility_reward": -4.383474278450012, + "rewards/get_intelligibility_reward_std": 10.852442836761474, + "rewards/get_target_len_reward": -0.01796508561819792, + "rewards/get_target_len_reward_std": 0.05169492810964584, + "step": 7840 + }, + { + "advantages": 1.0977189646155239e-07, + "advantages_std": 1.5909324765205384, + "clip_ratio": 0.0, + "completion_length": 84.8101203918457, + "epoch": 5.90375939849624, + "grad_norm": 22.0, + "kl": 0.3534141376614571, + "learning_rate": 2.048872180451128e-06, + "loss": 0.0369, + "num_tokens": 87490948.0, + "reward": -1.4347603440284729, + "reward_std": 5.89638843536377, + "rewards/get_chromagram_reward": 0.622400826215744, + "rewards/get_chromagram_reward_std": 0.11123741194605827, + "rewards/get_intelligibility_reward": -4.9074736595153805, + "rewards/get_intelligibility_reward_std": 9.186559772491455, + "rewards/get_target_len_reward": -0.019207833986729383, + "rewards/get_target_len_reward_std": 0.05386210381984711, + "step": 7850 + }, + { + "advantages": -2.622604597490863e-07, + "advantages_std": 1.5522445559501648, + "clip_ratio": 0.0, + "completion_length": 83.62976303100587, + "epoch": 5.9112781954887215, + "grad_norm": 7.71875, + "kl": 0.3019698172807693, + "learning_rate": 2.0451127819548876e-06, + "loss": 0.032, + "num_tokens": 87788686.0, + "reward": -1.4791161119937897, + "reward_std": 6.530420875549316, + "rewards/get_chromagram_reward": 0.6263103306293487, + "rewards/get_chromagram_reward_std": 0.11584180518984795, + "rewards/get_intelligibility_reward": -5.04234025478363, + "rewards/get_intelligibility_reward_std": 10.386770009994507, + "rewards/get_target_len_reward": -0.021318211499601603, + "rewards/get_target_len_reward_std": 0.05836506653577089, + "step": 7860 + }, + { + "advantages": -1.4901215372731258e-09, + "advantages_std": 1.7043312788009644, + "clip_ratio": 0.0, + "completion_length": 84.61190643310547, + "epoch": 5.918796992481203, + "grad_norm": 7.3125, + "kl": 0.330104099214077, + "learning_rate": 2.0413533834586468e-06, + "loss": 0.0425, + "num_tokens": 88088552.0, + "reward": -1.6420798242092132, + "reward_std": 6.732420825958252, + "rewards/get_chromagram_reward": 0.6319904744625091, + "rewards/get_chromagram_reward_std": 0.1173894077539444, + "rewards/get_intelligibility_reward": -5.5287513256073, + "rewards/get_intelligibility_reward_std": 10.562070035934449, + "rewards/get_target_len_reward": -0.029478291515260935, + "rewards/get_target_len_reward_std": 0.08697659857571124, + "step": 7870 + }, + { + "advantages": -1.7657876867360755e-07, + "advantages_std": 1.491753101348877, + "clip_ratio": 0.0, + "completion_length": 83.62678680419921, + "epoch": 5.926315789473684, + "grad_norm": 10.25, + "kl": 0.30500824749469757, + "learning_rate": 2.0375939849624064e-06, + "loss": 0.0323, + "num_tokens": 88386276.0, + "reward": -1.5912244275212288, + "reward_std": 6.555933570861816, + "rewards/get_chromagram_reward": 0.6297276735305786, + "rewards/get_chromagram_reward_std": 0.10537393242120743, + "rewards/get_intelligibility_reward": -5.385035419464112, + "rewards/get_intelligibility_reward_std": 10.15295705795288, + "rewards/get_target_len_reward": -0.018365173134952784, + "rewards/get_target_len_reward_std": 0.04643288180232048, + "step": 7880 + }, + { + "advantages": -2.3643176874088568e-07, + "advantages_std": 1.5616509437561035, + "clip_ratio": 0.0, + "completion_length": 84.15119171142578, + "epoch": 5.9338345864661655, + "grad_norm": 474.0, + "kl": 0.3755561888217926, + "learning_rate": 2.0338345864661656e-06, + "loss": 0.0409, + "num_tokens": 88685672.0, + "reward": -1.517099180072546, + "reward_std": 6.650594806671142, + "rewards/get_chromagram_reward": 0.6291989326477051, + "rewards/get_chromagram_reward_std": 0.10843588039278984, + "rewards/get_intelligibility_reward": -5.156713980436325, + "rewards/get_intelligibility_reward_std": 10.467614555358887, + "rewards/get_target_len_reward": -0.02378233168274164, + "rewards/get_target_len_reward_std": 0.06915392931550741, + "step": 7890 + }, + { + "advantages": 1.7955899256349994e-07, + "advantages_std": 1.5938949942588807, + "clip_ratio": 0.0, + "completion_length": 83.46071548461914, + "epoch": 5.9413533834586465, + "grad_norm": 9.5, + "kl": 0.4998282104730606, + "learning_rate": 2.030075187969925e-06, + "loss": 0.0578, + "num_tokens": 88983334.0, + "reward": -1.556469202041626, + "reward_std": 6.985205411911011, + "rewards/get_chromagram_reward": 0.6302753865718842, + "rewards/get_chromagram_reward_std": 0.12176822200417518, + "rewards/get_intelligibility_reward": -5.267308855056763, + "rewards/get_intelligibility_reward_std": 11.105116081237792, + "rewards/get_target_len_reward": -0.032373837940394876, + "rewards/get_target_len_reward_std": 0.09475215002894402, + "step": 7900 + }, + { + "advantages": 6.258487275090375e-08, + "advantages_std": 1.605884838104248, + "clip_ratio": 0.0, + "completion_length": 82.43928756713868, + "epoch": 5.9488721804511275, + "grad_norm": 37.25, + "kl": 0.36509293913841245, + "learning_rate": 2.026315789473684e-06, + "loss": 0.0391, + "num_tokens": 89277739.0, + "reward": -1.4767971098423005, + "reward_std": 6.863310861587524, + "rewards/get_chromagram_reward": 0.6345268189907074, + "rewards/get_chromagram_reward_std": 0.12481983080506324, + "rewards/get_intelligibility_reward": -5.037338101863861, + "rewards/get_intelligibility_reward_std": 10.954428386688232, + "rewards/get_target_len_reward": -0.02757984409108758, + "rewards/get_target_len_reward_std": 0.07784405499696731, + "step": 7910 + }, + { + "advantages": -8.953115724352755e-08, + "advantages_std": 1.486455249786377, + "clip_ratio": 0.0, + "completion_length": 88.00476379394532, + "epoch": 5.9563909774436095, + "grad_norm": 6.125, + "kl": 0.36002791225910186, + "learning_rate": 2.0225563909774437e-06, + "loss": 0.0416, + "num_tokens": 89587840.0, + "reward": -1.0802738130092622, + "reward_std": 6.528935480117798, + "rewards/get_chromagram_reward": 0.6253605365753174, + "rewards/get_chromagram_reward_std": 0.12136272937059403, + "rewards/get_intelligibility_reward": -3.8383922219276427, + "rewards/get_intelligibility_reward_std": 10.701953887939453, + "rewards/get_target_len_reward": -0.027789629716426133, + "rewards/get_target_len_reward_std": 0.07571598924696446, + "step": 7920 + }, + { + "advantages": -7.450581023249469e-08, + "advantages_std": 1.435997450351715, + "clip_ratio": 0.0, + "completion_length": 88.46964492797852, + "epoch": 5.9639097744360905, + "grad_norm": 18.75, + "kl": 0.3710513383150101, + "learning_rate": 2.018796992481203e-06, + "loss": 0.043, + "num_tokens": 89898232.0, + "reward": -1.3577034890651702, + "reward_std": 7.212418031692505, + "rewards/get_chromagram_reward": 0.6273108780384063, + "rewards/get_chromagram_reward_std": 0.11795835718512535, + "rewards/get_intelligibility_reward": -4.674716591835022, + "rewards/get_intelligibility_reward_std": 11.762153434753419, + "rewards/get_target_len_reward": -0.025704485923051835, + "rewards/get_target_len_reward_std": 0.07749940752983094, + "step": 7930 + }, + { + "advantages": 3.890445057663783e-07, + "advantages_std": 1.536833357810974, + "clip_ratio": 0.0, + "completion_length": 90.43154830932617, + "epoch": 5.9714285714285715, + "grad_norm": 6.21875, + "kl": 0.33514691740274427, + "learning_rate": 2.0150375939849626e-06, + "loss": 0.0336, + "num_tokens": 90215052.0, + "reward": -1.166587858274579, + "reward_std": 6.312453126907348, + "rewards/get_chromagram_reward": 0.6052829146385192, + "rewards/get_chromagram_reward_std": 0.10888011902570724, + "rewards/get_intelligibility_reward": -4.090318483114243, + "rewards/get_intelligibility_reward_std": 10.21594796180725, + "rewards/get_target_len_reward": -0.014727739710360765, + "rewards/get_target_len_reward_std": 0.04119625072926283, + "step": 7940 + }, + { + "advantages": -1.8924474396442292e-07, + "advantages_std": 1.5644995093345642, + "clip_ratio": 0.0, + "completion_length": 86.95238189697265, + "epoch": 5.978947368421053, + "grad_norm": 7.25, + "kl": 0.3163200944662094, + "learning_rate": 2.011278195488722e-06, + "loss": 0.0341, + "num_tokens": 90521684.0, + "reward": -1.7043726980686187, + "reward_std": 6.817698001861572, + "rewards/get_chromagram_reward": 0.6099827468395234, + "rewards/get_chromagram_reward_std": 0.12223256900906562, + "rewards/get_intelligibility_reward": -5.7000898838043215, + "rewards/get_intelligibility_reward_std": 10.658313989639282, + "rewards/get_target_len_reward": -0.023010530322790147, + "rewards/get_target_len_reward_std": 0.056262052804231646, + "step": 7950 + }, + { + "advantages": 8.924554137479391e-07, + "advantages_std": 1.5793184757232666, + "clip_ratio": 0.0, + "completion_length": 83.6452392578125, + "epoch": 5.986466165413534, + "grad_norm": 5.46875, + "kl": 0.3809826672077179, + "learning_rate": 2.0075187969924815e-06, + "loss": 0.0398, + "num_tokens": 90820167.0, + "reward": -1.5370681881904602, + "reward_std": 6.8732414722442625, + "rewards/get_chromagram_reward": 0.6148294448852539, + "rewards/get_chromagram_reward_std": 0.12173845618963242, + "rewards/get_intelligibility_reward": -5.202451133728028, + "rewards/get_intelligibility_reward_std": 10.899156856536866, + "rewards/get_target_len_reward": -0.023582598939538003, + "rewards/get_target_len_reward_std": 0.06225805208086967, + "step": 7960 + }, + { + "advantages": 2.734363164336173e-07, + "advantages_std": 1.6118045687675475, + "clip_ratio": 0.0, + "completion_length": 87.17857284545899, + "epoch": 5.9939849624060155, + "grad_norm": 6.96875, + "kl": 0.3206441327929497, + "learning_rate": 2.0037593984962407e-06, + "loss": 0.0338, + "num_tokens": 91127400.0, + "reward": -1.338904321193695, + "reward_std": 6.592849445343018, + "rewards/get_chromagram_reward": 0.6174337565898895, + "rewards/get_chromagram_reward_std": 0.1077630490064621, + "rewards/get_intelligibility_reward": -4.616280210018158, + "rewards/get_intelligibility_reward_std": 10.516047191619872, + "rewards/get_target_len_reward": -0.017866184283047915, + "rewards/get_target_len_reward_std": 0.050554357655346396, + "step": 7970 + }, + { + "advantages": 3.6954881466044753e-07, + "advantages_std": 1.5720568537712096, + "clip_ratio": 0.0, + "completion_length": 86.29333572387695, + "epoch": 6.002255639097744, + "grad_norm": 7264.0, + "kl": 1.0428142532706262, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.1081, + "num_tokens": 91433355.0, + "reward": -1.3834814786911012, + "reward_std": 6.725188589096069, + "rewards/get_chromagram_reward": 0.6308774054050446, + "rewards/get_chromagram_reward_std": 0.11067381277680396, + "rewards/get_intelligibility_reward": -4.760740184783936, + "rewards/get_intelligibility_reward_std": 10.797851181030273, + "rewards/get_target_len_reward": -0.02058134414255619, + "rewards/get_target_len_reward_std": 0.05509445741772652, + "step": 7980 + }, + { + "advantages": 6.395081754817511e-07, + "advantages_std": 1.533771240711212, + "clip_ratio": 0.0, + "completion_length": 85.02262115478516, + "epoch": 6.009774436090225, + "grad_norm": 7.875, + "kl": 0.2817109614610672, + "learning_rate": 1.9962406015037596e-06, + "loss": 0.034, + "num_tokens": 91734631.0, + "reward": -1.4631374210119248, + "reward_std": 6.582286691665649, + "rewards/get_chromagram_reward": 0.6157466650009156, + "rewards/get_chromagram_reward_std": 0.10654039457440376, + "rewards/get_intelligibility_reward": -4.983345782756805, + "rewards/get_intelligibility_reward_std": 10.402837800979615, + "rewards/get_target_len_reward": -0.021813186444342138, + "rewards/get_target_len_reward_std": 0.0684099044650793, + "step": 7990 + }, + { + "advantages": -5.8611242081951787e-08, + "advantages_std": 1.5856661796569824, + "clip_ratio": 0.0, + "completion_length": 87.34166793823242, + "epoch": 6.0172932330827065, + "grad_norm": 8.9375, + "kl": 0.3133195355534554, + "learning_rate": 1.9924812030075188e-06, + "loss": 0.0361, + "num_tokens": 92042898.0, + "reward": -1.4364587038755416, + "reward_std": 6.61562328338623, + "rewards/get_chromagram_reward": 0.6262919783592225, + "rewards/get_chromagram_reward_std": 0.11840886026620864, + "rewards/get_intelligibility_reward": -4.914097237586975, + "rewards/get_intelligibility_reward_std": 10.516135978698731, + "rewards/get_target_len_reward": -0.021570559963583945, + "rewards/get_target_len_reward_std": 0.06995179653167724, + "step": 8000 + }, + { + "advantages": 2.9305617132990848e-08, + "advantages_std": 1.611814260482788, + "clip_ratio": 0.0, + "completion_length": 90.39702529907227, + "epoch": 6.024812030075188, + "grad_norm": 30.75, + "kl": 0.3496403008699417, + "learning_rate": 1.9887218045112784e-06, + "loss": 0.0344, + "num_tokens": 92359567.0, + "reward": -1.0210472345352173, + "reward_std": 6.469062328338623, + "rewards/get_chromagram_reward": 0.6231420278549195, + "rewards/get_chromagram_reward_std": 0.10452277362346649, + "rewards/get_intelligibility_reward": -3.6649996519088743, + "rewards/get_intelligibility_reward_std": 10.687770748138428, + "rewards/get_target_len_reward": -0.021283876802772283, + "rewards/get_target_len_reward_std": 0.05394844859838486, + "step": 8010 + }, + { + "advantages": -6.124377453176066e-07, + "advantages_std": 1.4987279534339906, + "clip_ratio": 0.0, + "completion_length": 83.43869171142578, + "epoch": 6.032330827067669, + "grad_norm": 163.0, + "kl": 0.384149894118309, + "learning_rate": 1.9849624060150376e-06, + "loss": 0.0418, + "num_tokens": 92656476.0, + "reward": -1.8202392339706421, + "reward_std": 6.662428855895996, + "rewards/get_chromagram_reward": 0.6189577221870423, + "rewards/get_chromagram_reward_std": 0.12275770530104638, + "rewards/get_intelligibility_reward": -6.0548642635345455, + "rewards/get_intelligibility_reward_std": 10.261438941955566, + "rewards/get_target_len_reward": -0.024811002239584923, + "rewards/get_target_len_reward_std": 0.06445497255772352, + "step": 8020 + }, + { + "advantages": -1.5820067140737136e-07, + "advantages_std": 1.6390297174453736, + "clip_ratio": 0.0, + "completion_length": 87.66785888671875, + "epoch": 6.0398496240601505, + "grad_norm": 18.0, + "kl": 0.31635782122612, + "learning_rate": 1.9812030075187973e-06, + "loss": 0.0319, + "num_tokens": 92966368.0, + "reward": -1.2295335441827775, + "reward_std": 6.481088876724243, + "rewards/get_chromagram_reward": 0.6162499785423279, + "rewards/get_chromagram_reward_std": 0.10878583490848541, + "rewards/get_intelligibility_reward": -4.289811539649963, + "rewards/get_intelligibility_reward_std": 10.47708339691162, + "rewards/get_target_len_reward": -0.015038707852363586, + "rewards/get_target_len_reward_std": 0.04038047567009926, + "step": 8030 + }, + { + "advantages": -4.823009362553421e-07, + "advantages_std": 1.5642240166664123, + "clip_ratio": 0.0, + "completion_length": 88.17440567016601, + "epoch": 6.0473684210526315, + "grad_norm": 9.375, + "kl": 0.6099935069680213, + "learning_rate": 1.9774436090225565e-06, + "loss": 0.067, + "num_tokens": 93276192.0, + "reward": -1.6536858409643174, + "reward_std": 6.549434661865234, + "rewards/get_chromagram_reward": 0.6186293482780456, + "rewards/get_chromagram_reward_std": 0.125721525400877, + "rewards/get_intelligibility_reward": -5.550586187839508, + "rewards/get_intelligibility_reward_std": 10.16653528213501, + "rewards/get_target_len_reward": -0.0291005146689713, + "rewards/get_target_len_reward_std": 0.09697311259806156, + "step": 8040 + }, + { + "advantages": 2.3705265448370482e-07, + "advantages_std": 1.6255091428756714, + "clip_ratio": 0.0, + "completion_length": 84.94821624755859, + "epoch": 6.0548872180451125, + "grad_norm": 7.90625, + "kl": 0.34752358943223954, + "learning_rate": 1.973684210526316e-06, + "loss": 0.0392, + "num_tokens": 93576988.0, + "reward": -1.528093084692955, + "reward_std": 6.340835666656494, + "rewards/get_chromagram_reward": 0.6186446070671081, + "rewards/get_chromagram_reward_std": 0.12236835733056069, + "rewards/get_intelligibility_reward": -5.179726958274841, + "rewards/get_intelligibility_reward_std": 9.91945195198059, + "rewards/get_target_len_reward": -0.02319667050614953, + "rewards/get_target_len_reward_std": 0.06414449084550142, + "step": 8050 + }, + { + "advantages": -2.868473625738943e-07, + "advantages_std": 1.5047789096832276, + "clip_ratio": 0.0, + "completion_length": 87.94285888671875, + "epoch": 6.062406015037594, + "grad_norm": 6.21875, + "kl": 0.31508910208940505, + "learning_rate": 1.9699248120300754e-06, + "loss": 0.0333, + "num_tokens": 93886561.0, + "reward": -1.2129630863666534, + "reward_std": 6.305604791641235, + "rewards/get_chromagram_reward": 0.6133894443511962, + "rewards/get_chromagram_reward_std": 0.11958390176296234, + "rewards/get_intelligibility_reward": -4.229742407798767, + "rewards/get_intelligibility_reward_std": 10.233415031433106, + "rewards/get_target_len_reward": -0.022536069620400667, + "rewards/get_target_len_reward_std": 0.06264531053602695, + "step": 8060 + }, + { + "advantages": 2.2302072437696553e-07, + "advantages_std": 1.6339765667915345, + "clip_ratio": 0.0, + "completion_length": 86.25595321655274, + "epoch": 6.0699248120300755, + "grad_norm": 7.53125, + "kl": 0.26957926601171495, + "learning_rate": 1.966165413533835e-06, + "loss": 0.0311, + "num_tokens": 94191443.0, + "reward": -1.3586907029151916, + "reward_std": 6.435478687286377, + "rewards/get_chromagram_reward": 0.6246838212013245, + "rewards/get_chromagram_reward_std": 0.10953456312417983, + "rewards/get_intelligibility_reward": -4.679515743255616, + "rewards/get_intelligibility_reward_std": 10.33646240234375, + "rewards/get_target_len_reward": -0.021239984221756457, + "rewards/get_target_len_reward_std": 0.06747781485319138, + "step": 8070 + }, + { + "advantages": 1.822908856752292e-07, + "advantages_std": 1.5817332983016967, + "clip_ratio": 0.0, + "completion_length": 88.4273811340332, + "epoch": 6.0774436090225565, + "grad_norm": 12.375, + "kl": 0.30030532777309416, + "learning_rate": 1.9624060150375942e-06, + "loss": 0.0298, + "num_tokens": 94503323.0, + "reward": -1.3621489562094211, + "reward_std": 6.7359175205230715, + "rewards/get_chromagram_reward": 0.609242957830429, + "rewards/get_chromagram_reward_std": 0.1111322857439518, + "rewards/get_intelligibility_reward": -4.679346296191215, + "rewards/get_intelligibility_reward_std": 10.767370700836182, + "rewards/get_target_len_reward": -0.016343369521200656, + "rewards/get_target_len_reward_std": 0.047802192904055116, + "step": 8080 + }, + { + "advantages": 1.2479723672242925e-07, + "advantages_std": 1.5469775915145874, + "clip_ratio": 0.0, + "completion_length": 88.78869323730468, + "epoch": 6.084962406015038, + "grad_norm": 8.375, + "kl": 0.3110333472490311, + "learning_rate": 1.9586466165413535e-06, + "loss": 0.0299, + "num_tokens": 94815534.0, + "reward": -1.3442703269422054, + "reward_std": 6.71604962348938, + "rewards/get_chromagram_reward": 0.6142023801803589, + "rewards/get_chromagram_reward_std": 0.1096267469227314, + "rewards/get_intelligibility_reward": -4.628188914060592, + "rewards/get_intelligibility_reward_std": 10.82657985687256, + "rewards/get_target_len_reward": -0.018824245547875762, + "rewards/get_target_len_reward_std": 0.04439031798392534, + "step": 8090 + }, + { + "advantages": -8.245309075505247e-08, + "advantages_std": 1.5762543678283691, + "clip_ratio": 0.0, + "completion_length": 87.75893020629883, + "epoch": 6.092481203007519, + "grad_norm": 6.8125, + "kl": 0.6441232696175575, + "learning_rate": 1.9548872180451127e-06, + "loss": 0.0639, + "num_tokens": 95124092.0, + "reward": -1.5475315034389496, + "reward_std": 6.789844131469726, + "rewards/get_chromagram_reward": 0.6069821238517761, + "rewards/get_chromagram_reward_std": 0.10674103200435639, + "rewards/get_intelligibility_reward": -5.235275602340698, + "rewards/get_intelligibility_reward_std": 10.816466999053954, + "rewards/get_target_len_reward": -0.014300601463764906, + "rewards/get_target_len_reward_std": 0.03731911201030016, + "step": 8100 + }, + { + "advantages": 5.935629419084876e-08, + "advantages_std": 1.6695214748382567, + "clip_ratio": 0.0, + "completion_length": 87.73809661865235, + "epoch": 6.1, + "grad_norm": 8.3125, + "kl": 0.3115114450454712, + "learning_rate": 1.9511278195488723e-06, + "loss": 0.0318, + "num_tokens": 95433893.0, + "reward": -0.9923997074365616, + "reward_std": 6.4153810977935795, + "rewards/get_chromagram_reward": 0.6363845229148865, + "rewards/get_chromagram_reward_std": 0.11569565311074256, + "rewards/get_intelligibility_reward": -3.5924025774002075, + "rewards/get_intelligibility_reward_std": 10.606056118011475, + "rewards/get_target_len_reward": -0.02118076141923666, + "rewards/get_target_len_reward_std": 0.05613228138536215, + "step": 8110 + }, + { + "advantages": -4.447996587941816e-07, + "advantages_std": 1.6991694092750549, + "clip_ratio": 0.0, + "completion_length": 89.45952606201172, + "epoch": 6.107518796992482, + "grad_norm": 6.125, + "kl": 0.3138931080698967, + "learning_rate": 1.9473684210526315e-06, + "loss": 0.0337, + "num_tokens": 95747313.0, + "reward": -1.5589805111289023, + "reward_std": 6.462708234786987, + "rewards/get_chromagram_reward": 0.6133163690567016, + "rewards/get_chromagram_reward_std": 0.11356438770890236, + "rewards/get_intelligibility_reward": -5.270368266105652, + "rewards/get_intelligibility_reward_std": 10.066627788543702, + "rewards/get_target_len_reward": -0.01988951340317726, + "rewards/get_target_len_reward_std": 0.05338175091892481, + "step": 8120 + }, + { + "advantages": -3.0820569492107096e-07, + "advantages_std": 1.469941759109497, + "clip_ratio": 0.0, + "completion_length": 85.25238189697265, + "epoch": 6.115037593984963, + "grad_norm": 31.125, + "kl": 0.3752498090267181, + "learning_rate": 1.943609022556391e-06, + "loss": 0.0427, + "num_tokens": 96049192.0, + "reward": -1.7108392238616943, + "reward_std": 7.364109134674072, + "rewards/get_chromagram_reward": 0.6208688795566559, + "rewards/get_chromagram_reward_std": 0.1251021847128868, + "rewards/get_intelligibility_reward": -5.730622339248657, + "rewards/get_intelligibility_reward_std": 11.692827415466308, + "rewards/get_target_len_reward": -0.022763955313712357, + "rewards/get_target_len_reward_std": 0.06721158996224404, + "step": 8130 + }, + { + "advantages": -1.5075008619191976e-07, + "advantages_std": 1.5935341954231261, + "clip_ratio": 0.0, + "completion_length": 88.75476455688477, + "epoch": 6.122556390977444, + "grad_norm": 6.1875, + "kl": 0.3593511641025543, + "learning_rate": 1.9398496240601504e-06, + "loss": 0.0405, + "num_tokens": 96361424.0, + "reward": -1.5784752249717713, + "reward_std": 6.592087030410767, + "rewards/get_chromagram_reward": 0.6113418281078339, + "rewards/get_chromagram_reward_std": 0.11505894362926483, + "rewards/get_intelligibility_reward": -5.325995564460754, + "rewards/get_intelligibility_reward_std": 10.418634796142578, + "rewards/get_target_len_reward": -0.02077172938734293, + "rewards/get_target_len_reward_std": 0.062492662109434605, + "step": 8140 + }, + { + "advantages": 3.998478298683494e-07, + "advantages_std": 1.515890085697174, + "clip_ratio": 0.0, + "completion_length": 84.33035812377929, + "epoch": 6.130075187969925, + "grad_norm": 6.96875, + "kl": 0.2902126759290695, + "learning_rate": 1.93609022556391e-06, + "loss": 0.0302, + "num_tokens": 96661063.0, + "reward": -1.3295504868030548, + "reward_std": 6.396369504928589, + "rewards/get_chromagram_reward": 0.6250507950782775, + "rewards/get_chromagram_reward_std": 0.12326680570840835, + "rewards/get_intelligibility_reward": -4.596553935110569, + "rewards/get_intelligibility_reward_std": 10.163883447647095, + "rewards/get_target_len_reward": -0.017148146592080592, + "rewards/get_target_len_reward_std": 0.04615513402968645, + "step": 8150 + }, + { + "advantages": -2.834946002394645e-07, + "advantages_std": 1.62041996717453, + "clip_ratio": 0.0, + "completion_length": 88.4428596496582, + "epoch": 6.137593984962406, + "grad_norm": 7.25, + "kl": 0.29210630506277085, + "learning_rate": 1.9323308270676693e-06, + "loss": 0.0338, + "num_tokens": 96972724.0, + "reward": -1.1920385241508484, + "reward_std": 6.446213865280152, + "rewards/get_chromagram_reward": 0.624556976556778, + "rewards/get_chromagram_reward_std": 0.10356669053435326, + "rewards/get_intelligibility_reward": -4.179461181163788, + "rewards/get_intelligibility_reward_std": 10.4103515625, + "rewards/get_target_len_reward": -0.02121110763400793, + "rewards/get_target_len_reward_std": 0.06671831868588925, + "step": 8160 + }, + { + "advantages": 2.3022295110308731e-07, + "advantages_std": 1.5382867932319642, + "clip_ratio": 0.0, + "completion_length": 82.35952606201172, + "epoch": 6.145112781954888, + "grad_norm": 8.375, + "kl": 0.3417093217372894, + "learning_rate": 1.928571428571429e-06, + "loss": 0.0426, + "num_tokens": 97266863.0, + "reward": -1.7890142560005189, + "reward_std": 6.895098543167114, + "rewards/get_chromagram_reward": 0.6114492297172547, + "rewards/get_chromagram_reward_std": 0.12038289308547974, + "rewards/get_intelligibility_reward": -5.953407573699951, + "rewards/get_intelligibility_reward_std": 10.688486623764039, + "rewards/get_target_len_reward": -0.025084178801625966, + "rewards/get_target_len_reward_std": 0.0768413070589304, + "step": 8170 + }, + { + "advantages": 2.2798777941090975e-07, + "advantages_std": 1.5938152074813843, + "clip_ratio": 0.0, + "completion_length": 87.05595245361329, + "epoch": 6.152631578947369, + "grad_norm": 7.15625, + "kl": 0.30286179631948473, + "learning_rate": 1.924812030075188e-06, + "loss": 0.0342, + "num_tokens": 97573766.0, + "reward": -1.5368107587099076, + "reward_std": 7.258013725280762, + "rewards/get_chromagram_reward": 0.6315832614898682, + "rewards/get_chromagram_reward_std": 0.11302488297224045, + "rewards/get_intelligibility_reward": -5.219581270217896, + "rewards/get_intelligibility_reward_std": 11.637852764129638, + "rewards/get_target_len_reward": -0.02243395196273923, + "rewards/get_target_len_reward_std": 0.06936036106199026, + "step": 8180 + }, + { + "advantages": -5.406637967553251e-07, + "advantages_std": 1.5901297569274901, + "clip_ratio": 0.0, + "completion_length": 83.93274002075195, + "epoch": 6.16015037593985, + "grad_norm": 31.75, + "kl": 0.3345526769757271, + "learning_rate": 1.9210526315789474e-06, + "loss": 0.0365, + "num_tokens": 97871945.0, + "reward": -1.9582500457763672, + "reward_std": 6.844857597351075, + "rewards/get_chromagram_reward": 0.6026628851890564, + "rewards/get_chromagram_reward_std": 0.1115984320640564, + "rewards/get_intelligibility_reward": -6.46079785823822, + "rewards/get_intelligibility_reward_std": 10.43050413131714, + "rewards/get_target_len_reward": -0.016614936850965024, + "rewards/get_target_len_reward_std": 0.053378655947744845, + "step": 8190 + }, + { + "advantages": 5.277494210531586e-08, + "advantages_std": 1.4953957557678224, + "clip_ratio": 0.0, + "completion_length": 88.58690567016602, + "epoch": 6.167669172932331, + "grad_norm": 8.375, + "kl": 0.49890299141407013, + "learning_rate": 1.917293233082707e-06, + "loss": 0.0509, + "num_tokens": 98183337.0, + "reward": -1.4810203466564418, + "reward_std": 6.8311989307403564, + "rewards/get_chromagram_reward": 0.6411927580833435, + "rewards/get_chromagram_reward_std": 0.11407085806131363, + "rewards/get_intelligibility_reward": -5.062011855840683, + "rewards/get_intelligibility_reward_std": 10.800096607208252, + "rewards/get_target_len_reward": -0.022241707518696786, + "rewards/get_target_len_reward_std": 0.04992542583495378, + "step": 8200 + }, + { + "advantages": 3.0721227730623466e-07, + "advantages_std": 1.7018776655197143, + "clip_ratio": 0.0, + "completion_length": 87.16607208251953, + "epoch": 6.175187969924812, + "grad_norm": 28.5, + "kl": 0.402908644080162, + "learning_rate": 1.9135338345864662e-06, + "loss": 0.0441, + "num_tokens": 98491721.0, + "reward": -1.239027801156044, + "reward_std": 6.582078361511231, + "rewards/get_chromagram_reward": 0.6156240582466126, + "rewards/get_chromagram_reward_std": 0.1105881929397583, + "rewards/get_intelligibility_reward": -4.306832981109619, + "rewards/get_intelligibility_reward_std": 10.683900594711304, + "rewards/get_target_len_reward": -0.025874282885342836, + "rewards/get_target_len_reward_std": 0.07156331483274699, + "step": 8210 + }, + { + "advantages": -2.3345153392639873e-07, + "advantages_std": 1.6137319803237915, + "clip_ratio": 0.0, + "completion_length": 84.79285736083985, + "epoch": 6.182706766917293, + "grad_norm": 7.5625, + "kl": 0.4480709329247475, + "learning_rate": 1.909774436090226e-06, + "loss": 0.0512, + "num_tokens": 98792316.0, + "reward": -1.655766987800598, + "reward_std": 6.44510407447815, + "rewards/get_chromagram_reward": 0.6254911303520203, + "rewards/get_chromagram_reward_std": 0.11293328404426575, + "rewards/get_intelligibility_reward": -5.572954297065735, + "rewards/get_intelligibility_reward_std": 9.979540252685547, + "rewards/get_target_len_reward": -0.01983743775635958, + "rewards/get_target_len_reward_std": 0.05844023115932941, + "step": 8220 + }, + { + "advantages": -7.078050803244196e-08, + "advantages_std": 1.6125649809837341, + "clip_ratio": 0.0, + "completion_length": 85.79345474243163, + "epoch": 6.190225563909775, + "grad_norm": 186.0, + "kl": 3.631215937435627, + "learning_rate": 1.906015037593985e-06, + "loss": 0.3673, + "num_tokens": 99095940.0, + "reward": -1.8117383182048798, + "reward_std": 6.934366941452026, + "rewards/get_chromagram_reward": 0.6273930549621582, + "rewards/get_chromagram_reward_std": 0.1096891388297081, + "rewards/get_intelligibility_reward": -6.042661952972412, + "rewards/get_intelligibility_reward_std": 10.70831527709961, + "rewards/get_target_len_reward": -0.019945572968572377, + "rewards/get_target_len_reward_std": 0.06038584988564253, + "step": 8230 + }, + { + "advantages": 3.5768996617946416e-07, + "advantages_std": 1.748731517791748, + "clip_ratio": 0.0, + "completion_length": 89.18452529907226, + "epoch": 6.197744360902256, + "grad_norm": 5.3125, + "kl": 0.3243556499481201, + "learning_rate": 1.9022556390977445e-06, + "loss": 0.0343, + "num_tokens": 99409450.0, + "reward": -1.232142798602581, + "reward_std": 6.382209587097168, + "rewards/get_chromagram_reward": 0.6276563227176666, + "rewards/get_chromagram_reward_std": 0.11100775673985482, + "rewards/get_intelligibility_reward": -4.3032633543014525, + "rewards/get_intelligibility_reward_std": 10.270160007476807, + "rewards/get_target_len_reward": -0.0208210751414299, + "rewards/get_target_len_reward_std": 0.055809604562819005, + "step": 8240 + }, + { + "advantages": 5.664924970005814e-07, + "advantages_std": 1.6130936205387116, + "clip_ratio": 0.0, + "completion_length": 88.2357162475586, + "epoch": 6.205263157894737, + "grad_norm": 7.0, + "kl": 0.4117813140153885, + "learning_rate": 1.898496240601504e-06, + "loss": 0.0465, + "num_tokens": 99718961.0, + "reward": -1.6371686838567256, + "reward_std": 7.206591939926147, + "rewards/get_chromagram_reward": 0.6129654109477997, + "rewards/get_chromagram_reward_std": 0.10508479103446007, + "rewards/get_intelligibility_reward": -5.503394261002541, + "rewards/get_intelligibility_reward_std": 11.317814826965332, + "rewards/get_target_len_reward": -0.021076952386647464, + "rewards/get_target_len_reward_std": 0.07323625609278679, + "step": 8250 + }, + { + "advantages": 1.3535223644112194e-07, + "advantages_std": 1.5970824837684632, + "clip_ratio": 0.0, + "completion_length": 87.04643173217774, + "epoch": 6.212781954887218, + "grad_norm": 8.1875, + "kl": 0.660678879916668, + "learning_rate": 1.8947368421052634e-06, + "loss": 0.0729, + "num_tokens": 100026364.0, + "reward": -1.2636336654424667, + "reward_std": 6.453917837142944, + "rewards/get_chromagram_reward": 0.6379686415195465, + "rewards/get_chromagram_reward_std": 0.10256423130631447, + "rewards/get_intelligibility_reward": -4.404207837581635, + "rewards/get_intelligibility_reward_std": 10.436718273162843, + "rewards/get_target_len_reward": -0.024661644268780945, + "rewards/get_target_len_reward_std": 0.06927633434534072, + "step": 8260 + }, + { + "advantages": -2.4090209649330064e-08, + "advantages_std": 1.5134394288063049, + "clip_ratio": 0.0, + "completion_length": 88.10119171142578, + "epoch": 6.220300751879699, + "grad_norm": 13.25, + "kl": 0.3364943116903305, + "learning_rate": 1.8909774436090228e-06, + "loss": 0.0371, + "num_tokens": 100336374.0, + "reward": -1.2326872587203979, + "reward_std": 6.58498969078064, + "rewards/get_chromagram_reward": 0.6152431964874268, + "rewards/get_chromagram_reward_std": 0.09777917936444283, + "rewards/get_intelligibility_reward": -4.295182102918625, + "rewards/get_intelligibility_reward_std": 10.663773727416991, + "rewards/get_target_len_reward": -0.0181226521730423, + "rewards/get_target_len_reward_std": 0.06043642610311508, + "step": 8270 + }, + { + "advantages": -2.585351523975987e-07, + "advantages_std": 1.4786474108695984, + "clip_ratio": 0.0, + "completion_length": 87.06488189697265, + "epoch": 6.227819548872181, + "grad_norm": 9.0625, + "kl": 0.32379979491233823, + "learning_rate": 1.8872180451127823e-06, + "loss": 0.0392, + "num_tokens": 100643174.0, + "reward": -1.5286317825317384, + "reward_std": 6.597978019714356, + "rewards/get_chromagram_reward": 0.6205591857433319, + "rewards/get_chromagram_reward_std": 0.11373497024178505, + "rewards/get_intelligibility_reward": -5.185364344716072, + "rewards/get_intelligibility_reward_std": 10.417848777770995, + "rewards/get_target_len_reward": -0.021089773811399936, + "rewards/get_target_len_reward_std": 0.06376664116978645, + "step": 8280 + }, + { + "advantages": 3.129243943078563e-07, + "advantages_std": 1.6236744046211242, + "clip_ratio": 0.0, + "completion_length": 90.57024002075195, + "epoch": 6.235338345864662, + "grad_norm": 6.8125, + "kl": 0.37636581659317014, + "learning_rate": 1.8834586466165413e-06, + "loss": 0.044, + "num_tokens": 100960519.0, + "reward": -1.2806937724351883, + "reward_std": 6.859244298934937, + "rewards/get_chromagram_reward": 0.6241738677024842, + "rewards/get_chromagram_reward_std": 0.11543380096554756, + "rewards/get_intelligibility_reward": -4.445393490791321, + "rewards/get_intelligibility_reward_std": 10.988745975494385, + "rewards/get_target_len_reward": -0.020861545857042075, + "rewards/get_target_len_reward_std": 0.06513547562062741, + "step": 8290 + }, + { + "advantages": 4.6193599700927736e-08, + "advantages_std": 1.6427213549613953, + "clip_ratio": 0.0, + "completion_length": 87.55000305175781, + "epoch": 6.242857142857143, + "grad_norm": 9.125, + "kl": 0.3940353602170944, + "learning_rate": 1.8796992481203007e-06, + "loss": 0.0439, + "num_tokens": 101268639.0, + "reward": -1.496222859621048, + "reward_std": 6.498574066162109, + "rewards/get_chromagram_reward": 0.631698876619339, + "rewards/get_chromagram_reward_std": 0.11189599186182023, + "rewards/get_intelligibility_reward": -5.096735191345215, + "rewards/get_intelligibility_reward_std": 10.293656492233277, + "rewards/get_target_len_reward": -0.023632081225514412, + "rewards/get_target_len_reward_std": 0.06621262319386005, + "step": 8300 + }, + { + "advantages": -3.79979621811799e-07, + "advantages_std": 1.5759154319763184, + "clip_ratio": 0.0, + "completion_length": 83.46607208251953, + "epoch": 6.250375939849624, + "grad_norm": 7.25, + "kl": 12158.842534568907, + "learning_rate": 1.8759398496240601e-06, + "loss": 1215.8889, + "num_tokens": 101566074.0, + "reward": -1.5842579126358032, + "reward_std": 6.644231128692627, + "rewards/get_chromagram_reward": 0.6195006787776947, + "rewards/get_chromagram_reward_std": 0.11031231805682182, + "rewards/get_intelligibility_reward": -5.348876094818115, + "rewards/get_intelligibility_reward_std": 10.512411499023438, + "rewards/get_target_len_reward": -0.023398030642420055, + "rewards/get_target_len_reward_std": 0.06640774458646774, + "step": 8310 + }, + { + "advantages": 1.5286108485668136e-07, + "advantages_std": 1.6544549465179443, + "clip_ratio": 0.0, + "completion_length": 83.01488189697265, + "epoch": 6.257894736842105, + "grad_norm": 7.78125, + "kl": 0.3186744153499603, + "learning_rate": 1.8721804511278196e-06, + "loss": 0.0353, + "num_tokens": 101862129.0, + "reward": -1.4059077441692351, + "reward_std": 6.380411100387573, + "rewards/get_chromagram_reward": 0.6233551204204559, + "rewards/get_chromagram_reward_std": 0.11690683215856552, + "rewards/get_intelligibility_reward": -4.819263887405396, + "rewards/get_intelligibility_reward_std": 10.128415775299072, + "rewards/get_target_len_reward": -0.021814127545803787, + "rewards/get_target_len_reward_std": 0.06665037646889686, + "step": 8320 + }, + { + "advantages": 1.490116545710407e-08, + "advantages_std": 1.5730611205101013, + "clip_ratio": 0.0, + "completion_length": 87.61488189697266, + "epoch": 6.265413533834587, + "grad_norm": 5.09375, + "kl": 0.33536899983882906, + "learning_rate": 1.868421052631579e-06, + "loss": 0.043, + "num_tokens": 102170609.0, + "reward": -1.378728559613228, + "reward_std": 6.750386762619018, + "rewards/get_chromagram_reward": 0.6135457396507263, + "rewards/get_chromagram_reward_std": 0.12062636762857437, + "rewards/get_intelligibility_reward": -4.7219622254371645, + "rewards/get_intelligibility_reward_std": 10.827591705322266, + "rewards/get_target_len_reward": -0.027769038919359446, + "rewards/get_target_len_reward_std": 0.08973861802369357, + "step": 8330 + }, + { + "advantages": 1.559654998928295e-07, + "advantages_std": 1.5384475350379945, + "clip_ratio": 0.0, + "completion_length": 88.77202453613282, + "epoch": 6.272932330827068, + "grad_norm": 6.28125, + "kl": 0.36056165099143983, + "learning_rate": 1.8646616541353384e-06, + "loss": 0.0356, + "num_tokens": 102482310.0, + "reward": -1.124573567509651, + "reward_std": 6.1969846248626705, + "rewards/get_chromagram_reward": 0.6254005432128906, + "rewards/get_chromagram_reward_std": 0.10148084163665771, + "rewards/get_intelligibility_reward": -3.9837652325630186, + "rewards/get_intelligibility_reward_std": 10.051096773147583, + "rewards/get_target_len_reward": -0.015355863701552152, + "rewards/get_target_len_reward_std": 0.04210511483252048, + "step": 8340 + }, + { + "advantages": 1.912315923391361e-08, + "advantages_std": 1.4889129996299744, + "clip_ratio": 0.0, + "completion_length": 89.9702392578125, + "epoch": 6.280451127819549, + "grad_norm": 5.96875, + "kl": 0.33595179915428164, + "learning_rate": 1.8609022556390979e-06, + "loss": 0.0335, + "num_tokens": 102797762.0, + "reward": -1.3522529363632203, + "reward_std": 6.929391670227051, + "rewards/get_chromagram_reward": 0.6337267577648162, + "rewards/get_chromagram_reward_std": 0.1047416977584362, + "rewards/get_intelligibility_reward": -4.6725863218307495, + "rewards/get_intelligibility_reward_std": 11.20484275817871, + "rewards/get_target_len_reward": -0.01789900762960315, + "rewards/get_target_len_reward_std": 0.044453246705234054, + "step": 8350 + }, + { + "advantages": -3.428508954073095e-07, + "advantages_std": 1.634053146839142, + "clip_ratio": 0.0, + "completion_length": 86.97202377319336, + "epoch": 6.28796992481203, + "grad_norm": 10.0, + "kl": 0.35779112577438354, + "learning_rate": 1.8571428571428573e-06, + "loss": 0.0369, + "num_tokens": 103103782.0, + "reward": -1.4777903586626053, + "reward_std": 7.088888502120971, + "rewards/get_chromagram_reward": 0.6264364421367645, + "rewards/get_chromagram_reward_std": 0.11746482402086258, + "rewards/get_intelligibility_reward": -5.032951909303665, + "rewards/get_intelligibility_reward_std": 11.310615634918213, + "rewards/get_target_len_reward": -0.02685513999313116, + "rewards/get_target_len_reward_std": 0.07638096138834953, + "step": 8360 + }, + { + "advantages": 2.1234156264426928e-07, + "advantages_std": 1.5911062717437745, + "clip_ratio": 0.0, + "completion_length": 85.92916946411133, + "epoch": 6.295488721804511, + "grad_norm": 506.0, + "kl": 0.4133845239877701, + "learning_rate": 1.8533834586466167e-06, + "loss": 0.0495, + "num_tokens": 103407520.0, + "reward": -1.6198764503002168, + "reward_std": 6.933388805389404, + "rewards/get_chromagram_reward": 0.6128597974777221, + "rewards/get_chromagram_reward_std": 0.10890985131263733, + "rewards/get_intelligibility_reward": -5.447985672950745, + "rewards/get_intelligibility_reward_std": 10.8856369972229, + "rewards/get_target_len_reward": -0.02450302317738533, + "rewards/get_target_len_reward_std": 0.08764106258749962, + "step": 8370 + }, + { + "advantages": -4.8925475937267035e-08, + "advantages_std": 1.5571423411369323, + "clip_ratio": 0.0, + "completion_length": 84.05119171142579, + "epoch": 6.303007518796992, + "grad_norm": 6.15625, + "kl": 0.3722055435180664, + "learning_rate": 1.8496240601503762e-06, + "loss": 0.0402, + "num_tokens": 103706409.0, + "reward": -1.3056098520755768, + "reward_std": 6.024608469009399, + "rewards/get_chromagram_reward": 0.6267408490180969, + "rewards/get_chromagram_reward_std": 0.11162896826863289, + "rewards/get_intelligibility_reward": -4.523062682151794, + "rewards/get_intelligibility_reward_std": 9.563552713394165, + "rewards/get_target_len_reward": -0.020507614687085153, + "rewards/get_target_len_reward_std": 0.05589599050581455, + "step": 8380 + }, + { + "advantages": 1.0083120045578653e-07, + "advantages_std": 1.600996732711792, + "clip_ratio": 0.0, + "completion_length": 85.49881134033203, + "epoch": 6.310526315789474, + "grad_norm": 7.90625, + "kl": 0.3630474954843521, + "learning_rate": 1.8458646616541354e-06, + "loss": 0.0406, + "num_tokens": 104009587.0, + "reward": -1.3805613562464714, + "reward_std": 6.6248880386352536, + "rewards/get_chromagram_reward": 0.6300054669380188, + "rewards/get_chromagram_reward_std": 0.1087301142513752, + "rewards/get_intelligibility_reward": -4.750552833080292, + "rewards/get_intelligibility_reward_std": 10.588902616500855, + "rewards/get_target_len_reward": -0.021136431582272054, + "rewards/get_target_len_reward_std": 0.05443032290786505, + "step": 8390 + }, + { + "advantages": 1.1796752232839936e-07, + "advantages_std": 1.6699777483940124, + "clip_ratio": 0.0, + "completion_length": 86.6827407836914, + "epoch": 6.318045112781955, + "grad_norm": 5.46875, + "kl": 0.33182170391082766, + "learning_rate": 1.8421052631578948e-06, + "loss": 0.0341, + "num_tokens": 104316038.0, + "reward": -1.4212640821933746, + "reward_std": 6.47331919670105, + "rewards/get_chromagram_reward": 0.6123316049575805, + "rewards/get_chromagram_reward_std": 0.10759163647890091, + "rewards/get_intelligibility_reward": -4.858754765987396, + "rewards/get_intelligibility_reward_std": 10.282232522964478, + "rewards/get_target_len_reward": -0.017369027622044085, + "rewards/get_target_len_reward_std": 0.04712581820785999, + "step": 8400 + }, + { + "advantages": 2.6561320716211866e-07, + "advantages_std": 1.7279439330101014, + "clip_ratio": 0.0, + "completion_length": 87.80238342285156, + "epoch": 6.325563909774436, + "grad_norm": 32.75, + "kl": 0.44390062987804413, + "learning_rate": 1.8383458646616543e-06, + "loss": 0.0476, + "num_tokens": 104624926.0, + "reward": -1.646631732583046, + "reward_std": 7.138958835601807, + "rewards/get_chromagram_reward": 0.6219754099845887, + "rewards/get_chromagram_reward_std": 0.1156666859984398, + "rewards/get_intelligibility_reward": -5.541904759407044, + "rewards/get_intelligibility_reward_std": 11.233854007720947, + "rewards/get_target_len_reward": -0.019965619780123234, + "rewards/get_target_len_reward_std": 0.057146585918962954, + "step": 8410 + }, + { + "advantages": 2.248833581575127e-07, + "advantages_std": 1.6452932000160216, + "clip_ratio": 0.0, + "completion_length": 86.79107360839843, + "epoch": 6.333082706766917, + "grad_norm": 11.625, + "kl": 0.3099846750497818, + "learning_rate": 1.8345864661654137e-06, + "loss": 0.0353, + "num_tokens": 104930820.0, + "reward": -1.535498809814453, + "reward_std": 6.530526494979858, + "rewards/get_chromagram_reward": 0.6166078150272369, + "rewards/get_chromagram_reward_std": 0.11259147003293038, + "rewards/get_intelligibility_reward": -5.201871180534363, + "rewards/get_intelligibility_reward_std": 10.227967166900635, + "rewards/get_target_len_reward": -0.021232699742540717, + "rewards/get_target_len_reward_std": 0.0649916348978877, + "step": 8420 + }, + { + "advantages": -2.1134814573997573e-07, + "advantages_std": 1.6378458976745605, + "clip_ratio": 0.0, + "completion_length": 89.88333511352539, + "epoch": 6.340601503759398, + "grad_norm": 6.03125, + "kl": 0.35197239816188813, + "learning_rate": 1.8308270676691731e-06, + "loss": 0.0441, + "num_tokens": 105245119.0, + "reward": -1.078207679092884, + "reward_std": 6.285722970962524, + "rewards/get_chromagram_reward": 0.6210645318031311, + "rewards/get_chromagram_reward_std": 0.11385365948081017, + "rewards/get_intelligibility_reward": -3.830742156505585, + "rewards/get_intelligibility_reward_std": 10.259477138519287, + "rewards/get_target_len_reward": -0.024945309106260537, + "rewards/get_target_len_reward_std": 0.0796560823917389, + "step": 8430 + }, + { + "advantages": -1.862645206074376e-07, + "advantages_std": 1.616140902042389, + "clip_ratio": 0.0, + "completion_length": 84.11964340209961, + "epoch": 6.34812030075188, + "grad_norm": 10.9375, + "kl": 0.43686449378728864, + "learning_rate": 1.8270676691729326e-06, + "loss": 0.045, + "num_tokens": 105543834.0, + "reward": -1.6489113748073578, + "reward_std": 6.583069133758545, + "rewards/get_chromagram_reward": 0.6107912182807922, + "rewards/get_chromagram_reward_std": 0.10622861087322236, + "rewards/get_intelligibility_reward": -5.538575506210327, + "rewards/get_intelligibility_reward_std": 10.245015335083007, + "rewards/get_target_len_reward": -0.01894954005256295, + "rewards/get_target_len_reward_std": 0.0537716269493103, + "step": 8440 + }, + { + "advantages": -3.558894213995245e-07, + "advantages_std": 1.6489933967590331, + "clip_ratio": 0.0, + "completion_length": 86.92559585571288, + "epoch": 6.355639097744361, + "grad_norm": 6.03125, + "kl": 0.37896920144557955, + "learning_rate": 1.823308270676692e-06, + "loss": 0.0406, + "num_tokens": 105850112.0, + "reward": -1.5943864196538926, + "reward_std": 6.7091076374053955, + "rewards/get_chromagram_reward": 0.6330448150634765, + "rewards/get_chromagram_reward_std": 0.12480418682098389, + "rewards/get_intelligibility_reward": -5.390610149502754, + "rewards/get_intelligibility_reward_std": 10.372951984405518, + "rewards/get_target_len_reward": -0.0255936867557466, + "rewards/get_target_len_reward_std": 0.06997879669070244, + "step": 8450 + }, + { + "advantages": 3.0870238560964934e-07, + "advantages_std": 1.5005107045173645, + "clip_ratio": 0.0, + "completion_length": 89.99821548461914, + "epoch": 6.363157894736842, + "grad_norm": 9.875, + "kl": 0.328534708917141, + "learning_rate": 1.8195488721804514e-06, + "loss": 0.0422, + "num_tokens": 106165542.0, + "reward": -1.3361444085836411, + "reward_std": 6.730158090591431, + "rewards/get_chromagram_reward": 0.6162006616592407, + "rewards/get_chromagram_reward_std": 0.10386288464069367, + "rewards/get_intelligibility_reward": -4.604814183712006, + "rewards/get_intelligibility_reward_std": 10.82592601776123, + "rewards/get_target_len_reward": -0.019819434359669687, + "rewards/get_target_len_reward_std": 0.06969902403652668, + "step": 8460 + }, + { + "advantages": -1.9297004598684e-07, + "advantages_std": 1.5895915508270264, + "clip_ratio": 0.0, + "completion_length": 84.47619323730468, + "epoch": 6.370676691729323, + "grad_norm": 9.0, + "kl": 0.3012957707047462, + "learning_rate": 1.8157894736842109e-06, + "loss": 0.0336, + "num_tokens": 106465645.0, + "reward": -1.5179098486900329, + "reward_std": 6.907446670532226, + "rewards/get_chromagram_reward": 0.6151711463928222, + "rewards/get_chromagram_reward_std": 0.11792625412344933, + "rewards/get_intelligibility_reward": -5.149234783649445, + "rewards/get_intelligibility_reward_std": 11.002441883087158, + "rewards/get_target_len_reward": -0.019665668066591025, + "rewards/get_target_len_reward_std": 0.058271700888872145, + "step": 8470 + }, + { + "advantages": 8.170803766915924e-08, + "advantages_std": 1.5603940725326537, + "clip_ratio": 0.0, + "completion_length": 88.06666717529296, + "epoch": 6.378195488721804, + "grad_norm": 7.40625, + "kl": 0.33412752449512484, + "learning_rate": 1.8120300751879703e-06, + "loss": 0.0359, + "num_tokens": 106775755.0, + "reward": -1.5474644482135773, + "reward_std": 6.983898639678955, + "rewards/get_chromagram_reward": 0.6263640344142913, + "rewards/get_chromagram_reward_std": 0.11608935371041298, + "rewards/get_intelligibility_reward": -5.249857783317566, + "rewards/get_intelligibility_reward_std": 11.141074562072754, + "rewards/get_target_len_reward": -0.018899236246943472, + "rewards/get_target_len_reward_std": 0.04913288354873657, + "step": 8480 + }, + { + "advantages": -2.186745465593276e-07, + "advantages_std": 1.6869441747665406, + "clip_ratio": 0.0, + "completion_length": 84.07559661865234, + "epoch": 6.385714285714286, + "grad_norm": 5.09375, + "kl": 2.093244831264019, + "learning_rate": 1.8082706766917293e-06, + "loss": 0.2162, + "num_tokens": 107074777.0, + "reward": -1.7199529886245728, + "reward_std": 6.821811771392822, + "rewards/get_chromagram_reward": 0.6313170075416565, + "rewards/get_chromagram_reward_std": 0.11584596931934357, + "rewards/get_intelligibility_reward": -5.7700822114944454, + "rewards/get_intelligibility_reward_std": 10.664770889282227, + "rewards/get_target_len_reward": -0.0210935284383595, + "rewards/get_target_len_reward_std": 0.058279063180089, + "step": 8490 + }, + { + "advantages": 6.52546734158932e-07, + "advantages_std": 1.5661371231079102, + "clip_ratio": 0.0, + "completion_length": 90.01190567016602, + "epoch": 6.393233082706767, + "grad_norm": 11.0, + "kl": 1.2664696410298348, + "learning_rate": 1.8045112781954887e-06, + "loss": 0.1318, + "num_tokens": 107389476.0, + "reward": -1.5618727438151836, + "reward_std": 7.49455828666687, + "rewards/get_chromagram_reward": 0.623270982503891, + "rewards/get_chromagram_reward_std": 0.11199977099895478, + "rewards/get_intelligibility_reward": -5.290174907445907, + "rewards/get_intelligibility_reward_std": 11.975935935974121, + "rewards/get_target_len_reward": -0.018713922891765832, + "rewards/get_target_len_reward_std": 0.054541667178273204, + "step": 8500 + }, + { + "advantages": -1.0679164077487258e-07, + "advantages_std": 1.5995036005973815, + "clip_ratio": 0.0, + "completion_length": 82.4601203918457, + "epoch": 6.400751879699248, + "grad_norm": 9.25, + "kl": 0.4671302646398544, + "learning_rate": 1.8007518796992482e-06, + "loss": 0.0484, + "num_tokens": 107684417.0, + "reward": -1.7090104311704635, + "reward_std": 6.771821355819702, + "rewards/get_chromagram_reward": 0.6103951513767243, + "rewards/get_chromagram_reward_std": 0.1157499797642231, + "rewards/get_intelligibility_reward": -5.714370238780975, + "rewards/get_intelligibility_reward_std": 10.493238353729248, + "rewards/get_target_len_reward": -0.023055852763354777, + "rewards/get_target_len_reward_std": 0.05546447858214378, + "step": 8510 + }, + { + "advantages": -5.736946775414253e-08, + "advantages_std": 1.5776451468467712, + "clip_ratio": 0.0, + "completion_length": 86.18928680419921, + "epoch": 6.408270676691729, + "grad_norm": 6.46875, + "kl": 0.3133410021662712, + "learning_rate": 1.7969924812030076e-06, + "loss": 0.0351, + "num_tokens": 107989269.0, + "reward": -1.4148914575576783, + "reward_std": 6.5578773021698, + "rewards/get_chromagram_reward": 0.6112196803092956, + "rewards/get_chromagram_reward_std": 0.12375476881861687, + "rewards/get_intelligibility_reward": -4.83680567741394, + "rewards/get_intelligibility_reward_std": 10.457363033294678, + "rewards/get_target_len_reward": -0.019088097847998142, + "rewards/get_target_len_reward_std": 0.05398804843425751, + "step": 8520 + }, + { + "advantages": 1.7459195049696065e-07, + "advantages_std": 1.5523484230041504, + "clip_ratio": 0.0, + "completion_length": 88.58095321655273, + "epoch": 6.41578947368421, + "grad_norm": 9.4375, + "kl": 0.5049596816301346, + "learning_rate": 1.793233082706767e-06, + "loss": 0.0587, + "num_tokens": 108299956.0, + "reward": -1.4245594978332519, + "reward_std": 6.555758714675903, + "rewards/get_chromagram_reward": 0.6246987998485565, + "rewards/get_chromagram_reward_std": 0.11914677992463112, + "rewards/get_intelligibility_reward": -4.87405880689621, + "rewards/get_intelligibility_reward_std": 10.457990598678588, + "rewards/get_target_len_reward": -0.02431822130456567, + "rewards/get_target_len_reward_std": 0.06859862487763166, + "step": 8530 + }, + { + "advantages": -5.985301498867557e-08, + "advantages_std": 1.5063032269477845, + "clip_ratio": 0.0, + "completion_length": 88.59047775268554, + "epoch": 6.423308270676692, + "grad_norm": 7.5, + "kl": 4.2682260736823086, + "learning_rate": 1.7894736842105265e-06, + "loss": 0.4341, + "num_tokens": 108611511.0, + "reward": -1.5688146725296974, + "reward_std": 7.056678915023804, + "rewards/get_chromagram_reward": 0.6214147567749023, + "rewards/get_chromagram_reward_std": 0.11932239979505539, + "rewards/get_intelligibility_reward": -5.303305222094059, + "rewards/get_intelligibility_reward_std": 11.121390342712402, + "rewards/get_target_len_reward": -0.02455320842564106, + "rewards/get_target_len_reward_std": 0.06936857439577579, + "step": 8540 + }, + { + "advantages": -7.162491741041776e-07, + "advantages_std": 1.7330376267433167, + "clip_ratio": 0.0, + "completion_length": 84.84761962890624, + "epoch": 6.430827067669173, + "grad_norm": 7.03125, + "kl": 0.41429437398910524, + "learning_rate": 1.7857142857142859e-06, + "loss": 0.0445, + "num_tokens": 108911633.0, + "reward": -1.6869840025901794, + "reward_std": 6.495787239074707, + "rewards/get_chromagram_reward": 0.6123341917991638, + "rewards/get_chromagram_reward_std": 0.10488304197788238, + "rewards/get_intelligibility_reward": -5.649365377426148, + "rewards/get_intelligibility_reward_std": 10.102130126953124, + "rewards/get_target_len_reward": -0.02392053948715329, + "rewards/get_target_len_reward_std": 0.07797471843659878, + "step": 8550 + }, + { + "advantages": -5.784134136987973e-07, + "advantages_std": 1.498878812789917, + "clip_ratio": 0.0, + "completion_length": 89.68035888671875, + "epoch": 6.438345864661654, + "grad_norm": 6.8125, + "kl": 0.3487410917878151, + "learning_rate": 1.7819548872180453e-06, + "loss": 0.041, + "num_tokens": 109225082.0, + "reward": -1.1978444576263427, + "reward_std": 6.764661836624145, + "rewards/get_chromagram_reward": 0.6185143291950226, + "rewards/get_chromagram_reward_std": 0.11313116848468781, + "rewards/get_intelligibility_reward": -4.189542031288147, + "rewards/get_intelligibility_reward_std": 11.098699617385865, + "rewards/get_target_len_reward": -0.022505429945886134, + "rewards/get_target_len_reward_std": 0.0727207712829113, + "step": 8560 + }, + { + "advantages": 2.95912244041574e-07, + "advantages_std": 1.5081347227096558, + "clip_ratio": 0.0, + "completion_length": 88.14404907226563, + "epoch": 6.445864661654135, + "grad_norm": 68.5, + "kl": 0.43108378648757933, + "learning_rate": 1.7781954887218048e-06, + "loss": 0.0456, + "num_tokens": 109534683.0, + "reward": -1.529983852803707, + "reward_std": 6.960060405731201, + "rewards/get_chromagram_reward": 0.6225944101810456, + "rewards/get_chromagram_reward_std": 0.10861722603440285, + "rewards/get_intelligibility_reward": -5.19299818277359, + "rewards/get_intelligibility_reward_std": 11.049736833572387, + "rewards/get_target_len_reward": -0.019547457993030547, + "rewards/get_target_len_reward_std": 0.05952412653714419, + "step": 8570 + }, + { + "advantages": 5.5258470865737766e-08, + "advantages_std": 1.6067670226097106, + "clip_ratio": 0.0, + "completion_length": 86.61964340209961, + "epoch": 6.453383458646616, + "grad_norm": 17.875, + "kl": 0.3261327803134918, + "learning_rate": 1.7744360902255642e-06, + "loss": 0.0383, + "num_tokens": 109839985.0, + "reward": -1.5939397394657135, + "reward_std": 6.7601910591125485, + "rewards/get_chromagram_reward": 0.6212305128574371, + "rewards/get_chromagram_reward_std": 0.12474968880414963, + "rewards/get_intelligibility_reward": -5.381693542003632, + "rewards/get_intelligibility_reward_std": 10.635554122924805, + "rewards/get_target_len_reward": -0.02135584419593215, + "rewards/get_target_len_reward_std": 0.06412192359566689, + "step": 8580 + }, + { + "advantages": 1.6068421047066294e-07, + "advantages_std": 1.6900987029075623, + "clip_ratio": 0.0, + "completion_length": 87.32262115478515, + "epoch": 6.460902255639097, + "grad_norm": 6.0, + "kl": 0.32133436053991316, + "learning_rate": 1.7706766917293234e-06, + "loss": 0.0342, + "num_tokens": 110146731.0, + "reward": -1.518751847743988, + "reward_std": 6.533436012268067, + "rewards/get_chromagram_reward": 0.6186932504177094, + "rewards/get_chromagram_reward_std": 0.12964782863855362, + "rewards/get_intelligibility_reward": -5.159471201896667, + "rewards/get_intelligibility_reward_std": 10.372815799713134, + "rewards/get_target_len_reward": -0.015477333776652813, + "rewards/get_target_len_reward_std": 0.04433933421969414, + "step": 8590 + }, + { + "advantages": -3.4384431231160305e-07, + "advantages_std": 1.4812658846378326, + "clip_ratio": 0.0, + "completion_length": 84.97857284545898, + "epoch": 6.468421052631579, + "grad_norm": 6.03125, + "kl": 0.2949420794844627, + "learning_rate": 1.7669172932330828e-06, + "loss": 0.0378, + "num_tokens": 110448240.0, + "reward": -1.9043188631534576, + "reward_std": 6.780496072769165, + "rewards/get_chromagram_reward": 0.6142106592655182, + "rewards/get_chromagram_reward_std": 0.11835580542683602, + "rewards/get_intelligibility_reward": -6.305766224861145, + "rewards/get_intelligibility_reward_std": 10.31535725593567, + "rewards/get_target_len_reward": -0.021400523744523524, + "rewards/get_target_len_reward_std": 0.07350437175482512, + "step": 8600 + }, + { + "advantages": -5.513429854886454e-08, + "advantages_std": 1.6094445586204529, + "clip_ratio": 0.0, + "completion_length": 86.00714416503907, + "epoch": 6.47593984962406, + "grad_norm": 7.90625, + "kl": 2.071778839826584, + "learning_rate": 1.7631578947368423e-06, + "loss": 0.2087, + "num_tokens": 110752197.0, + "reward": -1.7652361631393432, + "reward_std": 6.954381370544434, + "rewards/get_chromagram_reward": 0.6145216584205627, + "rewards/get_chromagram_reward_std": 0.12558116614818574, + "rewards/get_intelligibility_reward": -5.887758874893189, + "rewards/get_intelligibility_reward_std": 10.753602027893066, + "rewards/get_target_len_reward": -0.022470803745090962, + "rewards/get_target_len_reward_std": 0.0651225570589304, + "step": 8610 + }, + { + "advantages": 1.80676577699046e-07, + "advantages_std": 1.4532333254814147, + "clip_ratio": 0.0, + "completion_length": 88.41131057739258, + "epoch": 6.483458646616541, + "grad_norm": 7.5625, + "kl": 0.33889811784029006, + "learning_rate": 1.7593984962406017e-06, + "loss": 0.0396, + "num_tokens": 111063022.0, + "reward": -1.2849171161651611, + "reward_std": 6.4561848640441895, + "rewards/get_chromagram_reward": 0.6181443452835083, + "rewards/get_chromagram_reward_std": 0.1125445008277893, + "rewards/get_intelligibility_reward": -4.449799716472626, + "rewards/get_intelligibility_reward_std": 10.360198545455933, + "rewards/get_target_len_reward": -0.023095874674618246, + "rewards/get_target_len_reward_std": 0.07099482864141464, + "step": 8620 + }, + { + "advantages": 6.780029053743419e-08, + "advantages_std": 1.6173210978507995, + "clip_ratio": 0.0, + "completion_length": 86.86428833007812, + "epoch": 6.490977443609022, + "grad_norm": 18.375, + "kl": 0.35529542416334153, + "learning_rate": 1.755639097744361e-06, + "loss": 0.035, + "num_tokens": 111369702.0, + "reward": -1.5035405695438384, + "reward_std": 6.32957501411438, + "rewards/get_chromagram_reward": 0.6382215857505799, + "rewards/get_chromagram_reward_std": 0.10603488236665726, + "rewards/get_intelligibility_reward": -5.13039231300354, + "rewards/get_intelligibility_reward_std": 9.97192997932434, + "rewards/get_target_len_reward": -0.018450586684048177, + "rewards/get_target_len_reward_std": 0.04384410493075848, + "step": 8630 + }, + { + "advantages": 7.533778996204887e-07, + "advantages_std": 1.6847024321556092, + "clip_ratio": 0.0, + "completion_length": 86.977978515625, + "epoch": 6.498496240601503, + "grad_norm": 5.71875, + "kl": 15.765263549983501, + "learning_rate": 1.7518796992481204e-06, + "loss": 1.5793, + "num_tokens": 111675934.0, + "reward": -1.5743951201438904, + "reward_std": 7.082512235641479, + "rewards/get_chromagram_reward": 0.6116576075553894, + "rewards/get_chromagram_reward_std": 0.1156858630478382, + "rewards/get_intelligibility_reward": -5.314873480796814, + "rewards/get_intelligibility_reward_std": 11.322156143188476, + "rewards/get_target_len_reward": -0.019969225488603116, + "rewards/get_target_len_reward_std": 0.05809299666434527, + "step": 8640 + }, + { + "advantages": -9.288390145911762e-08, + "advantages_std": 1.513525414466858, + "clip_ratio": 0.0, + "completion_length": 87.31547698974609, + "epoch": 6.5060150375939845, + "grad_norm": 14.625, + "kl": 0.31155717074871064, + "learning_rate": 1.7481203007518798e-06, + "loss": 0.0389, + "num_tokens": 111983532.0, + "reward": -1.3458161890506743, + "reward_std": 6.663216876983642, + "rewards/get_chromagram_reward": 0.6161249577999115, + "rewards/get_chromagram_reward_std": 0.11008406803011894, + "rewards/get_intelligibility_reward": -4.63141520023346, + "rewards/get_intelligibility_reward_std": 10.778537368774414, + "rewards/get_target_len_reward": -0.022158146370202303, + "rewards/get_target_len_reward_std": 0.06249944530427456, + "step": 8650 + }, + { + "advantages": 3.9959948026080385e-07, + "advantages_std": 1.6454981327056886, + "clip_ratio": 0.0, + "completion_length": 84.92202606201172, + "epoch": 6.513533834586466, + "grad_norm": 6.96875, + "kl": 0.4470529407262802, + "learning_rate": 1.7443609022556392e-06, + "loss": 0.0528, + "num_tokens": 112284573.0, + "reward": -1.5209301978349685, + "reward_std": 6.529443597793579, + "rewards/get_chromagram_reward": 0.5984456181526184, + "rewards/get_chromagram_reward_std": 0.12049673646688461, + "rewards/get_intelligibility_reward": -5.138624894618988, + "rewards/get_intelligibility_reward_std": 10.2837571144104, + "rewards/get_target_len_reward": -0.022611112985759973, + "rewards/get_target_len_reward_std": 0.07267039511352777, + "step": 8660 + }, + { + "advantages": -2.8436387466967972e-08, + "advantages_std": 1.6120068430900574, + "clip_ratio": 0.0, + "completion_length": 87.93393173217774, + "epoch": 6.521052631578947, + "grad_norm": 9.1875, + "kl": 0.39266557842493055, + "learning_rate": 1.7406015037593987e-06, + "loss": 0.0412, + "num_tokens": 112594646.0, + "reward": -1.500117802619934, + "reward_std": 6.989867258071899, + "rewards/get_chromagram_reward": 0.6125692486763, + "rewards/get_chromagram_reward_std": 0.10713726431131362, + "rewards/get_intelligibility_reward": -5.09507395029068, + "rewards/get_intelligibility_reward_std": 11.139153861999512, + "rewards/get_target_len_reward": -0.01784856105223298, + "rewards/get_target_len_reward_std": 0.05503632873296738, + "step": 8670 + }, + { + "advantages": 2.2525589997712815e-07, + "advantages_std": 1.5830511093139648, + "clip_ratio": 0.0, + "completion_length": 91.49642944335938, + "epoch": 6.5285714285714285, + "grad_norm": 5.0625, + "kl": 0.31605358272790907, + "learning_rate": 1.736842105263158e-06, + "loss": 0.0359, + "num_tokens": 112913111.0, + "reward": -1.2155792593955994, + "reward_std": 6.368030166625976, + "rewards/get_chromagram_reward": 0.6121292293071747, + "rewards/get_chromagram_reward_std": 0.10758508741855621, + "rewards/get_intelligibility_reward": -4.240009331703186, + "rewards/get_intelligibility_reward_std": 10.14575023651123, + "rewards/get_target_len_reward": -0.018857531901448964, + "rewards/get_target_len_reward_std": 0.06237869169563055, + "step": 8680 + }, + { + "advantages": -3.6458176566611655e-07, + "advantages_std": 1.5414014101028441, + "clip_ratio": 0.0, + "completion_length": 87.77916870117187, + "epoch": 6.5360902255639095, + "grad_norm": 6.53125, + "kl": 0.3152762994170189, + "learning_rate": 1.7330827067669173e-06, + "loss": 0.0337, + "num_tokens": 113223411.0, + "reward": -1.2201290145516395, + "reward_std": 6.668260669708252, + "rewards/get_chromagram_reward": 0.6242131114006042, + "rewards/get_chromagram_reward_std": 0.10980811715126038, + "rewards/get_intelligibility_reward": -4.262039077281952, + "rewards/get_intelligibility_reward_std": 10.77245044708252, + "rewards/get_target_len_reward": -0.022560841497033836, + "rewards/get_target_len_reward_std": 0.054543149471282956, + "step": 8690 + }, + { + "advantages": -1.8874803231483383e-08, + "advantages_std": 1.5456940293312074, + "clip_ratio": 0.0, + "completion_length": 91.7428596496582, + "epoch": 6.5436090225563905, + "grad_norm": 6.5625, + "kl": 0.29459398835897443, + "learning_rate": 1.7293233082706767e-06, + "loss": 0.0345, + "num_tokens": 113543909.0, + "reward": -1.447171300649643, + "reward_std": 6.840606307983398, + "rewards/get_chromagram_reward": 0.6198675811290741, + "rewards/get_chromagram_reward_std": 0.1132724367082119, + "rewards/get_intelligibility_reward": -4.9377094268798825, + "rewards/get_intelligibility_reward_std": 10.912199115753173, + "rewards/get_target_len_reward": -0.023671871796250344, + "rewards/get_target_len_reward_std": 0.07286440655589103, + "step": 8700 + }, + { + "advantages": -1.889963925805205e-07, + "advantages_std": 1.6550687193870544, + "clip_ratio": 0.0, + "completion_length": 86.5553581237793, + "epoch": 6.5511278195488725, + "grad_norm": 12.125, + "kl": 0.3258012026548386, + "learning_rate": 1.7255639097744362e-06, + "loss": 0.0406, + "num_tokens": 113849326.0, + "reward": -1.1805256187915802, + "reward_std": 6.319496059417725, + "rewards/get_chromagram_reward": 0.6109031736850739, + "rewards/get_chromagram_reward_std": 0.11574857532978058, + "rewards/get_intelligibility_reward": -4.132873296737671, + "rewards/get_intelligibility_reward_std": 10.294385719299317, + "rewards/get_target_len_reward": -0.019606582634150983, + "rewards/get_target_len_reward_std": 0.05889410562813282, + "step": 8710 + }, + { + "advantages": 2.54809867072936e-07, + "advantages_std": 1.6077858805656433, + "clip_ratio": 0.0, + "completion_length": 86.66369018554687, + "epoch": 6.5586466165413535, + "grad_norm": 13.0625, + "kl": 0.3335361868143082, + "learning_rate": 1.7218045112781956e-06, + "loss": 0.0357, + "num_tokens": 114155062.0, + "reward": -1.6903316497802734, + "reward_std": 6.802506971359253, + "rewards/get_chromagram_reward": 0.6251188158988953, + "rewards/get_chromagram_reward_std": 0.1196857139468193, + "rewards/get_intelligibility_reward": -5.675504660606384, + "rewards/get_intelligibility_reward_std": 10.526002311706543, + "rewards/get_target_len_reward": -0.020608733221888543, + "rewards/get_target_len_reward_std": 0.05494309738278389, + "step": 8720 + }, + { + "advantages": -8.344649558011951e-08, + "advantages_std": 1.6645886659622193, + "clip_ratio": 0.0, + "completion_length": 85.4029769897461, + "epoch": 6.5661654135338345, + "grad_norm": 9.4375, + "kl": 0.36896526366472243, + "learning_rate": 1.718045112781955e-06, + "loss": 0.0415, + "num_tokens": 114457071.0, + "reward": -1.6527364611625672, + "reward_std": 6.956710481643677, + "rewards/get_chromagram_reward": 0.6127317428588868, + "rewards/get_chromagram_reward_std": 0.12270680665969849, + "rewards/get_intelligibility_reward": -5.546628451347351, + "rewards/get_intelligibility_reward_std": 11.009428977966309, + "rewards/get_target_len_reward": -0.02431240752339363, + "rewards/get_target_len_reward_std": 0.07028789706528187, + "step": 8730 + }, + { + "advantages": -2.0960967361816074e-07, + "advantages_std": 1.543196427822113, + "clip_ratio": 0.0, + "completion_length": 87.92678604125976, + "epoch": 6.573684210526316, + "grad_norm": 6.34375, + "kl": 0.34959004521369935, + "learning_rate": 1.7142857142857145e-06, + "loss": 0.0343, + "num_tokens": 114766945.0, + "reward": -1.538115844130516, + "reward_std": 7.157424592971802, + "rewards/get_chromagram_reward": 0.6251393258571625, + "rewards/get_chromagram_reward_std": 0.11151338070631027, + "rewards/get_intelligibility_reward": -5.224161815643311, + "rewards/get_intelligibility_reward_std": 11.43363332748413, + "rewards/get_target_len_reward": -0.015324807818979025, + "rewards/get_target_len_reward_std": 0.04183251298964023, + "step": 8740 + }, + { + "advantages": 2.884616570497656e-07, + "advantages_std": 1.5197386741638184, + "clip_ratio": 0.0, + "completion_length": 87.06071548461914, + "epoch": 6.581203007518797, + "grad_norm": 7.625, + "kl": 0.3010944381356239, + "learning_rate": 1.710526315789474e-06, + "loss": 0.0353, + "num_tokens": 115074311.0, + "reward": -1.662864577770233, + "reward_std": 7.032498836517334, + "rewards/get_chromagram_reward": 0.623716801404953, + "rewards/get_chromagram_reward_std": 0.11941528245806694, + "rewards/get_intelligibility_reward": -5.592788290977478, + "rewards/get_intelligibility_reward_std": 11.116605138778686, + "rewards/get_target_len_reward": -0.019521817099303007, + "rewards/get_target_len_reward_std": 0.06179252229630947, + "step": 8750 + }, + { + "advantages": -3.750125898704937e-08, + "advantages_std": 1.5628470301628112, + "clip_ratio": 0.0, + "completion_length": 86.24881057739258, + "epoch": 6.5887218045112785, + "grad_norm": 5.625, + "kl": 0.520573103427887, + "learning_rate": 1.7067669172932333e-06, + "loss": 0.0546, + "num_tokens": 115379268.0, + "reward": -1.5334375344216824, + "reward_std": 6.752659320831299, + "rewards/get_chromagram_reward": 0.6204401135444642, + "rewards/get_chromagram_reward_std": 0.1138177677989006, + "rewards/get_intelligibility_reward": -5.2039868295192715, + "rewards/get_intelligibility_reward_std": 10.579442930221557, + "rewards/get_target_len_reward": -0.01676576565951109, + "rewards/get_target_len_reward_std": 0.045275353640317914, + "step": 8760 + }, + { + "advantages": -3.0001005200119837e-07, + "advantages_std": 1.6244164824485778, + "clip_ratio": 0.0, + "completion_length": 85.89107131958008, + "epoch": 6.59624060150376, + "grad_norm": 5.28125, + "kl": 0.2966675475239754, + "learning_rate": 1.7030075187969928e-06, + "loss": 0.0297, + "num_tokens": 115682975.0, + "reward": -1.2809839069843292, + "reward_std": 6.5348756313323975, + "rewards/get_chromagram_reward": 0.6395108997821808, + "rewards/get_chromagram_reward_std": 0.12465112805366516, + "rewards/get_intelligibility_reward": -4.461061334609985, + "rewards/get_intelligibility_reward_std": 10.533429145812988, + "rewards/get_target_len_reward": -0.021401012036949395, + "rewards/get_target_len_reward_std": 0.05138088017702103, + "step": 8770 + }, + { + "advantages": -2.5431316998947293e-07, + "advantages_std": 1.5624045133590698, + "clip_ratio": 0.0, + "completion_length": 87.46309661865234, + "epoch": 6.603759398496241, + "grad_norm": 7.96875, + "kl": 0.3417803421616554, + "learning_rate": 1.6992481203007522e-06, + "loss": 0.0377, + "num_tokens": 115990689.0, + "reward": -1.4857405304908753, + "reward_std": 6.962762022018433, + "rewards/get_chromagram_reward": 0.6201785683631897, + "rewards/get_chromagram_reward_std": 0.12068104594945908, + "rewards/get_intelligibility_reward": -5.0505283117294315, + "rewards/get_intelligibility_reward_std": 11.170841598510743, + "rewards/get_target_len_reward": -0.026871402747929096, + "rewards/get_target_len_reward_std": 0.07678173333406449, + "step": 8780 + }, + { + "advantages": 2.59031862270831e-07, + "advantages_std": 1.4772692441940307, + "clip_ratio": 0.0, + "completion_length": 88.14166870117188, + "epoch": 6.611278195488722, + "grad_norm": 4.9375, + "kl": 0.2925388216972351, + "learning_rate": 1.6954887218045112e-06, + "loss": 0.0312, + "num_tokens": 116302271.0, + "reward": -1.315621554851532, + "reward_std": 7.228752565383911, + "rewards/get_chromagram_reward": 0.6382994651794434, + "rewards/get_chromagram_reward_std": 0.1257988005876541, + "rewards/get_intelligibility_reward": -4.557725477218628, + "rewards/get_intelligibility_reward_std": 11.79575719833374, + "rewards/get_target_len_reward": -0.027438334189355373, + "rewards/get_target_len_reward_std": 0.06499854773283005, + "step": 8790 + }, + { + "advantages": -1.0716418898937264e-07, + "advantages_std": 1.5314384937286376, + "clip_ratio": 0.0, + "completion_length": 87.74285888671875, + "epoch": 6.618796992481203, + "grad_norm": 7.6875, + "kl": 0.3825727790594101, + "learning_rate": 1.6917293233082707e-06, + "loss": 0.0434, + "num_tokens": 116611413.0, + "reward": -1.272094513475895, + "reward_std": 6.248283386230469, + "rewards/get_chromagram_reward": 0.6156734883785248, + "rewards/get_chromagram_reward_std": 0.11137211546301842, + "rewards/get_intelligibility_reward": -4.4124711662530895, + "rewards/get_intelligibility_reward_std": 9.994548034667968, + "rewards/get_target_len_reward": -0.0194855909794569, + "rewards/get_target_len_reward_std": 0.05853393040597439, + "step": 8800 + }, + { + "advantages": 3.141661484740155e-07, + "advantages_std": 1.6251811265945435, + "clip_ratio": 0.0, + "completion_length": 87.82083435058594, + "epoch": 6.626315789473685, + "grad_norm": 114.5, + "kl": 0.32971449494361876, + "learning_rate": 1.68796992481203e-06, + "loss": 0.0385, + "num_tokens": 116920595.0, + "reward": -1.8200352877378463, + "reward_std": 7.04195556640625, + "rewards/get_chromagram_reward": 0.6245281338691712, + "rewards/get_chromagram_reward_std": 0.11340516358613968, + "rewards/get_intelligibility_reward": -6.059786748886109, + "rewards/get_intelligibility_reward_std": 10.940639019012451, + "rewards/get_target_len_reward": -0.02484700605273247, + "rewards/get_target_len_reward_std": 0.07456877678632737, + "step": 8810 + }, + { + "advantages": 9.561578622196975e-08, + "advantages_std": 1.5531502604484557, + "clip_ratio": 0.0, + "completion_length": 85.92440795898438, + "epoch": 6.633834586466166, + "grad_norm": 6.21875, + "kl": 0.30954319685697557, + "learning_rate": 1.6842105263157895e-06, + "loss": 0.0382, + "num_tokens": 117225206.0, + "reward": -1.457185184955597, + "reward_std": 6.83298134803772, + "rewards/get_chromagram_reward": 0.622676020860672, + "rewards/get_chromagram_reward_std": 0.11204622760415077, + "rewards/get_intelligibility_reward": -4.975749778747558, + "rewards/get_intelligibility_reward_std": 10.919587230682373, + "rewards/get_target_len_reward": -0.01848159311339259, + "rewards/get_target_len_reward_std": 0.05746168848127127, + "step": 8820 + }, + { + "advantages": -1.3845661754885442e-07, + "advantages_std": 1.5635218024253845, + "clip_ratio": 0.0, + "completion_length": 83.90654907226562, + "epoch": 6.641353383458647, + "grad_norm": 9.125, + "kl": 0.30399465262889863, + "learning_rate": 1.680451127819549e-06, + "loss": 0.0355, + "num_tokens": 117523272.0, + "reward": -1.6465578913688659, + "reward_std": 6.5834332466125485, + "rewards/get_chromagram_reward": 0.6250576794147491, + "rewards/get_chromagram_reward_std": 0.1151248849928379, + "rewards/get_intelligibility_reward": -5.5405457496643065, + "rewards/get_intelligibility_reward_std": 10.319419717788696, + "rewards/get_target_len_reward": -0.024185398779809474, + "rewards/get_target_len_reward_std": 0.08074920866638421, + "step": 8830 + }, + { + "advantages": -1.8986564214173996e-07, + "advantages_std": 1.5144237875938416, + "clip_ratio": 0.0, + "completion_length": 83.20476379394532, + "epoch": 6.648872180451128, + "grad_norm": 164.0, + "kl": 0.4179477095603943, + "learning_rate": 1.6766917293233084e-06, + "loss": 0.0442, + "num_tokens": 117819871.0, + "reward": -1.4703819632530213, + "reward_std": 6.541961574554444, + "rewards/get_chromagram_reward": 0.6243762791156768, + "rewards/get_chromagram_reward_std": 0.11654146909713745, + "rewards/get_intelligibility_reward": -5.014195156097412, + "rewards/get_intelligibility_reward_std": 10.413293981552124, + "rewards/get_target_len_reward": -0.02132681766524911, + "rewards/get_target_len_reward_std": 0.0534250408411026, + "step": 8840 + }, + { + "advantages": -1.1647741686715562e-07, + "advantages_std": 1.5453578233718872, + "clip_ratio": 0.0, + "completion_length": 86.41369247436523, + "epoch": 6.656390977443609, + "grad_norm": 32.75, + "kl": 0.3617649167776108, + "learning_rate": 1.6729323308270678e-06, + "loss": 0.0439, + "num_tokens": 118125840.0, + "reward": -1.111614690721035, + "reward_std": 6.658377170562744, + "rewards/get_chromagram_reward": 0.6361361742019653, + "rewards/get_chromagram_reward_std": 0.11931278705596923, + "rewards/get_intelligibility_reward": -3.9476100608706473, + "rewards/get_intelligibility_reward_std": 10.896197414398193, + "rewards/get_target_len_reward": -0.02336995638906956, + "rewards/get_target_len_reward_std": 0.06786302607506514, + "step": 8850 + }, + { + "advantages": 3.3862889949887174e-07, + "advantages_std": 1.54275141954422, + "clip_ratio": 0.0, + "completion_length": 86.06428756713868, + "epoch": 6.663909774436091, + "grad_norm": 9.3125, + "kl": 0.3179106771945953, + "learning_rate": 1.6691729323308273e-06, + "loss": 0.0318, + "num_tokens": 118430577.0, + "reward": -1.6130412936210632, + "reward_std": 7.0431239128112795, + "rewards/get_chromagram_reward": 0.6181690275669098, + "rewards/get_chromagram_reward_std": 0.11314555704593658, + "rewards/get_intelligibility_reward": -5.437689471244812, + "rewards/get_intelligibility_reward_std": 11.172392845153809, + "rewards/get_target_len_reward": -0.01960300365462899, + "rewards/get_target_len_reward_std": 0.04749173801392317, + "step": 8860 + }, + { + "advantages": 5.066394344055425e-08, + "advantages_std": 1.652000641822815, + "clip_ratio": 0.0, + "completion_length": 87.61964340209961, + "epoch": 6.671428571428572, + "grad_norm": 5.1875, + "kl": 0.2891685277223587, + "learning_rate": 1.6654135338345867e-06, + "loss": 0.0312, + "num_tokens": 118738849.0, + "reward": -1.160328009724617, + "reward_std": 6.376783657073974, + "rewards/get_chromagram_reward": 0.6396682798862457, + "rewards/get_chromagram_reward_std": 0.11093177422881126, + "rewards/get_intelligibility_reward": -4.09717288017273, + "rewards/get_intelligibility_reward_std": 10.407121753692627, + "rewards/get_target_len_reward": -0.023479225765913724, + "rewards/get_target_len_reward_std": 0.07763024345040322, + "step": 8870 + }, + { + "advantages": 2.1879872349472862e-07, + "advantages_std": 1.5195560693740844, + "clip_ratio": 0.0, + "completion_length": 87.31547775268555, + "epoch": 6.678947368421053, + "grad_norm": 7.65625, + "kl": 0.3144701659679413, + "learning_rate": 1.6616541353383461e-06, + "loss": 0.0347, + "num_tokens": 119046694.0, + "reward": -1.609968501329422, + "reward_std": 6.308842182159424, + "rewards/get_chromagram_reward": 0.6176996469497681, + "rewards/get_chromagram_reward_std": 0.1137751154601574, + "rewards/get_intelligibility_reward": -5.427005124092102, + "rewards/get_intelligibility_reward_std": 9.80909767150879, + "rewards/get_target_len_reward": -0.020599970314651727, + "rewards/get_target_len_reward_std": 0.055216855742037295, + "step": 8880 + }, + { + "advantages": 5.116065864285701e-08, + "advantages_std": 1.5125298976898194, + "clip_ratio": 0.0, + "completion_length": 86.89643020629883, + "epoch": 6.686466165413534, + "grad_norm": 5.9375, + "kl": 0.31706361621618273, + "learning_rate": 1.6578947368421053e-06, + "loss": 0.038, + "num_tokens": 119353470.0, + "reward": -1.6176787674427033, + "reward_std": 6.833755254745483, + "rewards/get_chromagram_reward": 0.6201191484928131, + "rewards/get_chromagram_reward_std": 0.11991398110985756, + "rewards/get_intelligibility_reward": -5.450251662731171, + "rewards/get_intelligibility_reward_std": 10.686319255828858, + "rewards/get_target_len_reward": -0.02290344387292862, + "rewards/get_target_len_reward_std": 0.06531772967427969, + "step": 8890 + }, + { + "advantages": -5.1657369048996316e-08, + "advantages_std": 1.5619229674339294, + "clip_ratio": 0.0, + "completion_length": 90.20178833007813, + "epoch": 6.693984962406015, + "grad_norm": 7.03125, + "kl": 0.33849948048591616, + "learning_rate": 1.6541353383458648e-06, + "loss": 0.0357, + "num_tokens": 119669186.0, + "reward": -1.3696957796812057, + "reward_std": 6.409091806411743, + "rewards/get_chromagram_reward": 0.6205049753189087, + "rewards/get_chromagram_reward_std": 0.11114726960659027, + "rewards/get_intelligibility_reward": -4.704723370075226, + "rewards/get_intelligibility_reward_std": 10.246364498138428, + "rewards/get_target_len_reward": -0.0248687282204628, + "rewards/get_target_len_reward_std": 0.07564185373485088, + "step": 8900 + }, + { + "advantages": 4.21206185308165e-07, + "advantages_std": 1.5767785668373109, + "clip_ratio": 0.0, + "completion_length": 87.05000152587891, + "epoch": 6.701503759398497, + "grad_norm": 5.78125, + "kl": 1.301441177725792, + "learning_rate": 1.6503759398496242e-06, + "loss": 0.1376, + "num_tokens": 119975757.0, + "reward": -1.6824484169483185, + "reward_std": 6.476547956466675, + "rewards/get_chromagram_reward": 0.6236434578895569, + "rewards/get_chromagram_reward_std": 0.12051276862621307, + "rewards/get_intelligibility_reward": -5.6518255233764645, + "rewards/get_intelligibility_reward_std": 9.937983512878418, + "rewards/get_target_len_reward": -0.01916287373751402, + "rewards/get_target_len_reward_std": 0.05058997441083193, + "step": 8910 + }, + { + "advantages": -2.3593503462038258e-08, + "advantages_std": 1.636739730834961, + "clip_ratio": 0.0, + "completion_length": 83.20416793823242, + "epoch": 6.709022556390978, + "grad_norm": 6.71875, + "kl": 0.34641715288162234, + "learning_rate": 1.6466165413533836e-06, + "loss": 0.0355, + "num_tokens": 120272973.0, + "reward": -1.6513566374778748, + "reward_std": 7.347185850143433, + "rewards/get_chromagram_reward": 0.6174445629119873, + "rewards/get_chromagram_reward_std": 0.11873769238591195, + "rewards/get_intelligibility_reward": -5.549295377731323, + "rewards/get_intelligibility_reward_std": 11.645557308197022, + "rewards/get_target_len_reward": -0.022218797355890274, + "rewards/get_target_len_reward_std": 0.06003955211490393, + "step": 8920 + }, + { + "advantages": 7.276734050876144e-08, + "advantages_std": 1.5402096390724183, + "clip_ratio": 0.0, + "completion_length": 88.8607177734375, + "epoch": 6.716541353383459, + "grad_norm": 7.40625, + "kl": 0.32854454070329664, + "learning_rate": 1.642857142857143e-06, + "loss": 0.0394, + "num_tokens": 120585221.0, + "reward": -1.6409627513494343, + "reward_std": 7.005185222625732, + "rewards/get_chromagram_reward": 0.6250455319881439, + "rewards/get_chromagram_reward_std": 0.12024707272648812, + "rewards/get_intelligibility_reward": -5.524389547109604, + "rewards/get_intelligibility_reward_std": 11.067870235443115, + "rewards/get_target_len_reward": -0.023543883627280594, + "rewards/get_target_len_reward_std": 0.05823461338877678, + "step": 8930 + }, + { + "advantages": 6.144245844552642e-07, + "advantages_std": 1.5682287096977234, + "clip_ratio": 0.0, + "completion_length": 86.10595397949218, + "epoch": 6.72406015037594, + "grad_norm": 10.5, + "kl": 0.3400934889912605, + "learning_rate": 1.6390977443609025e-06, + "loss": 0.038, + "num_tokens": 120889444.0, + "reward": -1.4442845374345779, + "reward_std": 6.767345428466797, + "rewards/get_chromagram_reward": 0.6401819050312042, + "rewards/get_chromagram_reward_std": 0.12307184860110283, + "rewards/get_intelligibility_reward": -4.949033164978028, + "rewards/get_intelligibility_reward_std": 10.734640312194824, + "rewards/get_target_len_reward": -0.02400211989879608, + "rewards/get_target_len_reward_std": 0.06089174598455429, + "step": 8940 + }, + { + "advantages": -4.656613000975085e-07, + "advantages_std": 1.56749826669693, + "clip_ratio": 0.0, + "completion_length": 85.18333435058594, + "epoch": 6.731578947368421, + "grad_norm": 7.65625, + "kl": 0.39585251808166505, + "learning_rate": 1.635338345864662e-06, + "loss": 0.0463, + "num_tokens": 121191124.0, + "reward": -1.516963106393814, + "reward_std": 6.727704811096191, + "rewards/get_chromagram_reward": 0.6204110085964203, + "rewards/get_chromagram_reward_std": 0.1156246043741703, + "rewards/get_intelligibility_reward": -5.150197815895081, + "rewards/get_intelligibility_reward_std": 10.64694356918335, + "rewards/get_target_len_reward": -0.021102245897054672, + "rewards/get_target_len_reward_std": 0.05872043874114752, + "step": 8950 + }, + { + "advantages": -2.510845703795894e-07, + "advantages_std": 1.6418975114822387, + "clip_ratio": 0.0, + "completion_length": 82.9857162475586, + "epoch": 6.739097744360902, + "grad_norm": 6.75, + "kl": 0.44420134127140043, + "learning_rate": 1.6315789473684212e-06, + "loss": 0.0469, + "num_tokens": 121486792.0, + "reward": -1.8509411275386811, + "reward_std": 7.135700464248657, + "rewards/get_chromagram_reward": 0.6175800204277039, + "rewards/get_chromagram_reward_std": 0.10379650369286537, + "rewards/get_intelligibility_reward": -6.14611246585846, + "rewards/get_intelligibility_reward_std": 11.105937671661376, + "rewards/get_target_len_reward": -0.024290661606937648, + "rewards/get_target_len_reward_std": 0.0705668555572629, + "step": 8960 + }, + { + "advantages": 9.785095400616228e-08, + "advantages_std": 1.5706232070922852, + "clip_ratio": 0.0, + "completion_length": 88.34821548461915, + "epoch": 6.746616541353384, + "grad_norm": 5.84375, + "kl": 9.645041857659816, + "learning_rate": 1.6278195488721806e-06, + "loss": 0.9646, + "num_tokens": 121798026.0, + "reward": -1.499186635017395, + "reward_std": 6.527169799804687, + "rewards/get_chromagram_reward": 0.6356152951717376, + "rewards/get_chromagram_reward_std": 0.114602829515934, + "rewards/get_intelligibility_reward": -5.1114842891693115, + "rewards/get_intelligibility_reward_std": 10.352026510238648, + "rewards/get_target_len_reward": -0.021690726187080143, + "rewards/get_target_len_reward_std": 0.05747928954660893, + "step": 8970 + }, + { + "advantages": -8.46137631071997e-07, + "advantages_std": 1.6054083704948425, + "clip_ratio": 0.0, + "completion_length": 85.80952606201171, + "epoch": 6.754135338345865, + "grad_norm": 7.5625, + "kl": 0.3294598564505577, + "learning_rate": 1.62406015037594e-06, + "loss": 0.0353, + "num_tokens": 122101766.0, + "reward": -1.4127657055854796, + "reward_std": 6.778144502639771, + "rewards/get_chromagram_reward": 0.6286609172821045, + "rewards/get_chromagram_reward_std": 0.1091697208583355, + "rewards/get_intelligibility_reward": -4.847491002082824, + "rewards/get_intelligibility_reward_std": 10.899819278717041, + "rewards/get_target_len_reward": -0.01946706180460751, + "rewards/get_target_len_reward_std": 0.05438558142632246, + "step": 8980 + }, + { + "advantages": 3.8991357875772795e-08, + "advantages_std": 1.491723620891571, + "clip_ratio": 0.0, + "completion_length": 84.7428581237793, + "epoch": 6.761654135338346, + "grad_norm": 8.875, + "kl": 0.34233406037092207, + "learning_rate": 1.6203007518796992e-06, + "loss": 0.0357, + "num_tokens": 122402651.0, + "reward": -1.2671085568144917, + "reward_std": 6.347675132751465, + "rewards/get_chromagram_reward": 0.6184946298599243, + "rewards/get_chromagram_reward_std": 0.11106136739253998, + "rewards/get_intelligibility_reward": -4.399339348077774, + "rewards/get_intelligibility_reward_std": 10.213108444213868, + "rewards/get_target_len_reward": -0.020480694342404605, + "rewards/get_target_len_reward_std": 0.06267086192965507, + "step": 8990 + }, + { + "advantages": -6.9538813818326165e-09, + "advantages_std": 1.7154770612716674, + "clip_ratio": 0.0, + "completion_length": 87.94761962890625, + "epoch": 6.769172932330827, + "grad_norm": 11.5, + "kl": 0.3620362639427185, + "learning_rate": 1.6165413533834587e-06, + "loss": 0.0425, + "num_tokens": 122711859.0, + "reward": -1.6059120416641235, + "reward_std": 7.1333118915557865, + "rewards/get_chromagram_reward": 0.6188837230205536, + "rewards/get_chromagram_reward_std": 0.11426898092031479, + "rewards/get_intelligibility_reward": -5.411304783821106, + "rewards/get_intelligibility_reward_std": 11.316510009765626, + "rewards/get_target_len_reward": -0.025314744096249342, + "rewards/get_target_len_reward_std": 0.07575746681541204, + "step": 9000 + } + ], + "logging_steps": 10, + "max_steps": 13300, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 24, + "trial_name": null, + "trial_params": null +}