diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,10533 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.15903307888040713, + "eval_steps": 0, + "global_step": 500, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "combined_loss": 0.7037124633789062, + "completion_length": 425.0, + "epoch": 0.0003180661577608143, + "grad_norm": 2.1160361766815186, + "kl": 0.0, + "learning_rate": 0.0, + "loss": 0.7037, + "num_samples": 1.0, + "reward": 3.90625, + "reward_std": 1.062600016593933, + "rewards/gpt4o_holistic_reward": 3.90625, + "rl_loss": -3.725290298461914e-09, + "sft_loss": 2.3457083702087402, + "speech_entropy": 2.5810890197753906, + "speech_kl": 0.0, + "step": 1, + "text_entropy": 0.44255462288856506, + "text_kl": 0.0, + "total_entropy": 1.9978519678115845 + }, + { + "combined_loss": 0.7883188724517822, + "completion_length": 347.125, + "epoch": 0.0006361323155216285, + "grad_norm": 2.1822354793548584, + "kl": 0.0, + "learning_rate": 2.3137821315975918e-07, + "loss": 0.7883, + "num_samples": 1.0, + "reward": 4.875, + "reward_std": 0.2501000165939331, + "rewards/gpt4o_holistic_reward": 4.875, + "rl_loss": 0.0, + "sft_loss": 2.6277294158935547, + "speech_entropy": 2.6779050827026367, + "speech_kl": 0.0, + "step": 2, + "text_entropy": 0.528403639793396, + "text_kl": 0.0, + "total_entropy": 2.251002311706543 + }, + { + "combined_loss": 0.7728084921836853, + "completion_length": 490.375, + "epoch": 0.0009541984732824427, + "grad_norm": 2.1721348762512207, + "kl": 0.0, + "learning_rate": 3.6672579134208467e-07, + "loss": 0.7728, + "num_samples": 1.0, + "reward": 2.9375, + "reward_std": 1.3848260641098022, + "rewards/gpt4o_holistic_reward": 2.9375, + "rl_loss": -1.862645149230957e-09, + "sft_loss": 2.576028347015381, + "speech_entropy": 2.6699180603027344, + "speech_kl": 0.0, + "step": 3, + "text_entropy": 0.675686240196228, + "text_kl": 0.0, + "total_entropy": 2.2666218280792236 + }, + { + "combined_loss": 0.7510870695114136, + "completion_length": 396.3125, + "epoch": 0.001272264631043257, + "grad_norm": 3.0144259929656982, + "kl": 0.0, + "learning_rate": 4.6275642631951835e-07, + "loss": 0.7511, + "num_samples": 1.0, + "reward": 4.5625, + "reward_std": 0.6637751460075378, + "rewards/gpt4o_holistic_reward": 4.5625, + "rl_loss": 7.450580596923828e-09, + "sft_loss": 2.5036234855651855, + "speech_entropy": 2.7026796340942383, + "speech_kl": 0.0, + "step": 4, + "text_entropy": 0.6944292783737183, + "text_kl": 0.0, + "total_entropy": 2.308650493621826 + }, + { + "combined_loss": 0.7991127967834473, + "completion_length": 467.625, + "epoch": 0.0015903307888040711, + "grad_norm": 5.25661039352417, + "kl": 0.0, + "learning_rate": 5.372435736804816e-07, + "loss": 0.7991, + "num_samples": 1.0, + "reward": 3.8125, + "reward_std": 0.329224169254303, + "rewards/gpt4o_holistic_reward": 3.8125, + "rl_loss": -7.450580596923828e-09, + "sft_loss": 2.6637091636657715, + "speech_entropy": 2.6493892669677734, + "speech_kl": 0.0, + "step": 5, + "text_entropy": 0.8403033018112183, + "text_kl": 0.0, + "total_entropy": 2.2811856269836426 + }, + { + "combined_loss": 0.7556890249252319, + "completion_length": 261.8125, + "epoch": 0.0019083969465648854, + "grad_norm": 2.2916808128356934, + "kl": 0.0, + "learning_rate": 5.981040045018438e-07, + "loss": 0.7557, + "num_samples": 1.0, + "reward": 2.625, + "reward_std": 0.7500999569892883, + "rewards/gpt4o_holistic_reward": 2.625, + "rl_loss": 0.0, + "sft_loss": 2.51896333694458, + "speech_entropy": 2.7437381744384766, + "speech_kl": 0.0, + "step": 6, + "text_entropy": 0.7866218686103821, + "text_kl": 0.0, + "total_entropy": 2.3357348442077637 + }, + { + "combined_loss": 0.7672804594039917, + "completion_length": 284.625, + "epoch": 0.0022264631043256997, + "grad_norm": 8.151514053344727, + "kl": 0.0, + "learning_rate": 6.495607655709434e-07, + "loss": 0.7673, + "num_samples": 1.0, + "reward": 3.3125, + "reward_std": 0.36094391345977783, + "rewards/gpt4o_holistic_reward": 3.3125, + "rl_loss": 0.0, + "sft_loss": 2.5576014518737793, + "speech_entropy": 1.9880790710449219, + "speech_kl": 0.0, + "step": 7, + "text_entropy": 0.5499787926673889, + "text_kl": 0.0, + "total_entropy": 1.618296504020691 + }, + { + "combined_loss": 0.7481317520141602, + "completion_length": 526.5625, + "epoch": 0.002544529262086514, + "grad_norm": 2.451380491256714, + "kl": 0.0, + "learning_rate": 6.941346394792774e-07, + "loss": 0.7481, + "num_samples": 1.0, + "reward": 3.125, + "reward_std": 0.2501000165939331, + "rewards/gpt4o_holistic_reward": 3.125, + "rl_loss": 7.450580596923828e-09, + "sft_loss": 2.493772506713867, + "speech_entropy": 2.589114189147949, + "speech_kl": 0.0, + "step": 8, + "text_entropy": 0.6615394353866577, + "text_kl": 0.0, + "total_entropy": 2.1917660236358643 + }, + { + "combined_loss": 0.7818739414215088, + "completion_length": 261.625, + "epoch": 0.0028625954198473282, + "grad_norm": 2.807657480239868, + "kl": 0.0, + "learning_rate": 7.334515826841693e-07, + "loss": 0.7819, + "num_samples": 1.0, + "reward": 2.9375, + "reward_std": 0.46360161900520325, + "rewards/gpt4o_holistic_reward": 2.9375, + "rl_loss": 0.0, + "sft_loss": 2.6062464714050293, + "speech_entropy": 2.7798025608062744, + "speech_kl": 0.0, + "step": 9, + "text_entropy": 0.7685192823410034, + "text_kl": 0.0, + "total_entropy": 2.320415735244751 + }, + { + "combined_loss": 0.74410080909729, + "completion_length": 355.375, + "epoch": 0.0031806615776081423, + "grad_norm": 2.3492045402526855, + "kl": 0.0, + "learning_rate": 7.686217868402409e-07, + "loss": 0.7441, + "num_samples": 1.0, + "reward": 4.25, + "reward_std": 0.6921550035476685, + "rewards/gpt4o_holistic_reward": 4.25, + "rl_loss": -7.450580596923828e-09, + "sft_loss": 2.4803357124328613, + "speech_entropy": 2.6616315841674805, + "speech_kl": 0.0, + "step": 10, + "text_entropy": 0.7898290753364563, + "text_kl": 0.0, + "total_entropy": 2.2620432376861572 + }, + { + "combined_loss": 0.8018359541893005, + "completion_length": 417.3125, + "epoch": 0.003498727735368957, + "grad_norm": 2.31730580329895, + "kl": 0.0, + "learning_rate": 8.004371064686714e-07, + "loss": 0.8018, + "num_samples": 1.0, + "reward": 4.0, + "reward_std": 0.8644567728042603, + "rewards/gpt4o_holistic_reward": 4.0, + "rl_loss": -7.450580596923828e-09, + "sft_loss": 2.6727864742279053, + "speech_entropy": 2.7092902660369873, + "speech_kl": 0.0, + "step": 11, + "text_entropy": 0.7865443825721741, + "text_kl": 0.0, + "total_entropy": 2.3307557106018066 + }, + { + "combined_loss": 0.7841310501098633, + "completion_length": 271.0, + "epoch": 0.003816793893129771, + "grad_norm": 4.778654098510742, + "kl": 0.0, + "learning_rate": 8.29482217661603e-07, + "loss": 0.7841, + "num_samples": 1.0, + "reward": 2.65625, + "reward_std": 1.0673450231552124, + "rewards/gpt4o_holistic_reward": 2.65625, + "rl_loss": 1.1175870895385742e-08, + "sft_loss": 2.613770008087158, + "speech_entropy": 2.814521312713623, + "speech_kl": 0.0, + "step": 12, + "text_entropy": 1.110842227935791, + "text_kl": 0.0, + "total_entropy": 2.2540793418884277 + }, + { + "combined_loss": 0.788222074508667, + "completion_length": 318.9375, + "epoch": 0.004134860050890585, + "grad_norm": 2.6159579753875732, + "kl": 0.0, + "learning_rate": 8.562011298888888e-07, + "loss": 0.7882, + "num_samples": 1.0, + "reward": 3.9375, + "reward_std": 0.8315354585647583, + "rewards/gpt4o_holistic_reward": 3.9375, + "rl_loss": -1.4901161193847656e-08, + "sft_loss": 2.627406597137451, + "speech_entropy": 2.816967725753784, + "speech_kl": 0.0, + "step": 13, + "text_entropy": 0.8139775395393372, + "text_kl": 0.0, + "total_entropy": 2.4237911701202393 + }, + { + "combined_loss": 0.7619365453720093, + "completion_length": 598.8125, + "epoch": 0.004452926208651399, + "grad_norm": 1.9425196647644043, + "kl": 0.0, + "learning_rate": 8.809389787307026e-07, + "loss": 0.7619, + "num_samples": 1.0, + "reward": 3.5, + "reward_std": 1.2196787595748901, + "rewards/gpt4o_holistic_reward": 3.5, + "rl_loss": 2.60770320892334e-08, + "sft_loss": 2.539788246154785, + "speech_entropy": 2.607710838317871, + "speech_kl": 0.0, + "step": 14, + "text_entropy": 0.6120049357414246, + "text_kl": 0.0, + "total_entropy": 2.184032440185547 + }, + { + "combined_loss": 0.8240371942520142, + "completion_length": 376.6875, + "epoch": 0.004770992366412214, + "grad_norm": 2.834174871444702, + "kl": 0.0, + "learning_rate": 9.039693650225662e-07, + "loss": 0.824, + "num_samples": 1.0, + "reward": 4.3125, + "reward_std": 1.0792241096496582, + "rewards/gpt4o_holistic_reward": 4.3125, + "rl_loss": 7.450580596923828e-09, + "sft_loss": 2.7467904090881348, + "speech_entropy": 2.7376527786254883, + "speech_kl": 0.0, + "step": 15, + "text_entropy": 1.3147271871566772, + "text_kl": 0.0, + "total_entropy": 2.462172031402588 + }, + { + "combined_loss": 0.7633702754974365, + "completion_length": 241.8125, + "epoch": 0.005089058524173028, + "grad_norm": 3.2188761234283447, + "kl": 0.0, + "learning_rate": 9.255128526390367e-07, + "loss": 0.7634, + "num_samples": 1.0, + "reward": 3.75, + "reward_std": 0.9856985807418823, + "rewards/gpt4o_holistic_reward": 3.75, + "rl_loss": 0.0, + "sft_loss": 2.544567584991455, + "speech_entropy": 2.6439318656921387, + "speech_kl": 0.0, + "step": 16, + "text_entropy": 0.9647745490074158, + "text_kl": 0.0, + "total_entropy": 2.306708812713623 + }, + { + "combined_loss": 0.8270055651664734, + "completion_length": 534.5, + "epoch": 0.005407124681933842, + "grad_norm": 2.851463556289673, + "kl": 0.0, + "learning_rate": 9.45749848565416e-07, + "loss": 0.827, + "num_samples": 1.0, + "reward": 2.75, + "reward_std": 1.0000998973846436, + "rewards/gpt4o_holistic_reward": 2.75, + "rl_loss": 0.0, + "sft_loss": 2.756685256958008, + "speech_entropy": 2.7267394065856934, + "speech_kl": 0.0, + "step": 17, + "text_entropy": 0.9119875431060791, + "text_kl": 0.0, + "total_entropy": 2.3539938926696777 + }, + { + "combined_loss": 0.8502093553543091, + "completion_length": 354.75, + "epoch": 0.0057251908396946565, + "grad_norm": 2.3651652336120605, + "kl": 0.0, + "learning_rate": 9.648297958439284e-07, + "loss": 0.8502, + "num_samples": 1.0, + "reward": 4.0, + "reward_std": 0.3944375813007355, + "rewards/gpt4o_holistic_reward": 4.0, + "rl_loss": 3.725290298461914e-09, + "sft_loss": 2.834031105041504, + "speech_entropy": 2.6965622901916504, + "speech_kl": 0.0, + "step": 18, + "text_entropy": 1.0353615283966064, + "text_kl": 0.0, + "total_entropy": 2.379913806915283 + }, + { + "combined_loss": 0.8145760297775269, + "completion_length": 331.375, + "epoch": 0.006043256997455471, + "grad_norm": 2.548919916152954, + "kl": 0.0, + "learning_rate": 9.828778776927557e-07, + "loss": 0.8146, + "num_samples": 1.0, + "reward": 3.5625, + "reward_std": 1.6985008716583252, + "rewards/gpt4o_holistic_reward": 3.5625, + "rl_loss": 2.2351741790771484e-08, + "sft_loss": 2.7152533531188965, + "speech_entropy": 2.7572903633117676, + "speech_kl": 0.0, + "step": 19, + "text_entropy": 0.8444140553474426, + "text_kl": 0.0, + "total_entropy": 2.3438541889190674 + }, + { + "combined_loss": 0.7225195169448853, + "completion_length": 397.3125, + "epoch": 0.006361323155216285, + "grad_norm": 2.0722339153289795, + "kl": 0.0, + "learning_rate": 1e-06, + "loss": 0.7225, + "num_samples": 1.0, + "reward": 3.5, + "reward_std": 0.8872368931770325, + "rewards/gpt4o_holistic_reward": 3.5, + "rl_loss": 1.1175870895385742e-08, + "sft_loss": 2.408398151397705, + "speech_entropy": 2.698948383331299, + "speech_kl": 0.0, + "step": 20, + "text_entropy": 0.8427600860595703, + "text_kl": 0.0, + "total_entropy": 2.3258702754974365 + }, + { + "combined_loss": 0.7821958661079407, + "completion_length": 306.125, + "epoch": 0.006679389312977099, + "grad_norm": 2.6868255138397217, + "kl": 0.0, + "learning_rate": 9.99999433562768e-07, + "loss": 0.7822, + "num_samples": 1.0, + "reward": 3.8125, + "reward_std": 1.103813648223877, + "rewards/gpt4o_holistic_reward": 3.8125, + "rl_loss": 1.862645149230957e-09, + "sft_loss": 2.6073193550109863, + "speech_entropy": 2.7519540786743164, + "speech_kl": 0.0, + "step": 21, + "text_entropy": 0.9722496271133423, + "text_kl": 0.0, + "total_entropy": 2.407655715942383 + }, + { + "combined_loss": 0.6645753979682922, + "completion_length": 460.1875, + "epoch": 0.006997455470737914, + "grad_norm": 2.0238919258117676, + "kl": 0.0, + "learning_rate": 9.99997734252498e-07, + "loss": 0.6646, + "num_samples": 1.0, + "reward": 3.9375, + "reward_std": 1.3854628801345825, + "rewards/gpt4o_holistic_reward": 3.9375, + "rl_loss": 1.1175870895385742e-08, + "sft_loss": 2.2152514457702637, + "speech_entropy": 2.5998382568359375, + "speech_kl": 0.0, + "step": 22, + "text_entropy": 0.31880295276641846, + "text_kl": 0.0, + "total_entropy": 2.1282958984375 + }, + { + "combined_loss": 0.7440503239631653, + "completion_length": 156.5, + "epoch": 0.007315521628498728, + "grad_norm": 2.6382734775543213, + "kl": 0.0, + "learning_rate": 9.999949020734677e-07, + "loss": 0.7441, + "num_samples": 1.0, + "reward": 3.5, + "reward_std": 0.9331126809120178, + "rewards/gpt4o_holistic_reward": 3.5, + "rl_loss": -3.725290298461914e-09, + "sft_loss": 2.480167865753174, + "speech_entropy": 2.679046630859375, + "speech_kl": 0.0, + "step": 23, + "text_entropy": 0.8172353506088257, + "text_kl": 0.0, + "total_entropy": 2.290531873703003 + }, + { + "combined_loss": 0.8006659746170044, + "completion_length": 273.0, + "epoch": 0.007633587786259542, + "grad_norm": 2.4343767166137695, + "kl": 0.0, + "learning_rate": 9.999909370328077e-07, + "loss": 0.8007, + "num_samples": 1.0, + "reward": 3.1875, + "reward_std": 1.058112621307373, + "rewards/gpt4o_holistic_reward": 3.1875, + "rl_loss": 3.725290298461914e-09, + "sft_loss": 2.668886423110962, + "speech_entropy": 2.74008846282959, + "speech_kl": 0.0, + "step": 24, + "text_entropy": 0.805307149887085, + "text_kl": 0.0, + "total_entropy": 2.3586955070495605 + }, + { + "combined_loss": 0.74156254529953, + "completion_length": 318.4375, + "epoch": 0.007951653944020356, + "grad_norm": 2.2340893745422363, + "kl": 0.0, + "learning_rate": 9.999858391404998e-07, + "loss": 0.7416, + "num_samples": 1.0, + "reward": 3.9375, + "reward_std": 0.9063550233840942, + "rewards/gpt4o_holistic_reward": 3.9375, + "rl_loss": -2.2351741790771484e-08, + "sft_loss": 2.4718751907348633, + "speech_entropy": 2.68562388420105, + "speech_kl": 0.0, + "step": 25, + "text_entropy": 0.8011962175369263, + "text_kl": 0.0, + "total_entropy": 2.301424503326416 + }, + { + "combined_loss": 0.7775212526321411, + "completion_length": 471.375, + "epoch": 0.00826972010178117, + "grad_norm": 1.9409180879592896, + "kl": 0.0, + "learning_rate": 9.999796084093777e-07, + "loss": 0.7775, + "num_samples": 1.0, + "reward": 3.90625, + "reward_std": 0.4376000165939331, + "rewards/gpt4o_holistic_reward": 3.90625, + "rl_loss": -2.2351741790771484e-08, + "sft_loss": 2.5917372703552246, + "speech_entropy": 2.6521334648132324, + "speech_kl": 0.0, + "step": 26, + "text_entropy": 0.755419909954071, + "text_kl": 0.0, + "total_entropy": 2.224979877471924 + }, + { + "combined_loss": 0.8118987083435059, + "completion_length": 381.25, + "epoch": 0.008587786259541985, + "grad_norm": 2.630601644515991, + "kl": 0.0, + "learning_rate": 9.999722448551275e-07, + "loss": 0.8119, + "num_samples": 1.0, + "reward": 3.03125, + "reward_std": 1.176088809967041, + "rewards/gpt4o_holistic_reward": 3.03125, + "rl_loss": 0.0, + "sft_loss": 2.706328868865967, + "speech_entropy": 2.7182955741882324, + "speech_kl": 0.0, + "step": 27, + "text_entropy": 0.9594783186912537, + "text_kl": 0.0, + "total_entropy": 2.36698055267334 + }, + { + "combined_loss": 0.7367856502532959, + "completion_length": 396.8125, + "epoch": 0.008905852417302799, + "grad_norm": 2.1226205825805664, + "kl": 0.0, + "learning_rate": 9.999637484962867e-07, + "loss": 0.7368, + "num_samples": 1.0, + "reward": 3.1875, + "reward_std": 0.6250999569892883, + "rewards/gpt4o_holistic_reward": 3.1875, + "rl_loss": -3.725290298461914e-09, + "sft_loss": 2.4559521675109863, + "speech_entropy": 2.6961069107055664, + "speech_kl": 0.0, + "step": 28, + "text_entropy": 0.5669960975646973, + "text_kl": 0.0, + "total_entropy": 2.260632038116455 + }, + { + "combined_loss": 0.7663246393203735, + "completion_length": 175.125, + "epoch": 0.009223918575063612, + "grad_norm": 1.594689965248108, + "kl": 0.0, + "learning_rate": 9.99954119354245e-07, + "loss": 0.7663, + "num_samples": 1.0, + "reward": 4.5, + "reward_std": 9.999999747378752e-05, + "rewards/gpt4o_holistic_reward": 4.5, + "rl_loss": 0.0, + "sft_loss": 2.554415225982666, + "speech_entropy": 2.819967746734619, + "speech_kl": 0.0, + "step": 29, + "text_entropy": 0.7421623468399048, + "text_kl": 0.0, + "total_entropy": 2.4235970973968506 + }, + { + "combined_loss": 0.746996283531189, + "completion_length": 368.8125, + "epoch": 0.009541984732824428, + "grad_norm": 3.2165513038635254, + "kl": 0.0, + "learning_rate": 9.999433574532437e-07, + "loss": 0.747, + "num_samples": 1.0, + "reward": 3.21875, + "reward_std": 1.240410566329956, + "rewards/gpt4o_holistic_reward": 3.21875, + "rl_loss": -2.60770320892334e-08, + "sft_loss": 2.48998761177063, + "speech_entropy": 2.657895088195801, + "speech_kl": 0.0, + "step": 30, + "text_entropy": 0.9665651321411133, + "text_kl": 0.0, + "total_entropy": 2.3054580688476562 + }, + { + "combined_loss": 0.7402773499488831, + "completion_length": 272.4375, + "epoch": 0.009860050890585241, + "grad_norm": 1.8161159753799438, + "kl": 0.0, + "learning_rate": 9.99931462820376e-07, + "loss": 0.7403, + "num_samples": 1.0, + "reward": 4.875, + "reward_std": 0.14443756639957428, + "rewards/gpt4o_holistic_reward": 4.875, + "rl_loss": 1.4901161193847656e-08, + "sft_loss": 2.4675910472869873, + "speech_entropy": 2.647010326385498, + "speech_kl": 0.0, + "step": 31, + "text_entropy": 1.1171433925628662, + "text_kl": 0.0, + "total_entropy": 2.3485312461853027 + }, + { + "combined_loss": 0.7246508598327637, + "completion_length": 248.5, + "epoch": 0.010178117048346057, + "grad_norm": 2.0797693729400635, + "kl": 0.0, + "learning_rate": 9.999184354855866e-07, + "loss": 0.7247, + "num_samples": 1.0, + "reward": 3.875, + "reward_std": 0.6144567728042603, + "rewards/gpt4o_holistic_reward": 3.875, + "rl_loss": 3.725290298461914e-09, + "sft_loss": 2.4155025482177734, + "speech_entropy": 2.6706910133361816, + "speech_kl": 0.0, + "step": 32, + "text_entropy": 0.736768901348114, + "text_kl": 0.0, + "total_entropy": 2.322854995727539 + }, + { + "combined_loss": 0.6848204731941223, + "completion_length": 422.125, + "epoch": 0.01049618320610687, + "grad_norm": 2.8221709728240967, + "kl": 0.0, + "learning_rate": 9.999042754816715e-07, + "loss": 0.6848, + "num_samples": 1.0, + "reward": 2.625, + "reward_std": 1.172311544418335, + "rewards/gpt4o_holistic_reward": 2.625, + "rl_loss": -3.725290298461914e-09, + "sft_loss": 2.2827348709106445, + "speech_entropy": 2.7249035835266113, + "speech_kl": 0.0, + "step": 33, + "text_entropy": 1.2600326538085938, + "text_kl": 0.0, + "total_entropy": 2.3216300010681152 + }, + { + "combined_loss": 0.8137314319610596, + "completion_length": 450.6875, + "epoch": 0.010814249363867684, + "grad_norm": 1.7230591773986816, + "kl": 0.0, + "learning_rate": 9.99888982844279e-07, + "loss": 0.8137, + "num_samples": 1.0, + "reward": 4.1875, + "reward_std": 0.42705631256103516, + "rewards/gpt4o_holistic_reward": 4.1875, + "rl_loss": 0.0, + "sft_loss": 2.7124381065368652, + "speech_entropy": 2.5579869747161865, + "speech_kl": 0.0, + "step": 34, + "text_entropy": 1.105285406112671, + "text_kl": 0.0, + "total_entropy": 2.2585811614990234 + }, + { + "combined_loss": 0.6867671012878418, + "completion_length": 296.0625, + "epoch": 0.0111323155216285, + "grad_norm": 2.008439064025879, + "kl": 0.0, + "learning_rate": 9.99872557611908e-07, + "loss": 0.6868, + "num_samples": 1.0, + "reward": 2.125, + "reward_std": 0.8536534309387207, + "rewards/gpt4o_holistic_reward": 2.125, + "rl_loss": -7.450580596923828e-09, + "sft_loss": 2.2892236709594727, + "speech_entropy": 2.624180316925049, + "speech_kl": 0.0, + "step": 35, + "text_entropy": 0.7666749358177185, + "text_kl": 0.0, + "total_entropy": 2.235846996307373 + }, + { + "combined_loss": 0.7378132343292236, + "completion_length": 518.0625, + "epoch": 0.011450381679389313, + "grad_norm": 1.8618037700653076, + "kl": 0.0, + "learning_rate": 9.99854999825909e-07, + "loss": 0.7378, + "num_samples": 1.0, + "reward": 2.75, + "reward_std": 0.8944376111030579, + "rewards/gpt4o_holistic_reward": 2.75, + "rl_loss": -1.30385160446167e-08, + "sft_loss": 2.4593772888183594, + "speech_entropy": 2.565446615219116, + "speech_kl": 0.0, + "step": 36, + "text_entropy": 0.864229679107666, + "text_kl": 0.0, + "total_entropy": 2.1934401988983154 + }, + { + "combined_loss": 0.7986043691635132, + "completion_length": 484.375, + "epoch": 0.011768447837150127, + "grad_norm": 1.954567551612854, + "kl": 0.0, + "learning_rate": 9.998363095304839e-07, + "loss": 0.7986, + "num_samples": 1.0, + "reward": 3.625, + "reward_std": 0.5001000165939331, + "rewards/gpt4o_holistic_reward": 3.625, + "rl_loss": -9.313225746154785e-09, + "sft_loss": 2.6620142459869385, + "speech_entropy": 2.6134657859802246, + "speech_kl": 0.0, + "step": 37, + "text_entropy": 0.908026933670044, + "text_kl": 0.0, + "total_entropy": 2.264120578765869 + }, + { + "combined_loss": 0.7768511176109314, + "completion_length": 327.375, + "epoch": 0.012086513994910942, + "grad_norm": 2.6139323711395264, + "kl": 0.0, + "learning_rate": 9.99816486772685e-07, + "loss": 0.7769, + "num_samples": 1.0, + "reward": 4.0, + "reward_std": 0.9717878103256226, + "rewards/gpt4o_holistic_reward": 4.0, + "rl_loss": 3.725290298461914e-09, + "sft_loss": 2.589503765106201, + "speech_entropy": 2.7159786224365234, + "speech_kl": 0.0, + "step": 38, + "text_entropy": 1.0709924697875977, + "text_kl": 0.0, + "total_entropy": 2.3924551010131836 + }, + { + "combined_loss": 0.690489649772644, + "completion_length": 294.0, + "epoch": 0.012404580152671756, + "grad_norm": 2.4900975227355957, + "kl": 0.0, + "learning_rate": 9.997955316024167e-07, + "loss": 0.6905, + "num_samples": 1.0, + "reward": 3.75, + "reward_std": 0.5774502754211426, + "rewards/gpt4o_holistic_reward": 3.75, + "rl_loss": 7.450580596923828e-09, + "sft_loss": 2.3016321659088135, + "speech_entropy": 2.509531021118164, + "speech_kl": 0.0, + "step": 39, + "text_entropy": 0.8610185980796814, + "text_kl": 0.0, + "total_entropy": 2.159456729888916 + }, + { + "combined_loss": 0.8176020383834839, + "completion_length": 299.9375, + "epoch": 0.01272264631043257, + "grad_norm": 2.29056453704834, + "kl": 0.0, + "learning_rate": 9.997734440724333e-07, + "loss": 0.8176, + "num_samples": 1.0, + "reward": 2.6875, + "reward_std": 0.41377514600753784, + "rewards/gpt4o_holistic_reward": 2.6875, + "rl_loss": 1.1175870895385742e-08, + "sft_loss": 2.7253401279449463, + "speech_entropy": 2.6276955604553223, + "speech_kl": 0.0, + "step": 40, + "text_entropy": 0.7860556840896606, + "text_kl": 0.0, + "total_entropy": 2.2730958461761475 + }, + { + "combined_loss": 0.6987892985343933, + "completion_length": 346.4375, + "epoch": 0.013040712468193385, + "grad_norm": 1.9701205492019653, + "kl": 0.0, + "learning_rate": 9.9975022423834e-07, + "loss": 0.6988, + "num_samples": 1.0, + "reward": 3.5, + "reward_std": 0.36445680260658264, + "rewards/gpt4o_holistic_reward": 3.5, + "rl_loss": 0.0, + "sft_loss": 2.3292975425720215, + "speech_entropy": 2.602095365524292, + "speech_kl": 0.0, + "step": 41, + "text_entropy": 0.8417441844940186, + "text_kl": 0.0, + "total_entropy": 2.2476534843444824 + }, + { + "combined_loss": 0.7409491539001465, + "completion_length": 319.1875, + "epoch": 0.013358778625954198, + "grad_norm": 2.334261894226074, + "kl": 0.0, + "learning_rate": 9.997258721585931e-07, + "loss": 0.7409, + "num_samples": 1.0, + "reward": 2.9375, + "reward_std": 0.8751000165939331, + "rewards/gpt4o_holistic_reward": 2.9375, + "rl_loss": 1.1175870895385742e-08, + "sft_loss": 2.469830274581909, + "speech_entropy": 2.437565326690674, + "speech_kl": 0.0, + "step": 42, + "text_entropy": 0.7768386602401733, + "text_kl": 0.0, + "total_entropy": 2.0847883224487305 + }, + { + "combined_loss": 0.6795743107795715, + "completion_length": 245.375, + "epoch": 0.013676844783715014, + "grad_norm": 2.3061537742614746, + "kl": 0.0, + "learning_rate": 9.997003878944985e-07, + "loss": 0.6796, + "num_samples": 1.0, + "reward": 3.0, + "reward_std": 0.5001000165939331, + "rewards/gpt4o_holistic_reward": 3.0, + "rl_loss": -1.4901161193847656e-08, + "sft_loss": 2.2652478218078613, + "speech_entropy": 2.602904796600342, + "speech_kl": 0.0, + "step": 43, + "text_entropy": 0.6788486242294312, + "text_kl": 0.0, + "total_entropy": 2.2193617820739746 + }, + { + "combined_loss": 0.7629357576370239, + "completion_length": 426.9375, + "epoch": 0.013994910941475827, + "grad_norm": 1.8447140455245972, + "kl": 0.0, + "learning_rate": 9.996737715102132e-07, + "loss": 0.7629, + "num_samples": 1.0, + "reward": 3.0625, + "reward_std": 0.6978486180305481, + "rewards/gpt4o_holistic_reward": 3.0625, + "rl_loss": 1.6763806343078613e-08, + "sft_loss": 2.543118953704834, + "speech_entropy": 2.4126617908477783, + "speech_kl": 0.0, + "step": 44, + "text_entropy": 0.933107852935791, + "text_kl": 0.0, + "total_entropy": 2.1085753440856934 + }, + { + "combined_loss": 0.7338756322860718, + "completion_length": 539.75, + "epoch": 0.01431297709923664, + "grad_norm": 6.8181939125061035, + "kl": 0.0, + "learning_rate": 9.996460230727435e-07, + "loss": 0.7339, + "num_samples": 1.0, + "reward": 4.0, + "reward_std": 0.5728486180305481, + "rewards/gpt4o_holistic_reward": 4.0, + "rl_loss": 0.0, + "sft_loss": 2.44625186920166, + "speech_entropy": 2.5158934593200684, + "speech_kl": 0.0, + "step": 45, + "text_entropy": 0.8566389083862305, + "text_kl": 0.0, + "total_entropy": 2.162264585494995 + }, + { + "combined_loss": 0.8030707240104675, + "completion_length": 346.375, + "epoch": 0.014631043256997456, + "grad_norm": 2.483579397201538, + "kl": 0.0, + "learning_rate": 9.996171426519463e-07, + "loss": 0.8031, + "num_samples": 1.0, + "reward": 2.9375, + "reward_std": 0.7394567728042603, + "rewards/gpt4o_holistic_reward": 2.9375, + "rl_loss": -2.60770320892334e-08, + "sft_loss": 2.6769022941589355, + "speech_entropy": 2.4880712032318115, + "speech_kl": 0.0, + "step": 46, + "text_entropy": 0.7922484874725342, + "text_kl": 0.0, + "total_entropy": 2.1411781311035156 + }, + { + "combined_loss": 0.721707820892334, + "completion_length": 403.5625, + "epoch": 0.01494910941475827, + "grad_norm": 1.8520557880401611, + "kl": 0.0, + "learning_rate": 9.995871303205279e-07, + "loss": 0.7217, + "num_samples": 1.0, + "reward": 4.5625, + "reward_std": 0.8751000165939331, + "rewards/gpt4o_holistic_reward": 4.5625, + "rl_loss": 3.725290298461914e-09, + "sft_loss": 2.4056925773620605, + "speech_entropy": 2.421140193939209, + "speech_kl": 0.0, + "step": 47, + "text_entropy": 0.7047562003135681, + "text_kl": 0.0, + "total_entropy": 2.0640478134155273 + }, + { + "combined_loss": 0.7390530109405518, + "completion_length": 239.25, + "epoch": 0.015267175572519083, + "grad_norm": 2.3096864223480225, + "kl": 0.0, + "learning_rate": 9.995559861540447e-07, + "loss": 0.7391, + "num_samples": 1.0, + "reward": 2.5625, + "reward_std": 0.5194376111030579, + "rewards/gpt4o_holistic_reward": 2.5625, + "rl_loss": -1.4901161193847656e-08, + "sft_loss": 2.463510036468506, + "speech_entropy": 2.620394229888916, + "speech_kl": 0.0, + "step": 48, + "text_entropy": 1.4838829040527344, + "text_kl": 0.0, + "total_entropy": 2.386155605316162 + }, + { + "combined_loss": 0.8287366628646851, + "completion_length": 313.125, + "epoch": 0.015585241730279899, + "grad_norm": 2.5821146965026855, + "kl": 0.0, + "learning_rate": 9.995237102309018e-07, + "loss": 0.8287, + "num_samples": 1.0, + "reward": 4.0, + "reward_std": 1.2807698249816895, + "rewards/gpt4o_holistic_reward": 4.0, + "rl_loss": -3.725290298461914e-09, + "sft_loss": 2.762455463409424, + "speech_entropy": 2.669032096862793, + "speech_kl": 0.0, + "step": 49, + "text_entropy": 1.0619488954544067, + "text_kl": 0.0, + "total_entropy": 2.372556447982788 + }, + { + "combined_loss": 0.6661741733551025, + "completion_length": 309.875, + "epoch": 0.015903307888040712, + "grad_norm": 1.8256869316101074, + "kl": 0.0, + "learning_rate": 9.994903026323536e-07, + "loss": 0.6662, + "num_samples": 1.0, + "reward": 4.125, + "reward_std": 0.14443756639957428, + "rewards/gpt4o_holistic_reward": 4.125, + "rl_loss": 1.1175870895385742e-08, + "sft_loss": 2.220580577850342, + "speech_entropy": 2.4839134216308594, + "speech_kl": 0.0, + "step": 50, + "text_entropy": 0.8060861825942993, + "text_kl": 0.0, + "total_entropy": 2.165754556655884 + }, + { + "combined_loss": 0.7401760220527649, + "completion_length": 260.75, + "epoch": 0.016221374045801526, + "grad_norm": 2.784619092941284, + "kl": 0.0, + "learning_rate": 9.994557634425038e-07, + "loss": 0.7402, + "num_samples": 1.0, + "reward": 3.375, + "reward_std": 1.1831127405166626, + "rewards/gpt4o_holistic_reward": 3.375, + "rl_loss": 2.2351741790771484e-08, + "sft_loss": 2.4672532081604004, + "speech_entropy": 1.8801207542419434, + "speech_kl": 0.0, + "step": 51, + "text_entropy": 1.1698064804077148, + "text_kl": 0.0, + "total_entropy": 1.9170701503753662 + }, + { + "combined_loss": 0.7516125440597534, + "completion_length": 388.875, + "epoch": 0.01653944020356234, + "grad_norm": 1.9916623830795288, + "kl": 0.0, + "learning_rate": 9.994200927483053e-07, + "loss": 0.7516, + "num_samples": 1.0, + "reward": 2.75, + "reward_std": 0.9002986550331116, + "rewards/gpt4o_holistic_reward": 2.75, + "rl_loss": -7.450580596923828e-09, + "sft_loss": 2.5053749084472656, + "speech_entropy": 2.393984794616699, + "speech_kl": 0.0, + "step": 52, + "text_entropy": 1.078744649887085, + "text_kl": 0.0, + "total_entropy": 2.1226654052734375 + }, + { + "combined_loss": 0.6983101963996887, + "completion_length": 421.9375, + "epoch": 0.016857506361323157, + "grad_norm": 2.4226467609405518, + "kl": 0.0, + "learning_rate": 9.993832906395582e-07, + "loss": 0.6983, + "num_samples": 1.0, + "reward": 3.125, + "reward_std": 0.5001000165939331, + "rewards/gpt4o_holistic_reward": 3.125, + "rl_loss": 0.0, + "sft_loss": 2.327700614929199, + "speech_entropy": 2.536123752593994, + "speech_kl": 0.0, + "step": 53, + "text_entropy": 1.049363136291504, + "text_kl": 0.0, + "total_entropy": 2.2463369369506836 + }, + { + "combined_loss": 0.738640308380127, + "completion_length": 277.5625, + "epoch": 0.01717557251908397, + "grad_norm": 2.4906787872314453, + "kl": 0.0, + "learning_rate": 9.993453572089124e-07, + "loss": 0.7386, + "num_samples": 1.0, + "reward": 4.125, + "reward_std": 0.7501000165939331, + "rewards/gpt4o_holistic_reward": 4.125, + "rl_loss": 2.9802322387695312e-08, + "sft_loss": 2.46213436126709, + "speech_entropy": 2.3886947631835938, + "speech_kl": 0.0, + "step": 54, + "text_entropy": 1.1047334671020508, + "text_kl": 0.0, + "total_entropy": 2.162165641784668 + }, + { + "combined_loss": 0.6425143480300903, + "completion_length": 306.8125, + "epoch": 0.017493638676844784, + "grad_norm": 1.8969634771347046, + "kl": 0.0, + "learning_rate": 9.99306292551865e-07, + "loss": 0.6425, + "num_samples": 1.0, + "reward": 3.375, + "reward_std": 0.14443756639957428, + "rewards/gpt4o_holistic_reward": 3.375, + "rl_loss": 0.0, + "sft_loss": 2.141714572906494, + "speech_entropy": 2.4277968406677246, + "speech_kl": 0.0, + "step": 55, + "text_entropy": 0.7971184253692627, + "text_kl": 0.0, + "total_entropy": 2.088435173034668 + }, + { + "combined_loss": 0.5967005491256714, + "completion_length": 362.5, + "epoch": 0.017811704834605598, + "grad_norm": 1.6407321691513062, + "kl": 0.0, + "learning_rate": 9.99266096766761e-07, + "loss": 0.5967, + "num_samples": 1.0, + "reward": 4.25, + "reward_std": 0.7501000165939331, + "rewards/gpt4o_holistic_reward": 4.25, + "rl_loss": 7.450580596923828e-09, + "sft_loss": 1.989001750946045, + "speech_entropy": 2.3562631607055664, + "speech_kl": 0.0, + "step": 56, + "text_entropy": 0.650113582611084, + "text_kl": 0.0, + "total_entropy": 1.9950945377349854 + }, + { + "combined_loss": 0.6596853137016296, + "completion_length": 356.0, + "epoch": 0.01812977099236641, + "grad_norm": 3.1980700492858887, + "kl": 0.0, + "learning_rate": 9.992247699547936e-07, + "loss": 0.6597, + "num_samples": 1.0, + "reward": 3.0625, + "reward_std": 1.536826252937317, + "rewards/gpt4o_holistic_reward": 3.0625, + "rl_loss": -2.2351741790771484e-08, + "sft_loss": 2.198951244354248, + "speech_entropy": 2.4273312091827393, + "speech_kl": 0.0, + "step": 57, + "text_entropy": 0.5911531448364258, + "text_kl": 0.0, + "total_entropy": 2.0504350662231445 + }, + { + "combined_loss": 0.7111167907714844, + "completion_length": 328.8125, + "epoch": 0.018447837150127225, + "grad_norm": 2.5813424587249756, + "kl": 0.0, + "learning_rate": 9.99182312220003e-07, + "loss": 0.7111, + "num_samples": 1.0, + "reward": 2.5625, + "reward_std": 1.2798004150390625, + "rewards/gpt4o_holistic_reward": 2.5625, + "rl_loss": 0.0, + "sft_loss": 2.370388984680176, + "speech_entropy": 2.387758731842041, + "speech_kl": 0.0, + "step": 58, + "text_entropy": 1.290541410446167, + "text_kl": 0.0, + "total_entropy": 2.1883113384246826 + }, + { + "combined_loss": 0.7116686105728149, + "completion_length": 483.125, + "epoch": 0.018765903307888042, + "grad_norm": 1.8333094120025635, + "kl": 0.0, + "learning_rate": 9.991387236692764e-07, + "loss": 0.7117, + "num_samples": 1.0, + "reward": 4.3125, + "reward_std": 0.6038135886192322, + "rewards/gpt4o_holistic_reward": 4.3125, + "rl_loss": 7.450580596923828e-09, + "sft_loss": 2.3722286224365234, + "speech_entropy": 2.387399196624756, + "speech_kl": 0.0, + "step": 59, + "text_entropy": 0.8359103798866272, + "text_kl": 0.0, + "total_entropy": 2.049403429031372 + }, + { + "combined_loss": 0.7444514036178589, + "completion_length": 254.6875, + "epoch": 0.019083969465648856, + "grad_norm": 2.073017120361328, + "kl": 0.0, + "learning_rate": 9.990940044123479e-07, + "loss": 0.7445, + "num_samples": 1.0, + "reward": 5.0, + "reward_std": 9.999999747378752e-05, + "rewards/gpt4o_holistic_reward": 5.0, + "rl_loss": 0.0, + "sft_loss": 2.4815046787261963, + "speech_entropy": 2.459671974182129, + "speech_kl": 0.0, + "step": 60, + "text_entropy": 0.8760254383087158, + "text_kl": 0.0, + "total_entropy": 2.171876907348633 + }, + { + "combined_loss": 0.6521174907684326, + "completion_length": 459.6875, + "epoch": 0.01940203562340967, + "grad_norm": 2.9567947387695312, + "kl": 0.0, + "learning_rate": 9.990481545617983e-07, + "loss": 0.6521, + "num_samples": 1.0, + "reward": 2.90625, + "reward_std": 0.6609638333320618, + "rewards/gpt4o_holistic_reward": 2.90625, + "rl_loss": 1.1175870895385742e-08, + "sft_loss": 2.173725128173828, + "speech_entropy": 1.9960131645202637, + "speech_kl": 0.0, + "step": 61, + "text_entropy": 0.9018564820289612, + "text_kl": 0.0, + "total_entropy": 1.8020646572113037 + }, + { + "combined_loss": 0.7371933460235596, + "completion_length": 338.1875, + "epoch": 0.019720101781170483, + "grad_norm": 1.8295475244522095, + "kl": 0.0, + "learning_rate": 9.990011742330542e-07, + "loss": 0.7372, + "num_samples": 1.0, + "reward": 4.4375, + "reward_std": 0.8081126809120178, + "rewards/gpt4o_holistic_reward": 4.4375, + "rl_loss": 0.0, + "sft_loss": 2.4573111534118652, + "speech_entropy": 2.409938097000122, + "speech_kl": 0.0, + "step": 62, + "text_entropy": 1.2029674053192139, + "text_kl": 0.0, + "total_entropy": 2.186131238937378 + }, + { + "combined_loss": 0.748369574546814, + "completion_length": 248.625, + "epoch": 0.020038167938931296, + "grad_norm": 2.8785228729248047, + "kl": 0.0, + "learning_rate": 9.98953063544389e-07, + "loss": 0.7484, + "num_samples": 1.0, + "reward": 3.375, + "reward_std": 1.7174440622329712, + "rewards/gpt4o_holistic_reward": 3.375, + "rl_loss": 3.725290298461914e-09, + "sft_loss": 2.49456524848938, + "speech_entropy": 2.4662866592407227, + "speech_kl": 0.0, + "step": 63, + "text_entropy": 1.4685275554656982, + "text_kl": 0.0, + "total_entropy": 2.278367280960083 + }, + { + "combined_loss": 0.668720006942749, + "completion_length": 369.25, + "epoch": 0.020356234096692113, + "grad_norm": 2.220876693725586, + "kl": 0.0, + "learning_rate": 9.989038226169207e-07, + "loss": 0.6687, + "num_samples": 1.0, + "reward": 3.6875, + "reward_std": 1.019437551498413, + "rewards/gpt4o_holistic_reward": 3.6875, + "rl_loss": 2.2351741790771484e-08, + "sft_loss": 2.2290663719177246, + "speech_entropy": 2.3302509784698486, + "speech_kl": 0.0, + "step": 64, + "text_entropy": 0.9201998710632324, + "text_kl": 0.0, + "total_entropy": 2.049210786819458 + }, + { + "combined_loss": 0.7535024881362915, + "completion_length": 459.625, + "epoch": 0.020674300254452927, + "grad_norm": 3.4402458667755127, + "kl": 0.0, + "learning_rate": 9.98853451574614e-07, + "loss": 0.7535, + "num_samples": 1.0, + "reward": 2.625, + "reward_std": 1.161826252937317, + "rewards/gpt4o_holistic_reward": 2.625, + "rl_loss": 3.725290298461914e-09, + "sft_loss": 2.5116748809814453, + "speech_entropy": 2.1198697090148926, + "speech_kl": 0.0, + "step": 65, + "text_entropy": 0.7827379703521729, + "text_kl": 0.0, + "total_entropy": 1.855495572090149 + }, + { + "combined_loss": 0.6956632733345032, + "completion_length": 267.0, + "epoch": 0.02099236641221374, + "grad_norm": 2.199934244155884, + "kl": 0.0, + "learning_rate": 9.988019505442775e-07, + "loss": 0.6957, + "num_samples": 1.0, + "reward": 4.5625, + "reward_std": 0.42705631256103516, + "rewards/gpt4o_holistic_reward": 4.5625, + "rl_loss": -9.313225746154785e-09, + "sft_loss": 2.3188774585723877, + "speech_entropy": 2.392827033996582, + "speech_kl": 0.0, + "step": 66, + "text_entropy": 1.4710502624511719, + "text_kl": 0.0, + "total_entropy": 2.220284938812256 + }, + { + "combined_loss": 0.6765873432159424, + "completion_length": 389.0, + "epoch": 0.021310432569974554, + "grad_norm": 2.0207324028015137, + "kl": 0.0, + "learning_rate": 9.987493196555649e-07, + "loss": 0.6766, + "num_samples": 1.0, + "reward": 4.25, + "reward_std": 0.9565354585647583, + "rewards/gpt4o_holistic_reward": 4.25, + "rl_loss": -3.725290298461914e-09, + "sft_loss": 2.255290985107422, + "speech_entropy": 2.4497034549713135, + "speech_kl": 0.0, + "step": 67, + "text_entropy": 0.9174681901931763, + "text_kl": 0.0, + "total_entropy": 2.17930006980896 + }, + { + "combined_loss": 0.7614186406135559, + "completion_length": 355.125, + "epoch": 0.021628498727735368, + "grad_norm": 2.724764585494995, + "kl": 0.0, + "learning_rate": 9.986955590409747e-07, + "loss": 0.7614, + "num_samples": 1.0, + "reward": 3.0, + "reward_std": 0.9788135290145874, + "rewards/gpt4o_holistic_reward": 3.0, + "rl_loss": 7.450580596923828e-09, + "sft_loss": 2.53806209564209, + "speech_entropy": 2.3254313468933105, + "speech_kl": 0.0, + "step": 68, + "text_entropy": 0.8083021640777588, + "text_kl": 0.0, + "total_entropy": 2.063969612121582 + }, + { + "combined_loss": 0.6472816467285156, + "completion_length": 407.5, + "epoch": 0.02194656488549618, + "grad_norm": 1.8889881372451782, + "kl": 0.0, + "learning_rate": 9.986406688358491e-07, + "loss": 0.6473, + "num_samples": 1.0, + "reward": 3.75, + "reward_std": 1.2387304306030273, + "rewards/gpt4o_holistic_reward": 3.75, + "rl_loss": 3.725290298461914e-09, + "sft_loss": 2.1576054096221924, + "speech_entropy": 2.329939842224121, + "speech_kl": 0.0, + "step": 69, + "text_entropy": 0.53284752368927, + "text_kl": 0.0, + "total_entropy": 1.9823434352874756 + }, + { + "combined_loss": 0.8229079842567444, + "completion_length": 370.9375, + "epoch": 0.022264631043257, + "grad_norm": 2.0763661861419678, + "kl": 0.0, + "learning_rate": 9.98584649178374e-07, + "loss": 0.8229, + "num_samples": 1.0, + "reward": 4.3125, + "reward_std": 0.8538135886192322, + "rewards/gpt4o_holistic_reward": 4.3125, + "rl_loss": -3.725290298461914e-09, + "sft_loss": 2.7430264949798584, + "speech_entropy": 2.232337474822998, + "speech_kl": 0.0, + "step": 70, + "text_entropy": 1.1738722324371338, + "text_kl": 0.0, + "total_entropy": 2.027456283569336 + }, + { + "combined_loss": 0.7078136205673218, + "completion_length": 402.1875, + "epoch": 0.022582697201017812, + "grad_norm": 2.2039167881011963, + "kl": 0.0, + "learning_rate": 9.985275002095789e-07, + "loss": 0.7078, + "num_samples": 1.0, + "reward": 3.5, + "reward_std": 1.183112621307373, + "rewards/gpt4o_holistic_reward": 3.5, + "rl_loss": 0.0, + "sft_loss": 2.3593788146972656, + "speech_entropy": 2.290937900543213, + "speech_kl": 0.0, + "step": 71, + "text_entropy": 1.0952609777450562, + "text_kl": 0.0, + "total_entropy": 2.054002285003662 + }, + { + "combined_loss": 0.7774462699890137, + "completion_length": 370.0, + "epoch": 0.022900763358778626, + "grad_norm": 2.0490500926971436, + "kl": 0.0, + "learning_rate": 9.984692220733363e-07, + "loss": 0.7774, + "num_samples": 1.0, + "reward": 4.0, + "reward_std": 0.7126991748809814, + "rewards/gpt4o_holistic_reward": 4.0, + "rl_loss": 7.450580596923828e-09, + "sft_loss": 2.591487407684326, + "speech_entropy": 2.2809252738952637, + "speech_kl": 0.0, + "step": 72, + "text_entropy": 1.4932012557983398, + "text_kl": 0.0, + "total_entropy": 2.1317849159240723 + }, + { + "combined_loss": 0.7310476899147034, + "completion_length": 416.125, + "epoch": 0.02321882951653944, + "grad_norm": 1.7452352046966553, + "kl": 0.0, + "learning_rate": 9.984098149163612e-07, + "loss": 0.731, + "num_samples": 1.0, + "reward": 4.375, + "reward_std": 0.5520563125610352, + "rewards/gpt4o_holistic_reward": 4.375, + "rl_loss": 0.0, + "sft_loss": 2.4368255138397217, + "speech_entropy": 2.2939229011535645, + "speech_kl": 0.0, + "step": 73, + "text_entropy": 1.1391348838806152, + "text_kl": 0.0, + "total_entropy": 2.0588903427124023 + }, + { + "combined_loss": 0.6863812208175659, + "completion_length": 426.4375, + "epoch": 0.023536895674300253, + "grad_norm": 1.8024567365646362, + "kl": 0.0, + "learning_rate": 9.98349278888211e-07, + "loss": 0.6864, + "num_samples": 1.0, + "reward": 2.5, + "reward_std": 0.14443756639957428, + "rewards/gpt4o_holistic_reward": 2.5, + "rl_loss": 0.0, + "sft_loss": 2.2879374027252197, + "speech_entropy": 1.6559635400772095, + "speech_kl": 0.0, + "step": 74, + "text_entropy": 0.796265184879303, + "text_kl": 0.0, + "total_entropy": 1.6575778722763062 + }, + { + "combined_loss": 0.6243323683738708, + "completion_length": 371.375, + "epoch": 0.02385496183206107, + "grad_norm": 1.5667005777359009, + "kl": 0.0, + "learning_rate": 9.982876141412855e-07, + "loss": 0.6243, + "num_samples": 1.0, + "reward": 4.25, + "reward_std": 0.3944375813007355, + "rewards/gpt4o_holistic_reward": 4.25, + "rl_loss": -3.725290298461914e-09, + "sft_loss": 2.0811080932617188, + "speech_entropy": 2.248847723007202, + "speech_kl": 0.0, + "step": 75, + "text_entropy": 0.7592964172363281, + "text_kl": 0.0, + "total_entropy": 1.9378535747528076 + }, + { + "combined_loss": 0.745708703994751, + "completion_length": 340.0, + "epoch": 0.024173027989821884, + "grad_norm": 2.7676212787628174, + "kl": 0.0, + "learning_rate": 9.982248208308253e-07, + "loss": 0.7457, + "num_samples": 1.0, + "reward": 3.4375, + "reward_std": 1.1250998973846436, + "rewards/gpt4o_holistic_reward": 3.4375, + "rl_loss": 1.30385160446167e-08, + "sft_loss": 2.4856958389282227, + "speech_entropy": 2.3308167457580566, + "speech_kl": 0.0, + "step": 76, + "text_entropy": 1.140113353729248, + "text_kl": 0.0, + "total_entropy": 2.1104745864868164 + }, + { + "combined_loss": 0.6474106907844543, + "completion_length": 482.3125, + "epoch": 0.024491094147582698, + "grad_norm": 2.045401096343994, + "kl": 0.0, + "learning_rate": 9.981608991149123e-07, + "loss": 0.6474, + "num_samples": 1.0, + "reward": 3.9375, + "reward_std": 1.375100016593933, + "rewards/gpt4o_holistic_reward": 3.9375, + "rl_loss": -3.725290298461914e-09, + "sft_loss": 2.1580355167388916, + "speech_entropy": 2.319225311279297, + "speech_kl": 0.0, + "step": 77, + "text_entropy": 0.8749170899391174, + "text_kl": 0.0, + "total_entropy": 2.047513723373413 + }, + { + "combined_loss": 0.7499513626098633, + "completion_length": 317.375, + "epoch": 0.02480916030534351, + "grad_norm": 2.194958209991455, + "kl": 0.0, + "learning_rate": 9.980958491544697e-07, + "loss": 0.75, + "num_samples": 1.0, + "reward": 3.75, + "reward_std": 0.6444375514984131, + "rewards/gpt4o_holistic_reward": 3.75, + "rl_loss": -5.587935447692871e-09, + "sft_loss": 2.499837875366211, + "speech_entropy": 2.3051891326904297, + "speech_kl": 0.0, + "step": 78, + "text_entropy": 1.2871158123016357, + "text_kl": 0.0, + "total_entropy": 2.1196823120117188 + }, + { + "combined_loss": 0.686776876449585, + "completion_length": 423.0, + "epoch": 0.025127226463104325, + "grad_norm": 2.2229573726654053, + "kl": 0.0, + "learning_rate": 9.980296711132606e-07, + "loss": 0.6868, + "num_samples": 1.0, + "reward": 3.1875, + "reward_std": 1.2525264024734497, + "rewards/gpt4o_holistic_reward": 3.1875, + "rl_loss": -1.4901161193847656e-08, + "sft_loss": 2.2892560958862305, + "speech_entropy": 2.3141417503356934, + "speech_kl": 0.0, + "step": 79, + "text_entropy": 1.1506476402282715, + "text_kl": 0.0, + "total_entropy": 2.09428071975708 + }, + { + "combined_loss": 0.7100934386253357, + "completion_length": 345.375, + "epoch": 0.02544529262086514, + "grad_norm": 2.2556166648864746, + "kl": 0.0, + "learning_rate": 9.97962365157888e-07, + "loss": 0.7101, + "num_samples": 1.0, + "reward": 3.3125, + "reward_std": 1.7286533117294312, + "rewards/gpt4o_holistic_reward": 3.3125, + "rl_loss": 1.862645149230957e-08, + "sft_loss": 2.366978168487549, + "speech_entropy": 2.3218116760253906, + "speech_kl": 0.0, + "step": 80, + "text_entropy": 1.308366060256958, + "text_kl": 0.0, + "total_entropy": 2.1330385208129883 + }, + { + "combined_loss": 0.7132716774940491, + "completion_length": 592.8125, + "epoch": 0.025763358778625955, + "grad_norm": 2.3514413833618164, + "kl": 0.0, + "learning_rate": 9.97893931457795e-07, + "loss": 0.7133, + "num_samples": 1.0, + "reward": 3.625, + "reward_std": 1.0983424186706543, + "rewards/gpt4o_holistic_reward": 3.625, + "rl_loss": -1.1175870895385742e-08, + "sft_loss": 2.3775720596313477, + "speech_entropy": 2.166839838027954, + "speech_kl": 0.0, + "step": 81, + "text_entropy": 0.6873091459274292, + "text_kl": 0.0, + "total_entropy": 1.8318581581115723 + }, + { + "combined_loss": 0.7247699499130249, + "completion_length": 368.875, + "epoch": 0.02608142493638677, + "grad_norm": 2.5765998363494873, + "kl": 0.0, + "learning_rate": 9.978243701852625e-07, + "loss": 0.7248, + "num_samples": 1.0, + "reward": 3.875, + "reward_std": 1.0000998973846436, + "rewards/gpt4o_holistic_reward": 3.875, + "rl_loss": 3.725290298461914e-09, + "sft_loss": 2.4158997535705566, + "speech_entropy": 2.3246705532073975, + "speech_kl": 0.0, + "step": 82, + "text_entropy": 1.0593510866165161, + "text_kl": 0.0, + "total_entropy": 1.8370463848114014 + }, + { + "combined_loss": 0.7713165879249573, + "completion_length": 510.5, + "epoch": 0.026399491094147583, + "grad_norm": 2.231696844100952, + "kl": 0.0, + "learning_rate": 9.977536815154117e-07, + "loss": 0.7713, + "num_samples": 1.0, + "reward": 3.125, + "reward_std": 0.7180101871490479, + "rewards/gpt4o_holistic_reward": 3.125, + "rl_loss": 3.725290298461914e-09, + "sft_loss": 2.5710554122924805, + "speech_entropy": 2.2227771282196045, + "speech_kl": 0.0, + "step": 83, + "text_entropy": 0.9712114334106445, + "text_kl": 0.0, + "total_entropy": 1.9926376342773438 + }, + { + "combined_loss": 0.66883784532547, + "completion_length": 462.8125, + "epoch": 0.026717557251908396, + "grad_norm": 2.2363080978393555, + "kl": 0.0, + "learning_rate": 9.97681865626201e-07, + "loss": 0.6688, + "num_samples": 1.0, + "reward": 2.625, + "reward_std": 1.017488718032837, + "rewards/gpt4o_holistic_reward": 2.625, + "rl_loss": 1.4901161193847656e-08, + "sft_loss": 2.229459524154663, + "speech_entropy": 2.178924083709717, + "speech_kl": 0.0, + "step": 84, + "text_entropy": 0.9698714017868042, + "text_kl": 0.0, + "total_entropy": 1.9314519166946411 + }, + { + "combined_loss": 0.7741247415542603, + "completion_length": 383.75, + "epoch": 0.02703562340966921, + "grad_norm": 2.050072193145752, + "kl": 0.0, + "learning_rate": 9.97608922698427e-07, + "loss": 0.7741, + "num_samples": 1.0, + "reward": 3.5, + "reward_std": 1.1404881477355957, + "rewards/gpt4o_holistic_reward": 3.5, + "rl_loss": 1.862645149230957e-08, + "sft_loss": 2.580415725708008, + "speech_entropy": 2.2788493633270264, + "speech_kl": 0.0, + "step": 85, + "text_entropy": 0.9160740971565247, + "text_kl": 0.0, + "total_entropy": 2.0253312587738037 + }, + { + "combined_loss": 0.6860091090202332, + "completion_length": 373.125, + "epoch": 0.027353689567430027, + "grad_norm": 3.053793430328369, + "kl": 0.0, + "learning_rate": 9.975348529157229e-07, + "loss": 0.686, + "num_samples": 1.0, + "reward": 3.875, + "reward_std": 0.8944376111030579, + "rewards/gpt4o_holistic_reward": 3.875, + "rl_loss": -3.725290298461914e-09, + "sft_loss": 2.2866969108581543, + "speech_entropy": 2.2216386795043945, + "speech_kl": 0.0, + "step": 86, + "text_entropy": 0.9611391425132751, + "text_kl": 0.0, + "total_entropy": 1.9948720932006836 + }, + { + "combined_loss": 0.7096176147460938, + "completion_length": 320.625, + "epoch": 0.02767175572519084, + "grad_norm": 2.05188250541687, + "kl": 0.0, + "learning_rate": 9.974596564645598e-07, + "loss": 0.7096, + "num_samples": 1.0, + "reward": 3.75, + "reward_std": 0.5520563125610352, + "rewards/gpt4o_holistic_reward": 3.75, + "rl_loss": -3.725290298461914e-09, + "sft_loss": 2.365391969680786, + "speech_entropy": 2.3104031085968018, + "speech_kl": 0.0, + "step": 87, + "text_entropy": 1.122492790222168, + "text_kl": 0.0, + "total_entropy": 2.0940003395080566 + }, + { + "combined_loss": 0.6567751169204712, + "completion_length": 530.125, + "epoch": 0.027989821882951654, + "grad_norm": 1.9356623888015747, + "kl": 0.0, + "learning_rate": 9.973833335342446e-07, + "loss": 0.6568, + "num_samples": 1.0, + "reward": 3.8125, + "reward_std": 0.807937741279602, + "rewards/gpt4o_holistic_reward": 3.8125, + "rl_loss": -7.450580596923828e-09, + "sft_loss": 2.1892502307891846, + "speech_entropy": 2.227321147918701, + "speech_kl": 0.0, + "step": 88, + "text_entropy": 1.0134230852127075, + "text_kl": 0.0, + "total_entropy": 1.9499976634979248 + }, + { + "combined_loss": 0.7411503791809082, + "completion_length": 485.9375, + "epoch": 0.028307888040712468, + "grad_norm": 1.8700544834136963, + "kl": 0.0, + "learning_rate": 9.9730588431692e-07, + "loss": 0.7412, + "num_samples": 1.0, + "reward": 3.1875, + "reward_std": 0.6637751460075378, + "rewards/gpt4o_holistic_reward": 3.1875, + "rl_loss": 5.587935447692871e-09, + "sft_loss": 2.47050142288208, + "speech_entropy": 2.357027530670166, + "speech_kl": 0.0, + "step": 89, + "text_entropy": 0.865372896194458, + "text_kl": 0.0, + "total_entropy": 2.083202600479126 + }, + { + "combined_loss": 0.6864016056060791, + "completion_length": 444.875, + "epoch": 0.02862595419847328, + "grad_norm": 1.8632216453552246, + "kl": 0.0, + "learning_rate": 9.972273090075645e-07, + "loss": 0.6864, + "num_samples": 1.0, + "reward": 2.90625, + "reward_std": 0.7911534309387207, + "rewards/gpt4o_holistic_reward": 2.90625, + "rl_loss": 3.725290298461914e-09, + "sft_loss": 2.2880053520202637, + "speech_entropy": 2.133165121078491, + "speech_kl": 0.0, + "step": 90, + "text_entropy": 0.9260656833648682, + "text_kl": 0.0, + "total_entropy": 1.80254328250885 + }, + { + "combined_loss": 0.6964578628540039, + "completion_length": 433.9375, + "epoch": 0.028944020356234095, + "grad_norm": 1.954607367515564, + "kl": 0.0, + "learning_rate": 9.97147607803991e-07, + "loss": 0.6965, + "num_samples": 1.0, + "reward": 2.875, + "reward_std": 0.6983708143234253, + "rewards/gpt4o_holistic_reward": 2.875, + "rl_loss": 7.450580596923828e-09, + "sft_loss": 2.321526050567627, + "speech_entropy": 2.222456455230713, + "speech_kl": 0.0, + "step": 91, + "text_entropy": 1.1704072952270508, + "text_kl": 0.0, + "total_entropy": 1.9853535890579224 + }, + { + "combined_loss": 0.7411965131759644, + "completion_length": 500.125, + "epoch": 0.029262086513994912, + "grad_norm": 2.046849250793457, + "kl": 0.0, + "learning_rate": 9.970667809068474e-07, + "loss": 0.7412, + "num_samples": 1.0, + "reward": 2.6875, + "reward_std": 0.8678992986679077, + "rewards/gpt4o_holistic_reward": 2.6875, + "rl_loss": -7.450580596923828e-09, + "sft_loss": 2.4706549644470215, + "speech_entropy": 2.2015461921691895, + "speech_kl": 0.0, + "step": 92, + "text_entropy": 1.1552786827087402, + "text_kl": 0.0, + "total_entropy": 2.0065386295318604 + }, + { + "combined_loss": 0.7156788110733032, + "completion_length": 403.125, + "epoch": 0.029580152671755726, + "grad_norm": 2.1912262439727783, + "kl": 0.0, + "learning_rate": 9.969848285196157e-07, + "loss": 0.7157, + "num_samples": 1.0, + "reward": 3.875, + "reward_std": 1.4470233917236328, + "rewards/gpt4o_holistic_reward": 3.875, + "rl_loss": 7.450580596923828e-09, + "sft_loss": 2.3855957984924316, + "speech_entropy": 2.2147631645202637, + "speech_kl": 0.0, + "step": 93, + "text_entropy": 1.0130202770233154, + "text_kl": 0.0, + "total_entropy": 1.9480912685394287 + }, + { + "combined_loss": 0.660508394241333, + "completion_length": 303.5, + "epoch": 0.02989821882951654, + "grad_norm": 2.6118781566619873, + "kl": 0.0, + "learning_rate": 9.969017508486105e-07, + "loss": 0.6605, + "num_samples": 1.0, + "reward": 3.9375, + "reward_std": 1.2233422994613647, + "rewards/gpt4o_holistic_reward": 3.9375, + "rl_loss": 0.0, + "sft_loss": 2.2016944885253906, + "speech_entropy": 2.45943284034729, + "speech_kl": 0.0, + "step": 94, + "text_entropy": 1.1381962299346924, + "text_kl": 0.0, + "total_entropy": 2.197706460952759 + }, + { + "combined_loss": 0.7968100309371948, + "completion_length": 326.6875, + "epoch": 0.030216284987277353, + "grad_norm": 2.501716375350952, + "kl": 0.0, + "learning_rate": 9.968175481029798e-07, + "loss": 0.7968, + "num_samples": 1.0, + "reward": 3.75, + "reward_std": 0.7887751460075378, + "rewards/gpt4o_holistic_reward": 3.75, + "rl_loss": 1.4901161193847656e-08, + "sft_loss": 2.656033515930176, + "speech_entropy": 2.389923334121704, + "speech_kl": 0.0, + "step": 95, + "text_entropy": 1.3742587566375732, + "text_kl": 0.0, + "total_entropy": 2.177299976348877 + }, + { + "combined_loss": 0.7844936847686768, + "completion_length": 325.375, + "epoch": 0.030534351145038167, + "grad_norm": 2.390887498855591, + "kl": 0.0, + "learning_rate": 9.967322204947038e-07, + "loss": 0.7845, + "num_samples": 1.0, + "reward": 3.5625, + "reward_std": 1.3041632175445557, + "rewards/gpt4o_holistic_reward": 3.5625, + "rl_loss": -1.4901161193847656e-08, + "sft_loss": 2.6149790287017822, + "speech_entropy": 2.296114206314087, + "speech_kl": 0.0, + "step": 96, + "text_entropy": 1.206712245941162, + "text_kl": 0.0, + "total_entropy": 2.0752415657043457 + }, + { + "combined_loss": 0.7896230220794678, + "completion_length": 317.3125, + "epoch": 0.030852417302798984, + "grad_norm": 2.363767147064209, + "kl": 0.0, + "learning_rate": 9.96645768238595e-07, + "loss": 0.7896, + "num_samples": 1.0, + "reward": 4.3125, + "reward_std": 1.0173285007476807, + "rewards/gpt4o_holistic_reward": 4.3125, + "rl_loss": 2.2351741790771484e-08, + "sft_loss": 2.6320767402648926, + "speech_entropy": 2.3654651641845703, + "speech_kl": 0.0, + "step": 97, + "text_entropy": 1.4019675254821777, + "text_kl": 0.0, + "total_entropy": 2.174586296081543 + }, + { + "combined_loss": 0.6473546028137207, + "completion_length": 494.8125, + "epoch": 0.031170483460559797, + "grad_norm": 1.823893427848816, + "kl": 0.0, + "learning_rate": 9.965581915522964e-07, + "loss": 0.6474, + "num_samples": 1.0, + "reward": 3.8125, + "reward_std": 0.9981511235237122, + "rewards/gpt4o_holistic_reward": 3.8125, + "rl_loss": 1.4901161193847656e-08, + "sft_loss": 2.157848358154297, + "speech_entropy": 2.229156494140625, + "speech_kl": 0.0, + "step": 98, + "text_entropy": 1.1541342735290527, + "text_kl": 0.0, + "total_entropy": 2.026672840118408 + }, + { + "combined_loss": 0.7282466292381287, + "completion_length": 373.5625, + "epoch": 0.03148854961832061, + "grad_norm": 2.1396214962005615, + "kl": 0.0, + "learning_rate": 9.964694906562826e-07, + "loss": 0.7282, + "num_samples": 1.0, + "reward": 3.0625, + "reward_std": 0.5281319618225098, + "rewards/gpt4o_holistic_reward": 3.0625, + "rl_loss": 0.0, + "sft_loss": 2.4274885654449463, + "speech_entropy": 2.2445669174194336, + "speech_kl": 0.0, + "step": 99, + "text_entropy": 1.1222586631774902, + "text_kl": 0.0, + "total_entropy": 2.014863967895508 + }, + { + "combined_loss": 0.7340562343597412, + "completion_length": 360.875, + "epoch": 0.031806615776081425, + "grad_norm": 2.1381564140319824, + "kl": 0.0, + "learning_rate": 9.96379665773858e-07, + "loss": 0.7341, + "num_samples": 1.0, + "reward": 4.0625, + "reward_std": 0.7394567728042603, + "rewards/gpt4o_holistic_reward": 4.0625, + "rl_loss": -1.4901161193847656e-08, + "sft_loss": 2.4468541145324707, + "speech_entropy": 2.292088270187378, + "speech_kl": 0.0, + "step": 100, + "text_entropy": 1.465188980102539, + "text_kl": 0.0, + "total_entropy": 2.1374363899230957 + }, + { + "combined_loss": 0.7976064682006836, + "completion_length": 296.25, + "epoch": 0.03212468193384224, + "grad_norm": 1.9820961952209473, + "kl": 0.0, + "learning_rate": 9.962887171311562e-07, + "loss": 0.7976, + "num_samples": 1.0, + "reward": 4.6875, + "reward_std": 0.3751000165939331, + "rewards/gpt4o_holistic_reward": 4.6875, + "rl_loss": 1.862645149230957e-09, + "sft_loss": 2.6586880683898926, + "speech_entropy": 2.284330368041992, + "speech_kl": 0.0, + "step": 101, + "text_entropy": 1.2345999479293823, + "text_kl": 0.0, + "total_entropy": 2.0803914070129395 + }, + { + "combined_loss": 0.6758935451507568, + "completion_length": 429.5625, + "epoch": 0.03244274809160305, + "grad_norm": 2.111237049102783, + "kl": 0.0, + "learning_rate": 9.961966449571407e-07, + "loss": 0.6759, + "num_samples": 1.0, + "reward": 2.875, + "reward_std": 1.5787245035171509, + "rewards/gpt4o_holistic_reward": 2.875, + "rl_loss": -1.862645149230957e-08, + "sft_loss": 2.2529783248901367, + "speech_entropy": 2.263260841369629, + "speech_kl": 0.0, + "step": 102, + "text_entropy": 0.8939234614372253, + "text_kl": 0.0, + "total_entropy": 1.9995882511138916 + }, + { + "combined_loss": 0.7361711263656616, + "completion_length": 395.9375, + "epoch": 0.03276081424936387, + "grad_norm": 1.850319743156433, + "kl": 0.0, + "learning_rate": 9.961034494836029e-07, + "loss": 0.7362, + "num_samples": 1.0, + "reward": 4.5, + "reward_std": 0.614456832408905, + "rewards/gpt4o_holistic_reward": 4.5, + "rl_loss": 7.450580596923828e-09, + "sft_loss": 2.4539036750793457, + "speech_entropy": 2.2334747314453125, + "speech_kl": 0.0, + "step": 103, + "text_entropy": 1.248520851135254, + "text_kl": 0.0, + "total_entropy": 2.0419020652770996 + }, + { + "combined_loss": 0.6475083827972412, + "completion_length": 475.625, + "epoch": 0.03307888040712468, + "grad_norm": 1.9739583730697632, + "kl": 0.0, + "learning_rate": 9.960091309451625e-07, + "loss": 0.6475, + "num_samples": 1.0, + "reward": 2.375, + "reward_std": 1.1752138137817383, + "rewards/gpt4o_holistic_reward": 2.375, + "rl_loss": -1.1175870895385742e-08, + "sft_loss": 2.1583614349365234, + "speech_entropy": 2.26711368560791, + "speech_kl": 0.0, + "step": 104, + "text_entropy": 0.8396031856536865, + "text_kl": 0.0, + "total_entropy": 1.9932773113250732 + }, + { + "combined_loss": 0.7815308570861816, + "completion_length": 394.0625, + "epoch": 0.033396946564885496, + "grad_norm": 2.9025793075561523, + "kl": 0.0, + "learning_rate": 9.95913689579266e-07, + "loss": 0.7815, + "num_samples": 1.0, + "reward": 4.5, + "reward_std": 0.704224169254303, + "rewards/gpt4o_holistic_reward": 4.5, + "rl_loss": 1.1175870895385742e-08, + "sft_loss": 2.6051025390625, + "speech_entropy": 2.256810188293457, + "speech_kl": 0.0, + "step": 105, + "text_entropy": 1.4908084869384766, + "text_kl": 0.0, + "total_entropy": 2.115090847015381 + }, + { + "combined_loss": 0.7154955863952637, + "completion_length": 475.25, + "epoch": 0.03371501272264631, + "grad_norm": 1.8907688856124878, + "kl": 0.0, + "learning_rate": 9.958171256261873e-07, + "loss": 0.7155, + "num_samples": 1.0, + "reward": 3.625, + "reward_std": 0.7587944269180298, + "rewards/gpt4o_holistic_reward": 3.625, + "rl_loss": 5.587935447692871e-09, + "sft_loss": 2.3849852085113525, + "speech_entropy": 2.25828218460083, + "speech_kl": 0.0, + "step": 106, + "text_entropy": 1.0575703382492065, + "text_kl": 0.0, + "total_entropy": 2.039116144180298 + }, + { + "combined_loss": 0.6845788955688477, + "completion_length": 512.5625, + "epoch": 0.034033078880407124, + "grad_norm": 1.646696925163269, + "kl": 0.0, + "learning_rate": 9.957194393290259e-07, + "loss": 0.6846, + "num_samples": 1.0, + "reward": 3.25, + "reward_std": 0.28877514600753784, + "rewards/gpt4o_holistic_reward": 3.25, + "rl_loss": 0.0, + "sft_loss": 2.2819294929504395, + "speech_entropy": 2.166106700897217, + "speech_kl": 0.0, + "step": 107, + "text_entropy": 0.6757108569145203, + "text_kl": 0.0, + "total_entropy": 1.8433654308319092 + }, + { + "combined_loss": 0.7150195837020874, + "completion_length": 491.6875, + "epoch": 0.03435114503816794, + "grad_norm": 1.833224892616272, + "kl": 0.0, + "learning_rate": 9.956206309337066e-07, + "loss": 0.715, + "num_samples": 1.0, + "reward": 3.0625, + "reward_std": 0.5581126809120178, + "rewards/gpt4o_holistic_reward": 3.0625, + "rl_loss": 0.0, + "sft_loss": 2.3833985328674316, + "speech_entropy": 2.1860873699188232, + "speech_kl": 0.0, + "step": 108, + "text_entropy": 1.2409619092941284, + "text_kl": 0.0, + "total_entropy": 2.0030808448791504 + }, + { + "combined_loss": 0.6441072821617126, + "completion_length": 293.3125, + "epoch": 0.03466921119592875, + "grad_norm": 2.175922393798828, + "kl": 0.0, + "learning_rate": 9.9552070068898e-07, + "loss": 0.6441, + "num_samples": 1.0, + "reward": 3.6875, + "reward_std": 0.8750999569892883, + "rewards/gpt4o_holistic_reward": 3.6875, + "rl_loss": -7.450580596923828e-09, + "sft_loss": 2.147024154663086, + "speech_entropy": 2.2451610565185547, + "speech_kl": 0.0, + "step": 109, + "text_entropy": 1.1438672542572021, + "text_kl": 0.0, + "total_entropy": 2.044260025024414 + }, + { + "combined_loss": 0.6975011825561523, + "completion_length": 398.8125, + "epoch": 0.03498727735368957, + "grad_norm": 1.7699748277664185, + "kl": 0.0, + "learning_rate": 9.954196488464196e-07, + "loss": 0.6975, + "num_samples": 1.0, + "reward": 3.3125, + "reward_std": 0.2694375813007355, + "rewards/gpt4o_holistic_reward": 3.3125, + "rl_loss": 3.725290298461914e-09, + "sft_loss": 2.3250038623809814, + "speech_entropy": 2.3067374229431152, + "speech_kl": 0.0, + "step": 110, + "text_entropy": 1.097214937210083, + "text_kl": 0.0, + "total_entropy": 2.070528268814087 + }, + { + "combined_loss": 0.6993280649185181, + "completion_length": 462.0625, + "epoch": 0.035305343511450385, + "grad_norm": 1.6435576677322388, + "kl": 0.0, + "learning_rate": 9.953174756604242e-07, + "loss": 0.6993, + "num_samples": 1.0, + "reward": 4.375, + "reward_std": 0.8483423590660095, + "rewards/gpt4o_holistic_reward": 4.375, + "rl_loss": 1.1175870895385742e-08, + "sft_loss": 2.3310935497283936, + "speech_entropy": 2.2712697982788086, + "speech_kl": 0.0, + "step": 111, + "text_entropy": 0.8282650113105774, + "text_kl": 0.0, + "total_entropy": 1.9767301082611084 + }, + { + "combined_loss": 0.6761323809623718, + "completion_length": 294.25, + "epoch": 0.035623409669211195, + "grad_norm": 2.934861183166504, + "kl": 0.0, + "learning_rate": 9.95214181388214e-07, + "loss": 0.6761, + "num_samples": 1.0, + "reward": 2.9375, + "reward_std": 1.14496648311615, + "rewards/gpt4o_holistic_reward": 2.9375, + "rl_loss": -1.1175870895385742e-08, + "sft_loss": 2.253774404525757, + "speech_entropy": 2.4185285568237305, + "speech_kl": 0.0, + "step": 112, + "text_entropy": 1.202892541885376, + "text_kl": 0.0, + "total_entropy": 2.171070098876953 + }, + { + "combined_loss": 0.7770819067955017, + "completion_length": 376.625, + "epoch": 0.03594147582697201, + "grad_norm": 73.62300872802734, + "kl": 0.0, + "learning_rate": 9.951097662898325e-07, + "loss": 0.7771, + "num_samples": 1.0, + "reward": 3.6875, + "reward_std": 0.8750999569892883, + "rewards/gpt4o_holistic_reward": 3.6875, + "rl_loss": -3.725290298461914e-09, + "sft_loss": 2.590272903442383, + "speech_entropy": 2.2988834381103516, + "speech_kl": 0.0, + "step": 113, + "text_entropy": 1.125475287437439, + "text_kl": 0.0, + "total_entropy": 2.0705349445343018 + }, + { + "combined_loss": 0.6613748073577881, + "completion_length": 509.4375, + "epoch": 0.03625954198473282, + "grad_norm": 2.1758439540863037, + "kl": 0.0, + "learning_rate": 9.950042306281445e-07, + "loss": 0.6614, + "num_samples": 1.0, + "reward": 3.4375, + "reward_std": 1.4686723947525024, + "rewards/gpt4o_holistic_reward": 3.4375, + "rl_loss": -7.450580596923828e-09, + "sft_loss": 2.204582691192627, + "speech_entropy": 2.27337908744812, + "speech_kl": 0.0, + "step": 114, + "text_entropy": 0.9519417881965637, + "text_kl": 0.0, + "total_entropy": 2.023104667663574 + }, + { + "combined_loss": 0.6832470893859863, + "completion_length": 496.5625, + "epoch": 0.03657760814249364, + "grad_norm": 9.72231674194336, + "kl": 0.0, + "learning_rate": 9.94897574668836e-07, + "loss": 0.6832, + "num_samples": 1.0, + "reward": 4.3125, + "reward_std": 0.6250999569892883, + "rewards/gpt4o_holistic_reward": 4.3125, + "rl_loss": -7.450580596923828e-09, + "sft_loss": 2.2774901390075684, + "speech_entropy": 2.29954195022583, + "speech_kl": 0.0, + "step": 115, + "text_entropy": 1.2875080108642578, + "text_kl": 0.0, + "total_entropy": 2.0839648246765137 + }, + { + "combined_loss": 0.6530779600143433, + "completion_length": 418.875, + "epoch": 0.03689567430025445, + "grad_norm": 2.1920166015625, + "kl": 0.0, + "learning_rate": 9.94789798680413e-07, + "loss": 0.6531, + "num_samples": 1.0, + "reward": 4.4375, + "reward_std": 0.23945678770542145, + "rewards/gpt4o_holistic_reward": 4.4375, + "rl_loss": 0.0, + "sft_loss": 2.176926612854004, + "speech_entropy": 2.3347601890563965, + "speech_kl": 0.0, + "step": 116, + "text_entropy": 1.2426201105117798, + "text_kl": 0.0, + "total_entropy": 2.1373891830444336 + }, + { + "combined_loss": 0.6800841689109802, + "completion_length": 381.5625, + "epoch": 0.03721374045801527, + "grad_norm": 2.2477047443389893, + "kl": 0.0, + "learning_rate": 9.94680902934202e-07, + "loss": 0.6801, + "num_samples": 1.0, + "reward": 4.0, + "reward_std": 0.9940415620803833, + "rewards/gpt4o_holistic_reward": 4.0, + "rl_loss": -7.450580596923828e-09, + "sft_loss": 2.2669472694396973, + "speech_entropy": 2.3144116401672363, + "speech_kl": 0.0, + "step": 117, + "text_entropy": 1.3952128887176514, + "text_kl": 0.0, + "total_entropy": 2.1378135681152344 + }, + { + "combined_loss": 0.7205807566642761, + "completion_length": 266.75, + "epoch": 0.037531806615776084, + "grad_norm": 2.3268606662750244, + "kl": 0.0, + "learning_rate": 9.94570887704347e-07, + "loss": 0.7206, + "num_samples": 1.0, + "reward": 4.25, + "reward_std": 1.2288135290145874, + "rewards/gpt4o_holistic_reward": 4.25, + "rl_loss": 7.450580596923828e-09, + "sft_loss": 2.4019358158111572, + "speech_entropy": 2.3998208045959473, + "speech_kl": 0.0, + "step": 118, + "text_entropy": 1.6290910243988037, + "text_kl": 0.0, + "total_entropy": 2.2556941509246826 + }, + { + "combined_loss": 0.7098033428192139, + "completion_length": 314.0, + "epoch": 0.037849872773536894, + "grad_norm": 2.0043420791625977, + "kl": 0.0, + "learning_rate": 9.944597532678119e-07, + "loss": 0.7098, + "num_samples": 1.0, + "reward": 3.0, + "reward_std": 0.6444375514984131, + "rewards/gpt4o_holistic_reward": 3.0, + "rl_loss": -1.862645149230957e-08, + "sft_loss": 2.366011142730713, + "speech_entropy": 2.3302297592163086, + "speech_kl": 0.0, + "step": 119, + "text_entropy": 1.2116031646728516, + "text_kl": 0.0, + "total_entropy": 2.1241815090179443 + }, + { + "combined_loss": 0.7127311825752258, + "completion_length": 565.0625, + "epoch": 0.03816793893129771, + "grad_norm": 1.6876592636108398, + "kl": 0.0, + "learning_rate": 9.943474999043775e-07, + "loss": 0.7127, + "num_samples": 1.0, + "reward": 2.6875, + "reward_std": 0.7235617637634277, + "rewards/gpt4o_holistic_reward": 2.6875, + "rl_loss": 7.450580596923828e-09, + "sft_loss": 2.3757705688476562, + "speech_entropy": 2.1477317810058594, + "speech_kl": 0.0, + "step": 120, + "text_entropy": 0.8258525133132935, + "text_kl": 0.0, + "total_entropy": 1.8741722106933594 + }, + { + "combined_loss": 0.6518399715423584, + "completion_length": 460.875, + "epoch": 0.03848600508905852, + "grad_norm": 1.9149906635284424, + "kl": 0.0, + "learning_rate": 9.94234127896641e-07, + "loss": 0.6518, + "num_samples": 1.0, + "reward": 2.5, + "reward_std": 1.2335585355758667, + "rewards/gpt4o_holistic_reward": 2.5, + "rl_loss": 3.725290298461914e-09, + "sft_loss": 2.172799825668335, + "speech_entropy": 2.247347593307495, + "speech_kl": 0.0, + "step": 121, + "text_entropy": 0.7082687020301819, + "text_kl": 0.0, + "total_entropy": 1.955895185470581 + }, + { + "combined_loss": 0.6932583451271057, + "completion_length": 415.9375, + "epoch": 0.03880407124681934, + "grad_norm": 1.803132176399231, + "kl": 0.0, + "learning_rate": 9.94119637530017e-07, + "loss": 0.6933, + "num_samples": 1.0, + "reward": 2.5625, + "reward_std": 0.1251000016927719, + "rewards/gpt4o_holistic_reward": 2.5625, + "rl_loss": -7.450580596923828e-09, + "sft_loss": 2.310861110687256, + "speech_entropy": 2.2421672344207764, + "speech_kl": 0.0, + "step": 122, + "text_entropy": 1.031731367111206, + "text_kl": 0.0, + "total_entropy": 2.0151727199554443 + }, + { + "combined_loss": 0.6039379239082336, + "completion_length": 606.0625, + "epoch": 0.039122137404580155, + "grad_norm": 1.545535683631897, + "kl": 0.0, + "learning_rate": 9.940040290927343e-07, + "loss": 0.6039, + "num_samples": 1.0, + "reward": 3.6875, + "reward_std": 1.2654881477355957, + "rewards/gpt4o_holistic_reward": 3.6875, + "rl_loss": -1.1175870895385742e-08, + "sft_loss": 2.0131263732910156, + "speech_entropy": 2.1528759002685547, + "speech_kl": 0.0, + "step": 123, + "text_entropy": 0.7468642592430115, + "text_kl": 0.0, + "total_entropy": 1.8691860437393188 + }, + { + "combined_loss": 0.637839674949646, + "completion_length": 377.375, + "epoch": 0.039440203562340966, + "grad_norm": 1.8493309020996094, + "kl": 0.0, + "learning_rate": 9.938873028758374e-07, + "loss": 0.6378, + "num_samples": 1.0, + "reward": 3.8125, + "reward_std": 1.0646765232086182, + "rewards/gpt4o_holistic_reward": 3.8125, + "rl_loss": 1.862645149230957e-08, + "sft_loss": 2.126132011413574, + "speech_entropy": 2.3589277267456055, + "speech_kl": 0.0, + "step": 124, + "text_entropy": 1.0232412815093994, + "text_kl": 0.0, + "total_entropy": 2.1352920532226562 + }, + { + "combined_loss": 0.6103811860084534, + "completion_length": 330.75, + "epoch": 0.03975826972010178, + "grad_norm": 1.9315379858016968, + "kl": 0.0, + "learning_rate": 9.93769459173184e-07, + "loss": 0.6104, + "num_samples": 1.0, + "reward": 3.8125, + "reward_std": 0.8751000165939331, + "rewards/gpt4o_holistic_reward": 3.8125, + "rl_loss": 2.60770320892334e-08, + "sft_loss": 2.0346038341522217, + "speech_entropy": 2.236910343170166, + "speech_kl": 0.0, + "step": 125, + "text_entropy": 0.7762662172317505, + "text_kl": 0.0, + "total_entropy": 1.9460570812225342 + }, + { + "combined_loss": 0.6840636730194092, + "completion_length": 393.25, + "epoch": 0.04007633587786259, + "grad_norm": 2.2317352294921875, + "kl": 0.0, + "learning_rate": 9.936504982814457e-07, + "loss": 0.6841, + "num_samples": 1.0, + "reward": 3.8125, + "reward_std": 0.579224169254303, + "rewards/gpt4o_holistic_reward": 3.8125, + "rl_loss": 7.450580596923828e-09, + "sft_loss": 2.28021240234375, + "speech_entropy": 2.2817130088806152, + "speech_kl": 0.0, + "step": 126, + "text_entropy": 0.9976484775543213, + "text_kl": 0.0, + "total_entropy": 2.0223848819732666 + }, + { + "combined_loss": 0.6637799739837646, + "completion_length": 422.9375, + "epoch": 0.04039440203562341, + "grad_norm": 1.987329125404358, + "kl": 0.0, + "learning_rate": 9.935304205001066e-07, + "loss": 0.6638, + "num_samples": 1.0, + "reward": 3.375, + "reward_std": 0.9001989364624023, + "rewards/gpt4o_holistic_reward": 3.375, + "rl_loss": 1.4901161193847656e-08, + "sft_loss": 2.212599754333496, + "speech_entropy": 2.2056527137756348, + "speech_kl": 0.0, + "step": 127, + "text_entropy": 0.9696847200393677, + "text_kl": 0.0, + "total_entropy": 1.965734839439392 + }, + { + "combined_loss": 0.7613767981529236, + "completion_length": 356.6875, + "epoch": 0.04071246819338423, + "grad_norm": 2.8389971256256104, + "kl": 0.0, + "learning_rate": 9.934092261314617e-07, + "loss": 0.7614, + "num_samples": 1.0, + "reward": 2.375, + "reward_std": 1.0087943077087402, + "rewards/gpt4o_holistic_reward": 2.375, + "rl_loss": 3.725290298461914e-09, + "sft_loss": 2.5379223823547363, + "speech_entropy": 2.238985538482666, + "speech_kl": 0.0, + "step": 128, + "text_entropy": 0.8710644841194153, + "text_kl": 0.0, + "total_entropy": 1.9456055164337158 + }, + { + "combined_loss": 0.7112630009651184, + "completion_length": 368.5, + "epoch": 0.04103053435114504, + "grad_norm": 1.772113561630249, + "kl": 0.0, + "learning_rate": 9.932869154806185e-07, + "loss": 0.7113, + "num_samples": 1.0, + "reward": 4.75, + "reward_std": 0.5001000165939331, + "rewards/gpt4o_holistic_reward": 4.75, + "rl_loss": 7.450580596923828e-09, + "sft_loss": 2.3708765506744385, + "speech_entropy": 2.2802562713623047, + "speech_kl": 0.0, + "step": 129, + "text_entropy": 1.0544356107711792, + "text_kl": 0.0, + "total_entropy": 2.0886013507843018 + }, + { + "combined_loss": 0.6580800414085388, + "completion_length": 434.4375, + "epoch": 0.041348600508905854, + "grad_norm": 1.6276386976242065, + "kl": 0.0, + "learning_rate": 9.931634888554935e-07, + "loss": 0.6581, + "num_samples": 1.0, + "reward": 4.75, + "reward_std": 0.28877514600753784, + "rewards/gpt4o_holistic_reward": 4.75, + "rl_loss": 3.725290298461914e-09, + "sft_loss": 2.1935999393463135, + "speech_entropy": 2.217723846435547, + "speech_kl": 0.0, + "step": 130, + "text_entropy": 0.9710134267807007, + "text_kl": 0.0, + "total_entropy": 1.9742225408554077 + }, + { + "combined_loss": 0.7769891023635864, + "completion_length": 299.3125, + "epoch": 0.041666666666666664, + "grad_norm": 2.6481971740722656, + "kl": 0.0, + "learning_rate": 9.930389465668132e-07, + "loss": 0.777, + "num_samples": 1.0, + "reward": 4.375, + "reward_std": 1.250100016593933, + "rewards/gpt4o_holistic_reward": 4.375, + "rl_loss": -7.450580596923828e-09, + "sft_loss": 2.589963436126709, + "speech_entropy": 2.583065986633301, + "speech_kl": 0.0, + "step": 131, + "text_entropy": 1.1438889503479004, + "text_kl": 0.0, + "total_entropy": 2.3853845596313477 + }, + { + "combined_loss": 0.6638728380203247, + "completion_length": 330.6875, + "epoch": 0.04198473282442748, + "grad_norm": 1.9016973972320557, + "kl": 0.0, + "learning_rate": 9.929132889281126e-07, + "loss": 0.6639, + "num_samples": 1.0, + "reward": 4.1875, + "reward_std": 0.1251000016927719, + "rewards/gpt4o_holistic_reward": 4.1875, + "rl_loss": 0.0, + "sft_loss": 2.212909460067749, + "speech_entropy": 2.2528860569000244, + "speech_kl": 0.0, + "step": 132, + "text_entropy": 1.0528066158294678, + "text_kl": 0.0, + "total_entropy": 2.020822525024414 + }, + { + "combined_loss": 0.7390019297599792, + "completion_length": 362.9375, + "epoch": 0.0423027989821883, + "grad_norm": 1.9329123497009277, + "kl": 0.0, + "learning_rate": 9.927865162557345e-07, + "loss": 0.739, + "num_samples": 1.0, + "reward": 4.0, + "reward_std": 0.5001000165939331, + "rewards/gpt4o_holistic_reward": 4.0, + "rl_loss": 7.450580596923828e-09, + "sft_loss": 2.4633398056030273, + "speech_entropy": 2.1905789375305176, + "speech_kl": 0.0, + "step": 133, + "text_entropy": 1.1139158010482788, + "text_kl": 0.0, + "total_entropy": 1.9759125709533691 + }, + { + "combined_loss": 0.8078758716583252, + "completion_length": 340.375, + "epoch": 0.04262086513994911, + "grad_norm": 2.5768580436706543, + "kl": 0.0, + "learning_rate": 9.926586288688295e-07, + "loss": 0.8079, + "num_samples": 1.0, + "reward": 3.8125, + "reward_std": 1.0792241096496582, + "rewards/gpt4o_holistic_reward": 3.8125, + "rl_loss": -3.725290298461914e-09, + "sft_loss": 2.6929194927215576, + "speech_entropy": 2.2723331451416016, + "speech_kl": 0.0, + "step": 134, + "text_entropy": 1.9749469757080078, + "text_kl": 0.0, + "total_entropy": 2.219290256500244 + }, + { + "combined_loss": 0.6676887273788452, + "completion_length": 364.0625, + "epoch": 0.042938931297709926, + "grad_norm": 1.688926339149475, + "kl": 0.0, + "learning_rate": 9.925296270893531e-07, + "loss": 0.6677, + "num_samples": 1.0, + "reward": 4.4375, + "reward_std": 0.8538135886192322, + "rewards/gpt4o_holistic_reward": 4.4375, + "rl_loss": 2.2351741790771484e-08, + "sft_loss": 2.2256290912628174, + "speech_entropy": 2.1805365085601807, + "speech_kl": 0.0, + "step": 135, + "text_entropy": 0.9880690574645996, + "text_kl": 0.0, + "total_entropy": 1.9571375846862793 + }, + { + "combined_loss": 0.7317559123039246, + "completion_length": 442.375, + "epoch": 0.043256997455470736, + "grad_norm": 2.035095691680908, + "kl": 0.0, + "learning_rate": 9.923995112420679e-07, + "loss": 0.7318, + "num_samples": 1.0, + "reward": 3.375, + "reward_std": 0.5001000165939331, + "rewards/gpt4o_holistic_reward": 3.375, + "rl_loss": 3.725290298461914e-09, + "sft_loss": 2.4391863346099854, + "speech_entropy": 2.255812644958496, + "speech_kl": 0.0, + "step": 136, + "text_entropy": 1.4666073322296143, + "text_kl": 0.0, + "total_entropy": 2.107949733734131 + }, + { + "combined_loss": 0.624023973941803, + "completion_length": 389.125, + "epoch": 0.04357506361323155, + "grad_norm": 2.0001988410949707, + "kl": 0.0, + "learning_rate": 9.922682816545399e-07, + "loss": 0.624, + "num_samples": 1.0, + "reward": 3.375, + "reward_std": 0.9396764636039734, + "rewards/gpt4o_holistic_reward": 3.375, + "rl_loss": -3.725290298461914e-09, + "sft_loss": 2.0800797939300537, + "speech_entropy": 2.1931605339050293, + "speech_kl": 0.0, + "step": 137, + "text_entropy": 1.0324208736419678, + "text_kl": 0.0, + "total_entropy": 2.005070924758911 + }, + { + "combined_loss": 0.6598743796348572, + "completion_length": 486.0625, + "epoch": 0.04389312977099236, + "grad_norm": 2.193516731262207, + "kl": 0.0, + "learning_rate": 9.9213593865714e-07, + "loss": 0.6599, + "num_samples": 1.0, + "reward": 4.125, + "reward_std": 1.183112621307373, + "rewards/gpt4o_holistic_reward": 4.125, + "rl_loss": -7.450580596923828e-09, + "sft_loss": 2.1995811462402344, + "speech_entropy": 2.289968252182007, + "speech_kl": 0.0, + "step": 138, + "text_entropy": 1.0836520195007324, + "text_kl": 0.0, + "total_entropy": 2.0700204372406006 + }, + { + "combined_loss": 0.6657881736755371, + "completion_length": 326.8125, + "epoch": 0.04421119592875318, + "grad_norm": 1.7214406728744507, + "kl": 0.0, + "learning_rate": 9.920024825830406e-07, + "loss": 0.6658, + "num_samples": 1.0, + "reward": 4.5625, + "reward_std": 0.3146764636039734, + "rewards/gpt4o_holistic_reward": 4.5625, + "rl_loss": -7.450580596923828e-09, + "sft_loss": 2.2192935943603516, + "speech_entropy": 2.253314971923828, + "speech_kl": 0.0, + "step": 139, + "text_entropy": 1.1099263429641724, + "text_kl": 0.0, + "total_entropy": 2.0267200469970703 + }, + { + "combined_loss": 0.6005537509918213, + "completion_length": 402.875, + "epoch": 0.044529262086514, + "grad_norm": 2.0783207416534424, + "kl": 0.0, + "learning_rate": 9.91867913768218e-07, + "loss": 0.6006, + "num_samples": 1.0, + "reward": 3.0, + "reward_std": 1.0774502754211426, + "rewards/gpt4o_holistic_reward": 3.0, + "rl_loss": -3.725290298461914e-09, + "sft_loss": 2.0018458366394043, + "speech_entropy": 2.27315354347229, + "speech_kl": 0.0, + "step": 140, + "text_entropy": 1.074316143989563, + "text_kl": 0.0, + "total_entropy": 2.044334888458252 + }, + { + "combined_loss": 0.6734490394592285, + "completion_length": 382.6875, + "epoch": 0.04484732824427481, + "grad_norm": 1.4923752546310425, + "kl": 0.0, + "learning_rate": 9.917322325514487e-07, + "loss": 0.6734, + "num_samples": 1.0, + "reward": 4.5, + "reward_std": 0.4565354883670807, + "rewards/gpt4o_holistic_reward": 4.5, + "rl_loss": 0.0, + "sft_loss": 2.2448298931121826, + "speech_entropy": 2.2860617637634277, + "speech_kl": 0.0, + "step": 141, + "text_entropy": 0.9306349754333496, + "text_kl": 0.0, + "total_entropy": 2.0320706367492676 + }, + { + "combined_loss": 0.6445150375366211, + "completion_length": 401.875, + "epoch": 0.045165394402035625, + "grad_norm": 2.9365267753601074, + "kl": 0.0, + "learning_rate": 9.915954392743102e-07, + "loss": 0.6445, + "num_samples": 1.0, + "reward": 2.25, + "reward_std": 0.9002986550331116, + "rewards/gpt4o_holistic_reward": 2.25, + "rl_loss": -1.862645149230957e-08, + "sft_loss": 2.148383617401123, + "speech_entropy": 2.395698070526123, + "speech_kl": 0.0, + "step": 142, + "text_entropy": 1.1935876607894897, + "text_kl": 0.0, + "total_entropy": 2.16239070892334 + }, + { + "combined_loss": 0.6403375864028931, + "completion_length": 475.5, + "epoch": 0.045483460559796435, + "grad_norm": 1.8146846294403076, + "kl": 0.0, + "learning_rate": 9.914575342811792e-07, + "loss": 0.6403, + "num_samples": 1.0, + "reward": 2.6875, + "reward_std": 1.096787929534912, + "rewards/gpt4o_holistic_reward": 2.6875, + "rl_loss": 0.0, + "sft_loss": 2.134458541870117, + "speech_entropy": 2.208829641342163, + "speech_kl": 0.0, + "step": 143, + "text_entropy": 0.6652034521102905, + "text_kl": 0.0, + "total_entropy": 1.8854793310165405 + }, + { + "combined_loss": 0.6461049914360046, + "completion_length": 507.3125, + "epoch": 0.04580152671755725, + "grad_norm": 1.7192074060440063, + "kl": 0.0, + "learning_rate": 9.913185179192316e-07, + "loss": 0.6461, + "num_samples": 1.0, + "reward": 3.375, + "reward_std": 0.8536533713340759, + "rewards/gpt4o_holistic_reward": 3.375, + "rl_loss": 7.450580596923828e-09, + "sft_loss": 2.1536831855773926, + "speech_entropy": 2.2175731658935547, + "speech_kl": 0.0, + "step": 144, + "text_entropy": 0.8226357698440552, + "text_kl": 0.0, + "total_entropy": 1.9434648752212524 + }, + { + "combined_loss": 0.6483294367790222, + "completion_length": 542.5, + "epoch": 0.04611959287531807, + "grad_norm": 1.73550546169281, + "kl": 0.0, + "learning_rate": 9.911783905384405e-07, + "loss": 0.6483, + "num_samples": 1.0, + "reward": 4.4375, + "reward_std": 0.9137751460075378, + "rewards/gpt4o_holistic_reward": 4.4375, + "rl_loss": -1.862645149230957e-08, + "sft_loss": 2.161098003387451, + "speech_entropy": 2.1762547492980957, + "speech_kl": 0.0, + "step": 145, + "text_entropy": 1.2476625442504883, + "text_kl": 0.0, + "total_entropy": 2.0006935596466064 + }, + { + "combined_loss": 0.6429945230484009, + "completion_length": 354.8125, + "epoch": 0.04643765903307888, + "grad_norm": 1.9572664499282837, + "kl": 0.0, + "learning_rate": 9.910371524915768e-07, + "loss": 0.643, + "num_samples": 1.0, + "reward": 2.6875, + "reward_std": 0.6229909658432007, + "rewards/gpt4o_holistic_reward": 2.6875, + "rl_loss": -3.725290298461914e-09, + "sft_loss": 2.143314838409424, + "speech_entropy": 2.268470287322998, + "speech_kl": 0.0, + "step": 146, + "text_entropy": 0.9877128601074219, + "text_kl": 0.0, + "total_entropy": 2.013819694519043 + }, + { + "combined_loss": 0.725261926651001, + "completion_length": 361.375, + "epoch": 0.046755725190839696, + "grad_norm": 2.1626226902008057, + "kl": 0.0, + "learning_rate": 9.908948041342072e-07, + "loss": 0.7253, + "num_samples": 1.0, + "reward": 3.5, + "reward_std": 1.1963939666748047, + "rewards/gpt4o_holistic_reward": 3.5, + "rl_loss": -3.725290298461914e-09, + "sft_loss": 2.417539596557617, + "speech_entropy": 2.394669532775879, + "speech_kl": 0.0, + "step": 147, + "text_entropy": 1.3553166389465332, + "text_kl": 0.0, + "total_entropy": 2.1874074935913086 + }, + { + "combined_loss": 0.7557258009910583, + "completion_length": 520.1875, + "epoch": 0.047073791348600506, + "grad_norm": 1.967831015586853, + "kl": 0.0, + "learning_rate": 9.907513458246934e-07, + "loss": 0.7557, + "num_samples": 1.0, + "reward": 4.4375, + "reward_std": 0.9733423590660095, + "rewards/gpt4o_holistic_reward": 4.4375, + "rl_loss": 7.450580596923828e-09, + "sft_loss": 2.5190858840942383, + "speech_entropy": 2.2636446952819824, + "speech_kl": 0.0, + "step": 148, + "text_entropy": 1.526688814163208, + "text_kl": 0.0, + "total_entropy": 2.1319243907928467 + }, + { + "combined_loss": 0.6749532222747803, + "completion_length": 338.5, + "epoch": 0.04739185750636132, + "grad_norm": 1.4147047996520996, + "kl": 0.0, + "learning_rate": 9.90606777924191e-07, + "loss": 0.675, + "num_samples": 1.0, + "reward": 3.6875, + "reward_std": 0.23945678770542145, + "rewards/gpt4o_holistic_reward": 3.6875, + "rl_loss": 3.725290298461914e-09, + "sft_loss": 2.2498438358306885, + "speech_entropy": 2.2806315422058105, + "speech_kl": 0.0, + "step": 149, + "text_entropy": 0.9889430999755859, + "text_kl": 0.0, + "total_entropy": 2.014362096786499 + }, + { + "combined_loss": 0.7438491582870483, + "completion_length": 432.8125, + "epoch": 0.04770992366412214, + "grad_norm": 1.7533581256866455, + "kl": 0.0, + "learning_rate": 9.904611007966504e-07, + "loss": 0.7438, + "num_samples": 1.0, + "reward": 4.6875, + "reward_std": 0.6251000165939331, + "rewards/gpt4o_holistic_reward": 4.6875, + "rl_loss": -1.1175870895385742e-08, + "sft_loss": 2.479497194290161, + "speech_entropy": 2.2352869510650635, + "speech_kl": 0.0, + "step": 150, + "text_entropy": 1.373764991760254, + "text_kl": 0.0, + "total_entropy": 2.0701396465301514 + }, + { + "combined_loss": 0.6471817493438721, + "completion_length": 439.8125, + "epoch": 0.04802798982188295, + "grad_norm": 1.6037753820419312, + "kl": 0.0, + "learning_rate": 9.90314314808813e-07, + "loss": 0.6472, + "num_samples": 1.0, + "reward": 3.9375, + "reward_std": 0.1251000016927719, + "rewards/gpt4o_holistic_reward": 3.9375, + "rl_loss": 7.450580596923828e-09, + "sft_loss": 2.1572723388671875, + "speech_entropy": 2.206624746322632, + "speech_kl": 0.0, + "step": 151, + "text_entropy": 1.0710368156433105, + "text_kl": 0.0, + "total_entropy": 1.9989676475524902 + }, + { + "combined_loss": 0.713241696357727, + "completion_length": 396.6875, + "epoch": 0.04834605597964377, + "grad_norm": 1.923511266708374, + "kl": 0.0, + "learning_rate": 9.901664203302124e-07, + "loss": 0.7132, + "num_samples": 1.0, + "reward": 4.25, + "reward_std": 0.8536533713340759, + "rewards/gpt4o_holistic_reward": 4.25, + "rl_loss": 1.1175870895385742e-08, + "sft_loss": 2.377472162246704, + "speech_entropy": 2.1870193481445312, + "speech_kl": 0.0, + "step": 152, + "text_entropy": 1.3757061958312988, + "text_kl": 0.0, + "total_entropy": 2.0416908264160156 + }, + { + "combined_loss": 0.671237587928772, + "completion_length": 504.0, + "epoch": 0.04866412213740458, + "grad_norm": 2.226810932159424, + "kl": 0.0, + "learning_rate": 9.90017417733173e-07, + "loss": 0.6712, + "num_samples": 1.0, + "reward": 3.3125, + "reward_std": 1.125100016593933, + "rewards/gpt4o_holistic_reward": 3.3125, + "rl_loss": 7.450580596923828e-09, + "sft_loss": 2.2374584674835205, + "speech_entropy": 2.2338881492614746, + "speech_kl": 0.0, + "step": 153, + "text_entropy": 1.1020745038986206, + "text_kl": 0.0, + "total_entropy": 2.044203758239746 + }, + { + "combined_loss": 0.6561381816864014, + "completion_length": 230.1875, + "epoch": 0.048982188295165395, + "grad_norm": 2.5940654277801514, + "kl": 0.0, + "learning_rate": 9.898673073928087e-07, + "loss": 0.6561, + "num_samples": 1.0, + "reward": 4.3125, + "reward_std": 0.9524502158164978, + "rewards/gpt4o_holistic_reward": 4.3125, + "rl_loss": 0.0, + "sft_loss": 2.187127113342285, + "speech_entropy": 2.3985471725463867, + "speech_kl": 0.0, + "step": 154, + "text_entropy": 1.0082178115844727, + "text_kl": 0.0, + "total_entropy": 2.182006359100342 + }, + { + "combined_loss": 0.73064124584198, + "completion_length": 516.375, + "epoch": 0.04930025445292621, + "grad_norm": 1.78936767578125, + "kl": 0.0, + "learning_rate": 9.897160896870217e-07, + "loss": 0.7306, + "num_samples": 1.0, + "reward": 4.1875, + "reward_std": 0.6637751460075378, + "rewards/gpt4o_holistic_reward": 4.1875, + "rl_loss": 9.313225746154785e-09, + "sft_loss": 2.4354705810546875, + "speech_entropy": 2.2467494010925293, + "speech_kl": 0.0, + "step": 155, + "text_entropy": 1.0586869716644287, + "text_kl": 0.0, + "total_entropy": 2.0154693126678467 + }, + { + "combined_loss": 0.721272349357605, + "completion_length": 523.8125, + "epoch": 0.04961832061068702, + "grad_norm": 2.0883450508117676, + "kl": 0.0, + "learning_rate": 9.895637649965028e-07, + "loss": 0.7213, + "num_samples": 1.0, + "reward": 3.6875, + "reward_std": 1.1250998973846436, + "rewards/gpt4o_holistic_reward": 3.6875, + "rl_loss": 3.725290298461914e-09, + "sft_loss": 2.4042410850524902, + "speech_entropy": 2.18511700630188, + "speech_kl": 0.0, + "step": 156, + "text_entropy": 1.0734648704528809, + "text_kl": 0.0, + "total_entropy": 1.961314082145691 + }, + { + "combined_loss": 0.6512343883514404, + "completion_length": 284.4375, + "epoch": 0.04993638676844784, + "grad_norm": 2.0043251514434814, + "kl": 0.0, + "learning_rate": 9.89410333704729e-07, + "loss": 0.6512, + "num_samples": 1.0, + "reward": 4.8125, + "reward_std": 0.2694375813007355, + "rewards/gpt4o_holistic_reward": 4.8125, + "rl_loss": 0.0, + "sft_loss": 2.170781135559082, + "speech_entropy": 2.359740734100342, + "speech_kl": 0.0, + "step": 157, + "text_entropy": 1.0422818660736084, + "text_kl": 0.0, + "total_entropy": 2.0986175537109375 + }, + { + "combined_loss": 0.6856322884559631, + "completion_length": 369.625, + "epoch": 0.05025445292620865, + "grad_norm": 2.0799102783203125, + "kl": 0.0, + "learning_rate": 9.892557961979634e-07, + "loss": 0.6856, + "num_samples": 1.0, + "reward": 4.25, + "reward_std": 1.183112621307373, + "rewards/gpt4o_holistic_reward": 4.25, + "rl_loss": 0.0, + "sft_loss": 2.2854409217834473, + "speech_entropy": 2.2864654064178467, + "speech_kl": 0.0, + "step": 158, + "text_entropy": 1.1000885963439941, + "text_kl": 0.0, + "total_entropy": 2.0594735145568848 + }, + { + "combined_loss": 0.6959141492843628, + "completion_length": 417.0625, + "epoch": 0.05057251908396947, + "grad_norm": 1.7902482748031616, + "kl": 0.0, + "learning_rate": 9.891001528652542e-07, + "loss": 0.6959, + "num_samples": 1.0, + "reward": 3.75, + "reward_std": 0.7042241096496582, + "rewards/gpt4o_holistic_reward": 3.75, + "rl_loss": 1.4901161193847656e-08, + "sft_loss": 2.319713592529297, + "speech_entropy": 2.2585508823394775, + "speech_kl": 0.0, + "step": 159, + "text_entropy": 1.5893566608428955, + "text_kl": 0.0, + "total_entropy": 2.1322367191314697 + }, + { + "combined_loss": 0.7123146057128906, + "completion_length": 478.875, + "epoch": 0.05089058524173028, + "grad_norm": 1.8146113157272339, + "kl": 0.0, + "learning_rate": 9.889434040984331e-07, + "loss": 0.7123, + "num_samples": 1.0, + "reward": 3.5, + "reward_std": 0.8020563125610352, + "rewards/gpt4o_holistic_reward": 3.5, + "rl_loss": 1.30385160446167e-08, + "sft_loss": 2.3743817806243896, + "speech_entropy": 2.307361602783203, + "speech_kl": 0.0, + "step": 160, + "text_entropy": 1.4639774560928345, + "text_kl": 0.0, + "total_entropy": 2.144629955291748 + }, + { + "combined_loss": 0.6143687963485718, + "completion_length": 426.8125, + "epoch": 0.051208651399491094, + "grad_norm": 1.7007486820220947, + "kl": 0.0, + "learning_rate": 9.88785550292115e-07, + "loss": 0.6144, + "num_samples": 1.0, + "reward": 3.5625, + "reward_std": 1.1308612823486328, + "rewards/gpt4o_holistic_reward": 3.5625, + "rl_loss": 1.1175870895385742e-08, + "sft_loss": 2.047895908355713, + "speech_entropy": 2.206613540649414, + "speech_kl": 0.0, + "step": 161, + "text_entropy": 1.1258774995803833, + "text_kl": 0.0, + "total_entropy": 1.9874699115753174 + }, + { + "combined_loss": 0.7055187821388245, + "completion_length": 364.375, + "epoch": 0.05152671755725191, + "grad_norm": 1.8136767148971558, + "kl": 0.0, + "learning_rate": 9.886265918436966e-07, + "loss": 0.7055, + "num_samples": 1.0, + "reward": 3.0625, + "reward_std": 0.8920267820358276, + "rewards/gpt4o_holistic_reward": 3.0625, + "rl_loss": 1.862645149230957e-08, + "sft_loss": 2.351729154586792, + "speech_entropy": 2.2987916469573975, + "speech_kl": 0.0, + "step": 162, + "text_entropy": 1.06236732006073, + "text_kl": 0.0, + "total_entropy": 2.0641303062438965 + }, + { + "combined_loss": 0.61933434009552, + "completion_length": 491.375, + "epoch": 0.05184478371501272, + "grad_norm": 1.4536187648773193, + "kl": 0.0, + "learning_rate": 9.88466529153356e-07, + "loss": 0.6193, + "num_samples": 1.0, + "reward": 3.8125, + "reward_std": 0.3751000165939331, + "rewards/gpt4o_holistic_reward": 3.8125, + "rl_loss": 1.1175870895385742e-08, + "sft_loss": 2.0644476413726807, + "speech_entropy": 2.1868205070495605, + "speech_kl": 0.0, + "step": 163, + "text_entropy": 0.7075154781341553, + "text_kl": 0.0, + "total_entropy": 1.885782241821289 + }, + { + "combined_loss": 0.6253555417060852, + "completion_length": 537.5625, + "epoch": 0.05216284987277354, + "grad_norm": 1.6687992811203003, + "kl": 0.0, + "learning_rate": 9.883053626240501e-07, + "loss": 0.6254, + "num_samples": 1.0, + "reward": 2.5625, + "reward_std": 0.829224169254303, + "rewards/gpt4o_holistic_reward": 2.5625, + "rl_loss": -7.450580596923828e-09, + "sft_loss": 2.0845184326171875, + "speech_entropy": 2.169955015182495, + "speech_kl": 0.0, + "step": 164, + "text_entropy": 0.7777245044708252, + "text_kl": 0.0, + "total_entropy": 1.884574055671692 + }, + { + "combined_loss": 0.6416522264480591, + "completion_length": 443.125, + "epoch": 0.05248091603053435, + "grad_norm": 2.1316330432891846, + "kl": 0.0, + "learning_rate": 9.88143092661516e-07, + "loss": 0.6417, + "num_samples": 1.0, + "reward": 3.8125, + "reward_std": 0.8750999569892883, + "rewards/gpt4o_holistic_reward": 3.8125, + "rl_loss": 3.725290298461914e-09, + "sft_loss": 2.138840675354004, + "speech_entropy": 2.191802501678467, + "speech_kl": 0.0, + "step": 165, + "text_entropy": 0.8017352819442749, + "text_kl": 0.0, + "total_entropy": 1.9171819686889648 + }, + { + "combined_loss": 0.6333335638046265, + "completion_length": 309.625, + "epoch": 0.052798982188295165, + "grad_norm": 1.602042317390442, + "kl": 0.0, + "learning_rate": 9.87979719674268e-07, + "loss": 0.6333, + "num_samples": 1.0, + "reward": 4.875, + "reward_std": 0.14443756639957428, + "rewards/gpt4o_holistic_reward": 4.875, + "rl_loss": 0.0, + "sft_loss": 2.111111640930176, + "speech_entropy": 2.29636549949646, + "speech_kl": 0.0, + "step": 166, + "text_entropy": 1.0871713161468506, + "text_kl": 0.0, + "total_entropy": 2.0691018104553223 + }, + { + "combined_loss": 0.753852903842926, + "completion_length": 504.5625, + "epoch": 0.05311704834605598, + "grad_norm": 1.7408075332641602, + "kl": 0.0, + "learning_rate": 9.878152440735971e-07, + "loss": 0.7539, + "num_samples": 1.0, + "reward": 4.5625, + "reward_std": 0.7286534309387207, + "rewards/gpt4o_holistic_reward": 4.5625, + "rl_loss": 7.450580596923828e-09, + "sft_loss": 2.512842893600464, + "speech_entropy": 2.2807769775390625, + "speech_kl": 0.0, + "step": 167, + "text_entropy": 1.8450149297714233, + "text_kl": 0.0, + "total_entropy": 2.1998775005340576 + }, + { + "combined_loss": 0.7053524255752563, + "completion_length": 346.375, + "epoch": 0.05343511450381679, + "grad_norm": 1.8231476545333862, + "kl": 0.0, + "learning_rate": 9.876496662735711e-07, + "loss": 0.7054, + "num_samples": 1.0, + "reward": 4.6875, + "reward_std": 0.6250999569892883, + "rewards/gpt4o_holistic_reward": 4.6875, + "rl_loss": -3.725290298461914e-09, + "sft_loss": 2.3511745929718018, + "speech_entropy": 2.353151559829712, + "speech_kl": 0.0, + "step": 168, + "text_entropy": 1.3969731330871582, + "text_kl": 0.0, + "total_entropy": 2.1840548515319824 + }, + { + "combined_loss": 0.6700061559677124, + "completion_length": 525.125, + "epoch": 0.05375318066157761, + "grad_norm": 1.6231902837753296, + "kl": 0.0, + "learning_rate": 9.874829866910313e-07, + "loss": 0.67, + "num_samples": 1.0, + "reward": 2.8125, + "reward_std": 0.6637752056121826, + "rewards/gpt4o_holistic_reward": 2.8125, + "rl_loss": -7.450580596923828e-09, + "sft_loss": 2.233353853225708, + "speech_entropy": 2.205387592315674, + "speech_kl": 0.0, + "step": 169, + "text_entropy": 0.9943252801895142, + "text_kl": 0.0, + "total_entropy": 1.9630248546600342 + }, + { + "combined_loss": 0.6313989162445068, + "completion_length": 331.25, + "epoch": 0.05407124681933842, + "grad_norm": 1.6013562679290771, + "kl": 0.0, + "learning_rate": 9.873152057455938e-07, + "loss": 0.6314, + "num_samples": 1.0, + "reward": 3.3125, + "reward_std": 0.1251000016927719, + "rewards/gpt4o_holistic_reward": 3.3125, + "rl_loss": -7.450580596923828e-09, + "sft_loss": 2.1046628952026367, + "speech_entropy": 2.343129873275757, + "speech_kl": 0.0, + "step": 170, + "text_entropy": 0.897804856300354, + "text_kl": 0.0, + "total_entropy": 2.0444600582122803 + }, + { + "combined_loss": 0.6534004211425781, + "completion_length": 617.3125, + "epoch": 0.05438931297709924, + "grad_norm": 1.932045578956604, + "kl": 0.0, + "learning_rate": 9.871463238596464e-07, + "loss": 0.6534, + "num_samples": 1.0, + "reward": 3.5625, + "reward_std": 0.8751000165939331, + "rewards/gpt4o_holistic_reward": 3.5625, + "rl_loss": 1.4901161193847656e-08, + "sft_loss": 2.1780014038085938, + "speech_entropy": 2.1928927898406982, + "speech_kl": 0.0, + "step": 171, + "text_entropy": 0.8436852693557739, + "text_kl": 0.0, + "total_entropy": 1.9311038255691528 + }, + { + "combined_loss": 0.6976642608642578, + "completion_length": 512.0625, + "epoch": 0.054707379134860054, + "grad_norm": 1.7346135377883911, + "kl": 0.0, + "learning_rate": 9.869763414583495e-07, + "loss": 0.6977, + "num_samples": 1.0, + "reward": 3.8125, + "reward_std": 0.5646764636039734, + "rewards/gpt4o_holistic_reward": 3.8125, + "rl_loss": -7.450580596923828e-09, + "sft_loss": 2.325547218322754, + "speech_entropy": 2.1841320991516113, + "speech_kl": 0.0, + "step": 172, + "text_entropy": 1.026000738143921, + "text_kl": 0.0, + "total_entropy": 1.9609410762786865 + }, + { + "combined_loss": 0.7238253355026245, + "completion_length": 598.0625, + "epoch": 0.055025445292620864, + "grad_norm": 1.9262490272521973, + "kl": 0.0, + "learning_rate": 9.868052589696336e-07, + "loss": 0.7238, + "num_samples": 1.0, + "reward": 4.5, + "reward_std": 0.4928992986679077, + "rewards/gpt4o_holistic_reward": 4.5, + "rl_loss": 3.725290298461914e-09, + "sft_loss": 2.4127509593963623, + "speech_entropy": 2.1995720863342285, + "speech_kl": 0.0, + "step": 173, + "text_entropy": 1.5878872871398926, + "text_kl": 0.0, + "total_entropy": 2.0725741386413574 + }, + { + "combined_loss": 0.6825557947158813, + "completion_length": 435.9375, + "epoch": 0.05534351145038168, + "grad_norm": 1.9291430711746216, + "kl": 0.0, + "learning_rate": 9.866330768241983e-07, + "loss": 0.6826, + "num_samples": 1.0, + "reward": 4.25, + "reward_std": 0.7548449039459229, + "rewards/gpt4o_holistic_reward": 4.25, + "rl_loss": 7.450580596923828e-09, + "sft_loss": 2.2751858234405518, + "speech_entropy": 2.231337070465088, + "speech_kl": 0.0, + "step": 174, + "text_entropy": 0.8735387325286865, + "text_kl": 0.0, + "total_entropy": 1.9939302206039429 + }, + { + "combined_loss": 0.7245073914527893, + "completion_length": 432.0, + "epoch": 0.05566157760814249, + "grad_norm": 2.2042038440704346, + "kl": 0.0, + "learning_rate": 9.864597954555122e-07, + "loss": 0.7245, + "num_samples": 1.0, + "reward": 3.5625, + "reward_std": 1.6403796672821045, + "rewards/gpt4o_holistic_reward": 3.5625, + "rl_loss": 7.450580596923828e-09, + "sft_loss": 2.415024757385254, + "speech_entropy": 2.3350276947021484, + "speech_kl": 0.0, + "step": 175, + "text_entropy": 1.081442952156067, + "text_kl": 0.0, + "total_entropy": 2.1579785346984863 + }, + { + "combined_loss": 0.7414748668670654, + "completion_length": 358.9375, + "epoch": 0.05597964376590331, + "grad_norm": 1.7152729034423828, + "kl": 0.0, + "learning_rate": 9.86285415299811e-07, + "loss": 0.7415, + "num_samples": 1.0, + "reward": 5.0, + "reward_std": 9.999999747378752e-05, + "rewards/gpt4o_holistic_reward": 5.0, + "rl_loss": 0.0, + "sft_loss": 2.4715828895568848, + "speech_entropy": 2.273726463317871, + "speech_kl": 0.0, + "step": 176, + "text_entropy": 1.5496562719345093, + "text_kl": 0.0, + "total_entropy": 2.137073040008545 + }, + { + "combined_loss": 0.6711795330047607, + "completion_length": 557.6875, + "epoch": 0.05629770992366412, + "grad_norm": 1.7282371520996094, + "kl": 0.0, + "learning_rate": 9.861099367960964e-07, + "loss": 0.6712, + "num_samples": 1.0, + "reward": 3.6875, + "reward_std": 1.1298449039459229, + "rewards/gpt4o_holistic_reward": 3.6875, + "rl_loss": 7.450580596923828e-09, + "sft_loss": 2.237265110015869, + "speech_entropy": 2.1773886680603027, + "speech_kl": 0.0, + "step": 177, + "text_entropy": 0.8734384775161743, + "text_kl": 0.0, + "total_entropy": 1.9134694337844849 + }, + { + "combined_loss": 0.7701910138130188, + "completion_length": 591.25, + "epoch": 0.056615776081424936, + "grad_norm": 1.7133898735046387, + "kl": 0.0, + "learning_rate": 9.859333603861353e-07, + "loss": 0.7702, + "num_samples": 1.0, + "reward": 3.1875, + "reward_std": 0.5194375514984131, + "rewards/gpt4o_holistic_reward": 3.1875, + "rl_loss": -1.1175870895385742e-08, + "sft_loss": 2.56730318069458, + "speech_entropy": 2.189483165740967, + "speech_kl": 0.0, + "step": 178, + "text_entropy": 1.366645097732544, + "text_kl": 0.0, + "total_entropy": 2.039158582687378 + }, + { + "combined_loss": 0.5972847938537598, + "completion_length": 284.375, + "epoch": 0.05693384223918575, + "grad_norm": 2.5166637897491455, + "kl": 0.0, + "learning_rate": 9.857556865144585e-07, + "loss": 0.5973, + "num_samples": 1.0, + "reward": 3.625, + "reward_std": 1.7565135955810547, + "rewards/gpt4o_holistic_reward": 3.625, + "rl_loss": 3.725290298461914e-09, + "sft_loss": 1.990949273109436, + "speech_entropy": 2.3300843238830566, + "speech_kl": 0.0, + "step": 179, + "text_entropy": 0.771490752696991, + "text_kl": 0.0, + "total_entropy": 2.0337142944335938 + }, + { + "combined_loss": 0.6814907193183899, + "completion_length": 378.6875, + "epoch": 0.05725190839694656, + "grad_norm": 1.7109546661376953, + "kl": 0.0, + "learning_rate": 9.855769156283603e-07, + "loss": 0.6815, + "num_samples": 1.0, + "reward": 3.8125, + "reward_std": 0.3751000165939331, + "rewards/gpt4o_holistic_reward": 3.8125, + "rl_loss": 3.725290298461914e-09, + "sft_loss": 2.2716357707977295, + "speech_entropy": 2.227330207824707, + "speech_kl": 0.0, + "step": 180, + "text_entropy": 1.1663092374801636, + "text_kl": 0.0, + "total_entropy": 2.016590118408203 + }, + { + "combined_loss": 0.5935865640640259, + "completion_length": 368.4375, + "epoch": 0.05756997455470738, + "grad_norm": 1.8667573928833008, + "kl": 0.0, + "learning_rate": 9.853970481778956e-07, + "loss": 0.5936, + "num_samples": 1.0, + "reward": 4.0, + "reward_std": 0.3944375813007355, + "rewards/gpt4o_holistic_reward": 4.0, + "rl_loss": -3.725290298461914e-09, + "sft_loss": 1.9786219596862793, + "speech_entropy": 2.1767773628234863, + "speech_kl": 0.0, + "step": 181, + "text_entropy": 0.6917097568511963, + "text_kl": 0.0, + "total_entropy": 1.8641599416732788 + }, + { + "combined_loss": 0.716184675693512, + "completion_length": 458.625, + "epoch": 0.05788804071246819, + "grad_norm": 1.799391508102417, + "kl": 0.0, + "learning_rate": 9.852160846158806e-07, + "loss": 0.7162, + "num_samples": 1.0, + "reward": 3.125, + "reward_std": 0.454224169254303, + "rewards/gpt4o_holistic_reward": 3.125, + "rl_loss": 3.725290298461914e-09, + "sft_loss": 2.387282133102417, + "speech_entropy": 2.258867025375366, + "speech_kl": 0.0, + "step": 182, + "text_entropy": 1.5640207529067993, + "text_kl": 0.0, + "total_entropy": 2.1286048889160156 + }, + { + "combined_loss": 0.7314043045043945, + "completion_length": 255.3125, + "epoch": 0.05820610687022901, + "grad_norm": 2.2445545196533203, + "kl": 0.0, + "learning_rate": 9.850340253978911e-07, + "loss": 0.7314, + "num_samples": 1.0, + "reward": 3.9375, + "reward_std": 1.125100016593933, + "rewards/gpt4o_holistic_reward": 3.9375, + "rl_loss": -3.725290298461914e-09, + "sft_loss": 2.438014268875122, + "speech_entropy": 2.723639726638794, + "speech_kl": 0.0, + "step": 183, + "text_entropy": 0.9034126400947571, + "text_kl": 0.0, + "total_entropy": 2.436861991882324 + }, + { + "combined_loss": 0.6868402361869812, + "completion_length": 410.8125, + "epoch": 0.058524173027989825, + "grad_norm": 1.6614896059036255, + "kl": 0.0, + "learning_rate": 9.848508709822607e-07, + "loss": 0.6868, + "num_samples": 1.0, + "reward": 3.5, + "reward_std": 0.3944375813007355, + "rewards/gpt4o_holistic_reward": 3.5, + "rl_loss": 1.862645149230957e-08, + "sft_loss": 2.2894670963287354, + "speech_entropy": 2.232311964035034, + "speech_kl": 0.0, + "step": 184, + "text_entropy": 1.3743724822998047, + "text_kl": 0.0, + "total_entropy": 2.072235584259033 + }, + { + "combined_loss": 0.6055930852890015, + "completion_length": 391.9375, + "epoch": 0.058842239185750635, + "grad_norm": 1.8519771099090576, + "kl": 0.0, + "learning_rate": 9.846666218300807e-07, + "loss": 0.6056, + "num_samples": 1.0, + "reward": 3.3125, + "reward_std": 0.9712333679199219, + "rewards/gpt4o_holistic_reward": 3.3125, + "rl_loss": 2.2351741790771484e-08, + "sft_loss": 2.018643379211426, + "speech_entropy": 2.2869997024536133, + "speech_kl": 0.0, + "step": 185, + "text_entropy": 0.664115846157074, + "text_kl": 0.0, + "total_entropy": 1.9729156494140625 + }, + { + "combined_loss": 0.6875672936439514, + "completion_length": 485.3125, + "epoch": 0.05916030534351145, + "grad_norm": 2.1656157970428467, + "kl": 0.0, + "learning_rate": 9.844812784051978e-07, + "loss": 0.6876, + "num_samples": 1.0, + "reward": 3.1875, + "reward_std": 0.9129188060760498, + "rewards/gpt4o_holistic_reward": 3.1875, + "rl_loss": -1.1175870895385742e-08, + "sft_loss": 2.291890859603882, + "speech_entropy": 2.182652473449707, + "speech_kl": 0.0, + "step": 186, + "text_entropy": 0.9604874849319458, + "text_kl": 0.0, + "total_entropy": 1.9440966844558716 + }, + { + "combined_loss": 0.6967759728431702, + "completion_length": 383.75, + "epoch": 0.05947837150127226, + "grad_norm": 1.8742365837097168, + "kl": 0.0, + "learning_rate": 9.84294841174214e-07, + "loss": 0.6968, + "num_samples": 1.0, + "reward": 4.125, + "reward_std": 1.0308762788772583, + "rewards/gpt4o_holistic_reward": 4.125, + "rl_loss": 3.725290298461914e-09, + "sft_loss": 2.3225862979888916, + "speech_entropy": 2.2666873931884766, + "speech_kl": 0.0, + "step": 187, + "text_entropy": 1.5268785953521729, + "text_kl": 0.0, + "total_entropy": 2.1126818656921387 + }, + { + "combined_loss": 0.6987001895904541, + "completion_length": 427.75, + "epoch": 0.05979643765903308, + "grad_norm": 1.6652710437774658, + "kl": 0.0, + "learning_rate": 9.841073106064852e-07, + "loss": 0.6987, + "num_samples": 1.0, + "reward": 2.875, + "reward_std": 0.5387751460075378, + "rewards/gpt4o_holistic_reward": 2.875, + "rl_loss": 3.725290298461914e-09, + "sft_loss": 2.329000473022461, + "speech_entropy": 2.187225103378296, + "speech_kl": 0.0, + "step": 188, + "text_entropy": 1.3647143840789795, + "text_kl": 0.0, + "total_entropy": 2.0151681900024414 + }, + { + "combined_loss": 0.6457899808883667, + "completion_length": 442.75, + "epoch": 0.060114503816793896, + "grad_norm": 3.32289719581604, + "kl": 0.0, + "learning_rate": 9.839186871741186e-07, + "loss": 0.6458, + "num_samples": 1.0, + "reward": 3.75, + "reward_std": 0.8536533117294312, + "rewards/gpt4o_holistic_reward": 3.75, + "rl_loss": 1.1175870895385742e-08, + "sft_loss": 2.1526331901550293, + "speech_entropy": 2.330132007598877, + "speech_kl": 0.0, + "step": 189, + "text_entropy": 1.0898842811584473, + "text_kl": 0.0, + "total_entropy": 2.0880885124206543 + }, + { + "combined_loss": 0.7040784955024719, + "completion_length": 552.4375, + "epoch": 0.060432569974554706, + "grad_norm": 1.7851852178573608, + "kl": 0.0, + "learning_rate": 9.83728971351974e-07, + "loss": 0.7041, + "num_samples": 1.0, + "reward": 2.25, + "reward_std": 0.6444376111030579, + "rewards/gpt4o_holistic_reward": 2.25, + "rl_loss": -3.725290298461914e-09, + "sft_loss": 2.346928119659424, + "speech_entropy": 2.2672841548919678, + "speech_kl": 0.0, + "step": 190, + "text_entropy": 1.2936159372329712, + "text_kl": 0.0, + "total_entropy": 2.0743160247802734 + }, + { + "combined_loss": 0.6943396329879761, + "completion_length": 251.625, + "epoch": 0.06075063613231552, + "grad_norm": 1.8841829299926758, + "kl": 0.0, + "learning_rate": 9.835381636176605e-07, + "loss": 0.6943, + "num_samples": 1.0, + "reward": 3.25, + "reward_std": 1.0983422994613647, + "rewards/gpt4o_holistic_reward": 3.25, + "rl_loss": -7.450580596923828e-09, + "sft_loss": 2.3144655227661133, + "speech_entropy": 2.4599857330322266, + "speech_kl": 0.0, + "step": 191, + "text_entropy": 1.357433557510376, + "text_kl": 0.0, + "total_entropy": 2.25075101852417 + }, + { + "combined_loss": 0.6926529407501221, + "completion_length": 340.9375, + "epoch": 0.061068702290076333, + "grad_norm": 1.9702279567718506, + "kl": 0.0, + "learning_rate": 9.833462644515366e-07, + "loss": 0.6927, + "num_samples": 1.0, + "reward": 3.3125, + "reward_std": 0.6251000165939331, + "rewards/gpt4o_holistic_reward": 3.3125, + "rl_loss": -3.725290298461914e-09, + "sft_loss": 2.3088431358337402, + "speech_entropy": 2.233442783355713, + "speech_kl": 0.0, + "step": 192, + "text_entropy": 1.2242528200149536, + "text_kl": 0.0, + "total_entropy": 2.0358939170837402 + }, + { + "combined_loss": 0.6026841998100281, + "completion_length": 382.8125, + "epoch": 0.06138676844783715, + "grad_norm": 1.9015692472457886, + "kl": 0.0, + "learning_rate": 9.83153274336708e-07, + "loss": 0.6027, + "num_samples": 1.0, + "reward": 3.0, + "reward_std": 0.6444375514984131, + "rewards/gpt4o_holistic_reward": 3.0, + "rl_loss": -1.1175870895385742e-08, + "sft_loss": 2.0089473724365234, + "speech_entropy": 2.1782498359680176, + "speech_kl": 0.0, + "step": 193, + "text_entropy": 0.8885223269462585, + "text_kl": 0.0, + "total_entropy": 1.9228941202163696 + }, + { + "combined_loss": 0.8226222991943359, + "completion_length": 641.9375, + "epoch": 0.06170483460559797, + "grad_norm": 4.10353946685791, + "kl": 0.0, + "learning_rate": 9.829591937590273e-07, + "loss": 0.8226, + "num_samples": 1.0, + "reward": 4.25, + "reward_std": 1.288775086402893, + "rewards/gpt4o_holistic_reward": 4.25, + "rl_loss": 3.725290298461914e-09, + "sft_loss": 2.7420742511749268, + "speech_entropy": 2.501152992248535, + "speech_kl": 0.0, + "step": 194, + "text_entropy": 1.702864646911621, + "text_kl": 0.0, + "total_entropy": 2.362520217895508 + }, + { + "combined_loss": 0.6262680292129517, + "completion_length": 441.6875, + "epoch": 0.06202290076335878, + "grad_norm": 1.9847943782806396, + "kl": 0.0, + "learning_rate": 9.82764023207092e-07, + "loss": 0.6263, + "num_samples": 1.0, + "reward": 3.25, + "reward_std": 1.1531318426132202, + "rewards/gpt4o_holistic_reward": 3.25, + "rl_loss": 7.450580596923828e-09, + "sft_loss": 2.087559938430786, + "speech_entropy": 2.365201473236084, + "speech_kl": 0.0, + "step": 195, + "text_entropy": 1.2258142232894897, + "text_kl": 0.0, + "total_entropy": 2.138317584991455 + }, + { + "combined_loss": 0.7117223739624023, + "completion_length": 181.375, + "epoch": 0.062340966921119595, + "grad_norm": 1.908617377281189, + "kl": 0.0, + "learning_rate": 9.825677631722435e-07, + "loss": 0.7117, + "num_samples": 1.0, + "reward": 4.25, + "reward_std": 0.20422415435314178, + "rewards/gpt4o_holistic_reward": 4.25, + "rl_loss": 0.0, + "sft_loss": 2.372407913208008, + "speech_entropy": 2.5207109451293945, + "speech_kl": 0.0, + "step": 196, + "text_entropy": 0.8852477073669434, + "text_kl": 0.0, + "total_entropy": 2.2032618522644043 + }, + { + "combined_loss": 0.6184705495834351, + "completion_length": 488.5625, + "epoch": 0.0626590330788804, + "grad_norm": 1.9325580596923828, + "kl": 0.0, + "learning_rate": 9.823704141485666e-07, + "loss": 0.6185, + "num_samples": 1.0, + "reward": 3.6875, + "reward_std": 0.8751000165939331, + "rewards/gpt4o_holistic_reward": 3.6875, + "rl_loss": 0.0, + "sft_loss": 2.061568260192871, + "speech_entropy": 2.212630033493042, + "speech_kl": 0.0, + "step": 197, + "text_entropy": 1.0164635181427002, + "text_kl": 0.0, + "total_entropy": 1.9818271398544312 + }, + { + "combined_loss": 0.6890236139297485, + "completion_length": 352.3125, + "epoch": 0.06297709923664122, + "grad_norm": 2.0383756160736084, + "kl": 0.0, + "learning_rate": 9.82171976632887e-07, + "loss": 0.689, + "num_samples": 1.0, + "reward": 3.9375, + "reward_std": 0.6251000165939331, + "rewards/gpt4o_holistic_reward": 3.9375, + "rl_loss": 1.1175870895385742e-08, + "sft_loss": 2.2967453002929688, + "speech_entropy": 2.279099702835083, + "speech_kl": 0.0, + "step": 198, + "text_entropy": 1.1877176761627197, + "text_kl": 0.0, + "total_entropy": 2.078038215637207 + }, + { + "combined_loss": 0.727849006652832, + "completion_length": 459.625, + "epoch": 0.06329516539440204, + "grad_norm": 4.476938724517822, + "kl": 0.0, + "learning_rate": 9.81972451124771e-07, + "loss": 0.7278, + "num_samples": 1.0, + "reward": 2.5625, + "reward_std": 1.036826252937317, + "rewards/gpt4o_holistic_reward": 2.5625, + "rl_loss": -7.450580596923828e-09, + "sft_loss": 2.4261631965637207, + "speech_entropy": 2.485858201980591, + "speech_kl": 0.0, + "step": 199, + "text_entropy": 0.7858097553253174, + "text_kl": 0.0, + "total_entropy": 2.1179397106170654 + }, + { + "combined_loss": 0.6614863276481628, + "completion_length": 468.25, + "epoch": 0.06361323155216285, + "grad_norm": 1.9755204916000366, + "kl": 0.0, + "learning_rate": 9.817718381265238e-07, + "loss": 0.6615, + "num_samples": 1.0, + "reward": 3.5, + "reward_std": 0.6144567728042603, + "rewards/gpt4o_holistic_reward": 3.5, + "rl_loss": -3.725290298461914e-09, + "sft_loss": 2.2049543857574463, + "speech_entropy": 2.3594043254852295, + "speech_kl": 0.0, + "step": 200, + "text_entropy": 1.0148909091949463, + "text_kl": 0.0, + "total_entropy": 2.100245475769043 + }, + { + "combined_loss": 0.6087247133255005, + "completion_length": 624.625, + "epoch": 0.06393129770992366, + "grad_norm": 1.7758878469467163, + "kl": 0.0, + "learning_rate": 9.815701381431885e-07, + "loss": 0.6087, + "num_samples": 1.0, + "reward": 2.875, + "reward_std": 0.68720543384552, + "rewards/gpt4o_holistic_reward": 2.875, + "rl_loss": -3.725290298461914e-09, + "sft_loss": 2.0290822982788086, + "speech_entropy": 2.1871001720428467, + "speech_kl": 0.0, + "step": 201, + "text_entropy": 1.0028703212738037, + "text_kl": 0.0, + "total_entropy": 1.9624230861663818 + }, + { + "combined_loss": 0.7581361532211304, + "completion_length": 428.5625, + "epoch": 0.06424936386768448, + "grad_norm": 2.9266483783721924, + "kl": 0.0, + "learning_rate": 9.813673516825443e-07, + "loss": 0.7581, + "num_samples": 1.0, + "reward": 3.4375, + "reward_std": 0.8376991748809814, + "rewards/gpt4o_holistic_reward": 3.4375, + "rl_loss": -3.725290298461914e-09, + "sft_loss": 2.527120351791382, + "speech_entropy": 2.388430595397949, + "speech_kl": 0.0, + "step": 202, + "text_entropy": 1.3267887830734253, + "text_kl": 0.0, + "total_entropy": 2.1770803928375244 + }, + { + "combined_loss": 0.679368257522583, + "completion_length": 652.9375, + "epoch": 0.0645674300254453, + "grad_norm": 1.8700424432754517, + "kl": 0.0, + "learning_rate": 9.81163479255106e-07, + "loss": 0.6794, + "num_samples": 1.0, + "reward": 2.875, + "reward_std": 0.7171862125396729, + "rewards/gpt4o_holistic_reward": 2.875, + "rl_loss": 0.0, + "sft_loss": 2.2645606994628906, + "speech_entropy": 2.1289772987365723, + "speech_kl": 0.0, + "step": 203, + "text_entropy": 1.0961881875991821, + "text_kl": 0.0, + "total_entropy": 1.903523325920105 + }, + { + "combined_loss": 0.7537906169891357, + "completion_length": 480.3125, + "epoch": 0.0648854961832061, + "grad_norm": 2.1203622817993164, + "kl": 0.0, + "learning_rate": 9.809585213741224e-07, + "loss": 0.7538, + "num_samples": 1.0, + "reward": 3.0, + "reward_std": 0.7288135886192322, + "rewards/gpt4o_holistic_reward": 3.0, + "rl_loss": 3.725290298461914e-09, + "sft_loss": 2.5126352310180664, + "speech_entropy": 2.323160171508789, + "speech_kl": 0.0, + "step": 204, + "text_entropy": 1.5344562530517578, + "text_kl": 0.0, + "total_entropy": 2.1817140579223633 + }, + { + "combined_loss": 0.7854565978050232, + "completion_length": 404.1875, + "epoch": 0.06520356234096693, + "grad_norm": 6.252560138702393, + "kl": 0.0, + "learning_rate": 9.807524785555744e-07, + "loss": 0.7855, + "num_samples": 1.0, + "reward": 2.9375, + "reward_std": 0.23945678770542145, + "rewards/gpt4o_holistic_reward": 2.9375, + "rl_loss": 7.450580596923828e-09, + "sft_loss": 2.6181886196136475, + "speech_entropy": 2.286557674407959, + "speech_kl": 0.0, + "step": 205, + "text_entropy": 1.3187462091445923, + "text_kl": 0.0, + "total_entropy": 2.110858917236328 + }, + { + "combined_loss": 0.6578105092048645, + "completion_length": 464.8125, + "epoch": 0.06552162849872774, + "grad_norm": 1.7753384113311768, + "kl": 0.0, + "learning_rate": 9.805453513181746e-07, + "loss": 0.6578, + "num_samples": 1.0, + "reward": 3.625, + "reward_std": 0.4331127107143402, + "rewards/gpt4o_holistic_reward": 3.625, + "rl_loss": -7.450580596923828e-09, + "sft_loss": 2.192701816558838, + "speech_entropy": 2.21907639503479, + "speech_kl": 0.0, + "step": 206, + "text_entropy": 1.3251252174377441, + "text_kl": 0.0, + "total_entropy": 2.0617318153381348 + }, + { + "combined_loss": 0.7920005321502686, + "completion_length": 623.25, + "epoch": 0.06583969465648855, + "grad_norm": 1.7488422393798828, + "kl": 0.0, + "learning_rate": 9.80337140183366e-07, + "loss": 0.792, + "num_samples": 1.0, + "reward": 3.9375, + "reward_std": 0.5194375514984131, + "rewards/gpt4o_holistic_reward": 3.9375, + "rl_loss": 1.1175870895385742e-08, + "sft_loss": 2.6400017738342285, + "speech_entropy": 2.1968274116516113, + "speech_kl": 0.0, + "step": 207, + "text_entropy": 1.44332754611969, + "text_kl": 0.0, + "total_entropy": 2.0511021614074707 + }, + { + "combined_loss": 0.6618804931640625, + "completion_length": 418.75, + "epoch": 0.06615776081424936, + "grad_norm": 1.7858362197875977, + "kl": 0.0, + "learning_rate": 9.801278456753193e-07, + "loss": 0.6619, + "num_samples": 1.0, + "reward": 4.8125, + "reward_std": 0.3751000165939331, + "rewards/gpt4o_holistic_reward": 4.8125, + "rl_loss": -7.450580596923828e-09, + "sft_loss": 2.206268310546875, + "speech_entropy": 2.241903781890869, + "speech_kl": 0.0, + "step": 208, + "text_entropy": 0.9072421193122864, + "text_kl": 0.0, + "total_entropy": 1.9820117950439453 + }, + { + "combined_loss": 0.6754240989685059, + "completion_length": 353.1875, + "epoch": 0.06647582697201018, + "grad_norm": 2.1248185634613037, + "kl": 0.0, + "learning_rate": 9.799174683209336e-07, + "loss": 0.6754, + "num_samples": 1.0, + "reward": 2.9375, + "reward_std": 1.2371759414672852, + "rewards/gpt4o_holistic_reward": 2.9375, + "rl_loss": -3.725290298461914e-09, + "sft_loss": 2.251413583755493, + "speech_entropy": 2.267850875854492, + "speech_kl": 0.0, + "step": 209, + "text_entropy": 1.2034542560577393, + "text_kl": 0.0, + "total_entropy": 2.069960594177246 + }, + { + "combined_loss": 0.6498122215270996, + "completion_length": 532.8125, + "epoch": 0.06679389312977099, + "grad_norm": 1.8640666007995605, + "kl": 0.0, + "learning_rate": 9.797060086498332e-07, + "loss": 0.6498, + "num_samples": 1.0, + "reward": 2.75, + "reward_std": 1.0983422994613647, + "rewards/gpt4o_holistic_reward": 2.75, + "rl_loss": 0.0, + "sft_loss": 2.1660404205322266, + "speech_entropy": 2.172290563583374, + "speech_kl": 0.0, + "step": 210, + "text_entropy": 1.1589062213897705, + "text_kl": 0.0, + "total_entropy": 1.9814873933792114 + }, + { + "combined_loss": 0.6711292862892151, + "completion_length": 657.9375, + "epoch": 0.0671119592875318, + "grad_norm": 1.5342586040496826, + "kl": 0.0, + "learning_rate": 9.79493467194368e-07, + "loss": 0.6711, + "num_samples": 1.0, + "reward": 3.6875, + "reward_std": 0.2694375813007355, + "rewards/gpt4o_holistic_reward": 3.6875, + "rl_loss": 1.1175870895385742e-08, + "sft_loss": 2.2370975017547607, + "speech_entropy": 2.2588064670562744, + "speech_kl": 0.0, + "step": 211, + "text_entropy": 0.9491192102432251, + "text_kl": 0.0, + "total_entropy": 2.00757098197937 + }, + { + "combined_loss": 0.7253336906433105, + "completion_length": 462.125, + "epoch": 0.06743002544529263, + "grad_norm": 1.837734341621399, + "kl": 0.0, + "learning_rate": 9.792798444896107e-07, + "loss": 0.7253, + "num_samples": 1.0, + "reward": 4.5, + "reward_std": 0.6831126809120178, + "rewards/gpt4o_holistic_reward": 4.5, + "rl_loss": 1.1175870895385742e-08, + "sft_loss": 2.417778730392456, + "speech_entropy": 2.2486274242401123, + "speech_kl": 0.0, + "step": 212, + "text_entropy": 1.3472001552581787, + "text_kl": 0.0, + "total_entropy": 2.0750081539154053 + }, + { + "combined_loss": 0.7301596999168396, + "completion_length": 306.8125, + "epoch": 0.06774809160305344, + "grad_norm": 2.5512921810150146, + "kl": 0.0, + "learning_rate": 9.790651410733562e-07, + "loss": 0.7302, + "num_samples": 1.0, + "reward": 4.625, + "reward_std": 0.4331127107143402, + "rewards/gpt4o_holistic_reward": 4.625, + "rl_loss": 0.0, + "sft_loss": 2.433865547180176, + "speech_entropy": 2.319303512573242, + "speech_kl": 0.0, + "step": 213, + "text_entropy": 0.9959409832954407, + "text_kl": 0.0, + "total_entropy": 2.0706756114959717 + }, + { + "combined_loss": 0.7285110950469971, + "completion_length": 426.6875, + "epoch": 0.06806615776081425, + "grad_norm": 2.5493366718292236, + "kl": 0.0, + "learning_rate": 9.788493574861199e-07, + "loss": 0.7285, + "num_samples": 1.0, + "reward": 3.5, + "reward_std": 1.0774502754211426, + "rewards/gpt4o_holistic_reward": 3.5, + "rl_loss": -3.725290298461914e-09, + "sft_loss": 2.428370237350464, + "speech_entropy": 2.256551742553711, + "speech_kl": 0.0, + "step": 214, + "text_entropy": 0.9938225150108337, + "text_kl": 0.0, + "total_entropy": 2.015477180480957 + }, + { + "combined_loss": 0.7399945855140686, + "completion_length": 371.375, + "epoch": 0.06838422391857506, + "grad_norm": 1.7615910768508911, + "kl": 0.0, + "learning_rate": 9.786324942711371e-07, + "loss": 0.74, + "num_samples": 1.0, + "reward": 4.75, + "reward_std": 0.5001000165939331, + "rewards/gpt4o_holistic_reward": 4.75, + "rl_loss": 7.450580596923828e-09, + "sft_loss": 2.466648578643799, + "speech_entropy": 2.2756552696228027, + "speech_kl": 0.0, + "step": 215, + "text_entropy": 1.2351531982421875, + "text_kl": 0.0, + "total_entropy": 2.0899648666381836 + }, + { + "combined_loss": 0.6048213243484497, + "completion_length": 541.9375, + "epoch": 0.06870229007633588, + "grad_norm": 1.555175542831421, + "kl": 0.0, + "learning_rate": 9.784145519743606e-07, + "loss": 0.6048, + "num_samples": 1.0, + "reward": 4.375, + "reward_std": 0.4331127107143402, + "rewards/gpt4o_holistic_reward": 4.375, + "rl_loss": 7.450580596923828e-09, + "sft_loss": 2.016071081161499, + "speech_entropy": 2.1028213500976562, + "speech_kl": 0.0, + "step": 216, + "text_entropy": 0.6853211522102356, + "text_kl": 0.0, + "total_entropy": 1.8233641386032104 + }, + { + "combined_loss": 0.6289892196655273, + "completion_length": 317.1875, + "epoch": 0.06902035623409669, + "grad_norm": 1.876192569732666, + "kl": 0.0, + "learning_rate": 9.781955311444596e-07, + "loss": 0.629, + "num_samples": 1.0, + "reward": 4.25, + "reward_std": 0.3944375813007355, + "rewards/gpt4o_holistic_reward": 4.25, + "rl_loss": 0.0, + "sft_loss": 2.096630573272705, + "speech_entropy": 2.2578630447387695, + "speech_kl": 0.0, + "step": 217, + "text_entropy": 1.1112196445465088, + "text_kl": 0.0, + "total_entropy": 2.032492160797119 + }, + { + "combined_loss": 0.6842765808105469, + "completion_length": 477.25, + "epoch": 0.0693384223918575, + "grad_norm": 2.3141274452209473, + "kl": 0.0, + "learning_rate": 9.779754323328192e-07, + "loss": 0.6843, + "num_samples": 1.0, + "reward": 3.5625, + "reward_std": 1.269437551498413, + "rewards/gpt4o_holistic_reward": 3.5625, + "rl_loss": 0.0, + "sft_loss": 2.2809219360351562, + "speech_entropy": 2.205554485321045, + "speech_kl": 0.0, + "step": 218, + "text_entropy": 1.3427551984786987, + "text_kl": 0.0, + "total_entropy": 2.0352625846862793 + }, + { + "combined_loss": 0.6563563346862793, + "completion_length": 386.4375, + "epoch": 0.06965648854961833, + "grad_norm": 1.7859476804733276, + "kl": 0.0, + "learning_rate": 9.777542560935373e-07, + "loss": 0.6564, + "num_samples": 1.0, + "reward": 4.625, + "reward_std": 0.5983423590660095, + "rewards/gpt4o_holistic_reward": 4.625, + "rl_loss": 2.2351741790771484e-08, + "sft_loss": 2.187854290008545, + "speech_entropy": 2.2751975059509277, + "speech_kl": 0.0, + "step": 219, + "text_entropy": 0.9894624352455139, + "text_kl": 0.0, + "total_entropy": 2.025278091430664 + }, + { + "combined_loss": 0.6714828610420227, + "completion_length": 397.6875, + "epoch": 0.06997455470737914, + "grad_norm": 2.1150624752044678, + "kl": 0.0, + "learning_rate": 9.775320029834254e-07, + "loss": 0.6715, + "num_samples": 1.0, + "reward": 2.75, + "reward_std": 0.9565354585647583, + "rewards/gpt4o_holistic_reward": 2.75, + "rl_loss": 7.450580596923828e-09, + "sft_loss": 2.238276243209839, + "speech_entropy": 2.2403571605682373, + "speech_kl": 0.0, + "step": 220, + "text_entropy": 1.4980268478393555, + "text_kl": 0.0, + "total_entropy": 2.0874156951904297 + }, + { + "combined_loss": 0.7247896790504456, + "completion_length": 497.375, + "epoch": 0.07029262086513995, + "grad_norm": 1.8192265033721924, + "kl": 0.0, + "learning_rate": 9.773086735620053e-07, + "loss": 0.7248, + "num_samples": 1.0, + "reward": 3.8125, + "reward_std": 0.4435809552669525, + "rewards/gpt4o_holistic_reward": 3.8125, + "rl_loss": 0.0, + "sft_loss": 2.4159655570983887, + "speech_entropy": 2.3147919178009033, + "speech_kl": 0.0, + "step": 221, + "text_entropy": 1.4983744621276855, + "text_kl": 0.0, + "total_entropy": 2.1691160202026367 + }, + { + "combined_loss": 0.616317868232727, + "completion_length": 594.25, + "epoch": 0.07061068702290077, + "grad_norm": 1.6542634963989258, + "kl": 0.0, + "learning_rate": 9.770842683915082e-07, + "loss": 0.6163, + "num_samples": 1.0, + "reward": 3.4375, + "reward_std": 0.47356173396110535, + "rewards/gpt4o_holistic_reward": 3.4375, + "rl_loss": -3.725290298461914e-09, + "sft_loss": 2.0543928146362305, + "speech_entropy": 2.261496067047119, + "speech_kl": 0.0, + "step": 222, + "text_entropy": 0.8812452554702759, + "text_kl": 0.0, + "total_entropy": 1.996401309967041 + }, + { + "combined_loss": 0.8017942905426025, + "completion_length": 413.75, + "epoch": 0.07092875318066158, + "grad_norm": 2.075211763381958, + "kl": 0.0, + "learning_rate": 9.768587880368742e-07, + "loss": 0.8018, + "num_samples": 1.0, + "reward": 3.6875, + "reward_std": 0.6251000165939331, + "rewards/gpt4o_holistic_reward": 3.6875, + "rl_loss": 0.0, + "sft_loss": 2.672647714614868, + "speech_entropy": 2.2792246341705322, + "speech_kl": 0.0, + "step": 223, + "text_entropy": 1.4464610815048218, + "text_kl": 0.0, + "total_entropy": 2.1342384815216064 + }, + { + "combined_loss": 0.733474612236023, + "completion_length": 342.875, + "epoch": 0.07124681933842239, + "grad_norm": 1.4014809131622314, + "kl": 0.0, + "learning_rate": 9.766322330657497e-07, + "loss": 0.7335, + "num_samples": 1.0, + "reward": 2.5, + "reward_std": 9.999999747378752e-05, + "rewards/gpt4o_holistic_reward": 2.5, + "rl_loss": 0.0, + "sft_loss": 2.444915294647217, + "speech_entropy": 2.490835428237915, + "speech_kl": 0.0, + "step": 224, + "text_entropy": 1.531280755996704, + "text_kl": 0.0, + "total_entropy": 2.3087196350097656 + }, + { + "combined_loss": 0.7604819536209106, + "completion_length": 377.125, + "epoch": 0.0715648854961832, + "grad_norm": 2.309183120727539, + "kl": 0.0, + "learning_rate": 9.764046040484868e-07, + "loss": 0.7605, + "num_samples": 1.0, + "reward": 3.375, + "reward_std": 9.999999747378752e-05, + "rewards/gpt4o_holistic_reward": 3.375, + "rl_loss": 0.0, + "sft_loss": 2.534939765930176, + "speech_entropy": 2.4606456756591797, + "speech_kl": 0.0, + "step": 225, + "text_entropy": 0.7890236377716064, + "text_kl": 0.0, + "total_entropy": 2.0891623497009277 + }, + { + "combined_loss": 0.6990571022033691, + "completion_length": 528.3125, + "epoch": 0.07188295165394402, + "grad_norm": 1.9987963438034058, + "kl": 0.0, + "learning_rate": 9.76175901558141e-07, + "loss": 0.6991, + "num_samples": 1.0, + "reward": 3.5625, + "reward_std": 0.6770563125610352, + "rewards/gpt4o_holistic_reward": 3.5625, + "rl_loss": 0.0, + "sft_loss": 2.3301901817321777, + "speech_entropy": 2.320341110229492, + "speech_kl": 0.0, + "step": 226, + "text_entropy": 1.3003857135772705, + "text_kl": 0.0, + "total_entropy": 2.1371474266052246 + }, + { + "combined_loss": 0.7233133316040039, + "completion_length": 355.9375, + "epoch": 0.07220101781170483, + "grad_norm": 1.6554374694824219, + "kl": 0.0, + "learning_rate": 9.759461261704705e-07, + "loss": 0.7233, + "num_samples": 1.0, + "reward": 4.375, + "reward_std": 0.2501000165939331, + "rewards/gpt4o_holistic_reward": 4.375, + "rl_loss": -3.725290298461914e-09, + "sft_loss": 2.4110443592071533, + "speech_entropy": 2.268995761871338, + "speech_kl": 0.0, + "step": 227, + "text_entropy": 1.413590431213379, + "text_kl": 0.0, + "total_entropy": 2.0987539291381836 + }, + { + "combined_loss": 0.7177764773368835, + "completion_length": 507.4375, + "epoch": 0.07251908396946564, + "grad_norm": 1.8407859802246094, + "kl": 0.0, + "learning_rate": 9.757152784639347e-07, + "loss": 0.7178, + "num_samples": 1.0, + "reward": 4.3125, + "reward_std": 0.7622368335723877, + "rewards/gpt4o_holistic_reward": 4.3125, + "rl_loss": -3.725290298461914e-09, + "sft_loss": 2.3925881385803223, + "speech_entropy": 2.314845561981201, + "speech_kl": 0.0, + "step": 228, + "text_entropy": 1.0042717456817627, + "text_kl": 0.0, + "total_entropy": 2.067537784576416 + }, + { + "combined_loss": 0.7055172324180603, + "completion_length": 615.5625, + "epoch": 0.07283715012722647, + "grad_norm": 1.6229463815689087, + "kl": 0.0, + "learning_rate": 9.754833590196926e-07, + "loss": 0.7055, + "num_samples": 1.0, + "reward": 4.625, + "reward_std": 0.6444375514984131, + "rewards/gpt4o_holistic_reward": 4.625, + "rl_loss": 7.450580596923828e-09, + "sft_loss": 2.3517239093780518, + "speech_entropy": 2.1739797592163086, + "speech_kl": 0.0, + "step": 229, + "text_entropy": 1.4586985111236572, + "text_kl": 0.0, + "total_entropy": 2.04360294342041 + }, + { + "combined_loss": 0.7200095057487488, + "completion_length": 461.0, + "epoch": 0.07315521628498728, + "grad_norm": 2.110440254211426, + "kl": 0.0, + "learning_rate": 9.752503684216007e-07, + "loss": 0.72, + "num_samples": 1.0, + "reward": 4.5625, + "reward_std": 0.8751000165939331, + "rewards/gpt4o_holistic_reward": 4.5625, + "rl_loss": 0.0, + "sft_loss": 2.400031566619873, + "speech_entropy": 2.22104811668396, + "speech_kl": 0.0, + "step": 230, + "text_entropy": 1.356811761856079, + "text_kl": 0.0, + "total_entropy": 2.0665719509124756 + }, + { + "combined_loss": 0.6246628761291504, + "completion_length": 591.875, + "epoch": 0.07347328244274809, + "grad_norm": 2.518479585647583, + "kl": 0.0, + "learning_rate": 9.75016307256213e-07, + "loss": 0.6247, + "num_samples": 1.0, + "reward": 3.1875, + "reward_std": 0.9550646543502808, + "rewards/gpt4o_holistic_reward": 3.1875, + "rl_loss": 3.725290298461914e-09, + "sft_loss": 2.082209587097168, + "speech_entropy": 2.3805904388427734, + "speech_kl": 0.0, + "step": 231, + "text_entropy": 0.864353597164154, + "text_kl": 0.0, + "total_entropy": 2.0375137329101562 + }, + { + "combined_loss": 0.6500042676925659, + "completion_length": 494.125, + "epoch": 0.0737913486005089, + "grad_norm": 1.7023345232009888, + "kl": 0.0, + "learning_rate": 9.74781176112778e-07, + "loss": 0.65, + "num_samples": 1.0, + "reward": 3.625, + "reward_std": 0.7674887180328369, + "rewards/gpt4o_holistic_reward": 3.625, + "rl_loss": 7.450580596923828e-09, + "sft_loss": 2.1666808128356934, + "speech_entropy": 2.172966957092285, + "speech_kl": 0.0, + "step": 232, + "text_entropy": 1.2742629051208496, + "text_kl": 0.0, + "total_entropy": 2.008732318878174 + }, + { + "combined_loss": 0.6174441576004028, + "completion_length": 380.1875, + "epoch": 0.07410941475826972, + "grad_norm": 1.9199665784835815, + "kl": 0.0, + "learning_rate": 9.74544975583238e-07, + "loss": 0.6174, + "num_samples": 1.0, + "reward": 3.625, + "reward_std": 1.6082265377044678, + "rewards/gpt4o_holistic_reward": 3.625, + "rl_loss": 3.725290298461914e-09, + "sft_loss": 2.0581471920013428, + "speech_entropy": 2.2164857387542725, + "speech_kl": 0.0, + "step": 233, + "text_entropy": 0.900662899017334, + "text_kl": 0.0, + "total_entropy": 1.9652466773986816 + }, + { + "combined_loss": 0.7314097285270691, + "completion_length": 307.0625, + "epoch": 0.07442748091603053, + "grad_norm": 2.08520245552063, + "kl": 0.0, + "learning_rate": 9.743077062622278e-07, + "loss": 0.7314, + "num_samples": 1.0, + "reward": 3.1875, + "reward_std": 0.8014019727706909, + "rewards/gpt4o_holistic_reward": 3.1875, + "rl_loss": -1.1175870895385742e-08, + "sft_loss": 2.438032388687134, + "speech_entropy": 2.3524227142333984, + "speech_kl": 0.0, + "step": 234, + "text_entropy": 0.8163133859634399, + "text_kl": 0.0, + "total_entropy": 2.0464541912078857 + }, + { + "combined_loss": 0.6391720175743103, + "completion_length": 435.5625, + "epoch": 0.07474554707379134, + "grad_norm": 1.9668989181518555, + "kl": 0.0, + "learning_rate": 9.740693687470722e-07, + "loss": 0.6392, + "num_samples": 1.0, + "reward": 3.8125, + "reward_std": 0.7694376111030579, + "rewards/gpt4o_holistic_reward": 3.8125, + "rl_loss": 7.450580596923828e-09, + "sft_loss": 2.130573272705078, + "speech_entropy": 2.305225372314453, + "speech_kl": 0.0, + "step": 235, + "text_entropy": 0.7647875547409058, + "text_kl": 0.0, + "total_entropy": 2.0108447074890137 + }, + { + "combined_loss": 0.630230724811554, + "completion_length": 401.0625, + "epoch": 0.07506361323155217, + "grad_norm": 2.1346487998962402, + "kl": 0.0, + "learning_rate": 9.738299636377862e-07, + "loss": 0.6302, + "num_samples": 1.0, + "reward": 3.125, + "reward_std": 0.8859703540802002, + "rewards/gpt4o_holistic_reward": 3.125, + "rl_loss": 9.313225746154785e-09, + "sft_loss": 2.10076904296875, + "speech_entropy": 2.27579402923584, + "speech_kl": 0.0, + "step": 236, + "text_entropy": 0.8570226430892944, + "text_kl": 0.0, + "total_entropy": 2.0057263374328613 + }, + { + "combined_loss": 0.6507729291915894, + "completion_length": 424.8125, + "epoch": 0.07538167938931298, + "grad_norm": 1.7010408639907837, + "kl": 0.0, + "learning_rate": 9.735894915370712e-07, + "loss": 0.6508, + "num_samples": 1.0, + "reward": 3.9375, + "reward_std": 0.989456832408905, + "rewards/gpt4o_holistic_reward": 3.9375, + "rl_loss": -1.4901161193847656e-08, + "sft_loss": 2.169243097305298, + "speech_entropy": 2.329667329788208, + "speech_kl": 0.0, + "step": 237, + "text_entropy": 1.0157864093780518, + "text_kl": 0.0, + "total_entropy": 2.0907399654388428 + }, + { + "combined_loss": 0.6825940608978271, + "completion_length": 556.8125, + "epoch": 0.07569974554707379, + "grad_norm": 2.043261766433716, + "kl": 0.0, + "learning_rate": 9.73347953050316e-07, + "loss": 0.6826, + "num_samples": 1.0, + "reward": 3.1875, + "reward_std": 0.7705972194671631, + "rewards/gpt4o_holistic_reward": 3.1875, + "rl_loss": 3.725290298461914e-09, + "sft_loss": 2.275313377380371, + "speech_entropy": 2.336167812347412, + "speech_kl": 0.0, + "step": 238, + "text_entropy": 1.4063137769699097, + "text_kl": 0.0, + "total_entropy": 2.168437957763672 + }, + { + "combined_loss": 0.6205179691314697, + "completion_length": 378.75, + "epoch": 0.07601781170483461, + "grad_norm": 2.229231595993042, + "kl": 0.0, + "learning_rate": 9.731053487855932e-07, + "loss": 0.6205, + "num_samples": 1.0, + "reward": 3.0625, + "reward_std": 0.8837943077087402, + "rewards/gpt4o_holistic_reward": 3.0625, + "rl_loss": 1.862645149230957e-08, + "sft_loss": 2.0683932304382324, + "speech_entropy": 2.4868783950805664, + "speech_kl": 0.0, + "step": 239, + "text_entropy": 1.320270299911499, + "text_kl": 0.0, + "total_entropy": 2.2788162231445312 + }, + { + "combined_loss": 0.7069031596183777, + "completion_length": 469.5625, + "epoch": 0.07633587786259542, + "grad_norm": 2.225677728652954, + "kl": 0.0, + "learning_rate": 9.728616793536587e-07, + "loss": 0.7069, + "num_samples": 1.0, + "reward": 3.1875, + "reward_std": 1.3122053146362305, + "rewards/gpt4o_holistic_reward": 3.1875, + "rl_loss": 1.4901161193847656e-08, + "sft_loss": 2.3563437461853027, + "speech_entropy": 2.316706657409668, + "speech_kl": 0.0, + "step": 240, + "text_entropy": 1.6138790845870972, + "text_kl": 0.0, + "total_entropy": 2.198500156402588 + }, + { + "combined_loss": 0.6990001797676086, + "completion_length": 413.25, + "epoch": 0.07665394402035623, + "grad_norm": 1.7011535167694092, + "kl": 0.0, + "learning_rate": 9.726169453679502e-07, + "loss": 0.699, + "num_samples": 1.0, + "reward": 4.125, + "reward_std": 0.6764019727706909, + "rewards/gpt4o_holistic_reward": 4.125, + "rl_loss": 2.60770320892334e-08, + "sft_loss": 2.330000400543213, + "speech_entropy": 2.288140296936035, + "speech_kl": 0.0, + "step": 241, + "text_entropy": 1.567537546157837, + "text_kl": 0.0, + "total_entropy": 2.162627696990967 + }, + { + "combined_loss": 0.7738334536552429, + "completion_length": 296.8125, + "epoch": 0.07697201017811704, + "grad_norm": 2.249540328979492, + "kl": 0.0, + "learning_rate": 9.72371147444585e-07, + "loss": 0.7738, + "num_samples": 1.0, + "reward": 3.25, + "reward_std": 1.8020561933517456, + "rewards/gpt4o_holistic_reward": 3.25, + "rl_loss": -3.725290298461914e-09, + "sft_loss": 2.579444646835327, + "speech_entropy": 2.2558746337890625, + "speech_kl": 0.0, + "step": 242, + "text_entropy": 1.140458583831787, + "text_kl": 0.0, + "total_entropy": 2.0467886924743652 + }, + { + "combined_loss": 0.6218153834342957, + "completion_length": 417.1875, + "epoch": 0.07729007633587787, + "grad_norm": 1.888468861579895, + "kl": 0.0, + "learning_rate": 9.721242862023591e-07, + "loss": 0.6218, + "num_samples": 1.0, + "reward": 4.6875, + "reward_std": 0.6251000165939331, + "rewards/gpt4o_holistic_reward": 4.6875, + "rl_loss": 0.0, + "sft_loss": 2.0727176666259766, + "speech_entropy": 2.2084412574768066, + "speech_kl": 0.0, + "step": 243, + "text_entropy": 0.5448473691940308, + "text_kl": 0.0, + "total_entropy": 1.8867276906967163 + }, + { + "combined_loss": 0.6377319097518921, + "completion_length": 424.375, + "epoch": 0.07760814249363868, + "grad_norm": 1.7285096645355225, + "kl": 0.0, + "learning_rate": 9.718763622627458e-07, + "loss": 0.6377, + "num_samples": 1.0, + "reward": 4.5625, + "reward_std": 0.7286534309387207, + "rewards/gpt4o_holistic_reward": 4.5625, + "rl_loss": -7.450580596923828e-09, + "sft_loss": 2.1257729530334473, + "speech_entropy": 2.167391538619995, + "speech_kl": 0.0, + "step": 244, + "text_entropy": 1.0646146535873413, + "text_kl": 0.0, + "total_entropy": 1.956799030303955 + }, + { + "combined_loss": 0.6482560634613037, + "completion_length": 407.4375, + "epoch": 0.07792620865139949, + "grad_norm": 1.660476803779602, + "kl": 0.0, + "learning_rate": 9.716273762498929e-07, + "loss": 0.6483, + "num_samples": 1.0, + "reward": 4.4375, + "reward_std": 0.42705631256103516, + "rewards/gpt4o_holistic_reward": 4.4375, + "rl_loss": 7.450580596923828e-09, + "sft_loss": 2.160853385925293, + "speech_entropy": 2.198376417160034, + "speech_kl": 0.0, + "step": 245, + "text_entropy": 1.3145016431808472, + "text_kl": 0.0, + "total_entropy": 2.031963348388672 + }, + { + "combined_loss": 0.6421551704406738, + "completion_length": 586.875, + "epoch": 0.07824427480916031, + "grad_norm": 1.4380934238433838, + "kl": 0.0, + "learning_rate": 9.71377328790622e-07, + "loss": 0.6422, + "num_samples": 1.0, + "reward": 3.5, + "reward_std": 0.5774502754211426, + "rewards/gpt4o_holistic_reward": 3.5, + "rl_loss": 0.0, + "sft_loss": 2.140516996383667, + "speech_entropy": 2.2290802001953125, + "speech_kl": 0.0, + "step": 246, + "text_entropy": 0.9156054258346558, + "text_kl": 0.0, + "total_entropy": 1.9700312614440918 + }, + { + "combined_loss": 0.7020258903503418, + "completion_length": 419.125, + "epoch": 0.07856234096692112, + "grad_norm": 3.0993080139160156, + "kl": 0.0, + "learning_rate": 9.711262205144285e-07, + "loss": 0.702, + "num_samples": 1.0, + "reward": 4.1875, + "reward_std": 0.6637751460075378, + "rewards/gpt4o_holistic_reward": 4.1875, + "rl_loss": -2.2351741790771484e-08, + "sft_loss": 2.340085983276367, + "speech_entropy": 2.3030998706817627, + "speech_kl": 0.0, + "step": 247, + "text_entropy": 1.1675899028778076, + "text_kl": 0.0, + "total_entropy": 2.082179307937622 + }, + { + "combined_loss": 0.686911940574646, + "completion_length": 387.25, + "epoch": 0.07888040712468193, + "grad_norm": 1.6759203672409058, + "kl": 0.0, + "learning_rate": 9.70874052053476e-07, + "loss": 0.6869, + "num_samples": 1.0, + "reward": 4.9375, + "reward_std": 0.1251000016927719, + "rewards/gpt4o_holistic_reward": 4.9375, + "rl_loss": 7.450580596923828e-09, + "sft_loss": 2.2897064685821533, + "speech_entropy": 2.1984457969665527, + "speech_kl": 0.0, + "step": 248, + "text_entropy": 1.1239951848983765, + "text_kl": 0.0, + "total_entropy": 1.9890978336334229 + }, + { + "combined_loss": 0.6796283721923828, + "completion_length": 495.375, + "epoch": 0.07919847328244274, + "grad_norm": 1.5949386358261108, + "kl": 0.0, + "learning_rate": 9.706208240425988e-07, + "loss": 0.6796, + "num_samples": 1.0, + "reward": 4.1875, + "reward_std": 0.1251000016927719, + "rewards/gpt4o_holistic_reward": 4.1875, + "rl_loss": 1.1175870895385742e-08, + "sft_loss": 2.265427589416504, + "speech_entropy": 2.263617992401123, + "speech_kl": 0.0, + "step": 249, + "text_entropy": 1.3017940521240234, + "text_kl": 0.0, + "total_entropy": 2.0769906044006348 + }, + { + "combined_loss": 0.6568690538406372, + "completion_length": 432.3125, + "epoch": 0.07951653944020357, + "grad_norm": 1.5095460414886475, + "kl": 0.0, + "learning_rate": 9.70366537119298e-07, + "loss": 0.6569, + "num_samples": 1.0, + "reward": 4.5, + "reward_std": 0.3536534011363983, + "rewards/gpt4o_holistic_reward": 4.5, + "rl_loss": 7.450580596923828e-09, + "sft_loss": 2.189563274383545, + "speech_entropy": 2.2465767860412598, + "speech_kl": 0.0, + "step": 250, + "text_entropy": 0.8271604776382446, + "text_kl": 0.0, + "total_entropy": 1.9808815717697144 + }, + { + "combined_loss": 0.691437840461731, + "completion_length": 455.625, + "epoch": 0.07983460559796438, + "grad_norm": 2.107602119445801, + "kl": 0.0, + "learning_rate": 9.701111919237408e-07, + "loss": 0.6914, + "num_samples": 1.0, + "reward": 2.625, + "reward_std": 1.228813648223877, + "rewards/gpt4o_holistic_reward": 2.625, + "rl_loss": -7.450580596923828e-09, + "sft_loss": 2.304792881011963, + "speech_entropy": 2.464122772216797, + "speech_kl": 0.0, + "step": 251, + "text_entropy": 1.3184483051300049, + "text_kl": 0.0, + "total_entropy": 2.2695252895355225 + }, + { + "combined_loss": 0.6778163909912109, + "completion_length": 400.5625, + "epoch": 0.08015267175572519, + "grad_norm": 1.9228991270065308, + "kl": 0.0, + "learning_rate": 9.698547890987584e-07, + "loss": 0.6778, + "num_samples": 1.0, + "reward": 3.4375, + "reward_std": 0.8751000165939331, + "rewards/gpt4o_holistic_reward": 3.4375, + "rl_loss": 1.862645149230957e-09, + "sft_loss": 2.259387969970703, + "speech_entropy": 2.1710927486419678, + "speech_kl": 0.0, + "step": 252, + "text_entropy": 1.0632684230804443, + "text_kl": 0.0, + "total_entropy": 1.970045566558838 + }, + { + "combined_loss": 0.7383031845092773, + "completion_length": 435.9375, + "epoch": 0.08047073791348601, + "grad_norm": 1.9669862985610962, + "kl": 0.0, + "learning_rate": 9.695973292898442e-07, + "loss": 0.7383, + "num_samples": 1.0, + "reward": 4.3125, + "reward_std": 1.0876991748809814, + "rewards/gpt4o_holistic_reward": 4.3125, + "rl_loss": 1.4901161193847656e-08, + "sft_loss": 2.461010456085205, + "speech_entropy": 2.2386293411254883, + "speech_kl": 0.0, + "step": 253, + "text_entropy": 1.4104411602020264, + "text_kl": 0.0, + "total_entropy": 2.0837998390197754 + }, + { + "combined_loss": 0.6110467910766602, + "completion_length": 388.125, + "epoch": 0.08078880407124682, + "grad_norm": 1.7974542379379272, + "kl": 0.0, + "learning_rate": 9.693388131451536e-07, + "loss": 0.611, + "num_samples": 1.0, + "reward": 4.0, + "reward_std": 1.3661253452301025, + "rewards/gpt4o_holistic_reward": 4.0, + "rl_loss": 7.450580596923828e-09, + "sft_loss": 2.036822557449341, + "speech_entropy": 2.301140785217285, + "speech_kl": 0.0, + "step": 254, + "text_entropy": 0.9246535897254944, + "text_kl": 0.0, + "total_entropy": 2.01608943939209 + }, + { + "combined_loss": 0.6638992428779602, + "completion_length": 567.8125, + "epoch": 0.08110687022900763, + "grad_norm": 1.5331536531448364, + "kl": 0.0, + "learning_rate": 9.690792413155002e-07, + "loss": 0.6639, + "num_samples": 1.0, + "reward": 4.1875, + "reward_std": 0.5646764636039734, + "rewards/gpt4o_holistic_reward": 4.1875, + "rl_loss": 0.0, + "sft_loss": 2.2129974365234375, + "speech_entropy": 2.180696964263916, + "speech_kl": 0.0, + "step": 255, + "text_entropy": 1.1481618881225586, + "text_kl": 0.0, + "total_entropy": 1.9820756912231445 + }, + { + "combined_loss": 0.6851294040679932, + "completion_length": 547.0, + "epoch": 0.08142493638676845, + "grad_norm": 1.8251949548721313, + "kl": 0.0, + "learning_rate": 9.688186144543558e-07, + "loss": 0.6851, + "num_samples": 1.0, + "reward": 3.6875, + "reward_std": 0.8081126809120178, + "rewards/gpt4o_holistic_reward": 3.6875, + "rl_loss": -1.1175870895385742e-08, + "sft_loss": 2.2837648391723633, + "speech_entropy": 2.2099359035491943, + "speech_kl": 0.0, + "step": 256, + "text_entropy": 1.4614191055297852, + "text_kl": 0.0, + "total_entropy": 2.0672175884246826 + }, + { + "combined_loss": 0.7618996500968933, + "completion_length": 372.0625, + "epoch": 0.08174300254452926, + "grad_norm": 2.1775362491607666, + "kl": 0.0, + "learning_rate": 9.685569332178487e-07, + "loss": 0.7619, + "num_samples": 1.0, + "reward": 2.8125, + "reward_std": 0.5194376111030579, + "rewards/gpt4o_holistic_reward": 2.8125, + "rl_loss": -3.725290298461914e-09, + "sft_loss": 2.539665460586548, + "speech_entropy": 2.2694239616394043, + "speech_kl": 0.0, + "step": 257, + "text_entropy": 0.980126142501831, + "text_kl": 0.0, + "total_entropy": 2.0079169273376465 + }, + { + "combined_loss": 0.6269736289978027, + "completion_length": 546.5, + "epoch": 0.08206106870229007, + "grad_norm": 1.405476689338684, + "kl": 0.0, + "learning_rate": 9.682941982647605e-07, + "loss": 0.627, + "num_samples": 1.0, + "reward": 2.9375, + "reward_std": 0.6251000165939331, + "rewards/gpt4o_holistic_reward": 2.9375, + "rl_loss": -3.725290298461914e-09, + "sft_loss": 2.089912176132202, + "speech_entropy": 2.1415200233459473, + "speech_kl": 0.0, + "step": 258, + "text_entropy": 1.16708242893219, + "text_kl": 0.0, + "total_entropy": 1.9402116537094116 + }, + { + "combined_loss": 0.6667758226394653, + "completion_length": 376.0625, + "epoch": 0.08237913486005088, + "grad_norm": 1.9283385276794434, + "kl": 0.0, + "learning_rate": 9.680304102565265e-07, + "loss": 0.6668, + "num_samples": 1.0, + "reward": 3.0625, + "reward_std": 0.7394567728042603, + "rewards/gpt4o_holistic_reward": 3.0625, + "rl_loss": -1.4901161193847656e-08, + "sft_loss": 2.222586154937744, + "speech_entropy": 2.2938990592956543, + "speech_kl": 0.0, + "step": 259, + "text_entropy": 0.9376010894775391, + "text_kl": 0.0, + "total_entropy": 2.0546226501464844 + }, + { + "combined_loss": 0.6579493880271912, + "completion_length": 224.5, + "epoch": 0.08269720101781171, + "grad_norm": 2.025418519973755, + "kl": 0.0, + "learning_rate": 9.677655698572325e-07, + "loss": 0.6579, + "num_samples": 1.0, + "reward": 4.375, + "reward_std": 0.3228486180305481, + "rewards/gpt4o_holistic_reward": 4.375, + "rl_loss": 7.450580596923828e-09, + "sft_loss": 2.193164587020874, + "speech_entropy": 2.3803346157073975, + "speech_kl": 0.0, + "step": 260, + "text_entropy": 1.0623462200164795, + "text_kl": 0.0, + "total_entropy": 2.11057186126709 + }, + { + "combined_loss": 0.7478048205375671, + "completion_length": 362.0625, + "epoch": 0.08301526717557252, + "grad_norm": 3.3174071311950684, + "kl": 0.0, + "learning_rate": 9.674996777336142e-07, + "loss": 0.7478, + "num_samples": 1.0, + "reward": 4.5625, + "reward_std": 0.3146764636039734, + "rewards/gpt4o_holistic_reward": 4.5625, + "rl_loss": 0.0, + "sft_loss": 2.492682456970215, + "speech_entropy": 2.2766928672790527, + "speech_kl": 0.0, + "step": 261, + "text_entropy": 1.202368140220642, + "text_kl": 0.0, + "total_entropy": 2.069495677947998 + }, + { + "combined_loss": 0.6093226075172424, + "completion_length": 404.625, + "epoch": 0.08333333333333333, + "grad_norm": 2.024925470352173, + "kl": 0.0, + "learning_rate": 9.672327345550543e-07, + "loss": 0.6093, + "num_samples": 1.0, + "reward": 3.0, + "reward_std": 1.1161253452301025, + "rewards/gpt4o_holistic_reward": 3.0, + "rl_loss": 3.725290298461914e-09, + "sft_loss": 2.0310750007629395, + "speech_entropy": 2.2833635807037354, + "speech_kl": 0.0, + "step": 262, + "text_entropy": 0.996048629283905, + "text_kl": 0.0, + "total_entropy": 2.037139654159546 + }, + { + "combined_loss": 0.6289666891098022, + "completion_length": 487.0625, + "epoch": 0.08365139949109415, + "grad_norm": 2.06803297996521, + "kl": 0.0, + "learning_rate": 9.669647409935822e-07, + "loss": 0.629, + "num_samples": 1.0, + "reward": 3.625, + "reward_std": 1.2440414428710938, + "rewards/gpt4o_holistic_reward": 3.625, + "rl_loss": -3.725290298461914e-09, + "sft_loss": 2.096555709838867, + "speech_entropy": 2.283952236175537, + "speech_kl": 0.0, + "step": 263, + "text_entropy": 1.3200794458389282, + "text_kl": 0.0, + "total_entropy": 2.095597267150879 + }, + { + "combined_loss": 0.6302919387817383, + "completion_length": 503.0625, + "epoch": 0.08396946564885496, + "grad_norm": 1.8565832376480103, + "kl": 0.0, + "learning_rate": 9.666956977238711e-07, + "loss": 0.6303, + "num_samples": 1.0, + "reward": 3.75, + "reward_std": 0.6144567728042603, + "rewards/gpt4o_holistic_reward": 3.75, + "rl_loss": 0.0, + "sft_loss": 2.100973129272461, + "speech_entropy": 2.1800198554992676, + "speech_kl": 0.0, + "step": 264, + "text_entropy": 0.9526975154876709, + "text_kl": 0.0, + "total_entropy": 1.956667184829712 + }, + { + "combined_loss": 0.7350342273712158, + "completion_length": 456.9375, + "epoch": 0.08428753180661577, + "grad_norm": 1.7183167934417725, + "kl": 0.0, + "learning_rate": 9.664256054232374e-07, + "loss": 0.735, + "num_samples": 1.0, + "reward": 4.875, + "reward_std": 0.2501000165939331, + "rewards/gpt4o_holistic_reward": 4.875, + "rl_loss": 3.725290298461914e-09, + "sft_loss": 2.4501140117645264, + "speech_entropy": 2.2488903999328613, + "speech_kl": 0.0, + "step": 265, + "text_entropy": 1.4711058139801025, + "text_kl": 0.0, + "total_entropy": 2.073747396469116 + }, + { + "combined_loss": 0.6489673256874084, + "completion_length": 426.8125, + "epoch": 0.0846055979643766, + "grad_norm": 1.6044869422912598, + "kl": 0.0, + "learning_rate": 9.66154464771638e-07, + "loss": 0.649, + "num_samples": 1.0, + "reward": 3.0, + "reward_std": 0.5001000165939331, + "rewards/gpt4o_holistic_reward": 3.0, + "rl_loss": 0.0, + "sft_loss": 2.163224220275879, + "speech_entropy": 2.525622606277466, + "speech_kl": 0.0, + "step": 266, + "text_entropy": 1.200254201889038, + "text_kl": 0.0, + "total_entropy": 2.274986505508423 + }, + { + "combined_loss": 0.7551906108856201, + "completion_length": 445.6875, + "epoch": 0.08492366412213741, + "grad_norm": 1.658619999885559, + "kl": 0.0, + "learning_rate": 9.658822764516693e-07, + "loss": 0.7552, + "num_samples": 1.0, + "reward": 4.4375, + "reward_std": 0.8081126809120178, + "rewards/gpt4o_holistic_reward": 4.4375, + "rl_loss": 3.725290298461914e-09, + "sft_loss": 2.5173017978668213, + "speech_entropy": 2.1905436515808105, + "speech_kl": 0.0, + "step": 267, + "text_entropy": 1.1561870574951172, + "text_kl": 0.0, + "total_entropy": 1.9924242496490479 + }, + { + "combined_loss": 0.6897430419921875, + "completion_length": 678.3125, + "epoch": 0.08524173027989822, + "grad_norm": 1.7225435972213745, + "kl": 0.0, + "learning_rate": 9.65609041148565e-07, + "loss": 0.6897, + "num_samples": 1.0, + "reward": 3.5625, + "reward_std": 0.9435809850692749, + "rewards/gpt4o_holistic_reward": 3.5625, + "rl_loss": 1.1175870895385742e-08, + "sft_loss": 2.2991433143615723, + "speech_entropy": 2.1389427185058594, + "speech_kl": 0.0, + "step": 268, + "text_entropy": 1.3221979141235352, + "text_kl": 0.0, + "total_entropy": 1.9796137809753418 + }, + { + "combined_loss": 0.5930161476135254, + "completion_length": 548.0, + "epoch": 0.08555979643765903, + "grad_norm": 1.4510716199874878, + "kl": 0.0, + "learning_rate": 9.653347595501946e-07, + "loss": 0.593, + "num_samples": 1.0, + "reward": 4.125, + "reward_std": 0.14443756639957428, + "rewards/gpt4o_holistic_reward": 4.125, + "rl_loss": 0.0, + "sft_loss": 1.9767203330993652, + "speech_entropy": 2.1469504833221436, + "speech_kl": 0.0, + "step": 269, + "text_entropy": 0.8947275876998901, + "text_kl": 0.0, + "total_entropy": 1.9000020027160645 + }, + { + "combined_loss": 0.652092456817627, + "completion_length": 484.625, + "epoch": 0.08587786259541985, + "grad_norm": 1.9512776136398315, + "kl": 0.0, + "learning_rate": 9.650594323470617e-07, + "loss": 0.6521, + "num_samples": 1.0, + "reward": 2.8125, + "reward_std": 0.8808612823486328, + "rewards/gpt4o_holistic_reward": 2.8125, + "rl_loss": -1.4901161193847656e-08, + "sft_loss": 2.1736412048339844, + "speech_entropy": 2.1839957237243652, + "speech_kl": 0.0, + "step": 270, + "text_entropy": 1.3368381261825562, + "text_kl": 0.0, + "total_entropy": 2.01955246925354 + }, + { + "combined_loss": 0.6888371706008911, + "completion_length": 471.3125, + "epoch": 0.08619592875318066, + "grad_norm": 1.8091537952423096, + "kl": 0.0, + "learning_rate": 9.64783060232302e-07, + "loss": 0.6888, + "num_samples": 1.0, + "reward": 3.875, + "reward_std": 1.183112621307373, + "rewards/gpt4o_holistic_reward": 3.875, + "rl_loss": 2.2351741790771484e-08, + "sft_loss": 2.296123743057251, + "speech_entropy": 2.1703720092773438, + "speech_kl": 0.0, + "step": 271, + "text_entropy": 1.033645749092102, + "text_kl": 0.0, + "total_entropy": 1.927350401878357 + }, + { + "combined_loss": 0.6634478569030762, + "completion_length": 618.1875, + "epoch": 0.08651399491094147, + "grad_norm": 1.6202704906463623, + "kl": 0.0, + "learning_rate": 9.645056439016825e-07, + "loss": 0.6634, + "num_samples": 1.0, + "reward": 3.125, + "reward_std": 1.5379188060760498, + "rewards/gpt4o_holistic_reward": 3.125, + "rl_loss": 3.725290298461914e-09, + "sft_loss": 2.2114930152893066, + "speech_entropy": 2.063384532928467, + "speech_kl": 0.0, + "step": 272, + "text_entropy": 0.6551350951194763, + "text_kl": 0.0, + "total_entropy": 1.7709801197052002 + }, + { + "combined_loss": 0.6863433718681335, + "completion_length": 390.0625, + "epoch": 0.0868320610687023, + "grad_norm": 1.5252995491027832, + "kl": 0.0, + "learning_rate": 9.64227184053598e-07, + "loss": 0.6863, + "num_samples": 1.0, + "reward": 3.875, + "reward_std": 0.4788135886192322, + "rewards/gpt4o_holistic_reward": 3.875, + "rl_loss": 0.0, + "sft_loss": 2.287811040878296, + "speech_entropy": 2.296483039855957, + "speech_kl": 0.0, + "step": 273, + "text_entropy": 1.0703375339508057, + "text_kl": 0.0, + "total_entropy": 2.0609984397888184 + }, + { + "combined_loss": 0.7279566526412964, + "completion_length": 336.625, + "epoch": 0.0871501272264631, + "grad_norm": 2.0608808994293213, + "kl": 0.0, + "learning_rate": 9.639476813890713e-07, + "loss": 0.728, + "num_samples": 1.0, + "reward": 2.875, + "reward_std": 0.6444375514984131, + "rewards/gpt4o_holistic_reward": 2.875, + "rl_loss": 0.0, + "sft_loss": 2.4265217781066895, + "speech_entropy": 2.2220163345336914, + "speech_kl": 0.0, + "step": 274, + "text_entropy": 1.446899175643921, + "text_kl": 0.0, + "total_entropy": 2.081493377685547 + }, + { + "combined_loss": 0.7374498248100281, + "completion_length": 533.1875, + "epoch": 0.08746819338422392, + "grad_norm": 1.9891971349716187, + "kl": 0.0, + "learning_rate": 9.636671366117494e-07, + "loss": 0.7374, + "num_samples": 1.0, + "reward": 2.5, + "reward_std": 0.3944375813007355, + "rewards/gpt4o_holistic_reward": 2.5, + "rl_loss": -7.450580596923828e-09, + "sft_loss": 2.4581661224365234, + "speech_entropy": 2.133476495742798, + "speech_kl": 0.0, + "step": 275, + "text_entropy": 1.2225779294967651, + "text_kl": 0.0, + "total_entropy": 1.9578973054885864 + }, + { + "combined_loss": 0.6486621499061584, + "completion_length": 366.875, + "epoch": 0.08778625954198473, + "grad_norm": 3.3881635665893555, + "kl": 0.0, + "learning_rate": 9.63385550427904e-07, + "loss": 0.6487, + "num_samples": 1.0, + "reward": 4.375, + "reward_std": 0.9331126809120178, + "rewards/gpt4o_holistic_reward": 4.375, + "rl_loss": 7.450580596923828e-09, + "sft_loss": 2.1622071266174316, + "speech_entropy": 2.344599723815918, + "speech_kl": 0.0, + "step": 276, + "text_entropy": 1.1587448120117188, + "text_kl": 0.0, + "total_entropy": 2.122767448425293 + }, + { + "combined_loss": 0.6684514284133911, + "completion_length": 424.9375, + "epoch": 0.08810432569974555, + "grad_norm": 1.9074612855911255, + "kl": 0.0, + "learning_rate": 9.631029235464278e-07, + "loss": 0.6685, + "num_samples": 1.0, + "reward": 4.5, + "reward_std": 0.5774502158164978, + "rewards/gpt4o_holistic_reward": 4.5, + "rl_loss": -3.725290298461914e-09, + "sft_loss": 2.2281713485717773, + "speech_entropy": 2.4562625885009766, + "speech_kl": 0.0, + "step": 277, + "text_entropy": 1.2592930793762207, + "text_kl": 0.0, + "total_entropy": 2.2500598430633545 + }, + { + "combined_loss": 0.671592116355896, + "completion_length": 438.0, + "epoch": 0.08842239185750636, + "grad_norm": 1.861733078956604, + "kl": 0.0, + "learning_rate": 9.628192566788335e-07, + "loss": 0.6716, + "num_samples": 1.0, + "reward": 3.75, + "reward_std": 0.6404881477355957, + "rewards/gpt4o_holistic_reward": 3.75, + "rl_loss": 1.4901161193847656e-08, + "sft_loss": 2.238640308380127, + "speech_entropy": 2.177516222000122, + "speech_kl": 0.0, + "step": 278, + "text_entropy": 1.346944808959961, + "text_kl": 0.0, + "total_entropy": 2.020207643508911 + }, + { + "combined_loss": 0.638625979423523, + "completion_length": 528.4375, + "epoch": 0.08874045801526717, + "grad_norm": 1.7982735633850098, + "kl": 0.0, + "learning_rate": 9.625345505392522e-07, + "loss": 0.6386, + "num_samples": 1.0, + "reward": 2.625, + "reward_std": 1.0474694967269897, + "rewards/gpt4o_holistic_reward": 2.625, + "rl_loss": 3.725290298461914e-09, + "sft_loss": 2.128753185272217, + "speech_entropy": 2.180100679397583, + "speech_kl": 0.0, + "step": 279, + "text_entropy": 0.8083711862564087, + "text_kl": 0.0, + "total_entropy": 1.9105725288391113 + }, + { + "combined_loss": 0.7118488550186157, + "completion_length": 514.25, + "epoch": 0.089058524173028, + "grad_norm": 1.5744655132293701, + "kl": 0.0, + "learning_rate": 9.622488058444313e-07, + "loss": 0.7118, + "num_samples": 1.0, + "reward": 4.375, + "reward_std": 0.8228486180305481, + "rewards/gpt4o_holistic_reward": 4.375, + "rl_loss": 5.587935447692871e-09, + "sft_loss": 2.3728294372558594, + "speech_entropy": 2.1545000076293945, + "speech_kl": 0.0, + "step": 280, + "text_entropy": 1.114195466041565, + "text_kl": 0.0, + "total_entropy": 1.9586902856826782 + }, + { + "combined_loss": 0.6681440472602844, + "completion_length": 377.4375, + "epoch": 0.0893765903307888, + "grad_norm": 1.993377447128296, + "kl": 0.0, + "learning_rate": 9.619620233137326e-07, + "loss": 0.6681, + "num_samples": 1.0, + "reward": 3.4375, + "reward_std": 0.6724694967269897, + "rewards/gpt4o_holistic_reward": 3.4375, + "rl_loss": -3.725290298461914e-09, + "sft_loss": 2.227146863937378, + "speech_entropy": 2.207726240158081, + "speech_kl": 0.0, + "step": 281, + "text_entropy": 1.0726655721664429, + "text_kl": 0.0, + "total_entropy": 1.990494728088379 + }, + { + "combined_loss": 0.6539067029953003, + "completion_length": 420.5625, + "epoch": 0.08969465648854962, + "grad_norm": 1.9349361658096313, + "kl": 0.0, + "learning_rate": 9.61674203669131e-07, + "loss": 0.6539, + "num_samples": 1.0, + "reward": 4.375, + "reward_std": 1.250100016593933, + "rewards/gpt4o_holistic_reward": 4.375, + "rl_loss": 1.1175870895385742e-08, + "sft_loss": 2.1796889305114746, + "speech_entropy": 2.3027138710021973, + "speech_kl": 0.0, + "step": 282, + "text_entropy": 1.0392296314239502, + "text_kl": 0.0, + "total_entropy": 2.0374677181243896 + }, + { + "combined_loss": 0.7105068564414978, + "completion_length": 469.1875, + "epoch": 0.09001272264631044, + "grad_norm": 1.9159810543060303, + "kl": 0.0, + "learning_rate": 9.61385347635212e-07, + "loss": 0.7105, + "num_samples": 1.0, + "reward": 3.5, + "reward_std": 0.7288135886192322, + "rewards/gpt4o_holistic_reward": 3.5, + "rl_loss": 3.725290298461914e-09, + "sft_loss": 2.368356227874756, + "speech_entropy": 2.4519176483154297, + "speech_kl": 0.0, + "step": 283, + "text_entropy": 2.145763397216797, + "text_kl": 0.0, + "total_entropy": 2.3996076583862305 + }, + { + "combined_loss": 0.562969982624054, + "completion_length": 570.3125, + "epoch": 0.09033078880407125, + "grad_norm": 1.6282283067703247, + "kl": 0.0, + "learning_rate": 9.610954559391704e-07, + "loss": 0.563, + "num_samples": 1.0, + "reward": 3.5, + "reward_std": 0.9788135886192322, + "rewards/gpt4o_holistic_reward": 3.5, + "rl_loss": 0.0, + "sft_loss": 1.8765664100646973, + "speech_entropy": 2.180119514465332, + "speech_kl": 0.0, + "step": 284, + "text_entropy": 0.6460127830505371, + "text_kl": 0.0, + "total_entropy": 1.8707494735717773 + }, + { + "combined_loss": 0.691946268081665, + "completion_length": 618.0625, + "epoch": 0.09064885496183206, + "grad_norm": 1.7233694791793823, + "kl": 0.0, + "learning_rate": 9.60804529310808e-07, + "loss": 0.6919, + "num_samples": 1.0, + "reward": 3.5, + "reward_std": 1.3536533117294312, + "rewards/gpt4o_holistic_reward": 3.5, + "rl_loss": -7.450580596923828e-09, + "sft_loss": 2.306487560272217, + "speech_entropy": 2.1861276626586914, + "speech_kl": 0.0, + "step": 285, + "text_entropy": 0.9004356861114502, + "text_kl": 0.0, + "total_entropy": 1.9308792352676392 + }, + { + "combined_loss": 0.6357396841049194, + "completion_length": 506.5625, + "epoch": 0.09096692111959287, + "grad_norm": 1.5584510564804077, + "kl": 0.0, + "learning_rate": 9.605125684825322e-07, + "loss": 0.6357, + "num_samples": 1.0, + "reward": 3.375, + "reward_std": 0.5001000165939331, + "rewards/gpt4o_holistic_reward": 3.375, + "rl_loss": -7.450580596923828e-09, + "sft_loss": 2.1191322803497314, + "speech_entropy": 2.424971580505371, + "speech_kl": 0.0, + "step": 286, + "text_entropy": 1.4699835777282715, + "text_kl": 0.0, + "total_entropy": 2.243284225463867 + }, + { + "combined_loss": 0.8120319843292236, + "completion_length": 579.6875, + "epoch": 0.0912849872773537, + "grad_norm": 1.81868577003479, + "kl": 0.0, + "learning_rate": 9.602195741893546e-07, + "loss": 0.812, + "num_samples": 1.0, + "reward": 4.375, + "reward_std": 0.6115237474441528, + "rewards/gpt4o_holistic_reward": 4.375, + "rl_loss": -1.862645149230957e-09, + "sft_loss": 2.706773042678833, + "speech_entropy": 2.2252883911132812, + "speech_kl": 0.0, + "step": 287, + "text_entropy": 1.768620491027832, + "text_kl": 0.0, + "total_entropy": 2.1409504413604736 + }, + { + "combined_loss": 0.7210448384284973, + "completion_length": 399.75, + "epoch": 0.0916030534351145, + "grad_norm": 3.3182334899902344, + "kl": 0.0, + "learning_rate": 9.59925547168887e-07, + "loss": 0.721, + "num_samples": 1.0, + "reward": 3.4375, + "reward_std": 1.0731656551361084, + "rewards/gpt4o_holistic_reward": 3.4375, + "rl_loss": 3.725290298461914e-09, + "sft_loss": 2.403482675552368, + "speech_entropy": 2.526559829711914, + "speech_kl": 0.0, + "step": 288, + "text_entropy": 1.5308034420013428, + "text_kl": 0.0, + "total_entropy": 2.3434336185455322 + }, + { + "combined_loss": 0.7331863641738892, + "completion_length": 562.9375, + "epoch": 0.09192111959287531, + "grad_norm": 1.7665117979049683, + "kl": 0.0, + "learning_rate": 9.596304881613432e-07, + "loss": 0.7332, + "num_samples": 1.0, + "reward": 3.375, + "reward_std": 1.0327467918395996, + "rewards/gpt4o_holistic_reward": 3.375, + "rl_loss": 7.450580596923828e-09, + "sft_loss": 2.4439544677734375, + "speech_entropy": 2.499924659729004, + "speech_kl": 0.0, + "step": 289, + "text_entropy": 1.3738188743591309, + "text_kl": 0.0, + "total_entropy": 2.2846546173095703 + }, + { + "combined_loss": 0.7444977760314941, + "completion_length": 483.3125, + "epoch": 0.09223918575063614, + "grad_norm": 1.8501068353652954, + "kl": 0.0, + "learning_rate": 9.593343979095332e-07, + "loss": 0.7445, + "num_samples": 1.0, + "reward": 3.75, + "reward_std": 0.5000999569892883, + "rewards/gpt4o_holistic_reward": 3.75, + "rl_loss": 7.450580596923828e-09, + "sft_loss": 2.481659173965454, + "speech_entropy": 2.394073486328125, + "speech_kl": 0.0, + "step": 290, + "text_entropy": 1.5317790508270264, + "text_kl": 0.0, + "total_entropy": 2.2343432903289795 + }, + { + "combined_loss": 0.6787593364715576, + "completion_length": 557.375, + "epoch": 0.09255725190839695, + "grad_norm": 2.0699431896209717, + "kl": 0.0, + "learning_rate": 9.59037277158864e-07, + "loss": 0.6788, + "num_samples": 1.0, + "reward": 2.6875, + "reward_std": 0.9478486180305481, + "rewards/gpt4o_holistic_reward": 2.6875, + "rl_loss": 1.862645149230957e-08, + "sft_loss": 2.26253080368042, + "speech_entropy": 2.463019609451294, + "speech_kl": 0.0, + "step": 291, + "text_entropy": 1.1903096437454224, + "text_kl": 0.0, + "total_entropy": 2.2193331718444824 + }, + { + "combined_loss": 0.6866003274917603, + "completion_length": 340.25, + "epoch": 0.09287531806615776, + "grad_norm": 1.7148224115371704, + "kl": 0.0, + "learning_rate": 9.587391266573366e-07, + "loss": 0.6866, + "num_samples": 1.0, + "reward": 4.875, + "reward_std": 0.2501000165939331, + "rewards/gpt4o_holistic_reward": 4.875, + "rl_loss": -3.725290298461914e-09, + "sft_loss": 2.288667678833008, + "speech_entropy": 2.2371129989624023, + "speech_kl": 0.0, + "step": 292, + "text_entropy": 1.372521162033081, + "text_kl": 0.0, + "total_entropy": 2.084916591644287 + }, + { + "combined_loss": 0.6642424464225769, + "completion_length": 414.375, + "epoch": 0.09319338422391857, + "grad_norm": 1.98994779586792, + "kl": 0.0, + "learning_rate": 9.584399471555449e-07, + "loss": 0.6642, + "num_samples": 1.0, + "reward": 4.375, + "reward_std": 1.1036534309387207, + "rewards/gpt4o_holistic_reward": 4.375, + "rl_loss": 0.0, + "sft_loss": 2.214141368865967, + "speech_entropy": 2.3391613960266113, + "speech_kl": 0.0, + "step": 293, + "text_entropy": 1.0432538986206055, + "text_kl": 0.0, + "total_entropy": 2.0753378868103027 + }, + { + "combined_loss": 0.6237722635269165, + "completion_length": 518.5, + "epoch": 0.09351145038167939, + "grad_norm": 1.5434069633483887, + "kl": 0.0, + "learning_rate": 9.581397394066726e-07, + "loss": 0.6238, + "num_samples": 1.0, + "reward": 3.75, + "reward_std": 0.5001000165939331, + "rewards/gpt4o_holistic_reward": 3.75, + "rl_loss": 1.862645149230957e-09, + "sft_loss": 2.0792407989501953, + "speech_entropy": 2.4281280040740967, + "speech_kl": 0.0, + "step": 294, + "text_entropy": 1.0986557006835938, + "text_kl": 0.0, + "total_entropy": 2.148758888244629 + }, + { + "combined_loss": 0.6445981860160828, + "completion_length": 476.0625, + "epoch": 0.0938295165394402, + "grad_norm": 1.9953515529632568, + "kl": 0.0, + "learning_rate": 9.578385041664925e-07, + "loss": 0.6446, + "num_samples": 1.0, + "reward": 3.0, + "reward_std": 0.5774502754211426, + "rewards/gpt4o_holistic_reward": 3.0, + "rl_loss": 1.1175870895385742e-08, + "sft_loss": 2.148660659790039, + "speech_entropy": 2.1423118114471436, + "speech_kl": 0.0, + "step": 295, + "text_entropy": 1.1786762475967407, + "text_kl": 0.0, + "total_entropy": 1.9594595432281494 + }, + { + "combined_loss": 0.698508620262146, + "completion_length": 513.6875, + "epoch": 0.09414758269720101, + "grad_norm": 1.7260363101959229, + "kl": 0.0, + "learning_rate": 9.575362421933638e-07, + "loss": 0.6985, + "num_samples": 1.0, + "reward": 3.5625, + "reward_std": 0.9786533117294312, + "rewards/gpt4o_holistic_reward": 3.5625, + "rl_loss": -3.725290298461914e-09, + "sft_loss": 2.328361988067627, + "speech_entropy": 2.1992037296295166, + "speech_kl": 0.0, + "step": 296, + "text_entropy": 1.1559290885925293, + "text_kl": 0.0, + "total_entropy": 1.9949164390563965 + }, + { + "combined_loss": 0.6678205132484436, + "completion_length": 562.1875, + "epoch": 0.09446564885496184, + "grad_norm": 2.3985586166381836, + "kl": 0.0, + "learning_rate": 9.572329542482309e-07, + "loss": 0.6678, + "num_samples": 1.0, + "reward": 3.3125, + "reward_std": 0.5622053742408752, + "rewards/gpt4o_holistic_reward": 3.3125, + "rl_loss": 1.4901161193847656e-08, + "sft_loss": 2.2260682582855225, + "speech_entropy": 2.17387056350708, + "speech_kl": 0.0, + "step": 297, + "text_entropy": 1.249056339263916, + "text_kl": 0.0, + "total_entropy": 1.9902275800704956 + }, + { + "combined_loss": 0.7157855033874512, + "completion_length": 452.0, + "epoch": 0.09478371501272265, + "grad_norm": 1.746224045753479, + "kl": 0.0, + "learning_rate": 9.569286410946207e-07, + "loss": 0.7158, + "num_samples": 1.0, + "reward": 4.5, + "reward_std": 0.5774502158164978, + "rewards/gpt4o_holistic_reward": 4.5, + "rl_loss": 3.725290298461914e-09, + "sft_loss": 2.385951519012451, + "speech_entropy": 2.403526782989502, + "speech_kl": 0.0, + "step": 298, + "text_entropy": 1.6002991199493408, + "text_kl": 0.0, + "total_entropy": 2.2620410919189453 + }, + { + "combined_loss": 0.7848547101020813, + "completion_length": 352.8125, + "epoch": 0.09510178117048346, + "grad_norm": 2.679422616958618, + "kl": 0.0, + "learning_rate": 9.566233034986411e-07, + "loss": 0.7849, + "num_samples": 1.0, + "reward": 3.0, + "reward_std": 0.7500999569892883, + "rewards/gpt4o_holistic_reward": 3.0, + "rl_loss": 1.4901161193847656e-08, + "sft_loss": 2.6161820888519287, + "speech_entropy": 2.4429244995117188, + "speech_kl": 0.0, + "step": 299, + "text_entropy": 1.1291344165802002, + "text_kl": 0.0, + "total_entropy": 2.160275936126709 + }, + { + "combined_loss": 0.7183820009231567, + "completion_length": 509.0, + "epoch": 0.09541984732824428, + "grad_norm": 2.244758129119873, + "kl": 0.0, + "learning_rate": 9.563169422289796e-07, + "loss": 0.7184, + "num_samples": 1.0, + "reward": 3.75, + "reward_std": 1.478813648223877, + "rewards/gpt4o_holistic_reward": 3.75, + "rl_loss": 1.862645149230957e-09, + "sft_loss": 2.394606590270996, + "speech_entropy": 2.7869982719421387, + "speech_kl": 0.0, + "step": 300, + "text_entropy": 1.4828916788101196, + "text_kl": 0.0, + "total_entropy": 2.5221667289733887 + }, + { + "combined_loss": 0.6260841488838196, + "completion_length": 436.25, + "epoch": 0.09573791348600509, + "grad_norm": 2.182548761367798, + "kl": 0.0, + "learning_rate": 9.560095580568996e-07, + "loss": 0.6261, + "num_samples": 1.0, + "reward": 3.75, + "reward_std": 1.395711898803711, + "rewards/gpt4o_holistic_reward": 3.75, + "rl_loss": 3.725290298461914e-09, + "sft_loss": 2.086947202682495, + "speech_entropy": 2.514786720275879, + "speech_kl": 0.0, + "step": 301, + "text_entropy": 1.6524386405944824, + "text_kl": 0.0, + "total_entropy": 2.2878293991088867 + }, + { + "combined_loss": 0.7308363914489746, + "completion_length": 391.375, + "epoch": 0.0960559796437659, + "grad_norm": 1.8132033348083496, + "kl": 0.0, + "learning_rate": 9.55701151756241e-07, + "loss": 0.7308, + "num_samples": 1.0, + "reward": 4.375, + "reward_std": 0.2501000165939331, + "rewards/gpt4o_holistic_reward": 4.375, + "rl_loss": -3.725290298461914e-09, + "sft_loss": 2.4361209869384766, + "speech_entropy": 2.2866296768188477, + "speech_kl": 0.0, + "step": 302, + "text_entropy": 1.3407695293426514, + "text_kl": 0.0, + "total_entropy": 2.0895495414733887 + }, + { + "combined_loss": 0.6096498966217041, + "completion_length": 303.9375, + "epoch": 0.09637404580152671, + "grad_norm": 2.869903087615967, + "kl": 0.0, + "learning_rate": 9.55391724103416e-07, + "loss": 0.6096, + "num_samples": 1.0, + "reward": 4.0625, + "reward_std": 0.7394567728042603, + "rewards/gpt4o_holistic_reward": 4.0625, + "rl_loss": 0.0, + "sft_loss": 2.0321662425994873, + "speech_entropy": 2.8348755836486816, + "speech_kl": 0.0, + "step": 303, + "text_entropy": 1.151402235031128, + "text_kl": 0.0, + "total_entropy": 2.51552152633667 + }, + { + "combined_loss": 0.6735842227935791, + "completion_length": 553.4375, + "epoch": 0.09669211195928754, + "grad_norm": 1.5486979484558105, + "kl": 0.0, + "learning_rate": 9.550812758774085e-07, + "loss": 0.6736, + "num_samples": 1.0, + "reward": 3.625, + "reward_std": 0.4788135886192322, + "rewards/gpt4o_holistic_reward": 3.625, + "rl_loss": 0.0, + "sft_loss": 2.2452807426452637, + "speech_entropy": 2.6690659523010254, + "speech_kl": 0.0, + "step": 304, + "text_entropy": 1.2094488143920898, + "text_kl": 0.0, + "total_entropy": 2.3848817348480225 + }, + { + "combined_loss": 0.6192010641098022, + "completion_length": 465.4375, + "epoch": 0.09701017811704835, + "grad_norm": 1.6173670291900635, + "kl": 0.0, + "learning_rate": 9.547698078597713e-07, + "loss": 0.6192, + "num_samples": 1.0, + "reward": 4.0625, + "reward_std": 0.6770563125610352, + "rewards/gpt4o_holistic_reward": 4.0625, + "rl_loss": -3.725290298461914e-09, + "sft_loss": 2.0640034675598145, + "speech_entropy": 2.154529094696045, + "speech_kl": 0.0, + "step": 305, + "text_entropy": 1.051210641860962, + "text_kl": 0.0, + "total_entropy": 1.921095848083496 + }, + { + "combined_loss": 0.6843652725219727, + "completion_length": 401.125, + "epoch": 0.09732824427480916, + "grad_norm": 2.191951036453247, + "kl": 0.0, + "learning_rate": 9.54457320834625e-07, + "loss": 0.6844, + "num_samples": 1.0, + "reward": 4.3125, + "reward_std": 0.5581127405166626, + "rewards/gpt4o_holistic_reward": 4.3125, + "rl_loss": 3.725290298461914e-09, + "sft_loss": 2.281217575073242, + "speech_entropy": 2.282792091369629, + "speech_kl": 0.0, + "step": 306, + "text_entropy": 1.1090407371520996, + "text_kl": 0.0, + "total_entropy": 2.0719239711761475 + }, + { + "combined_loss": 0.6455174684524536, + "completion_length": 423.625, + "epoch": 0.09764631043256998, + "grad_norm": 2.207777261734009, + "kl": 0.0, + "learning_rate": 9.541438155886554e-07, + "loss": 0.6455, + "num_samples": 1.0, + "reward": 2.625, + "reward_std": 0.8644567728042603, + "rewards/gpt4o_holistic_reward": 2.625, + "rl_loss": -3.725290298461914e-09, + "sft_loss": 2.1517248153686523, + "speech_entropy": 2.6509978771209717, + "speech_kl": 0.0, + "step": 307, + "text_entropy": 1.262976884841919, + "text_kl": 0.0, + "total_entropy": 2.38793683052063 + }, + { + "combined_loss": 0.7142800092697144, + "completion_length": 549.875, + "epoch": 0.09796437659033079, + "grad_norm": 1.5794309377670288, + "kl": 0.0, + "learning_rate": 9.538292929111112e-07, + "loss": 0.7143, + "num_samples": 1.0, + "reward": 3.5, + "reward_std": 0.2501000165939331, + "rewards/gpt4o_holistic_reward": 3.5, + "rl_loss": 3.725290298461914e-09, + "sft_loss": 2.3809332847595215, + "speech_entropy": 2.319915294647217, + "speech_kl": 0.0, + "step": 308, + "text_entropy": 1.5507756471633911, + "text_kl": 0.0, + "total_entropy": 2.1751065254211426 + }, + { + "combined_loss": 0.6554994583129883, + "completion_length": 314.3125, + "epoch": 0.0982824427480916, + "grad_norm": 1.8395949602127075, + "kl": 0.0, + "learning_rate": 9.535137535938031e-07, + "loss": 0.6555, + "num_samples": 1.0, + "reward": 4.4375, + "reward_std": 0.5646764636039734, + "rewards/gpt4o_holistic_reward": 4.4375, + "rl_loss": -7.450580596923828e-09, + "sft_loss": 2.1849982738494873, + "speech_entropy": 2.135890245437622, + "speech_kl": 0.0, + "step": 309, + "text_entropy": 0.8897652626037598, + "text_kl": 0.0, + "total_entropy": 1.9156931638717651 + }, + { + "combined_loss": 0.738502025604248, + "completion_length": 449.25, + "epoch": 0.09860050890585242, + "grad_norm": 1.810171127319336, + "kl": 0.0, + "learning_rate": 9.531971984311011e-07, + "loss": 0.7385, + "num_samples": 1.0, + "reward": 3.5625, + "reward_std": 0.7654882073402405, + "rewards/gpt4o_holistic_reward": 3.5625, + "rl_loss": 3.725290298461914e-09, + "sft_loss": 2.4616734981536865, + "speech_entropy": 2.2290310859680176, + "speech_kl": 0.0, + "step": 310, + "text_entropy": 1.629712462425232, + "text_kl": 0.0, + "total_entropy": 2.121129035949707 + }, + { + "combined_loss": 0.7250782251358032, + "completion_length": 504.125, + "epoch": 0.09891857506361323, + "grad_norm": 1.8087352514266968, + "kl": 0.0, + "learning_rate": 9.528796282199321e-07, + "loss": 0.7251, + "num_samples": 1.0, + "reward": 3.6875, + "reward_std": 0.8750999569892883, + "rewards/gpt4o_holistic_reward": 3.6875, + "rl_loss": 7.450580596923828e-09, + "sft_loss": 2.4169273376464844, + "speech_entropy": 2.1521968841552734, + "speech_kl": 0.0, + "step": 311, + "text_entropy": 1.3652535676956177, + "text_kl": 0.0, + "total_entropy": 2.007472038269043 + }, + { + "combined_loss": 0.6740373373031616, + "completion_length": 274.0625, + "epoch": 0.09923664122137404, + "grad_norm": 2.0441370010375977, + "kl": 0.0, + "learning_rate": 9.52561043759779e-07, + "loss": 0.674, + "num_samples": 1.0, + "reward": 4.25, + "reward_std": 1.183112621307373, + "rewards/gpt4o_holistic_reward": 4.25, + "rl_loss": 3.725290298461914e-09, + "sft_loss": 2.246790885925293, + "speech_entropy": 2.477415084838867, + "speech_kl": 0.0, + "step": 312, + "text_entropy": 0.9889883399009705, + "text_kl": 0.0, + "total_entropy": 2.1920857429504395 + }, + { + "combined_loss": 0.64084392786026, + "completion_length": 396.0625, + "epoch": 0.09955470737913485, + "grad_norm": 1.5913077592849731, + "kl": 0.0, + "learning_rate": 9.522414458526778e-07, + "loss": 0.6408, + "num_samples": 1.0, + "reward": 4.1875, + "reward_std": 0.1251000016927719, + "rewards/gpt4o_holistic_reward": 4.1875, + "rl_loss": 7.450580596923828e-09, + "sft_loss": 2.1361465454101562, + "speech_entropy": 2.176445484161377, + "speech_kl": 0.0, + "step": 313, + "text_entropy": 1.2821910381317139, + "text_kl": 0.0, + "total_entropy": 2.0197858810424805 + }, + { + "combined_loss": 0.6571515202522278, + "completion_length": 449.0625, + "epoch": 0.09987277353689568, + "grad_norm": 1.8189066648483276, + "kl": 0.0, + "learning_rate": 9.519208353032158e-07, + "loss": 0.6572, + "num_samples": 1.0, + "reward": 4.3125, + "reward_std": 0.4733423590660095, + "rewards/gpt4o_holistic_reward": 4.3125, + "rl_loss": -7.450580596923828e-09, + "sft_loss": 2.190505027770996, + "speech_entropy": 2.1687145233154297, + "speech_kl": 0.0, + "step": 314, + "text_entropy": 1.071582555770874, + "text_kl": 0.0, + "total_entropy": 1.9608268737792969 + }, + { + "combined_loss": 0.5773699283599854, + "completion_length": 375.3125, + "epoch": 0.10019083969465649, + "grad_norm": 1.6602267026901245, + "kl": 0.0, + "learning_rate": 9.515992129185294e-07, + "loss": 0.5774, + "num_samples": 1.0, + "reward": 4.375, + "reward_std": 0.8274502158164978, + "rewards/gpt4o_holistic_reward": 4.375, + "rl_loss": -7.450580596923828e-09, + "sft_loss": 1.924566388130188, + "speech_entropy": 2.1676058769226074, + "speech_kl": 0.0, + "step": 315, + "text_entropy": 0.8978569507598877, + "text_kl": 0.0, + "total_entropy": 1.9022667407989502 + }, + { + "combined_loss": 0.591927170753479, + "completion_length": 374.9375, + "epoch": 0.1005089058524173, + "grad_norm": 1.8265076875686646, + "kl": 0.0, + "learning_rate": 9.512765795083029e-07, + "loss": 0.5919, + "num_samples": 1.0, + "reward": 4.0625, + "reward_std": 0.1251000016927719, + "rewards/gpt4o_holistic_reward": 4.0625, + "rl_loss": 0.0, + "sft_loss": 1.973090410232544, + "speech_entropy": 2.2445173263549805, + "speech_kl": 0.0, + "step": 316, + "text_entropy": 0.9660072922706604, + "text_kl": 0.0, + "total_entropy": 1.9937914609909058 + }, + { + "combined_loss": 0.7159188985824585, + "completion_length": 387.875, + "epoch": 0.10082697201017812, + "grad_norm": 2.4045403003692627, + "kl": 0.0, + "learning_rate": 9.509529358847654e-07, + "loss": 0.7159, + "num_samples": 1.0, + "reward": 4.625, + "reward_std": 0.6444375514984131, + "rewards/gpt4o_holistic_reward": 4.625, + "rl_loss": 3.725290298461914e-09, + "sft_loss": 2.3863961696624756, + "speech_entropy": 2.341677665710449, + "speech_kl": 0.0, + "step": 317, + "text_entropy": 1.6361427307128906, + "text_kl": 0.0, + "total_entropy": 2.214456796646118 + }, + { + "combined_loss": 0.6382442712783813, + "completion_length": 391.0625, + "epoch": 0.10114503816793893, + "grad_norm": 1.9414576292037964, + "kl": 0.0, + "learning_rate": 9.506282828626894e-07, + "loss": 0.6382, + "num_samples": 1.0, + "reward": 2.75, + "reward_std": 0.5001000165939331, + "rewards/gpt4o_holistic_reward": 2.75, + "rl_loss": 1.1175870895385742e-08, + "sft_loss": 2.1274807453155518, + "speech_entropy": 2.3970725536346436, + "speech_kl": 0.0, + "step": 318, + "text_entropy": 1.257880449295044, + "text_kl": 0.0, + "total_entropy": 2.180114269256592 + }, + { + "combined_loss": 0.6456592082977295, + "completion_length": 357.625, + "epoch": 0.10146310432569974, + "grad_norm": 1.8857872486114502, + "kl": 0.0, + "learning_rate": 9.503026212593886e-07, + "loss": 0.6457, + "num_samples": 1.0, + "reward": 3.3125, + "reward_std": 0.6251000165939331, + "rewards/gpt4o_holistic_reward": 3.3125, + "rl_loss": -1.4901161193847656e-08, + "sft_loss": 2.1521973609924316, + "speech_entropy": 2.1984481811523438, + "speech_kl": 0.0, + "step": 319, + "text_entropy": 1.3871957063674927, + "text_kl": 0.0, + "total_entropy": 2.054154634475708 + }, + { + "combined_loss": 0.6763216257095337, + "completion_length": 374.75, + "epoch": 0.10178117048346055, + "grad_norm": 1.619836688041687, + "kl": 0.0, + "learning_rate": 9.499759518947154e-07, + "loss": 0.6763, + "num_samples": 1.0, + "reward": 4.25, + "reward_std": 9.999999747378752e-05, + "rewards/gpt4o_holistic_reward": 4.25, + "rl_loss": 0.0, + "sft_loss": 2.2544054985046387, + "speech_entropy": 2.144301414489746, + "speech_kl": 0.0, + "step": 320, + "text_entropy": 0.933074951171875, + "text_kl": 0.0, + "total_entropy": 1.9263477325439453 + }, + { + "combined_loss": 0.8536103963851929, + "completion_length": 433.5625, + "epoch": 0.10209923664122138, + "grad_norm": 3.0396976470947266, + "kl": 0.0, + "learning_rate": 9.496482755910599e-07, + "loss": 0.8536, + "num_samples": 1.0, + "reward": 2.875, + "reward_std": 0.772705078125, + "rewards/gpt4o_holistic_reward": 2.875, + "rl_loss": -7.450580596923828e-09, + "sft_loss": 2.8453681468963623, + "speech_entropy": 2.4692482948303223, + "speech_kl": 0.0, + "step": 321, + "text_entropy": 1.5737974643707275, + "text_kl": 0.0, + "total_entropy": 2.3104732036590576 + }, + { + "combined_loss": 0.6165826320648193, + "completion_length": 455.8125, + "epoch": 0.10241730279898219, + "grad_norm": 1.8582490682601929, + "kl": 0.0, + "learning_rate": 9.493195931733465e-07, + "loss": 0.6166, + "num_samples": 1.0, + "reward": 3.8125, + "reward_std": 0.8750999569892883, + "rewards/gpt4o_holistic_reward": 3.8125, + "rl_loss": -3.725290298461914e-09, + "sft_loss": 2.0552754402160645, + "speech_entropy": 2.3463516235351562, + "speech_kl": 0.0, + "step": 322, + "text_entropy": 0.7413707971572876, + "text_kl": 0.0, + "total_entropy": 2.0313949584960938 + }, + { + "combined_loss": 0.7083429098129272, + "completion_length": 455.0625, + "epoch": 0.102735368956743, + "grad_norm": 1.5822250843048096, + "kl": 0.0, + "learning_rate": 9.489899054690329e-07, + "loss": 0.7083, + "num_samples": 1.0, + "reward": 3.5, + "reward_std": 0.3536534011363983, + "rewards/gpt4o_holistic_reward": 3.5, + "rl_loss": 0.0, + "sft_loss": 2.361143112182617, + "speech_entropy": 2.1958041191101074, + "speech_kl": 0.0, + "step": 323, + "text_entropy": 1.5856623649597168, + "text_kl": 0.0, + "total_entropy": 2.0761876106262207 + }, + { + "combined_loss": 0.6362742185592651, + "completion_length": 430.3125, + "epoch": 0.10305343511450382, + "grad_norm": 2.0028886795043945, + "kl": 0.0, + "learning_rate": 9.486592133081075e-07, + "loss": 0.6363, + "num_samples": 1.0, + "reward": 4.3125, + "reward_std": 0.4478486180305481, + "rewards/gpt4o_holistic_reward": 4.3125, + "rl_loss": 7.450580596923828e-09, + "sft_loss": 2.1209139823913574, + "speech_entropy": 2.1673266887664795, + "speech_kl": 0.0, + "step": 324, + "text_entropy": 1.05299711227417, + "text_kl": 0.0, + "total_entropy": 1.9691715240478516 + }, + { + "combined_loss": 0.7382842302322388, + "completion_length": 476.875, + "epoch": 0.10337150127226463, + "grad_norm": 2.1232645511627197, + "kl": 0.0, + "learning_rate": 9.483275175230874e-07, + "loss": 0.7383, + "num_samples": 1.0, + "reward": 3.75, + "reward_std": 1.0774502754211426, + "rewards/gpt4o_holistic_reward": 3.75, + "rl_loss": 1.1175870895385742e-08, + "sft_loss": 2.460947275161743, + "speech_entropy": 2.690502882003784, + "speech_kl": 0.0, + "step": 325, + "text_entropy": 1.2036354541778564, + "text_kl": 0.0, + "total_entropy": 2.3894941806793213 + }, + { + "combined_loss": 0.6234292387962341, + "completion_length": 601.4375, + "epoch": 0.10368956743002544, + "grad_norm": 1.537564992904663, + "kl": 0.0, + "learning_rate": 9.479948189490164e-07, + "loss": 0.6234, + "num_samples": 1.0, + "reward": 4.3125, + "reward_std": 0.5581127405166626, + "rewards/gpt4o_holistic_reward": 4.3125, + "rl_loss": -7.450580596923828e-09, + "sft_loss": 2.078097343444824, + "speech_entropy": 2.1220028400421143, + "speech_kl": 0.0, + "step": 326, + "text_entropy": 1.139967918395996, + "text_kl": 0.0, + "total_entropy": 1.9373573064804077 + }, + { + "combined_loss": 0.6593961715698242, + "completion_length": 401.125, + "epoch": 0.10400763358778627, + "grad_norm": 2.1285951137542725, + "kl": 0.0, + "learning_rate": 9.476611184234627e-07, + "loss": 0.6594, + "num_samples": 1.0, + "reward": 3.3125, + "reward_std": 1.019437551498413, + "rewards/gpt4o_holistic_reward": 3.3125, + "rl_loss": -3.725290298461914e-09, + "sft_loss": 2.1979873180389404, + "speech_entropy": 2.558565616607666, + "speech_kl": 0.0, + "step": 327, + "text_entropy": 1.2850337028503418, + "text_kl": 0.0, + "total_entropy": 2.329627513885498 + }, + { + "combined_loss": 0.7995979189872742, + "completion_length": 416.5625, + "epoch": 0.10432569974554708, + "grad_norm": 2.2032155990600586, + "kl": 0.0, + "learning_rate": 9.473264167865171e-07, + "loss": 0.7996, + "num_samples": 1.0, + "reward": 3.6875, + "reward_std": 0.6637751460075378, + "rewards/gpt4o_holistic_reward": 3.6875, + "rl_loss": 0.0, + "sft_loss": 2.6653263568878174, + "speech_entropy": 2.289992332458496, + "speech_kl": 0.0, + "step": 328, + "text_entropy": 1.8079229593276978, + "text_kl": 0.0, + "total_entropy": 2.2008442878723145 + }, + { + "combined_loss": 0.7375897169113159, + "completion_length": 515.0625, + "epoch": 0.10464376590330789, + "grad_norm": 1.9254885911941528, + "kl": 0.0, + "learning_rate": 9.469907148807904e-07, + "loss": 0.7376, + "num_samples": 1.0, + "reward": 3.125, + "reward_std": 0.8515443801879883, + "rewards/gpt4o_holistic_reward": 3.125, + "rl_loss": 1.4901161193847656e-08, + "sft_loss": 2.458632469177246, + "speech_entropy": 2.328657627105713, + "speech_kl": 0.0, + "step": 329, + "text_entropy": 1.6102688312530518, + "text_kl": 0.0, + "total_entropy": 2.207139492034912 + }, + { + "combined_loss": 0.6494673490524292, + "completion_length": 605.5, + "epoch": 0.1049618320610687, + "grad_norm": 1.5243898630142212, + "kl": 0.0, + "learning_rate": 9.466540135514118e-07, + "loss": 0.6495, + "num_samples": 1.0, + "reward": 3.5625, + "reward_std": 0.5581127405166626, + "rewards/gpt4o_holistic_reward": 3.5625, + "rl_loss": 3.725290298461914e-09, + "sft_loss": 2.164891004562378, + "speech_entropy": 2.216064929962158, + "speech_kl": 0.0, + "step": 330, + "text_entropy": 1.050743579864502, + "text_kl": 0.0, + "total_entropy": 1.9922053813934326 + }, + { + "combined_loss": 0.6644630432128906, + "completion_length": 465.4375, + "epoch": 0.10527989821882952, + "grad_norm": 2.0127413272857666, + "kl": 0.0, + "learning_rate": 9.463163136460267e-07, + "loss": 0.6645, + "num_samples": 1.0, + "reward": 3.625, + "reward_std": 1.1372368335723877, + "rewards/gpt4o_holistic_reward": 3.625, + "rl_loss": 1.4901161193847656e-08, + "sft_loss": 2.214876651763916, + "speech_entropy": 2.606668472290039, + "speech_kl": 0.0, + "step": 331, + "text_entropy": 1.4162580966949463, + "text_kl": 0.0, + "total_entropy": 2.3680739402770996 + }, + { + "combined_loss": 0.6894693374633789, + "completion_length": 457.5, + "epoch": 0.10559796437659033, + "grad_norm": 1.5198652744293213, + "kl": 0.0, + "learning_rate": 9.45977616014794e-07, + "loss": 0.6895, + "num_samples": 1.0, + "reward": 4.8125, + "reward_std": 0.3751000165939331, + "rewards/gpt4o_holistic_reward": 4.8125, + "rl_loss": 0.0, + "sft_loss": 2.2982308864593506, + "speech_entropy": 2.0877485275268555, + "speech_kl": 0.0, + "step": 332, + "text_entropy": 1.2472989559173584, + "text_kl": 0.0, + "total_entropy": 1.9227503538131714 + }, + { + "combined_loss": 0.6039384603500366, + "completion_length": 533.625, + "epoch": 0.10591603053435114, + "grad_norm": 1.8885440826416016, + "kl": 0.0, + "learning_rate": 9.456379215103845e-07, + "loss": 0.6039, + "num_samples": 1.0, + "reward": 3.125, + "reward_std": 1.010462999343872, + "rewards/gpt4o_holistic_reward": 3.125, + "rl_loss": -3.725290298461914e-09, + "sft_loss": 2.0131282806396484, + "speech_entropy": 2.623091697692871, + "speech_kl": 0.0, + "step": 333, + "text_entropy": 0.8223081827163696, + "text_kl": 0.0, + "total_entropy": 2.1995439529418945 + }, + { + "combined_loss": 0.7336439490318298, + "completion_length": 343.3125, + "epoch": 0.10623409669211197, + "grad_norm": 1.7900915145874023, + "kl": 0.0, + "learning_rate": 9.452972309879789e-07, + "loss": 0.7336, + "num_samples": 1.0, + "reward": 3.875, + "reward_std": 0.2501000165939331, + "rewards/gpt4o_holistic_reward": 3.875, + "rl_loss": -1.4901161193847656e-08, + "sft_loss": 2.4454798698425293, + "speech_entropy": 2.2133467197418213, + "speech_kl": 0.0, + "step": 334, + "text_entropy": 1.487365484237671, + "text_kl": 0.0, + "total_entropy": 2.088663339614868 + }, + { + "combined_loss": 0.6571398377418518, + "completion_length": 472.6875, + "epoch": 0.10655216284987278, + "grad_norm": 1.6357449293136597, + "kl": 0.0, + "learning_rate": 9.44955545305265e-07, + "loss": 0.6571, + "num_samples": 1.0, + "reward": 4.75, + "reward_std": 0.5000999569892883, + "rewards/gpt4o_holistic_reward": 4.75, + "rl_loss": 0.0, + "sft_loss": 2.1904659271240234, + "speech_entropy": 2.30568790435791, + "speech_kl": 0.0, + "step": 335, + "text_entropy": 0.9574769139289856, + "text_kl": 0.0, + "total_entropy": 2.014315128326416 + }, + { + "combined_loss": 0.663806140422821, + "completion_length": 316.8125, + "epoch": 0.10687022900763359, + "grad_norm": 1.8551936149597168, + "kl": 0.0, + "learning_rate": 9.446128653224363e-07, + "loss": 0.6638, + "num_samples": 1.0, + "reward": 2.9375, + "reward_std": 0.1251000016927719, + "rewards/gpt4o_holistic_reward": 2.9375, + "rl_loss": 7.450580596923828e-09, + "sft_loss": 2.2126870155334473, + "speech_entropy": 2.284435749053955, + "speech_kl": 0.0, + "step": 336, + "text_entropy": 1.4010343551635742, + "text_kl": 0.0, + "total_entropy": 2.119978904724121 + }, + { + "combined_loss": 0.696797788143158, + "completion_length": 450.1875, + "epoch": 0.1071882951653944, + "grad_norm": 1.812302589416504, + "kl": 0.0, + "learning_rate": 9.442691919021891e-07, + "loss": 0.6968, + "num_samples": 1.0, + "reward": 3.25, + "reward_std": 0.7887751460075378, + "rewards/gpt4o_holistic_reward": 3.25, + "rl_loss": -7.450580596923828e-09, + "sft_loss": 2.3226590156555176, + "speech_entropy": 2.48870587348938, + "speech_kl": 0.0, + "step": 337, + "text_entropy": 1.0859538316726685, + "text_kl": 0.0, + "total_entropy": 2.2247252464294434 + }, + { + "combined_loss": 0.6544849276542664, + "completion_length": 362.5, + "epoch": 0.10750636132315522, + "grad_norm": 1.7814265489578247, + "kl": 0.0, + "learning_rate": 9.43924525909721e-07, + "loss": 0.6545, + "num_samples": 1.0, + "reward": 4.25, + "reward_std": 0.6831126809120178, + "rewards/gpt4o_holistic_reward": 4.25, + "rl_loss": -3.725290298461914e-09, + "sft_loss": 2.1816163063049316, + "speech_entropy": 2.2594857215881348, + "speech_kl": 0.0, + "step": 338, + "text_entropy": 1.0299164056777954, + "text_kl": 0.0, + "total_entropy": 2.003577470779419 + }, + { + "combined_loss": 0.5944312214851379, + "completion_length": 279.625, + "epoch": 0.10782442748091603, + "grad_norm": 1.5945285558700562, + "kl": 0.0, + "learning_rate": 9.43578868212728e-07, + "loss": 0.5944, + "num_samples": 1.0, + "reward": 4.3125, + "reward_std": 0.4733423590660095, + "rewards/gpt4o_holistic_reward": 4.3125, + "rl_loss": -3.725290298461914e-09, + "sft_loss": 1.9814373254776, + "speech_entropy": 2.3196024894714355, + "speech_kl": 0.0, + "step": 339, + "text_entropy": 0.6264474391937256, + "text_kl": 0.0, + "total_entropy": 1.9987720251083374 + }, + { + "combined_loss": 0.7080922722816467, + "completion_length": 357.75, + "epoch": 0.10814249363867684, + "grad_norm": 1.8270853757858276, + "kl": 0.0, + "learning_rate": 9.432322196814032e-07, + "loss": 0.7081, + "num_samples": 1.0, + "reward": 3.0, + "reward_std": 0.2501000165939331, + "rewards/gpt4o_holistic_reward": 3.0, + "rl_loss": -3.725290298461914e-09, + "sft_loss": 2.360307455062866, + "speech_entropy": 2.377963066101074, + "speech_kl": 0.0, + "step": 340, + "text_entropy": 1.37347412109375, + "text_kl": 0.0, + "total_entropy": 2.1954591274261475 + }, + { + "combined_loss": 0.6495932340621948, + "completion_length": 439.4375, + "epoch": 0.10846055979643766, + "grad_norm": 1.823044776916504, + "kl": 0.0, + "learning_rate": 9.428845811884336e-07, + "loss": 0.6496, + "num_samples": 1.0, + "reward": 3.3125, + "reward_std": 0.2694375813007355, + "rewards/gpt4o_holistic_reward": 3.3125, + "rl_loss": -7.450580596923828e-09, + "sft_loss": 2.165310859680176, + "speech_entropy": 2.547529697418213, + "speech_kl": 0.0, + "step": 341, + "text_entropy": 1.3410420417785645, + "text_kl": 0.0, + "total_entropy": 2.3126533031463623 + }, + { + "combined_loss": 0.744182825088501, + "completion_length": 700.0625, + "epoch": 0.10877862595419847, + "grad_norm": 1.4928951263427734, + "kl": 0.0, + "learning_rate": 9.42535953608999e-07, + "loss": 0.7442, + "num_samples": 1.0, + "reward": 3.625, + "reward_std": 0.36445680260658264, + "rewards/gpt4o_holistic_reward": 3.625, + "rl_loss": 0.0, + "sft_loss": 2.48060941696167, + "speech_entropy": 2.0912342071533203, + "speech_kl": 0.0, + "step": 342, + "text_entropy": 1.2619953155517578, + "text_kl": 0.0, + "total_entropy": 1.9290344715118408 + }, + { + "combined_loss": 0.6683810353279114, + "completion_length": 419.0, + "epoch": 0.10909669211195928, + "grad_norm": 2.006411075592041, + "kl": 0.0, + "learning_rate": 9.421863378207685e-07, + "loss": 0.6684, + "num_samples": 1.0, + "reward": 3.5625, + "reward_std": 1.0154881477355957, + "rewards/gpt4o_holistic_reward": 3.5625, + "rl_loss": 2.2351741790771484e-08, + "sft_loss": 2.2279367446899414, + "speech_entropy": 2.1261141300201416, + "speech_kl": 0.0, + "step": 343, + "text_entropy": 1.138145923614502, + "text_kl": 0.0, + "total_entropy": 1.941408395767212 + }, + { + "combined_loss": 0.6885769367218018, + "completion_length": 448.625, + "epoch": 0.10941475826972011, + "grad_norm": 1.6280336380004883, + "kl": 0.0, + "learning_rate": 9.418357347038998e-07, + "loss": 0.6886, + "num_samples": 1.0, + "reward": 3.8125, + "reward_std": 0.3751000165939331, + "rewards/gpt4o_holistic_reward": 3.8125, + "rl_loss": 1.1175870895385742e-08, + "sft_loss": 2.2952563762664795, + "speech_entropy": 2.533572196960449, + "speech_kl": 0.0, + "step": 344, + "text_entropy": 1.452898621559143, + "text_kl": 0.0, + "total_entropy": 2.3209335803985596 + }, + { + "combined_loss": 0.6933377981185913, + "completion_length": 469.25, + "epoch": 0.10973282442748092, + "grad_norm": 1.7831571102142334, + "kl": 0.0, + "learning_rate": 9.414841451410354e-07, + "loss": 0.6933, + "num_samples": 1.0, + "reward": 3.625, + "reward_std": 0.7217878103256226, + "rewards/gpt4o_holistic_reward": 3.625, + "rl_loss": -7.450580596923828e-09, + "sft_loss": 2.3111257553100586, + "speech_entropy": 2.4832825660705566, + "speech_kl": 0.0, + "step": 345, + "text_entropy": 1.2831058502197266, + "text_kl": 0.0, + "total_entropy": 2.2077646255493164 + }, + { + "combined_loss": 0.6893813610076904, + "completion_length": 457.25, + "epoch": 0.11005089058524173, + "grad_norm": 1.5565513372421265, + "kl": 0.0, + "learning_rate": 9.411315700173023e-07, + "loss": 0.6894, + "num_samples": 1.0, + "reward": 4.25, + "reward_std": 9.999999747378752e-05, + "rewards/gpt4o_holistic_reward": 4.25, + "rl_loss": 0.0, + "sft_loss": 2.2979378700256348, + "speech_entropy": 2.1734261512756348, + "speech_kl": 0.0, + "step": 346, + "text_entropy": 1.4588537216186523, + "text_kl": 0.0, + "total_entropy": 2.0491230487823486 + }, + { + "combined_loss": 0.6448703408241272, + "completion_length": 340.125, + "epoch": 0.11036895674300254, + "grad_norm": 2.0143826007843018, + "kl": 0.0, + "learning_rate": 9.407780102203073e-07, + "loss": 0.6449, + "num_samples": 1.0, + "reward": 3.0, + "reward_std": 1.0313551425933838, + "rewards/gpt4o_holistic_reward": 3.0, + "rl_loss": -2.2351741790771484e-08, + "sft_loss": 2.1495676040649414, + "speech_entropy": 2.1733856201171875, + "speech_kl": 0.0, + "step": 347, + "text_entropy": 1.2910046577453613, + "text_kl": 0.0, + "total_entropy": 2.0091493129730225 + }, + { + "combined_loss": 0.7653839588165283, + "completion_length": 219.375, + "epoch": 0.11068702290076336, + "grad_norm": 2.6158885955810547, + "kl": 0.0, + "learning_rate": 9.40423466640137e-07, + "loss": 0.7654, + "num_samples": 1.0, + "reward": 3.4375, + "reward_std": 0.4478486180305481, + "rewards/gpt4o_holistic_reward": 3.4375, + "rl_loss": -3.725290298461914e-09, + "sft_loss": 2.5512795448303223, + "speech_entropy": 2.3109560012817383, + "speech_kl": 0.0, + "step": 348, + "text_entropy": 1.525007963180542, + "text_kl": 0.0, + "total_entropy": 2.177790641784668 + }, + { + "combined_loss": 0.7735346555709839, + "completion_length": 485.0, + "epoch": 0.11100508905852417, + "grad_norm": 1.7478300333023071, + "kl": 0.0, + "learning_rate": 9.400679401693546e-07, + "loss": 0.7735, + "num_samples": 1.0, + "reward": 4.1875, + "reward_std": 0.48945680260658264, + "rewards/gpt4o_holistic_reward": 4.1875, + "rl_loss": 7.450580596923828e-09, + "sft_loss": 2.578448534011841, + "speech_entropy": 2.2317914962768555, + "speech_kl": 0.0, + "step": 349, + "text_entropy": 1.4859943389892578, + "text_kl": 0.0, + "total_entropy": 2.102978229522705 + }, + { + "combined_loss": 0.7051453590393066, + "completion_length": 413.4375, + "epoch": 0.11132315521628498, + "grad_norm": 1.8334420919418335, + "kl": 0.0, + "learning_rate": 9.397114317029974e-07, + "loss": 0.7051, + "num_samples": 1.0, + "reward": 3.125, + "reward_std": 0.14443756639957428, + "rewards/gpt4o_holistic_reward": 3.125, + "rl_loss": 0.0, + "sft_loss": 2.3504843711853027, + "speech_entropy": 2.3960375785827637, + "speech_kl": 0.0, + "step": 350, + "text_entropy": 1.6391247510910034, + "text_kl": 0.0, + "total_entropy": 2.2544939517974854 + }, + { + "combined_loss": 0.6969923973083496, + "completion_length": 393.25, + "epoch": 0.11164122137404581, + "grad_norm": 1.7989410161972046, + "kl": 0.0, + "learning_rate": 9.393539421385749e-07, + "loss": 0.697, + "num_samples": 1.0, + "reward": 3.5625, + "reward_std": 0.2694375813007355, + "rewards/gpt4o_holistic_reward": 3.5625, + "rl_loss": 7.450580596923828e-09, + "sft_loss": 2.323307514190674, + "speech_entropy": 2.3470993041992188, + "speech_kl": 0.0, + "step": 351, + "text_entropy": 1.3637549877166748, + "text_kl": 0.0, + "total_entropy": 2.1490964889526367 + }, + { + "combined_loss": 0.6479834318161011, + "completion_length": 286.0625, + "epoch": 0.11195928753180662, + "grad_norm": 1.3780865669250488, + "kl": 0.0, + "learning_rate": 9.38995472376067e-07, + "loss": 0.648, + "num_samples": 1.0, + "reward": 3.0, + "reward_std": 9.999999747378752e-05, + "rewards/gpt4o_holistic_reward": 3.0, + "rl_loss": 0.0, + "sft_loss": 2.159944534301758, + "speech_entropy": 2.2307074069976807, + "speech_kl": 0.0, + "step": 352, + "text_entropy": 1.2372667789459229, + "text_kl": 0.0, + "total_entropy": 2.0377559661865234 + }, + { + "combined_loss": 0.9208307862281799, + "completion_length": 562.375, + "epoch": 0.11227735368956743, + "grad_norm": 2.392199754714966, + "kl": 0.0, + "learning_rate": 9.386360233179206e-07, + "loss": 0.9208, + "num_samples": 1.0, + "reward": 2.875, + "reward_std": 0.7065354585647583, + "rewards/gpt4o_holistic_reward": 2.875, + "rl_loss": 0.0, + "sft_loss": 3.0694358348846436, + "speech_entropy": 2.4065301418304443, + "speech_kl": 0.0, + "step": 353, + "text_entropy": 1.0953893661499023, + "text_kl": 0.0, + "total_entropy": 2.1367077827453613 + }, + { + "combined_loss": 0.6610080599784851, + "completion_length": 534.125, + "epoch": 0.11259541984732824, + "grad_norm": 1.6925069093704224, + "kl": 0.0, + "learning_rate": 9.382755958690485e-07, + "loss": 0.661, + "num_samples": 1.0, + "reward": 4.625, + "reward_std": 0.7500999569892883, + "rewards/gpt4o_holistic_reward": 4.625, + "rl_loss": -7.450580596923828e-09, + "sft_loss": 2.203360080718994, + "speech_entropy": 2.2898178100585938, + "speech_kl": 0.0, + "step": 354, + "text_entropy": 1.2608642578125, + "text_kl": 0.0, + "total_entropy": 2.0855863094329834 + }, + { + "combined_loss": 0.6016900539398193, + "completion_length": 431.875, + "epoch": 0.11291348600508906, + "grad_norm": 2.1892993450164795, + "kl": 0.0, + "learning_rate": 9.379141909368262e-07, + "loss": 0.6017, + "num_samples": 1.0, + "reward": 3.5625, + "reward_std": 1.2394567728042603, + "rewards/gpt4o_holistic_reward": 3.5625, + "rl_loss": 1.4901161193847656e-08, + "sft_loss": 2.0056333541870117, + "speech_entropy": 2.3564295768737793, + "speech_kl": 0.0, + "step": 355, + "text_entropy": 1.2530461549758911, + "text_kl": 0.0, + "total_entropy": 2.141939640045166 + }, + { + "combined_loss": 0.6209220290184021, + "completion_length": 342.8125, + "epoch": 0.11323155216284987, + "grad_norm": 2.095818519592285, + "kl": 0.0, + "learning_rate": 9.375518094310902e-07, + "loss": 0.6209, + "num_samples": 1.0, + "reward": 4.5, + "reward_std": 0.7288135886192322, + "rewards/gpt4o_holistic_reward": 4.5, + "rl_loss": -7.450580596923828e-09, + "sft_loss": 2.069740056991577, + "speech_entropy": 2.1869020462036133, + "speech_kl": 0.0, + "step": 356, + "text_entropy": 1.4376100301742554, + "text_kl": 0.0, + "total_entropy": 2.056838035583496 + }, + { + "combined_loss": 0.6652117967605591, + "completion_length": 436.75, + "epoch": 0.11354961832061068, + "grad_norm": 1.6426494121551514, + "kl": 0.0, + "learning_rate": 9.371884522641357e-07, + "loss": 0.6652, + "num_samples": 1.0, + "reward": 4.3125, + "reward_std": 0.4733423590660095, + "rewards/gpt4o_holistic_reward": 4.3125, + "rl_loss": -1.4901161193847656e-08, + "sft_loss": 2.217372417449951, + "speech_entropy": 2.4225001335144043, + "speech_kl": 0.0, + "step": 357, + "text_entropy": 1.0462160110473633, + "text_kl": 0.0, + "total_entropy": 2.1539487838745117 + }, + { + "combined_loss": 0.6718185544013977, + "completion_length": 358.9375, + "epoch": 0.1138676844783715, + "grad_norm": 2.442697048187256, + "kl": 0.0, + "learning_rate": 9.368241203507136e-07, + "loss": 0.6718, + "num_samples": 1.0, + "reward": 3.375, + "reward_std": 0.6831126809120178, + "rewards/gpt4o_holistic_reward": 3.375, + "rl_loss": -3.725290298461914e-09, + "sft_loss": 2.2393951416015625, + "speech_entropy": 2.331137180328369, + "speech_kl": 0.0, + "step": 358, + "text_entropy": 1.2198386192321777, + "text_kl": 0.0, + "total_entropy": 2.1104815006256104 + }, + { + "combined_loss": 0.6823822259902954, + "completion_length": 366.125, + "epoch": 0.11418575063613232, + "grad_norm": 2.0511248111724854, + "kl": 0.0, + "learning_rate": 9.364588146080293e-07, + "loss": 0.6824, + "num_samples": 1.0, + "reward": 3.25, + "reward_std": 0.7042241096496582, + "rewards/gpt4o_holistic_reward": 3.25, + "rl_loss": 3.725290298461914e-09, + "sft_loss": 2.2746074199676514, + "speech_entropy": 2.4417941570281982, + "speech_kl": 0.0, + "step": 359, + "text_entropy": 1.3095173835754395, + "text_kl": 0.0, + "total_entropy": 2.220010757446289 + }, + { + "combined_loss": 0.8011901378631592, + "completion_length": 471.9375, + "epoch": 0.11450381679389313, + "grad_norm": 2.027939558029175, + "kl": 0.0, + "learning_rate": 9.360925359557396e-07, + "loss": 0.8012, + "num_samples": 1.0, + "reward": 3.25, + "reward_std": 0.6444375514984131, + "rewards/gpt4o_holistic_reward": 3.25, + "rl_loss": -1.4901161193847656e-08, + "sft_loss": 2.6706337928771973, + "speech_entropy": 2.198263168334961, + "speech_kl": 0.0, + "step": 360, + "text_entropy": 1.6125402450561523, + "text_kl": 0.0, + "total_entropy": 2.0894393920898438 + }, + { + "combined_loss": 0.6634936332702637, + "completion_length": 622.0625, + "epoch": 0.11482188295165395, + "grad_norm": 1.804551124572754, + "kl": 0.0, + "learning_rate": 9.357252853159505e-07, + "loss": 0.6635, + "num_samples": 1.0, + "reward": 3.875, + "reward_std": 1.2180101871490479, + "rewards/gpt4o_holistic_reward": 3.875, + "rl_loss": -1.4901161193847656e-08, + "sft_loss": 2.2116456031799316, + "speech_entropy": 2.1450726985931396, + "speech_kl": 0.0, + "step": 361, + "text_entropy": 1.2231552600860596, + "text_kl": 0.0, + "total_entropy": 1.972318410873413 + }, + { + "combined_loss": 0.7286593317985535, + "completion_length": 344.0, + "epoch": 0.11513994910941476, + "grad_norm": 2.2605528831481934, + "kl": 0.0, + "learning_rate": 9.35357063613215e-07, + "loss": 0.7287, + "num_samples": 1.0, + "reward": 3.625, + "reward_std": 0.933112621307373, + "rewards/gpt4o_holistic_reward": 3.625, + "rl_loss": 7.450580596923828e-09, + "sft_loss": 2.4288644790649414, + "speech_entropy": 2.370051383972168, + "speech_kl": 0.0, + "step": 362, + "text_entropy": 1.7082545757293701, + "text_kl": 0.0, + "total_entropy": 2.255542039871216 + }, + { + "combined_loss": 0.7230384349822998, + "completion_length": 372.1875, + "epoch": 0.11545801526717557, + "grad_norm": 1.8940509557724, + "kl": 0.0, + "learning_rate": 9.349878717745308e-07, + "loss": 0.723, + "num_samples": 1.0, + "reward": 4.875, + "reward_std": 0.2501000165939331, + "rewards/gpt4o_holistic_reward": 4.875, + "rl_loss": 1.4901161193847656e-08, + "sft_loss": 2.410127878189087, + "speech_entropy": 2.214015245437622, + "speech_kl": 0.0, + "step": 363, + "text_entropy": 1.4911437034606934, + "text_kl": 0.0, + "total_entropy": 2.0867202281951904 + }, + { + "combined_loss": 0.7651809453964233, + "completion_length": 591.6875, + "epoch": 0.11577608142493638, + "grad_norm": 1.642069697380066, + "kl": 0.0, + "learning_rate": 9.34617710729338e-07, + "loss": 0.7652, + "num_samples": 1.0, + "reward": 2.6875, + "reward_std": 0.8538135886192322, + "rewards/gpt4o_holistic_reward": 2.6875, + "rl_loss": -3.725290298461914e-09, + "sft_loss": 2.550603151321411, + "speech_entropy": 2.124917507171631, + "speech_kl": 0.0, + "step": 364, + "text_entropy": 1.3163607120513916, + "text_kl": 0.0, + "total_entropy": 1.9680266380310059 + }, + { + "combined_loss": 0.6506673693656921, + "completion_length": 625.3125, + "epoch": 0.1160941475826972, + "grad_norm": 2.7186765670776367, + "kl": 0.0, + "learning_rate": 9.342465814095166e-07, + "loss": 0.6507, + "num_samples": 1.0, + "reward": 2.4375, + "reward_std": 0.5194375514984131, + "rewards/gpt4o_holistic_reward": 2.4375, + "rl_loss": 7.450580596923828e-09, + "sft_loss": 2.168890953063965, + "speech_entropy": 2.8031604290008545, + "speech_kl": 0.0, + "step": 365, + "text_entropy": 1.175290822982788, + "text_kl": 0.0, + "total_entropy": 2.457470655441284 + }, + { + "combined_loss": 0.6432278156280518, + "completion_length": 494.9375, + "epoch": 0.11641221374045801, + "grad_norm": 1.8091281652450562, + "kl": 0.0, + "learning_rate": 9.338744847493842e-07, + "loss": 0.6432, + "num_samples": 1.0, + "reward": 3.875, + "reward_std": 0.4788135886192322, + "rewards/gpt4o_holistic_reward": 3.875, + "rl_loss": 1.4901161193847656e-08, + "sft_loss": 2.1440927982330322, + "speech_entropy": 2.3679351806640625, + "speech_kl": 0.0, + "step": 366, + "text_entropy": 1.285107135772705, + "text_kl": 0.0, + "total_entropy": 2.1672964096069336 + }, + { + "combined_loss": 0.6938588619232178, + "completion_length": 392.9375, + "epoch": 0.11673027989821882, + "grad_norm": 2.0217089653015137, + "kl": 0.0, + "learning_rate": 9.335014216856936e-07, + "loss": 0.6939, + "num_samples": 1.0, + "reward": 4.5625, + "reward_std": 0.8750999569892883, + "rewards/gpt4o_holistic_reward": 4.5625, + "rl_loss": 1.4901161193847656e-08, + "sft_loss": 2.3128628730773926, + "speech_entropy": 2.1415584087371826, + "speech_kl": 0.0, + "step": 367, + "text_entropy": 1.1758991479873657, + "text_kl": 0.0, + "total_entropy": 1.9482624530792236 + }, + { + "combined_loss": 0.726897120475769, + "completion_length": 487.875, + "epoch": 0.11704834605597965, + "grad_norm": 1.7421088218688965, + "kl": 0.0, + "learning_rate": 9.331273931576306e-07, + "loss": 0.7269, + "num_samples": 1.0, + "reward": 4.0625, + "reward_std": 0.8146764636039734, + "rewards/gpt4o_holistic_reward": 4.0625, + "rl_loss": 1.862645149230957e-08, + "sft_loss": 2.422990322113037, + "speech_entropy": 2.3109121322631836, + "speech_kl": 0.0, + "step": 368, + "text_entropy": 1.0041307210922241, + "text_kl": 0.0, + "total_entropy": 2.0410120487213135 + }, + { + "combined_loss": 0.6220736503601074, + "completion_length": 688.875, + "epoch": 0.11736641221374046, + "grad_norm": 1.5585706233978271, + "kl": 0.0, + "learning_rate": 9.327524001068118e-07, + "loss": 0.6221, + "num_samples": 1.0, + "reward": 2.875, + "reward_std": 0.8644567728042603, + "rewards/gpt4o_holistic_reward": 2.875, + "rl_loss": 1.1175870895385742e-08, + "sft_loss": 2.0735785961151123, + "speech_entropy": 2.154421806335449, + "speech_kl": 0.0, + "step": 369, + "text_entropy": 1.0012127161026, + "text_kl": 0.0, + "total_entropy": 1.9310146570205688 + }, + { + "combined_loss": 0.6993934512138367, + "completion_length": 423.875, + "epoch": 0.11768447837150127, + "grad_norm": 1.9345406293869019, + "kl": 0.0, + "learning_rate": 9.323764434772815e-07, + "loss": 0.6994, + "num_samples": 1.0, + "reward": 2.5, + "reward_std": 0.6231511831283569, + "rewards/gpt4o_holistic_reward": 2.5, + "rl_loss": -3.725290298461914e-09, + "sft_loss": 2.3313114643096924, + "speech_entropy": 2.3823764324188232, + "speech_kl": 0.0, + "step": 370, + "text_entropy": 1.6932473182678223, + "text_kl": 0.0, + "total_entropy": 2.245680332183838 + }, + { + "combined_loss": 0.6287088394165039, + "completion_length": 436.6875, + "epoch": 0.1180025445292621, + "grad_norm": 2.2497189044952393, + "kl": 0.0, + "learning_rate": 9.319995242155101e-07, + "loss": 0.6287, + "num_samples": 1.0, + "reward": 3.6875, + "reward_std": 0.8750999569892883, + "rewards/gpt4o_holistic_reward": 3.6875, + "rl_loss": 0.0, + "sft_loss": 2.095696210861206, + "speech_entropy": 2.30611515045166, + "speech_kl": 0.0, + "step": 371, + "text_entropy": 0.9617571234703064, + "text_kl": 0.0, + "total_entropy": 2.0462255477905273 + }, + { + "combined_loss": 0.6533428430557251, + "completion_length": 536.4375, + "epoch": 0.1183206106870229, + "grad_norm": 2.011059045791626, + "kl": 0.0, + "learning_rate": 9.316216432703917e-07, + "loss": 0.6533, + "num_samples": 1.0, + "reward": 3.6875, + "reward_std": 1.057937741279602, + "rewards/gpt4o_holistic_reward": 3.6875, + "rl_loss": -3.725290298461914e-09, + "sft_loss": 2.177809238433838, + "speech_entropy": 2.581768035888672, + "speech_kl": 0.0, + "step": 372, + "text_entropy": 1.0381070375442505, + "text_kl": 0.0, + "total_entropy": 2.2744696140289307 + }, + { + "combined_loss": 0.6885940432548523, + "completion_length": 485.1875, + "epoch": 0.11863867684478371, + "grad_norm": 2.2359321117401123, + "kl": 0.0, + "learning_rate": 9.312428015932407e-07, + "loss": 0.6886, + "num_samples": 1.0, + "reward": 3.3125, + "reward_std": 1.226402759552002, + "rewards/gpt4o_holistic_reward": 3.3125, + "rl_loss": 1.862645149230957e-09, + "sft_loss": 2.2953133583068848, + "speech_entropy": 3.0134053230285645, + "speech_kl": 0.0, + "step": 373, + "text_entropy": 1.2640031576156616, + "text_kl": 0.0, + "total_entropy": 2.5999834537506104 + }, + { + "combined_loss": 0.6422200202941895, + "completion_length": 451.625, + "epoch": 0.11895674300254452, + "grad_norm": 1.8345330953598022, + "kl": 0.0, + "learning_rate": 9.308630001377909e-07, + "loss": 0.6422, + "num_samples": 1.0, + "reward": 2.75, + "reward_std": 0.9565354585647583, + "rewards/gpt4o_holistic_reward": 2.75, + "rl_loss": 0.0, + "sft_loss": 2.140733242034912, + "speech_entropy": 2.1300010681152344, + "speech_kl": 0.0, + "step": 374, + "text_entropy": 1.349212408065796, + "text_kl": 0.0, + "total_entropy": 1.9911762475967407 + }, + { + "combined_loss": 0.6511521339416504, + "completion_length": 440.8125, + "epoch": 0.11927480916030535, + "grad_norm": 2.062229871749878, + "kl": 0.0, + "learning_rate": 9.304822398601919e-07, + "loss": 0.6512, + "num_samples": 1.0, + "reward": 3.3125, + "reward_std": 0.2694375813007355, + "rewards/gpt4o_holistic_reward": 3.3125, + "rl_loss": 0.0, + "sft_loss": 2.1705071926116943, + "speech_entropy": 2.244220495223999, + "speech_kl": 0.0, + "step": 375, + "text_entropy": 1.1989765167236328, + "text_kl": 0.0, + "total_entropy": 2.0404000282287598 + }, + { + "combined_loss": 0.7069913148880005, + "completion_length": 472.375, + "epoch": 0.11959287531806616, + "grad_norm": 2.3955235481262207, + "kl": 0.0, + "learning_rate": 9.301005217190072e-07, + "loss": 0.707, + "num_samples": 1.0, + "reward": 2.625, + "reward_std": 1.0000998973846436, + "rewards/gpt4o_holistic_reward": 2.625, + "rl_loss": -3.725290298461914e-09, + "sft_loss": 2.356637477874756, + "speech_entropy": 3.064028263092041, + "speech_kl": 0.0, + "step": 376, + "text_entropy": 1.921829104423523, + "text_kl": 0.0, + "total_entropy": 2.78952693939209 + }, + { + "combined_loss": 0.648313045501709, + "completion_length": 499.5625, + "epoch": 0.11991094147582697, + "grad_norm": 2.091409206390381, + "kl": 0.0, + "learning_rate": 9.297178466752118e-07, + "loss": 0.6483, + "num_samples": 1.0, + "reward": 3.0, + "reward_std": 1.0774502754211426, + "rewards/gpt4o_holistic_reward": 3.0, + "rl_loss": -3.725290298461914e-09, + "sft_loss": 2.161043405532837, + "speech_entropy": 3.0961804389953613, + "speech_kl": 0.0, + "step": 377, + "text_entropy": 1.5107793807983398, + "text_kl": 0.0, + "total_entropy": 2.7813608646392822 + }, + { + "combined_loss": 0.713459849357605, + "completion_length": 469.625, + "epoch": 0.12022900763358779, + "grad_norm": 2.0358638763427734, + "kl": 0.0, + "learning_rate": 9.293342156921896e-07, + "loss": 0.7135, + "num_samples": 1.0, + "reward": 4.1875, + "reward_std": 0.1251000016927719, + "rewards/gpt4o_holistic_reward": 4.1875, + "rl_loss": 0.0, + "sft_loss": 2.378199338912964, + "speech_entropy": 2.2007017135620117, + "speech_kl": 0.0, + "step": 378, + "text_entropy": 1.2517070770263672, + "text_kl": 0.0, + "total_entropy": 2.017704963684082 + }, + { + "combined_loss": 0.6941728591918945, + "completion_length": 430.8125, + "epoch": 0.1205470737913486, + "grad_norm": 1.5506932735443115, + "kl": 0.0, + "learning_rate": 9.289496297357313e-07, + "loss": 0.6942, + "num_samples": 1.0, + "reward": 5.0, + "reward_std": 9.999999747378752e-05, + "rewards/gpt4o_holistic_reward": 5.0, + "rl_loss": 0.0, + "sft_loss": 2.3139095306396484, + "speech_entropy": 2.1886677742004395, + "speech_kl": 0.0, + "step": 379, + "text_entropy": 1.5006790161132812, + "text_kl": 0.0, + "total_entropy": 2.0677237510681152 + }, + { + "combined_loss": 0.7921469211578369, + "completion_length": 444.0, + "epoch": 0.12086513994910941, + "grad_norm": 1.8955799341201782, + "kl": 0.0, + "learning_rate": 9.285640897740315e-07, + "loss": 0.7921, + "num_samples": 1.0, + "reward": 4.375, + "reward_std": 0.4331127107143402, + "rewards/gpt4o_holistic_reward": 4.375, + "rl_loss": 0.0, + "sft_loss": 2.6404895782470703, + "speech_entropy": 2.0556373596191406, + "speech_kl": 0.0, + "step": 380, + "text_entropy": 1.4717328548431396, + "text_kl": 0.0, + "total_entropy": 1.9481353759765625 + }, + { + "combined_loss": 0.7151652574539185, + "completion_length": 566.5, + "epoch": 0.12118320610687022, + "grad_norm": 1.9992356300354004, + "kl": 0.0, + "learning_rate": 9.281775967776865e-07, + "loss": 0.7152, + "num_samples": 1.0, + "reward": 4.5, + "reward_std": 0.864456832408905, + "rewards/gpt4o_holistic_reward": 4.5, + "rl_loss": 3.725290298461914e-09, + "sft_loss": 2.3838841915130615, + "speech_entropy": 2.1918318271636963, + "speech_kl": 0.0, + "step": 381, + "text_entropy": 1.32926344871521, + "text_kl": 0.0, + "total_entropy": 2.039583683013916 + }, + { + "combined_loss": 0.6602721214294434, + "completion_length": 361.8125, + "epoch": 0.12150127226463105, + "grad_norm": 2.4221584796905518, + "kl": 0.0, + "learning_rate": 9.277901517196921e-07, + "loss": 0.6603, + "num_samples": 1.0, + "reward": 3.25, + "reward_std": 0.704224169254303, + "rewards/gpt4o_holistic_reward": 3.25, + "rl_loss": -3.725290298461914e-09, + "sft_loss": 2.2009072303771973, + "speech_entropy": 3.4160265922546387, + "speech_kl": 0.0, + "step": 382, + "text_entropy": 1.843759536743164, + "text_kl": 0.0, + "total_entropy": 3.1454572677612305 + }, + { + "combined_loss": 0.6596149206161499, + "completion_length": 469.5625, + "epoch": 0.12181933842239186, + "grad_norm": 2.0681304931640625, + "kl": 0.0, + "learning_rate": 9.274017555754407e-07, + "loss": 0.6596, + "num_samples": 1.0, + "reward": 3.3125, + "reward_std": 0.8676799535751343, + "rewards/gpt4o_holistic_reward": 3.3125, + "rl_loss": -1.1175870895385742e-08, + "sft_loss": 2.198716402053833, + "speech_entropy": 2.4791064262390137, + "speech_kl": 0.0, + "step": 383, + "text_entropy": 1.5029938220977783, + "text_kl": 0.0, + "total_entropy": 2.3004837036132812 + }, + { + "combined_loss": 0.6301867961883545, + "completion_length": 606.3125, + "epoch": 0.12213740458015267, + "grad_norm": 2.0828793048858643, + "kl": 0.0, + "learning_rate": 9.270124093227192e-07, + "loss": 0.6302, + "num_samples": 1.0, + "reward": 3.0, + "reward_std": 0.8483423590660095, + "rewards/gpt4o_holistic_reward": 3.0, + "rl_loss": 7.450580596923828e-09, + "sft_loss": 2.1006226539611816, + "speech_entropy": 2.148000717163086, + "speech_kl": 0.0, + "step": 384, + "text_entropy": 0.49018028378486633, + "text_kl": 0.0, + "total_entropy": 1.8085635900497437 + }, + { + "combined_loss": 0.589484453201294, + "completion_length": 417.5, + "epoch": 0.12245547073791349, + "grad_norm": 2.000455856323242, + "kl": 0.0, + "learning_rate": 9.266221139417064e-07, + "loss": 0.5895, + "num_samples": 1.0, + "reward": 4.0, + "reward_std": 0.3944375813007355, + "rewards/gpt4o_holistic_reward": 4.0, + "rl_loss": -7.450580596923828e-09, + "sft_loss": 1.9649479389190674, + "speech_entropy": 2.3916573524475098, + "speech_kl": 0.0, + "step": 385, + "text_entropy": 1.274303913116455, + "text_kl": 0.0, + "total_entropy": 2.180541515350342 + }, + { + "combined_loss": 0.6605916023254395, + "completion_length": 463.625, + "epoch": 0.1227735368956743, + "grad_norm": 2.3290114402770996, + "kl": 0.0, + "learning_rate": 9.262308704149701e-07, + "loss": 0.6606, + "num_samples": 1.0, + "reward": 3.4375, + "reward_std": 0.6176798939704895, + "rewards/gpt4o_holistic_reward": 3.4375, + "rl_loss": -1.4901161193847656e-08, + "sft_loss": 2.201972007751465, + "speech_entropy": 3.0757741928100586, + "speech_kl": 0.0, + "step": 386, + "text_entropy": 1.367950439453125, + "text_kl": 0.0, + "total_entropy": 2.818263530731201 + }, + { + "combined_loss": 0.6665786504745483, + "completion_length": 479.8125, + "epoch": 0.12309160305343511, + "grad_norm": 3.4283993244171143, + "kl": 0.0, + "learning_rate": 9.258386797274658e-07, + "loss": 0.6666, + "num_samples": 1.0, + "reward": 2.8125, + "reward_std": 1.2235616445541382, + "rewards/gpt4o_holistic_reward": 2.8125, + "rl_loss": -5.587935447692871e-09, + "sft_loss": 2.221928596496582, + "speech_entropy": 3.2466955184936523, + "speech_kl": 0.0, + "step": 387, + "text_entropy": 1.366699457168579, + "text_kl": 0.0, + "total_entropy": 2.97598934173584 + }, + { + "combined_loss": 0.6852359771728516, + "completion_length": 439.5, + "epoch": 0.12340966921119594, + "grad_norm": 2.255603551864624, + "kl": 0.0, + "learning_rate": 9.254455428665329e-07, + "loss": 0.6852, + "num_samples": 1.0, + "reward": 3.8125, + "reward_std": 0.8751000165939331, + "rewards/gpt4o_holistic_reward": 3.8125, + "rl_loss": 3.725290298461914e-09, + "sft_loss": 2.2841198444366455, + "speech_entropy": 2.7042312622070312, + "speech_kl": 0.0, + "step": 388, + "text_entropy": 1.335016131401062, + "text_kl": 0.0, + "total_entropy": 2.444276809692383 + }, + { + "combined_loss": 0.6944676041603088, + "completion_length": 545.0, + "epoch": 0.12372773536895675, + "grad_norm": 3.5577309131622314, + "kl": 0.0, + "learning_rate": 9.250514608218928e-07, + "loss": 0.6945, + "num_samples": 1.0, + "reward": 3.375, + "reward_std": 0.2501000165939331, + "rewards/gpt4o_holistic_reward": 3.375, + "rl_loss": 1.4901161193847656e-08, + "sft_loss": 2.314892053604126, + "speech_entropy": 2.9382314682006836, + "speech_kl": 0.0, + "step": 389, + "text_entropy": 1.2093441486358643, + "text_kl": 0.0, + "total_entropy": 2.6713953018188477 + }, + { + "combined_loss": 0.6847076416015625, + "completion_length": 318.875, + "epoch": 0.12404580152671756, + "grad_norm": 2.288792610168457, + "kl": 0.0, + "learning_rate": 9.24656434585647e-07, + "loss": 0.6847, + "num_samples": 1.0, + "reward": 4.0, + "reward_std": 0.7042241096496582, + "rewards/gpt4o_holistic_reward": 4.0, + "rl_loss": 7.450580596923828e-09, + "sft_loss": 2.2823588848114014, + "speech_entropy": 2.4487361907958984, + "speech_kl": 0.0, + "step": 390, + "text_entropy": 1.1644865274429321, + "text_kl": 0.0, + "total_entropy": 2.222623348236084 + }, + { + "combined_loss": 0.6812296509742737, + "completion_length": 496.5, + "epoch": 0.12436386768447837, + "grad_norm": 2.5237197875976562, + "kl": 0.0, + "learning_rate": 9.242604651522735e-07, + "loss": 0.6812, + "num_samples": 1.0, + "reward": 3.0, + "reward_std": 1.2178384065628052, + "rewards/gpt4o_holistic_reward": 3.0, + "rl_loss": 1.1175870895385742e-08, + "sft_loss": 2.2707653045654297, + "speech_entropy": 3.798156261444092, + "speech_kl": 0.0, + "step": 391, + "text_entropy": 1.453238606452942, + "text_kl": 0.0, + "total_entropy": 3.4767727851867676 + }, + { + "combined_loss": 0.6517486572265625, + "completion_length": 350.5, + "epoch": 0.12468193384223919, + "grad_norm": 1.931217908859253, + "kl": 0.0, + "learning_rate": 9.238635535186246e-07, + "loss": 0.6517, + "num_samples": 1.0, + "reward": 1.875, + "reward_std": 0.3944375813007355, + "rewards/gpt4o_holistic_reward": 1.875, + "rl_loss": 3.725290298461914e-09, + "sft_loss": 2.1724953651428223, + "speech_entropy": 2.189283847808838, + "speech_kl": 0.0, + "step": 392, + "text_entropy": 1.1253941059112549, + "text_kl": 0.0, + "total_entropy": 1.9858797788619995 + }, + { + "combined_loss": 0.693282961845398, + "completion_length": 286.8125, + "epoch": 0.125, + "grad_norm": 2.019578218460083, + "kl": 0.0, + "learning_rate": 9.234657006839249e-07, + "loss": 0.6933, + "num_samples": 1.0, + "reward": 3.25, + "reward_std": 0.5000999569892883, + "rewards/gpt4o_holistic_reward": 3.25, + "rl_loss": -7.450580596923828e-09, + "sft_loss": 2.310943126678467, + "speech_entropy": 2.226673126220703, + "speech_kl": 0.0, + "step": 393, + "text_entropy": 1.3821954727172852, + "text_kl": 0.0, + "total_entropy": 2.0710020065307617 + }, + { + "combined_loss": 0.6477099657058716, + "completion_length": 431.5625, + "epoch": 0.1253180661577608, + "grad_norm": 3.454335927963257, + "kl": 0.0, + "learning_rate": 9.230669076497687e-07, + "loss": 0.6477, + "num_samples": 1.0, + "reward": 3.5625, + "reward_std": 1.501086711883545, + "rewards/gpt4o_holistic_reward": 3.5625, + "rl_loss": 3.725290298461914e-09, + "sft_loss": 2.1590328216552734, + "speech_entropy": 5.795164108276367, + "speech_kl": 0.0, + "step": 394, + "text_entropy": 1.2117631435394287, + "text_kl": 0.0, + "total_entropy": 5.437085151672363 + }, + { + "combined_loss": 0.6723122596740723, + "completion_length": 557.5625, + "epoch": 0.12563613231552162, + "grad_norm": 1.878871202468872, + "kl": 0.0, + "learning_rate": 9.226671754201167e-07, + "loss": 0.6723, + "num_samples": 1.0, + "reward": 3.0625, + "reward_std": 0.3751000165939331, + "rewards/gpt4o_holistic_reward": 3.0625, + "rl_loss": 3.725290298461914e-09, + "sft_loss": 2.2410407066345215, + "speech_entropy": 3.8088769912719727, + "speech_kl": 0.0, + "step": 395, + "text_entropy": 1.0774390697479248, + "text_kl": 0.0, + "total_entropy": 3.4700815677642822 + }, + { + "combined_loss": 0.7662006616592407, + "completion_length": 397.5625, + "epoch": 0.12595419847328243, + "grad_norm": 2.458489418029785, + "kl": 0.0, + "learning_rate": 9.222665050012947e-07, + "loss": 0.7662, + "num_samples": 1.0, + "reward": 3.4375, + "reward_std": 0.7654881477355957, + "rewards/gpt4o_holistic_reward": 3.4375, + "rl_loss": 7.450580596923828e-09, + "sft_loss": 2.554001808166504, + "speech_entropy": 2.176966667175293, + "speech_kl": 0.0, + "step": 396, + "text_entropy": 1.7679321765899658, + "text_kl": 0.0, + "total_entropy": 2.1059505939483643 + }, + { + "combined_loss": 0.5795345902442932, + "completion_length": 308.3125, + "epoch": 0.12627226463104327, + "grad_norm": 2.792732000350952, + "kl": 0.0, + "learning_rate": 9.218648974019896e-07, + "loss": 0.5795, + "num_samples": 1.0, + "reward": 4.125, + "reward_std": 0.614456832408905, + "rewards/gpt4o_holistic_reward": 4.125, + "rl_loss": 3.725290298461914e-09, + "sft_loss": 1.9317818880081177, + "speech_entropy": 2.6999130249023438, + "speech_kl": 0.0, + "step": 397, + "text_entropy": 1.0977978706359863, + "text_kl": 0.0, + "total_entropy": 2.432361602783203 + }, + { + "combined_loss": 0.8004469275474548, + "completion_length": 436.625, + "epoch": 0.12659033078880408, + "grad_norm": 4.002890586853027, + "kl": 0.0, + "learning_rate": 9.214623536332482e-07, + "loss": 0.8004, + "num_samples": 1.0, + "reward": 3.0, + "reward_std": 0.9788135290145874, + "rewards/gpt4o_holistic_reward": 3.0, + "rl_loss": -1.1175870895385742e-08, + "sft_loss": 2.668156385421753, + "speech_entropy": 5.438493728637695, + "speech_kl": 0.0, + "step": 398, + "text_entropy": 1.7521681785583496, + "text_kl": 0.0, + "total_entropy": 5.220246315002441 + }, + { + "combined_loss": 0.7312483787536621, + "completion_length": 681.8125, + "epoch": 0.1269083969465649, + "grad_norm": 2.7238285541534424, + "kl": 0.0, + "learning_rate": 9.21058874708474e-07, + "loss": 0.7312, + "num_samples": 1.0, + "reward": 4.0, + "reward_std": 0.9463939070701599, + "rewards/gpt4o_holistic_reward": 4.0, + "rl_loss": -3.725290298461914e-09, + "sft_loss": 2.4374947547912598, + "speech_entropy": 3.460540294647217, + "speech_kl": 0.0, + "step": 399, + "text_entropy": 1.679933786392212, + "text_kl": 0.0, + "total_entropy": 3.2443912029266357 + }, + { + "combined_loss": 0.7706436514854431, + "completion_length": 449.875, + "epoch": 0.1272264631043257, + "grad_norm": 1.9544907808303833, + "kl": 0.0, + "learning_rate": 9.206544616434248e-07, + "loss": 0.7706, + "num_samples": 1.0, + "reward": 4.125, + "reward_std": 1.0308762788772583, + "rewards/gpt4o_holistic_reward": 4.125, + "rl_loss": 7.450580596923828e-09, + "sft_loss": 2.568812131881714, + "speech_entropy": 2.1668813228607178, + "speech_kl": 0.0, + "step": 400, + "text_entropy": 1.5073027610778809, + "text_kl": 0.0, + "total_entropy": 2.0442421436309814 + }, + { + "combined_loss": 0.6995494365692139, + "completion_length": 447.75, + "epoch": 0.1275445292620865, + "grad_norm": 1.9926646947860718, + "kl": 0.0, + "learning_rate": 9.202491154562097e-07, + "loss": 0.6995, + "num_samples": 1.0, + "reward": 3.875, + "reward_std": 0.2501000165939331, + "rewards/gpt4o_holistic_reward": 3.875, + "rl_loss": -3.725290298461914e-09, + "sft_loss": 2.331831455230713, + "speech_entropy": 3.2041335105895996, + "speech_kl": 0.0, + "step": 401, + "text_entropy": 1.542539119720459, + "text_kl": 0.0, + "total_entropy": 3.0047965049743652 + }, + { + "combined_loss": 0.674731433391571, + "completion_length": 414.75, + "epoch": 0.12786259541984732, + "grad_norm": 1.9648370742797852, + "kl": 0.0, + "learning_rate": 9.198428371672874e-07, + "loss": 0.6747, + "num_samples": 1.0, + "reward": 4.1875, + "reward_std": 0.6682298183441162, + "rewards/gpt4o_holistic_reward": 4.1875, + "rl_loss": 1.862645149230957e-09, + "sft_loss": 2.2491049766540527, + "speech_entropy": 2.0483105182647705, + "speech_kl": 0.0, + "step": 402, + "text_entropy": 1.0541050434112549, + "text_kl": 0.0, + "total_entropy": 1.8693532943725586 + }, + { + "combined_loss": 0.7094697952270508, + "completion_length": 490.75, + "epoch": 0.12818066157760813, + "grad_norm": 3.759694814682007, + "kl": 0.0, + "learning_rate": 9.194356277994632e-07, + "loss": 0.7095, + "num_samples": 1.0, + "reward": 3.4375, + "reward_std": 0.5713939070701599, + "rewards/gpt4o_holistic_reward": 3.4375, + "rl_loss": 7.450580596923828e-09, + "sft_loss": 2.364899158477783, + "speech_entropy": 2.0986576080322266, + "speech_kl": 0.0, + "step": 403, + "text_entropy": 1.6229678392410278, + "text_kl": 0.0, + "total_entropy": 2.018561363220215 + }, + { + "combined_loss": 0.6625657081604004, + "completion_length": 677.5, + "epoch": 0.12849872773536897, + "grad_norm": 6.927186012268066, + "kl": 0.0, + "learning_rate": 9.19027488377886e-07, + "loss": 0.6626, + "num_samples": 1.0, + "reward": 3.125, + "reward_std": 1.3692017793655396, + "rewards/gpt4o_holistic_reward": 3.125, + "rl_loss": 1.862645149230957e-08, + "sft_loss": 2.208552122116089, + "speech_entropy": 3.995192527770996, + "speech_kl": 0.0, + "step": 404, + "text_entropy": 1.7138077020645142, + "text_kl": 0.0, + "total_entropy": 3.6808578968048096 + }, + { + "combined_loss": 0.6429011225700378, + "completion_length": 530.25, + "epoch": 0.12881679389312978, + "grad_norm": 5.672791004180908, + "kl": 0.0, + "learning_rate": 9.186184199300463e-07, + "loss": 0.6429, + "num_samples": 1.0, + "reward": 3.5, + "reward_std": 1.3904881477355957, + "rewards/gpt4o_holistic_reward": 3.5, + "rl_loss": 0.0, + "sft_loss": 2.1430037021636963, + "speech_entropy": 4.908637046813965, + "speech_kl": 0.0, + "step": 405, + "text_entropy": 1.521721363067627, + "text_kl": 0.0, + "total_entropy": 4.66417932510376 + }, + { + "combined_loss": 0.6549615859985352, + "completion_length": 477.375, + "epoch": 0.1291348600508906, + "grad_norm": 2.929534912109375, + "kl": 0.0, + "learning_rate": 9.182084234857735e-07, + "loss": 0.655, + "num_samples": 1.0, + "reward": 2.3125, + "reward_std": 0.8430101871490479, + "rewards/gpt4o_holistic_reward": 2.3125, + "rl_loss": -3.725290298461914e-09, + "sft_loss": 2.1832053661346436, + "speech_entropy": 2.1303212642669678, + "speech_kl": 0.0, + "step": 406, + "text_entropy": 1.1068902015686035, + "text_kl": 0.0, + "total_entropy": 1.9402127265930176 + }, + { + "combined_loss": 0.6791283488273621, + "completion_length": 461.6875, + "epoch": 0.1294529262086514, + "grad_norm": 2.371797800064087, + "kl": 0.0, + "learning_rate": 9.17797500077233e-07, + "loss": 0.6791, + "num_samples": 1.0, + "reward": 4.4375, + "reward_std": 0.8538135886192322, + "rewards/gpt4o_holistic_reward": 4.4375, + "rl_loss": -3.725290298461914e-09, + "sft_loss": 2.263761043548584, + "speech_entropy": 3.0191469192504883, + "speech_kl": 0.0, + "step": 407, + "text_entropy": 2.0163426399230957, + "text_kl": 0.0, + "total_entropy": 2.867436408996582 + }, + { + "combined_loss": 0.683641791343689, + "completion_length": 407.125, + "epoch": 0.1297709923664122, + "grad_norm": 3.123765468597412, + "kl": 0.0, + "learning_rate": 9.173856507389244e-07, + "loss": 0.6836, + "num_samples": 1.0, + "reward": 3.0, + "reward_std": 1.3274502754211426, + "rewards/gpt4o_holistic_reward": 3.0, + "rl_loss": 0.0, + "sft_loss": 2.278805732727051, + "speech_entropy": 4.666620254516602, + "speech_kl": 0.0, + "step": 408, + "text_entropy": 4.294477462768555, + "text_kl": 0.0, + "total_entropy": 4.622071266174316 + }, + { + "combined_loss": 0.7313430309295654, + "completion_length": 254.5625, + "epoch": 0.13008905852417302, + "grad_norm": 2.3901264667510986, + "kl": 0.0, + "learning_rate": 9.169728765076774e-07, + "loss": 0.7313, + "num_samples": 1.0, + "reward": 4.0625, + "reward_std": 0.2694375813007355, + "rewards/gpt4o_holistic_reward": 4.0625, + "rl_loss": 7.450580596923828e-09, + "sft_loss": 2.437809944152832, + "speech_entropy": 2.1548728942871094, + "speech_kl": 0.0, + "step": 409, + "text_entropy": 1.255692958831787, + "text_kl": 0.0, + "total_entropy": 2.0065855979919434 + }, + { + "combined_loss": 0.6742551326751709, + "completion_length": 608.0, + "epoch": 0.13040712468193386, + "grad_norm": 2.8765196800231934, + "kl": 0.0, + "learning_rate": 9.165591784226511e-07, + "loss": 0.6743, + "num_samples": 1.0, + "reward": 2.0625, + "reward_std": 0.1251000016927719, + "rewards/gpt4o_holistic_reward": 2.0625, + "rl_loss": -3.725290298461914e-09, + "sft_loss": 2.2475171089172363, + "speech_entropy": 2.9422497749328613, + "speech_kl": 0.0, + "step": 410, + "text_entropy": 2.1728014945983887, + "text_kl": 0.0, + "total_entropy": 2.778034210205078 + }, + { + "combined_loss": 0.6012893319129944, + "completion_length": 402.5, + "epoch": 0.13072519083969467, + "grad_norm": 4.226046085357666, + "kl": 0.0, + "learning_rate": 9.161445575253295e-07, + "loss": 0.6013, + "num_samples": 1.0, + "reward": 3.375, + "reward_std": 0.9331126809120178, + "rewards/gpt4o_holistic_reward": 3.375, + "rl_loss": 0.0, + "sft_loss": 2.0042977333068848, + "speech_entropy": 3.145230293273926, + "speech_kl": 0.0, + "step": 411, + "text_entropy": 2.9247562885284424, + "text_kl": 0.0, + "total_entropy": 3.075840473175049 + }, + { + "combined_loss": 0.6655627489089966, + "completion_length": 465.5625, + "epoch": 0.13104325699745548, + "grad_norm": 4.012775421142578, + "kl": 0.0, + "learning_rate": 9.157290148595206e-07, + "loss": 0.6656, + "num_samples": 1.0, + "reward": 3.8125, + "reward_std": 0.6250999569892883, + "rewards/gpt4o_holistic_reward": 3.8125, + "rl_loss": -7.450580596923828e-09, + "sft_loss": 2.2185425758361816, + "speech_entropy": 2.861677646636963, + "speech_kl": 0.0, + "step": 412, + "text_entropy": 2.6879758834838867, + "text_kl": 0.0, + "total_entropy": 2.890303134918213 + }, + { + "combined_loss": 0.7086147665977478, + "completion_length": 562.5625, + "epoch": 0.1313613231552163, + "grad_norm": 4.615509510040283, + "kl": 0.0, + "learning_rate": 9.153125514713523e-07, + "loss": 0.7086, + "num_samples": 1.0, + "reward": 3.125, + "reward_std": 1.1756925582885742, + "rewards/gpt4o_holistic_reward": 3.125, + "rl_loss": -3.725290298461914e-09, + "sft_loss": 2.362049102783203, + "speech_entropy": 3.222433090209961, + "speech_kl": 0.0, + "step": 413, + "text_entropy": 2.680953025817871, + "text_kl": 0.0, + "total_entropy": 3.1390838623046875 + }, + { + "combined_loss": 0.7332242131233215, + "completion_length": 579.3125, + "epoch": 0.1316793893129771, + "grad_norm": 8.893054962158203, + "kl": 0.0, + "learning_rate": 9.148951684092709e-07, + "loss": 0.7332, + "num_samples": 1.0, + "reward": 3.0625, + "reward_std": 1.2498197555541992, + "rewards/gpt4o_holistic_reward": 3.0625, + "rl_loss": 0.0, + "sft_loss": 2.4440808296203613, + "speech_entropy": 3.2557754516601562, + "speech_kl": 0.0, + "step": 414, + "text_entropy": 3.8602097034454346, + "text_kl": 0.0, + "total_entropy": 3.7247118949890137 + }, + { + "combined_loss": 0.6186578273773193, + "completion_length": 346.75, + "epoch": 0.1319974554707379, + "grad_norm": 2.8829197883605957, + "kl": 0.0, + "learning_rate": 9.144768667240375e-07, + "loss": 0.6187, + "num_samples": 1.0, + "reward": 4.6875, + "reward_std": 0.4733423590660095, + "rewards/gpt4o_holistic_reward": 4.6875, + "rl_loss": 1.4901161193847656e-08, + "sft_loss": 2.062192678451538, + "speech_entropy": 2.5403671264648438, + "speech_kl": 0.0, + "step": 415, + "text_entropy": 2.09316086769104, + "text_kl": 0.0, + "total_entropy": 2.482808828353882 + }, + { + "combined_loss": 0.6629255414009094, + "completion_length": 498.1875, + "epoch": 0.13231552162849872, + "grad_norm": 1.7470639944076538, + "kl": 0.0, + "learning_rate": 9.140576474687263e-07, + "loss": 0.6629, + "num_samples": 1.0, + "reward": 4.625, + "reward_std": 0.7501000165939331, + "rewards/gpt4o_holistic_reward": 4.625, + "rl_loss": 1.862645149230957e-08, + "sft_loss": 2.209751605987549, + "speech_entropy": 2.040069103240967, + "speech_kl": 0.0, + "step": 416, + "text_entropy": 1.1820553541183472, + "text_kl": 0.0, + "total_entropy": 1.8843843936920166 + }, + { + "combined_loss": 0.7431026101112366, + "completion_length": 490.5, + "epoch": 0.13263358778625955, + "grad_norm": 1.9392704963684082, + "kl": 0.0, + "learning_rate": 9.136375116987211e-07, + "loss": 0.7431, + "num_samples": 1.0, + "reward": 3.9375, + "reward_std": 0.7394567728042603, + "rewards/gpt4o_holistic_reward": 3.9375, + "rl_loss": 1.4901161193847656e-08, + "sft_loss": 2.477008581161499, + "speech_entropy": 2.424851894378662, + "speech_kl": 0.0, + "step": 417, + "text_entropy": 1.5257370471954346, + "text_kl": 0.0, + "total_entropy": 2.2481653690338135 + }, + { + "combined_loss": 0.6801667809486389, + "completion_length": 366.6875, + "epoch": 0.13295165394402036, + "grad_norm": 2.0551204681396484, + "kl": 0.0, + "learning_rate": 9.132164604717135e-07, + "loss": 0.6802, + "num_samples": 1.0, + "reward": 4.125, + "reward_std": 0.2501000165939331, + "rewards/gpt4o_holistic_reward": 4.125, + "rl_loss": 7.450580596923828e-09, + "sft_loss": 2.2672226428985596, + "speech_entropy": 2.0900187492370605, + "speech_kl": 0.0, + "step": 418, + "text_entropy": 1.2720967531204224, + "text_kl": 0.0, + "total_entropy": 1.9535316228866577 + }, + { + "combined_loss": 0.7415467500686646, + "completion_length": 447.4375, + "epoch": 0.13326972010178118, + "grad_norm": 3.4998650550842285, + "kl": 0.0, + "learning_rate": 9.127944948476993e-07, + "loss": 0.7415, + "num_samples": 1.0, + "reward": 3.4375, + "reward_std": 1.3322067260742188, + "rewards/gpt4o_holistic_reward": 3.4375, + "rl_loss": -7.450580596923828e-09, + "sft_loss": 2.471822500228882, + "speech_entropy": 2.6262001991271973, + "speech_kl": 0.0, + "step": 419, + "text_entropy": 2.3739140033721924, + "text_kl": 0.0, + "total_entropy": 2.5941972732543945 + }, + { + "combined_loss": 0.6638205051422119, + "completion_length": 492.5, + "epoch": 0.13358778625954199, + "grad_norm": 2.7289879322052, + "kl": 0.0, + "learning_rate": 9.123716158889764e-07, + "loss": 0.6638, + "num_samples": 1.0, + "reward": 4.75, + "reward_std": 0.5000999569892883, + "rewards/gpt4o_holistic_reward": 4.75, + "rl_loss": 7.450580596923828e-09, + "sft_loss": 2.212735176086426, + "speech_entropy": 2.982739210128784, + "speech_kl": 0.0, + "step": 420, + "text_entropy": 2.626387119293213, + "text_kl": 0.0, + "total_entropy": 2.923607110977173 + }, + { + "combined_loss": 0.801853358745575, + "completion_length": 382.75, + "epoch": 0.1339058524173028, + "grad_norm": 5.198933124542236, + "kl": 0.0, + "learning_rate": 9.11947824660142e-07, + "loss": 0.8019, + "num_samples": 1.0, + "reward": 3.4375, + "reward_std": 0.41377514600753784, + "rewards/gpt4o_holistic_reward": 3.4375, + "rl_loss": 7.450580596923828e-09, + "sft_loss": 2.672844409942627, + "speech_entropy": 2.06710147857666, + "speech_kl": 0.0, + "step": 421, + "text_entropy": 1.2483782768249512, + "text_kl": 0.0, + "total_entropy": 1.9093949794769287 + }, + { + "combined_loss": 0.6525322198867798, + "completion_length": 461.3125, + "epoch": 0.1342239185750636, + "grad_norm": 2.1027040481567383, + "kl": 0.0, + "learning_rate": 9.115231222280901e-07, + "loss": 0.6525, + "num_samples": 1.0, + "reward": 3.0625, + "reward_std": 0.3751000165939331, + "rewards/gpt4o_holistic_reward": 3.0625, + "rl_loss": 0.0, + "sft_loss": 2.175107479095459, + "speech_entropy": 2.58778715133667, + "speech_kl": 0.0, + "step": 422, + "text_entropy": 1.6190730333328247, + "text_kl": 0.0, + "total_entropy": 2.423537492752075 + }, + { + "combined_loss": 0.6848458051681519, + "completion_length": 399.25, + "epoch": 0.13454198473282442, + "grad_norm": 3.9089579582214355, + "kl": 0.0, + "learning_rate": 9.110975096620087e-07, + "loss": 0.6848, + "num_samples": 1.0, + "reward": 4.375, + "reward_std": 1.2500998973846436, + "rewards/gpt4o_holistic_reward": 4.375, + "rl_loss": 1.1175870895385742e-08, + "sft_loss": 2.2828192710876465, + "speech_entropy": 4.557295799255371, + "speech_kl": 0.0, + "step": 423, + "text_entropy": 4.6551923751831055, + "text_kl": 0.0, + "total_entropy": 4.5974225997924805 + }, + { + "combined_loss": 0.6781374216079712, + "completion_length": 464.25, + "epoch": 0.13486005089058525, + "grad_norm": 2.1914126873016357, + "kl": 0.0, + "learning_rate": 9.106709880333768e-07, + "loss": 0.6781, + "num_samples": 1.0, + "reward": 3.375, + "reward_std": 0.9031319618225098, + "rewards/gpt4o_holistic_reward": 3.375, + "rl_loss": -1.862645149230957e-08, + "sft_loss": 2.260457992553711, + "speech_entropy": 2.97335147857666, + "speech_kl": 0.0, + "step": 424, + "text_entropy": 3.013314723968506, + "text_kl": 0.0, + "total_entropy": 3.1752607822418213 + }, + { + "combined_loss": 0.671977698802948, + "completion_length": 338.625, + "epoch": 0.13517811704834606, + "grad_norm": 3.089296579360962, + "kl": 0.0, + "learning_rate": 9.102435584159621e-07, + "loss": 0.672, + "num_samples": 1.0, + "reward": 4.25, + "reward_std": 1.077450156211853, + "rewards/gpt4o_holistic_reward": 4.25, + "rl_loss": 0.0, + "sft_loss": 2.2399253845214844, + "speech_entropy": 3.009519100189209, + "speech_kl": 0.0, + "step": 425, + "text_entropy": 3.133180618286133, + "text_kl": 0.0, + "total_entropy": 3.1568689346313477 + }, + { + "combined_loss": 0.6367964148521423, + "completion_length": 485.375, + "epoch": 0.13549618320610687, + "grad_norm": 2.3259880542755127, + "kl": 0.0, + "learning_rate": 9.098152218858182e-07, + "loss": 0.6368, + "num_samples": 1.0, + "reward": 3.875, + "reward_std": 0.36445680260658264, + "rewards/gpt4o_holistic_reward": 3.875, + "rl_loss": 1.1175870895385742e-08, + "sft_loss": 2.122654676437378, + "speech_entropy": 2.219896078109741, + "speech_kl": 0.0, + "step": 426, + "text_entropy": 2.419771671295166, + "text_kl": 0.0, + "total_entropy": 2.319455862045288 + }, + { + "combined_loss": 0.7209469079971313, + "completion_length": 481.4375, + "epoch": 0.13581424936386768, + "grad_norm": 3.2517311573028564, + "kl": 0.0, + "learning_rate": 9.093859795212817e-07, + "loss": 0.7209, + "num_samples": 1.0, + "reward": 3.8125, + "reward_std": 0.9894567728042603, + "rewards/gpt4o_holistic_reward": 3.8125, + "rl_loss": 0.0, + "sft_loss": 2.403156280517578, + "speech_entropy": 2.589256525039673, + "speech_kl": 0.0, + "step": 427, + "text_entropy": 3.033482074737549, + "text_kl": 0.0, + "total_entropy": 2.8865761756896973 + }, + { + "combined_loss": 0.6140339970588684, + "completion_length": 439.125, + "epoch": 0.1361323155216285, + "grad_norm": 2.3876121044158936, + "kl": 0.0, + "learning_rate": 9.089558324029699e-07, + "loss": 0.614, + "num_samples": 1.0, + "reward": 2.75, + "reward_std": 1.2350690364837646, + "rewards/gpt4o_holistic_reward": 2.75, + "rl_loss": 7.450580596923828e-09, + "sft_loss": 2.0467801094055176, + "speech_entropy": 2.7281534671783447, + "speech_kl": 0.0, + "step": 428, + "text_entropy": 2.7204747200012207, + "text_kl": 0.0, + "total_entropy": 2.784191846847534 + }, + { + "combined_loss": 0.6906248331069946, + "completion_length": 602.125, + "epoch": 0.1364503816793893, + "grad_norm": 1.9201291799545288, + "kl": 0.0, + "learning_rate": 9.085247816137775e-07, + "loss": 0.6906, + "num_samples": 1.0, + "reward": 2.6875, + "reward_std": 0.45901402831077576, + "rewards/gpt4o_holistic_reward": 2.6875, + "rl_loss": 7.450580596923828e-09, + "sft_loss": 2.3020825386047363, + "speech_entropy": 2.1891660690307617, + "speech_kl": 0.0, + "step": 429, + "text_entropy": 1.7951006889343262, + "text_kl": 0.0, + "total_entropy": 2.125535488128662 + }, + { + "combined_loss": 0.8007920980453491, + "completion_length": 364.9375, + "epoch": 0.13676844783715011, + "grad_norm": 3.1121604442596436, + "kl": 0.0, + "learning_rate": 9.080928282388745e-07, + "loss": 0.8008, + "num_samples": 1.0, + "reward": 3.5, + "reward_std": 0.8024665117263794, + "rewards/gpt4o_holistic_reward": 3.5, + "rl_loss": -7.450580596923828e-09, + "sft_loss": 2.669306755065918, + "speech_entropy": 3.015037775039673, + "speech_kl": 0.0, + "step": 430, + "text_entropy": 3.2195496559143066, + "text_kl": 0.0, + "total_entropy": 3.1852550506591797 + }, + { + "combined_loss": 0.627656102180481, + "completion_length": 145.8125, + "epoch": 0.13708651399491095, + "grad_norm": 1.940796971321106, + "kl": 0.0, + "learning_rate": 9.076599733657027e-07, + "loss": 0.6277, + "num_samples": 1.0, + "reward": 4.875, + "reward_std": 0.2501000165939331, + "rewards/gpt4o_holistic_reward": 4.875, + "rl_loss": 0.0, + "sft_loss": 2.09218692779541, + "speech_entropy": 2.180631637573242, + "speech_kl": 0.0, + "step": 431, + "text_entropy": 0.7923494577407837, + "text_kl": 0.0, + "total_entropy": 1.9149651527404785 + }, + { + "combined_loss": 0.6692330241203308, + "completion_length": 400.875, + "epoch": 0.13740458015267176, + "grad_norm": 2.384009599685669, + "kl": 0.0, + "learning_rate": 9.072262180839741e-07, + "loss": 0.6692, + "num_samples": 1.0, + "reward": 3.8125, + "reward_std": 0.8750999569892883, + "rewards/gpt4o_holistic_reward": 3.8125, + "rl_loss": 1.1175870895385742e-08, + "sft_loss": 2.230776786804199, + "speech_entropy": 2.6541309356689453, + "speech_kl": 0.0, + "step": 432, + "text_entropy": 2.617983102798462, + "text_kl": 0.0, + "total_entropy": 2.7352967262268066 + }, + { + "combined_loss": 0.619999885559082, + "completion_length": 364.625, + "epoch": 0.13772264631043257, + "grad_norm": 1.7166526317596436, + "kl": 0.0, + "learning_rate": 9.06791563485667e-07, + "loss": 0.62, + "num_samples": 1.0, + "reward": 4.875, + "reward_std": 0.2501000165939331, + "rewards/gpt4o_holistic_reward": 4.875, + "rl_loss": -3.725290298461914e-09, + "sft_loss": 2.0666658878326416, + "speech_entropy": 2.0609240531921387, + "speech_kl": 0.0, + "step": 433, + "text_entropy": 1.1358726024627686, + "text_kl": 0.0, + "total_entropy": 1.8963937759399414 + }, + { + "combined_loss": 0.6619538068771362, + "completion_length": 455.8125, + "epoch": 0.13804071246819338, + "grad_norm": 2.4818007946014404, + "kl": 0.0, + "learning_rate": 9.063560106650238e-07, + "loss": 0.662, + "num_samples": 1.0, + "reward": 3.8125, + "reward_std": 1.363730549812317, + "rewards/gpt4o_holistic_reward": 3.8125, + "rl_loss": -1.862645149230957e-09, + "sft_loss": 2.206512451171875, + "speech_entropy": 3.38385272026062, + "speech_kl": 0.0, + "step": 434, + "text_entropy": 3.1482739448547363, + "text_kl": 0.0, + "total_entropy": 3.4985315799713135 + }, + { + "combined_loss": 0.6583099365234375, + "completion_length": 388.0, + "epoch": 0.1383587786259542, + "grad_norm": 1.6376653909683228, + "kl": 0.0, + "learning_rate": 9.059195607185481e-07, + "loss": 0.6583, + "num_samples": 1.0, + "reward": 5.0, + "reward_std": 9.999999747378752e-05, + "rewards/gpt4o_holistic_reward": 5.0, + "rl_loss": 0.0, + "sft_loss": 2.194366455078125, + "speech_entropy": 2.094740152359009, + "speech_kl": 0.0, + "step": 435, + "text_entropy": 1.0836068391799927, + "text_kl": 0.0, + "total_entropy": 1.9306923151016235 + }, + { + "combined_loss": 0.6782636642456055, + "completion_length": 487.8125, + "epoch": 0.138676844783715, + "grad_norm": 2.002826452255249, + "kl": 0.0, + "learning_rate": 9.054822147450022e-07, + "loss": 0.6783, + "num_samples": 1.0, + "reward": 4.375, + "reward_std": 0.8904882073402405, + "rewards/gpt4o_holistic_reward": 4.375, + "rl_loss": 1.862645149230957e-08, + "sft_loss": 2.260878562927246, + "speech_entropy": 1.9968137741088867, + "speech_kl": 0.0, + "step": 436, + "text_entropy": 1.1932947635650635, + "text_kl": 0.0, + "total_entropy": 1.8503947257995605 + }, + { + "combined_loss": 0.7755292654037476, + "completion_length": 452.0625, + "epoch": 0.1389949109414758, + "grad_norm": 1.9055498838424683, + "kl": 0.0, + "learning_rate": 9.050439738454042e-07, + "loss": 0.7755, + "num_samples": 1.0, + "reward": 3.5625, + "reward_std": 1.058112621307373, + "rewards/gpt4o_holistic_reward": 3.5625, + "rl_loss": 3.725290298461914e-09, + "sft_loss": 2.585097551345825, + "speech_entropy": 1.9971908330917358, + "speech_kl": 0.0, + "step": 437, + "text_entropy": 1.1110544204711914, + "text_kl": 0.0, + "total_entropy": 1.8316848278045654 + }, + { + "combined_loss": 0.6731459498405457, + "completion_length": 323.3125, + "epoch": 0.13931297709923665, + "grad_norm": 2.3526909351348877, + "kl": 0.0, + "learning_rate": 9.046048391230247e-07, + "loss": 0.6731, + "num_samples": 1.0, + "reward": 4.6875, + "reward_std": 0.6251000165939331, + "rewards/gpt4o_holistic_reward": 4.6875, + "rl_loss": 2.2351741790771484e-08, + "sft_loss": 2.2438197135925293, + "speech_entropy": 2.0683212280273438, + "speech_kl": 0.0, + "step": 438, + "text_entropy": 1.1782288551330566, + "text_kl": 0.0, + "total_entropy": 1.9051158428192139 + }, + { + "combined_loss": 0.7724200487136841, + "completion_length": 532.0, + "epoch": 0.13963104325699746, + "grad_norm": 1.7427912950515747, + "kl": 0.0, + "learning_rate": 9.041648116833853e-07, + "loss": 0.7724, + "num_samples": 1.0, + "reward": 3.75, + "reward_std": 0.20422415435314178, + "rewards/gpt4o_holistic_reward": 3.75, + "rl_loss": 0.0, + "sft_loss": 2.5747334957122803, + "speech_entropy": 2.0349836349487305, + "speech_kl": 0.0, + "step": 439, + "text_entropy": 1.427042841911316, + "text_kl": 0.0, + "total_entropy": 1.928051233291626 + }, + { + "combined_loss": 0.5910226106643677, + "completion_length": 380.6875, + "epoch": 0.13994910941475827, + "grad_norm": 1.7229056358337402, + "kl": 0.0, + "learning_rate": 9.037238926342543e-07, + "loss": 0.591, + "num_samples": 1.0, + "reward": 5.0, + "reward_std": 9.999999747378752e-05, + "rewards/gpt4o_holistic_reward": 5.0, + "rl_loss": 0.0, + "sft_loss": 1.970075249671936, + "speech_entropy": 1.9323487281799316, + "speech_kl": 0.0, + "step": 440, + "text_entropy": 0.8262702226638794, + "text_kl": 0.0, + "total_entropy": 1.7402493953704834 + }, + { + "combined_loss": 0.6713898777961731, + "completion_length": 546.75, + "epoch": 0.14026717557251908, + "grad_norm": 1.5187904834747314, + "kl": 0.0, + "learning_rate": 9.032820830856449e-07, + "loss": 0.6714, + "num_samples": 1.0, + "reward": 3.4375, + "reward_std": 0.3751000165939331, + "rewards/gpt4o_holistic_reward": 3.4375, + "rl_loss": -3.725290298461914e-09, + "sft_loss": 2.2379660606384277, + "speech_entropy": 2.4945101737976074, + "speech_kl": 0.0, + "step": 441, + "text_entropy": 2.5544350147247314, + "text_kl": 0.0, + "total_entropy": 2.6159682273864746 + }, + { + "combined_loss": 0.6225212812423706, + "completion_length": 336.75, + "epoch": 0.1405852417302799, + "grad_norm": 1.7801567316055298, + "kl": 0.0, + "learning_rate": 9.028393841498121e-07, + "loss": 0.6225, + "num_samples": 1.0, + "reward": 4.125, + "reward_std": 0.14443756639957428, + "rewards/gpt4o_holistic_reward": 4.125, + "rl_loss": -3.725290298461914e-09, + "sft_loss": 2.075070858001709, + "speech_entropy": 2.058767080307007, + "speech_kl": 0.0, + "step": 442, + "text_entropy": 0.9935916662216187, + "text_kl": 0.0, + "total_entropy": 1.8773467540740967 + }, + { + "combined_loss": 0.6718326210975647, + "completion_length": 393.75, + "epoch": 0.1409033078880407, + "grad_norm": 1.7748103141784668, + "kl": 0.0, + "learning_rate": 9.023957969412499e-07, + "loss": 0.6718, + "num_samples": 1.0, + "reward": 2.375, + "reward_std": 0.2501000165939331, + "rewards/gpt4o_holistic_reward": 2.375, + "rl_loss": 0.0, + "sft_loss": 2.2394418716430664, + "speech_entropy": 2.0696630477905273, + "speech_kl": 0.0, + "step": 443, + "text_entropy": 1.2485952377319336, + "text_kl": 0.0, + "total_entropy": 1.9180147647857666 + }, + { + "combined_loss": 0.6381258964538574, + "completion_length": 492.1875, + "epoch": 0.14122137404580154, + "grad_norm": 3.0544703006744385, + "kl": 0.0, + "learning_rate": 9.019513225766888e-07, + "loss": 0.6381, + "num_samples": 1.0, + "reward": 4.125, + "reward_std": 0.8483423590660095, + "rewards/gpt4o_holistic_reward": 4.125, + "rl_loss": 0.0, + "sft_loss": 2.1270861625671387, + "speech_entropy": 2.6119399070739746, + "speech_kl": 0.0, + "step": 444, + "text_entropy": 1.2080113887786865, + "text_kl": 0.0, + "total_entropy": 2.211930990219116 + }, + { + "combined_loss": 0.6671527624130249, + "completion_length": 339.375, + "epoch": 0.14153944020356235, + "grad_norm": 3.0724539756774902, + "kl": 0.0, + "learning_rate": 9.01505962175092e-07, + "loss": 0.6672, + "num_samples": 1.0, + "reward": 4.3125, + "reward_std": 1.125100016593933, + "rewards/gpt4o_holistic_reward": 4.3125, + "rl_loss": 1.862645149230957e-09, + "sft_loss": 2.2238426208496094, + "speech_entropy": 2.0776891708374023, + "speech_kl": 0.0, + "step": 445, + "text_entropy": 1.0866782665252686, + "text_kl": 0.0, + "total_entropy": 1.9007325172424316 + }, + { + "combined_loss": 0.6805820465087891, + "completion_length": 565.625, + "epoch": 0.14185750636132316, + "grad_norm": 2.9971096515655518, + "kl": 0.0, + "learning_rate": 9.010597168576542e-07, + "loss": 0.6806, + "num_samples": 1.0, + "reward": 3.125, + "reward_std": 0.14443756639957428, + "rewards/gpt4o_holistic_reward": 3.125, + "rl_loss": 7.450580596923828e-09, + "sft_loss": 2.268606662750244, + "speech_entropy": 3.4692184925079346, + "speech_kl": 0.0, + "step": 446, + "text_entropy": 4.159717082977295, + "text_kl": 0.0, + "total_entropy": 3.8382232189178467 + }, + { + "combined_loss": 0.6768923401832581, + "completion_length": 378.5, + "epoch": 0.14217557251908397, + "grad_norm": 1.5186785459518433, + "kl": 0.0, + "learning_rate": 9.006125877477975e-07, + "loss": 0.6769, + "num_samples": 1.0, + "reward": 4.0625, + "reward_std": 0.1251000016927719, + "rewards/gpt4o_holistic_reward": 4.0625, + "rl_loss": -3.725290298461914e-09, + "sft_loss": 2.256307601928711, + "speech_entropy": 2.0326013565063477, + "speech_kl": 0.0, + "step": 447, + "text_entropy": 1.1156654357910156, + "text_kl": 0.0, + "total_entropy": 1.8572213649749756 + }, + { + "combined_loss": 0.6424182653427124, + "completion_length": 378.5625, + "epoch": 0.14249363867684478, + "grad_norm": 1.8854279518127441, + "kl": 0.0, + "learning_rate": 9.001645759711687e-07, + "loss": 0.6424, + "num_samples": 1.0, + "reward": 3.625, + "reward_std": 0.6008730530738831, + "rewards/gpt4o_holistic_reward": 3.625, + "rl_loss": -7.450580596923828e-09, + "sft_loss": 2.1413941383361816, + "speech_entropy": 2.0020644664764404, + "speech_kl": 0.0, + "step": 448, + "text_entropy": 0.8331085443496704, + "text_kl": 0.0, + "total_entropy": 1.8029673099517822 + }, + { + "combined_loss": 0.6341080665588379, + "completion_length": 501.1875, + "epoch": 0.1428117048346056, + "grad_norm": 1.8343334197998047, + "kl": 0.0, + "learning_rate": 8.997156826556369e-07, + "loss": 0.6341, + "num_samples": 1.0, + "reward": 4.0625, + "reward_std": 0.8600690364837646, + "rewards/gpt4o_holistic_reward": 4.0625, + "rl_loss": -1.862645149230957e-09, + "sft_loss": 2.1136932373046875, + "speech_entropy": 2.640713691711426, + "speech_kl": 0.0, + "step": 449, + "text_entropy": 2.6889724731445312, + "text_kl": 0.0, + "total_entropy": 2.807018280029297 + }, + { + "combined_loss": 0.6706026792526245, + "completion_length": 411.875, + "epoch": 0.1431297709923664, + "grad_norm": 2.0064008235931396, + "kl": 0.0, + "learning_rate": 8.992659089312905e-07, + "loss": 0.6706, + "num_samples": 1.0, + "reward": 4.5625, + "reward_std": 0.6251000165939331, + "rewards/gpt4o_holistic_reward": 4.5625, + "rl_loss": 3.725290298461914e-09, + "sft_loss": 2.235342025756836, + "speech_entropy": 2.0443549156188965, + "speech_kl": 0.0, + "step": 450, + "text_entropy": 1.1791572570800781, + "text_kl": 0.0, + "total_entropy": 1.8973615169525146 + }, + { + "combined_loss": 0.7649298906326294, + "completion_length": 356.4375, + "epoch": 0.14344783715012724, + "grad_norm": 2.9334285259246826, + "kl": 0.0, + "learning_rate": 8.988152559304345e-07, + "loss": 0.7649, + "num_samples": 1.0, + "reward": 3.875, + "reward_std": 1.4256925582885742, + "rewards/gpt4o_holistic_reward": 3.875, + "rl_loss": 0.0, + "sft_loss": 2.5497660636901855, + "speech_entropy": 2.2932775020599365, + "speech_kl": 0.0, + "step": 451, + "text_entropy": 1.6037144660949707, + "text_kl": 0.0, + "total_entropy": 2.1606814861297607 + }, + { + "combined_loss": 0.8326526880264282, + "completion_length": 575.9375, + "epoch": 0.14376590330788805, + "grad_norm": 2.880450963973999, + "kl": 0.0, + "learning_rate": 8.983637247875872e-07, + "loss": 0.8327, + "num_samples": 1.0, + "reward": 4.125, + "reward_std": 1.1683900356292725, + "rewards/gpt4o_holistic_reward": 4.125, + "rl_loss": 5.587935447692871e-09, + "sft_loss": 2.7755088806152344, + "speech_entropy": 2.5213093757629395, + "speech_kl": 0.0, + "step": 452, + "text_entropy": 1.762268304824829, + "text_kl": 0.0, + "total_entropy": 2.381089448928833 + }, + { + "combined_loss": 0.7429494261741638, + "completion_length": 398.0, + "epoch": 0.14408396946564886, + "grad_norm": 2.2405805587768555, + "kl": 0.0, + "learning_rate": 8.979113166394775e-07, + "loss": 0.7429, + "num_samples": 1.0, + "reward": 3.75, + "reward_std": 0.5000999569892883, + "rewards/gpt4o_holistic_reward": 3.75, + "rl_loss": 3.725290298461914e-09, + "sft_loss": 2.4764981269836426, + "speech_entropy": 3.2521719932556152, + "speech_kl": 0.0, + "step": 453, + "text_entropy": 3.165997266769409, + "text_kl": 0.0, + "total_entropy": 3.449254035949707 + }, + { + "combined_loss": 0.5944318175315857, + "completion_length": 424.1875, + "epoch": 0.14440203562340967, + "grad_norm": 1.6569174528121948, + "kl": 0.0, + "learning_rate": 8.974580326250424e-07, + "loss": 0.5944, + "num_samples": 1.0, + "reward": 3.3125, + "reward_std": 0.3751000165939331, + "rewards/gpt4o_holistic_reward": 3.3125, + "rl_loss": 1.862645149230957e-09, + "sft_loss": 1.9814393520355225, + "speech_entropy": 1.9895391464233398, + "speech_kl": 0.0, + "step": 454, + "text_entropy": 0.9808429479598999, + "text_kl": 0.0, + "total_entropy": 1.808826208114624 + }, + { + "combined_loss": 0.6611145734786987, + "completion_length": 466.625, + "epoch": 0.14472010178117048, + "grad_norm": 1.9934300184249878, + "kl": 0.0, + "learning_rate": 8.970038738854244e-07, + "loss": 0.6611, + "num_samples": 1.0, + "reward": 3.9375, + "reward_std": 0.5194375514984131, + "rewards/gpt4o_holistic_reward": 3.9375, + "rl_loss": 0.0, + "sft_loss": 2.2037153244018555, + "speech_entropy": 2.034078359603882, + "speech_kl": 0.0, + "step": 455, + "text_entropy": 1.189087986946106, + "text_kl": 0.0, + "total_entropy": 1.8778494596481323 + }, + { + "combined_loss": 0.6767491698265076, + "completion_length": 517.75, + "epoch": 0.1450381679389313, + "grad_norm": 1.939832091331482, + "kl": 0.0, + "learning_rate": 8.965488415639671e-07, + "loss": 0.6767, + "num_samples": 1.0, + "reward": 2.875, + "reward_std": 0.7217878103256226, + "rewards/gpt4o_holistic_reward": 2.875, + "rl_loss": 7.450580596923828e-09, + "sft_loss": 2.2558302879333496, + "speech_entropy": 2.4079430103302, + "speech_kl": 0.0, + "step": 456, + "text_entropy": 2.5751752853393555, + "text_kl": 0.0, + "total_entropy": 2.6069483757019043 + }, + { + "combined_loss": 0.6979454159736633, + "completion_length": 484.125, + "epoch": 0.1453562340966921, + "grad_norm": 1.6610511541366577, + "kl": 0.0, + "learning_rate": 8.960929368062138e-07, + "loss": 0.6979, + "num_samples": 1.0, + "reward": 3.75, + "reward_std": 0.5774502754211426, + "rewards/gpt4o_holistic_reward": 3.75, + "rl_loss": 0.0, + "sft_loss": 2.3264846801757812, + "speech_entropy": 2.0884976387023926, + "speech_kl": 0.0, + "step": 457, + "text_entropy": 1.078056812286377, + "text_kl": 0.0, + "total_entropy": 1.8959829807281494 + }, + { + "combined_loss": 0.6162172555923462, + "completion_length": 581.4375, + "epoch": 0.14567430025445294, + "grad_norm": 1.4069401025772095, + "kl": 0.0, + "learning_rate": 8.956361607599043e-07, + "loss": 0.6162, + "num_samples": 1.0, + "reward": 3.625, + "reward_std": 0.4331127107143402, + "rewards/gpt4o_holistic_reward": 3.625, + "rl_loss": 0.0, + "sft_loss": 2.0540573596954346, + "speech_entropy": 1.9977447986602783, + "speech_kl": 0.0, + "step": 458, + "text_entropy": 0.7805925607681274, + "text_kl": 0.0, + "total_entropy": 1.7389767169952393 + }, + { + "combined_loss": 0.677904486656189, + "completion_length": 249.4375, + "epoch": 0.14599236641221375, + "grad_norm": 2.1168832778930664, + "kl": 0.0, + "learning_rate": 8.951785145749719e-07, + "loss": 0.6779, + "num_samples": 1.0, + "reward": 3.5625, + "reward_std": 0.7090140581130981, + "rewards/gpt4o_holistic_reward": 3.5625, + "rl_loss": 1.1175870895385742e-08, + "sft_loss": 2.259681463241577, + "speech_entropy": 2.0867042541503906, + "speech_kl": 0.0, + "step": 459, + "text_entropy": 0.5684058666229248, + "text_kl": 0.0, + "total_entropy": 1.8018689155578613 + }, + { + "combined_loss": 0.6343377828598022, + "completion_length": 347.4375, + "epoch": 0.14631043256997456, + "grad_norm": 2.351423501968384, + "kl": 0.0, + "learning_rate": 8.9471999940354e-07, + "loss": 0.6343, + "num_samples": 1.0, + "reward": 3.375, + "reward_std": 1.1404881477355957, + "rewards/gpt4o_holistic_reward": 3.375, + "rl_loss": 0.0, + "sft_loss": 2.1144590377807617, + "speech_entropy": 2.272289752960205, + "speech_kl": 0.0, + "step": 460, + "text_entropy": 1.7216747999191284, + "text_kl": 0.0, + "total_entropy": 2.1894326210021973 + }, + { + "combined_loss": 0.7236204147338867, + "completion_length": 413.1875, + "epoch": 0.14662849872773537, + "grad_norm": 2.023482322692871, + "kl": 0.0, + "learning_rate": 8.942606163999204e-07, + "loss": 0.7236, + "num_samples": 1.0, + "reward": 3.6875, + "reward_std": 0.6250999569892883, + "rewards/gpt4o_holistic_reward": 3.6875, + "rl_loss": 7.450580596923828e-09, + "sft_loss": 2.4120678901672363, + "speech_entropy": 2.0738351345062256, + "speech_kl": 0.0, + "step": 461, + "text_entropy": 1.424910306930542, + "text_kl": 0.0, + "total_entropy": 1.9624372720718384 + }, + { + "combined_loss": 0.6593168377876282, + "completion_length": 440.625, + "epoch": 0.14694656488549618, + "grad_norm": 2.023776054382324, + "kl": 0.0, + "learning_rate": 8.93800366720609e-07, + "loss": 0.6593, + "num_samples": 1.0, + "reward": 3.3125, + "reward_std": 0.651972770690918, + "rewards/gpt4o_holistic_reward": 3.3125, + "rl_loss": -3.725290298461914e-09, + "sft_loss": 2.197722911834717, + "speech_entropy": 2.1023144721984863, + "speech_kl": 0.0, + "step": 462, + "text_entropy": 1.0320696830749512, + "text_kl": 0.0, + "total_entropy": 1.9018843173980713 + }, + { + "combined_loss": 0.6318823099136353, + "completion_length": 373.1875, + "epoch": 0.147264631043257, + "grad_norm": 2.802096128463745, + "kl": 0.0, + "learning_rate": 8.933392515242838e-07, + "loss": 0.6319, + "num_samples": 1.0, + "reward": 3.375, + "reward_std": 0.8731511235237122, + "rewards/gpt4o_holistic_reward": 3.375, + "rl_loss": 0.0, + "sft_loss": 2.106274366378784, + "speech_entropy": 3.7126779556274414, + "speech_kl": 0.0, + "step": 463, + "text_entropy": 3.271230697631836, + "text_kl": 0.0, + "total_entropy": 3.7780895233154297 + }, + { + "combined_loss": 0.6602550745010376, + "completion_length": 463.25, + "epoch": 0.1475826972010178, + "grad_norm": 2.1378464698791504, + "kl": 0.0, + "learning_rate": 8.928772719718018e-07, + "loss": 0.6603, + "num_samples": 1.0, + "reward": 4.3125, + "reward_std": 0.6477051377296448, + "rewards/gpt4o_holistic_reward": 4.3125, + "rl_loss": -7.450580596923828e-09, + "sft_loss": 2.200850248336792, + "speech_entropy": 1.9239245653152466, + "speech_kl": 0.0, + "step": 464, + "text_entropy": 1.3505656719207764, + "text_kl": 0.0, + "total_entropy": 1.8138980865478516 + }, + { + "combined_loss": 0.6675082445144653, + "completion_length": 347.75, + "epoch": 0.14790076335877864, + "grad_norm": 2.2810842990875244, + "kl": 0.0, + "learning_rate": 8.924144292261962e-07, + "loss": 0.6675, + "num_samples": 1.0, + "reward": 3.1875, + "reward_std": 0.8750999569892883, + "rewards/gpt4o_holistic_reward": 3.1875, + "rl_loss": -2.2351741790771484e-08, + "sft_loss": 2.225027561187744, + "speech_entropy": 2.1317272186279297, + "speech_kl": 0.0, + "step": 465, + "text_entropy": 1.1749463081359863, + "text_kl": 0.0, + "total_entropy": 1.9694660902023315 + }, + { + "combined_loss": 0.7008163928985596, + "completion_length": 294.5625, + "epoch": 0.14821882951653945, + "grad_norm": 2.6175572872161865, + "kl": 0.0, + "learning_rate": 8.919507244526726e-07, + "loss": 0.7008, + "num_samples": 1.0, + "reward": 4.875, + "reward_std": 0.2501000165939331, + "rewards/gpt4o_holistic_reward": 4.875, + "rl_loss": 3.725290298461914e-09, + "sft_loss": 2.336054563522339, + "speech_entropy": 2.284971237182617, + "speech_kl": 0.0, + "step": 466, + "text_entropy": 1.1186554431915283, + "text_kl": 0.0, + "total_entropy": 2.061189889907837 + }, + { + "combined_loss": 0.7040484547615051, + "completion_length": 343.25, + "epoch": 0.14853689567430026, + "grad_norm": 2.2278120517730713, + "kl": 0.0, + "learning_rate": 8.914861588186076e-07, + "loss": 0.704, + "num_samples": 1.0, + "reward": 4.0625, + "reward_std": 0.6038135886192322, + "rewards/gpt4o_holistic_reward": 4.0625, + "rl_loss": 0.0, + "sft_loss": 2.346827983856201, + "speech_entropy": 2.906318426132202, + "speech_kl": 0.0, + "step": 467, + "text_entropy": 3.0286753177642822, + "text_kl": 0.0, + "total_entropy": 3.109015464782715 + }, + { + "combined_loss": 0.6465004682540894, + "completion_length": 415.1875, + "epoch": 0.14885496183206107, + "grad_norm": 2.1413097381591797, + "kl": 0.0, + "learning_rate": 8.910207334935446e-07, + "loss": 0.6465, + "num_samples": 1.0, + "reward": 4.0, + "reward_std": 0.5001000165939331, + "rewards/gpt4o_holistic_reward": 4.0, + "rl_loss": -3.725290298461914e-09, + "sft_loss": 2.155001640319824, + "speech_entropy": 2.1206655502319336, + "speech_kl": 0.0, + "step": 468, + "text_entropy": 1.0818634033203125, + "text_kl": 0.0, + "total_entropy": 1.9281299114227295 + }, + { + "combined_loss": 0.7584635615348816, + "completion_length": 545.25, + "epoch": 0.14917302798982188, + "grad_norm": 1.5538190603256226, + "kl": 0.0, + "learning_rate": 8.90554449649191e-07, + "loss": 0.7585, + "num_samples": 1.0, + "reward": 5.0, + "reward_std": 9.999999747378752e-05, + "rewards/gpt4o_holistic_reward": 5.0, + "rl_loss": 0.0, + "sft_loss": 2.528211832046509, + "speech_entropy": 2.115117073059082, + "speech_kl": 0.0, + "step": 469, + "text_entropy": 1.4869662523269653, + "text_kl": 0.0, + "total_entropy": 2.0077667236328125 + }, + { + "combined_loss": 0.7136243581771851, + "completion_length": 401.3125, + "epoch": 0.1494910941475827, + "grad_norm": 1.8857200145721436, + "kl": 0.0, + "learning_rate": 8.900873084594161e-07, + "loss": 0.7136, + "num_samples": 1.0, + "reward": 4.8125, + "reward_std": 0.2694375813007355, + "rewards/gpt4o_holistic_reward": 4.8125, + "rl_loss": -3.725290298461914e-09, + "sft_loss": 2.3787477016448975, + "speech_entropy": 2.175069808959961, + "speech_kl": 0.0, + "step": 470, + "text_entropy": 1.4863579273223877, + "text_kl": 0.0, + "total_entropy": 2.0467123985290527 + }, + { + "combined_loss": 0.7089301347732544, + "completion_length": 378.0625, + "epoch": 0.14980916030534353, + "grad_norm": 2.2557709217071533, + "kl": 0.0, + "learning_rate": 8.896193111002475e-07, + "loss": 0.7089, + "num_samples": 1.0, + "reward": 3.8125, + "reward_std": 0.739456832408905, + "rewards/gpt4o_holistic_reward": 3.8125, + "rl_loss": 0.0, + "sft_loss": 2.363100528717041, + "speech_entropy": 2.1490824222564697, + "speech_kl": 0.0, + "step": 471, + "text_entropy": 0.9428769946098328, + "text_kl": 0.0, + "total_entropy": 1.9167104959487915 + }, + { + "combined_loss": 0.6535341143608093, + "completion_length": 417.375, + "epoch": 0.15012722646310434, + "grad_norm": 1.727138876914978, + "kl": 0.0, + "learning_rate": 8.891504587498674e-07, + "loss": 0.6535, + "num_samples": 1.0, + "reward": 3.5625, + "reward_std": 0.2694375813007355, + "rewards/gpt4o_holistic_reward": 3.5625, + "rl_loss": 0.0, + "sft_loss": 2.1784467697143555, + "speech_entropy": 2.090775489807129, + "speech_kl": 0.0, + "step": 472, + "text_entropy": 1.3160064220428467, + "text_kl": 0.0, + "total_entropy": 1.9528473615646362 + }, + { + "combined_loss": 0.7152138948440552, + "completion_length": 432.625, + "epoch": 0.15044529262086515, + "grad_norm": 1.9705350399017334, + "kl": 0.0, + "learning_rate": 8.886807525886113e-07, + "loss": 0.7152, + "num_samples": 1.0, + "reward": 4.75, + "reward_std": 0.5001000165939331, + "rewards/gpt4o_holistic_reward": 4.75, + "rl_loss": 1.1175870895385742e-08, + "sft_loss": 2.3840465545654297, + "speech_entropy": 2.1602611541748047, + "speech_kl": 0.0, + "step": 473, + "text_entropy": 1.172673225402832, + "text_kl": 0.0, + "total_entropy": 1.9916683435440063 + }, + { + "combined_loss": 0.6396362781524658, + "completion_length": 311.8125, + "epoch": 0.15076335877862596, + "grad_norm": 1.9259790182113647, + "kl": 0.0, + "learning_rate": 8.882101937989642e-07, + "loss": 0.6396, + "num_samples": 1.0, + "reward": 4.1875, + "reward_std": 0.5194376111030579, + "rewards/gpt4o_holistic_reward": 4.1875, + "rl_loss": 1.4901161193847656e-08, + "sft_loss": 2.1321208477020264, + "speech_entropy": 2.147437334060669, + "speech_kl": 0.0, + "step": 474, + "text_entropy": 0.9479535818099976, + "text_kl": 0.0, + "total_entropy": 1.929750919342041 + }, + { + "combined_loss": 0.6557704210281372, + "completion_length": 485.6875, + "epoch": 0.15108142493638677, + "grad_norm": 2.2060537338256836, + "kl": 0.0, + "learning_rate": 8.87738783565557e-07, + "loss": 0.6558, + "num_samples": 1.0, + "reward": 3.625, + "reward_std": 0.933112621307373, + "rewards/gpt4o_holistic_reward": 3.625, + "rl_loss": 0.0, + "sft_loss": 2.185901165008545, + "speech_entropy": 2.608290672302246, + "speech_kl": 0.0, + "step": 475, + "text_entropy": 2.9066762924194336, + "text_kl": 0.0, + "total_entropy": 2.873114585876465 + }, + { + "combined_loss": 0.6064234972000122, + "completion_length": 427.6875, + "epoch": 0.15139949109414758, + "grad_norm": 1.4221584796905518, + "kl": 0.0, + "learning_rate": 8.872665230751643e-07, + "loss": 0.6064, + "num_samples": 1.0, + "reward": 4.8125, + "reward_std": 0.3751000165939331, + "rewards/gpt4o_holistic_reward": 4.8125, + "rl_loss": 3.725290298461914e-09, + "sft_loss": 2.021411657333374, + "speech_entropy": 1.9495731592178345, + "speech_kl": 0.0, + "step": 476, + "text_entropy": 0.6150962114334106, + "text_kl": 0.0, + "total_entropy": 1.6934847831726074 + }, + { + "combined_loss": 0.6592838168144226, + "completion_length": 535.5, + "epoch": 0.15171755725190839, + "grad_norm": 1.9688835144042969, + "kl": 0.0, + "learning_rate": 8.867934135167016e-07, + "loss": 0.6593, + "num_samples": 1.0, + "reward": 2.5, + "reward_std": 0.8944375514984131, + "rewards/gpt4o_holistic_reward": 2.5, + "rl_loss": -1.1175870895385742e-08, + "sft_loss": 2.197612762451172, + "speech_entropy": 2.0396499633789062, + "speech_kl": 0.0, + "step": 477, + "text_entropy": 1.2544045448303223, + "text_kl": 0.0, + "total_entropy": 1.8939778804779053 + }, + { + "combined_loss": 0.7103521823883057, + "completion_length": 476.3125, + "epoch": 0.15203562340966922, + "grad_norm": 1.6043519973754883, + "kl": 0.0, + "learning_rate": 8.863194560812214e-07, + "loss": 0.7104, + "num_samples": 1.0, + "reward": 5.0, + "reward_std": 9.999999747378752e-05, + "rewards/gpt4o_holistic_reward": 5.0, + "rl_loss": 0.0, + "sft_loss": 2.367840528488159, + "speech_entropy": 2.1345765590667725, + "speech_kl": 0.0, + "step": 478, + "text_entropy": 1.4203035831451416, + "text_kl": 0.0, + "total_entropy": 2.0092082023620605 + }, + { + "combined_loss": 0.6551531553268433, + "completion_length": 424.875, + "epoch": 0.15235368956743003, + "grad_norm": 1.9476300477981567, + "kl": 0.0, + "learning_rate": 8.858446519619112e-07, + "loss": 0.6552, + "num_samples": 1.0, + "reward": 4.0625, + "reward_std": 0.9063550233840942, + "rewards/gpt4o_holistic_reward": 4.0625, + "rl_loss": 7.450580596923828e-09, + "sft_loss": 2.1838438510894775, + "speech_entropy": 2.67842435836792, + "speech_kl": 0.0, + "step": 479, + "text_entropy": 2.9070305824279785, + "text_kl": 0.0, + "total_entropy": 2.8740692138671875 + }, + { + "combined_loss": 0.6260417699813843, + "completion_length": 583.6875, + "epoch": 0.15267175572519084, + "grad_norm": 1.565351128578186, + "kl": 0.0, + "learning_rate": 8.853690023540895e-07, + "loss": 0.626, + "num_samples": 1.0, + "reward": 4.3125, + "reward_std": 0.1251000016927719, + "rewards/gpt4o_holistic_reward": 4.3125, + "rl_loss": -1.4901161193847656e-08, + "sft_loss": 2.086805820465088, + "speech_entropy": 1.9153913259506226, + "speech_kl": 0.0, + "step": 480, + "text_entropy": 0.715573251247406, + "text_kl": 0.0, + "total_entropy": 1.687865972518921 + }, + { + "combined_loss": 0.6601178050041199, + "completion_length": 385.25, + "epoch": 0.15298982188295165, + "grad_norm": 2.2061314582824707, + "kl": 0.0, + "learning_rate": 8.84892508455204e-07, + "loss": 0.6601, + "num_samples": 1.0, + "reward": 3.75, + "reward_std": 1.1313834190368652, + "rewards/gpt4o_holistic_reward": 3.75, + "rl_loss": 7.450580596923828e-09, + "sft_loss": 2.200392484664917, + "speech_entropy": 2.007784366607666, + "speech_kl": 0.0, + "step": 481, + "text_entropy": 1.371636986732483, + "text_kl": 0.0, + "total_entropy": 1.8983948230743408 + }, + { + "combined_loss": 0.6691581010818481, + "completion_length": 463.3125, + "epoch": 0.15330788804071246, + "grad_norm": 1.7773209810256958, + "kl": 0.0, + "learning_rate": 8.844151714648274e-07, + "loss": 0.6692, + "num_samples": 1.0, + "reward": 3.3125, + "reward_std": 0.853813648223877, + "rewards/gpt4o_holistic_reward": 3.3125, + "rl_loss": 7.450580596923828e-09, + "sft_loss": 2.230526924133301, + "speech_entropy": 2.0789544582366943, + "speech_kl": 0.0, + "step": 482, + "text_entropy": 1.1371136903762817, + "text_kl": 0.0, + "total_entropy": 1.8991715908050537 + }, + { + "combined_loss": 0.6369400024414062, + "completion_length": 410.6875, + "epoch": 0.15362595419847327, + "grad_norm": 1.8571422100067139, + "kl": 0.0, + "learning_rate": 8.839369925846548e-07, + "loss": 0.6369, + "num_samples": 1.0, + "reward": 2.8125, + "reward_std": 0.48945680260658264, + "rewards/gpt4o_holistic_reward": 2.8125, + "rl_loss": 3.725290298461914e-09, + "sft_loss": 2.1231331825256348, + "speech_entropy": 2.2014362812042236, + "speech_kl": 0.0, + "step": 483, + "text_entropy": 1.6307443380355835, + "text_kl": 0.0, + "total_entropy": 2.1009421348571777 + }, + { + "combined_loss": 0.7150350213050842, + "completion_length": 738.4375, + "epoch": 0.15394402035623408, + "grad_norm": 1.8518329858779907, + "kl": 0.0, + "learning_rate": 8.834579730185012e-07, + "loss": 0.715, + "num_samples": 1.0, + "reward": 3.9375, + "reward_std": 0.6935809850692749, + "rewards/gpt4o_holistic_reward": 3.9375, + "rl_loss": -1.1175870895385742e-08, + "sft_loss": 2.3834500312805176, + "speech_entropy": 2.07857608795166, + "speech_kl": 0.0, + "step": 484, + "text_entropy": 1.5495516061782837, + "text_kl": 0.0, + "total_entropy": 1.9772846698760986 + }, + { + "combined_loss": 0.7554680109024048, + "completion_length": 434.4375, + "epoch": 0.15426208651399492, + "grad_norm": 1.8048903942108154, + "kl": 0.0, + "learning_rate": 8.829781139722978e-07, + "loss": 0.7555, + "num_samples": 1.0, + "reward": 3.9375, + "reward_std": 0.3751000165939331, + "rewards/gpt4o_holistic_reward": 3.9375, + "rl_loss": -1.1175870895385742e-08, + "sft_loss": 2.5182266235351562, + "speech_entropy": 2.2054073810577393, + "speech_kl": 0.0, + "step": 485, + "text_entropy": 1.5983672142028809, + "text_kl": 0.0, + "total_entropy": 2.0975542068481445 + }, + { + "combined_loss": 0.7759544253349304, + "completion_length": 575.25, + "epoch": 0.15458015267175573, + "grad_norm": 1.7967489957809448, + "kl": 0.0, + "learning_rate": 8.824974166540889e-07, + "loss": 0.776, + "num_samples": 1.0, + "reward": 3.625, + "reward_std": 0.3944375813007355, + "rewards/gpt4o_holistic_reward": 3.625, + "rl_loss": 1.1175870895385742e-08, + "sft_loss": 2.586514711380005, + "speech_entropy": 2.0900139808654785, + "speech_kl": 0.0, + "step": 486, + "text_entropy": 1.4879392385482788, + "text_kl": 0.0, + "total_entropy": 1.9754605293273926 + }, + { + "combined_loss": 0.7049446105957031, + "completion_length": 516.3125, + "epoch": 0.15489821882951654, + "grad_norm": 1.6339975595474243, + "kl": 0.0, + "learning_rate": 8.820158822740297e-07, + "loss": 0.7049, + "num_samples": 1.0, + "reward": 3.25, + "reward_std": 0.2501000165939331, + "rewards/gpt4o_holistic_reward": 3.25, + "rl_loss": -1.1175870895385742e-08, + "sft_loss": 2.3498153686523438, + "speech_entropy": 2.0781824588775635, + "speech_kl": 0.0, + "step": 487, + "text_entropy": 1.1251006126403809, + "text_kl": 0.0, + "total_entropy": 1.9013476371765137 + }, + { + "combined_loss": 0.7730693817138672, + "completion_length": 473.25, + "epoch": 0.15521628498727735, + "grad_norm": 1.7758270502090454, + "kl": 0.0, + "learning_rate": 8.81533512044382e-07, + "loss": 0.7731, + "num_samples": 1.0, + "reward": 3.875, + "reward_std": 1.0387752056121826, + "rewards/gpt4o_holistic_reward": 3.875, + "rl_loss": 0.0, + "sft_loss": 2.5768978595733643, + "speech_entropy": 2.2842307090759277, + "speech_kl": 0.0, + "step": 488, + "text_entropy": 1.3962032794952393, + "text_kl": 0.0, + "total_entropy": 2.119879961013794 + }, + { + "combined_loss": 0.6924104690551758, + "completion_length": 437.25, + "epoch": 0.15553435114503816, + "grad_norm": 2.08705735206604, + "kl": 0.0, + "learning_rate": 8.810503071795131e-07, + "loss": 0.6924, + "num_samples": 1.0, + "reward": 3.4375, + "reward_std": 0.9478486180305481, + "rewards/gpt4o_holistic_reward": 3.4375, + "rl_loss": 0.0, + "sft_loss": 2.308034896850586, + "speech_entropy": 2.0420119762420654, + "speech_kl": 0.0, + "step": 489, + "text_entropy": 1.393028736114502, + "text_kl": 0.0, + "total_entropy": 1.928663730621338 + }, + { + "combined_loss": 0.7534229159355164, + "completion_length": 616.9375, + "epoch": 0.15585241730279897, + "grad_norm": 1.6156688928604126, + "kl": 0.0, + "learning_rate": 8.805662688958898e-07, + "loss": 0.7534, + "num_samples": 1.0, + "reward": 4.3125, + "reward_std": 0.6978486180305481, + "rewards/gpt4o_holistic_reward": 4.3125, + "rl_loss": -9.313225746154785e-09, + "sft_loss": 2.5114095211029053, + "speech_entropy": 2.1210994720458984, + "speech_kl": 0.0, + "step": 490, + "text_entropy": 1.2488669157028198, + "text_kl": 0.0, + "total_entropy": 1.964477300643921 + }, + { + "combined_loss": 0.648138165473938, + "completion_length": 361.8125, + "epoch": 0.15617048346055978, + "grad_norm": 1.8364536762237549, + "kl": 0.0, + "learning_rate": 8.800813984120786e-07, + "loss": 0.6481, + "num_samples": 1.0, + "reward": 4.0625, + "reward_std": 0.3751000165939331, + "rewards/gpt4o_holistic_reward": 4.0625, + "rl_loss": 3.725290298461914e-09, + "sft_loss": 2.1604604721069336, + "speech_entropy": 2.1113858222961426, + "speech_kl": 0.0, + "step": 491, + "text_entropy": 1.0654577016830444, + "text_kl": 0.0, + "total_entropy": 1.9055767059326172 + }, + { + "combined_loss": 0.7758172750473022, + "completion_length": 603.0625, + "epoch": 0.15648854961832062, + "grad_norm": 1.6238081455230713, + "kl": 0.0, + "learning_rate": 8.795956969487398e-07, + "loss": 0.7758, + "num_samples": 1.0, + "reward": 4.1875, + "reward_std": 0.8081127405166626, + "rewards/gpt4o_holistic_reward": 4.1875, + "rl_loss": 1.862645149230957e-08, + "sft_loss": 2.586057662963867, + "speech_entropy": 2.1472387313842773, + "speech_kl": 0.0, + "step": 492, + "text_entropy": 1.605583667755127, + "text_kl": 0.0, + "total_entropy": 2.047147035598755 + }, + { + "combined_loss": 0.6960165500640869, + "completion_length": 649.0625, + "epoch": 0.15680661577608143, + "grad_norm": 6.360669136047363, + "kl": 0.0, + "learning_rate": 8.791091657286267e-07, + "loss": 0.696, + "num_samples": 1.0, + "reward": 3.3125, + "reward_std": 1.0713938474655151, + "rewards/gpt4o_holistic_reward": 3.3125, + "rl_loss": 1.862645149230957e-09, + "sft_loss": 2.3200550079345703, + "speech_entropy": 3.068434715270996, + "speech_kl": 0.0, + "step": 493, + "text_entropy": 2.149477005004883, + "text_kl": 0.0, + "total_entropy": 2.953958034515381 + }, + { + "combined_loss": 0.5907741785049438, + "completion_length": 397.0625, + "epoch": 0.15712468193384224, + "grad_norm": 1.611266016960144, + "kl": 0.0, + "learning_rate": 8.786218059765809e-07, + "loss": 0.5908, + "num_samples": 1.0, + "reward": 3.9375, + "reward_std": 0.6250999569892883, + "rewards/gpt4o_holistic_reward": 3.9375, + "rl_loss": 1.4901161193847656e-08, + "sft_loss": 1.9692471027374268, + "speech_entropy": 1.9849560260772705, + "speech_kl": 0.0, + "step": 494, + "text_entropy": 0.6464660167694092, + "text_kl": 0.0, + "total_entropy": 1.7163063287734985 + }, + { + "combined_loss": 0.6750938892364502, + "completion_length": 525.0, + "epoch": 0.15744274809160305, + "grad_norm": 2.001243829727173, + "kl": 0.0, + "learning_rate": 8.781336189195296e-07, + "loss": 0.6751, + "num_samples": 1.0, + "reward": 3.75, + "reward_std": 1.0048449039459229, + "rewards/gpt4o_holistic_reward": 3.75, + "rl_loss": -1.1175870895385742e-08, + "sft_loss": 2.2503128051757812, + "speech_entropy": 2.9428701400756836, + "speech_kl": 0.0, + "step": 495, + "text_entropy": 3.2061309814453125, + "text_kl": 0.0, + "total_entropy": 3.2631325721740723 + }, + { + "combined_loss": 0.691491961479187, + "completion_length": 531.9375, + "epoch": 0.15776081424936386, + "grad_norm": 1.7672028541564941, + "kl": 0.0, + "learning_rate": 8.776446057864838e-07, + "loss": 0.6915, + "num_samples": 1.0, + "reward": 3.625, + "reward_std": 0.8274502754211426, + "rewards/gpt4o_holistic_reward": 3.625, + "rl_loss": -7.450580596923828e-09, + "sft_loss": 2.3049731254577637, + "speech_entropy": 2.1160459518432617, + "speech_kl": 0.0, + "step": 496, + "text_entropy": 1.1704938411712646, + "text_kl": 0.0, + "total_entropy": 1.9302992820739746 + }, + { + "combined_loss": 0.6495949625968933, + "completion_length": 476.375, + "epoch": 0.15807888040712467, + "grad_norm": 2.018303632736206, + "kl": 0.0, + "learning_rate": 8.77154767808533e-07, + "loss": 0.6496, + "num_samples": 1.0, + "reward": 3.8125, + "reward_std": 0.6637751460075378, + "rewards/gpt4o_holistic_reward": 3.8125, + "rl_loss": -1.1175870895385742e-08, + "sft_loss": 2.165316581726074, + "speech_entropy": 2.1160874366760254, + "speech_kl": 0.0, + "step": 497, + "text_entropy": 1.245253086090088, + "text_kl": 0.0, + "total_entropy": 1.9460238218307495 + }, + { + "combined_loss": 0.6617956161499023, + "completion_length": 528.8125, + "epoch": 0.15839694656488548, + "grad_norm": 1.7395589351654053, + "kl": 0.0, + "learning_rate": 8.766641062188442e-07, + "loss": 0.6618, + "num_samples": 1.0, + "reward": 4.1875, + "reward_std": 0.1251000016927719, + "rewards/gpt4o_holistic_reward": 4.1875, + "rl_loss": 1.4901161193847656e-08, + "sft_loss": 2.2059853076934814, + "speech_entropy": 2.0821304321289062, + "speech_kl": 0.0, + "step": 498, + "text_entropy": 1.037213921546936, + "text_kl": 0.0, + "total_entropy": 1.8841687440872192 + }, + { + "combined_loss": 0.6126347184181213, + "completion_length": 466.5, + "epoch": 0.15871501272264632, + "grad_norm": 1.8486003875732422, + "kl": 0.0, + "learning_rate": 8.761726222526569e-07, + "loss": 0.6126, + "num_samples": 1.0, + "reward": 3.4375, + "reward_std": 1.2024502754211426, + "rewards/gpt4o_holistic_reward": 3.4375, + "rl_loss": 7.450580596923828e-09, + "sft_loss": 2.0421156883239746, + "speech_entropy": 2.067373752593994, + "speech_kl": 0.0, + "step": 499, + "text_entropy": 0.8997583389282227, + "text_kl": 0.0, + "total_entropy": 1.8452403545379639 + }, + { + "combined_loss": 0.7227458953857422, + "completion_length": 520.3125, + "epoch": 0.15903307888040713, + "grad_norm": 1.7850871086120605, + "kl": 0.0, + "learning_rate": 8.756803171472816e-07, + "loss": 0.7227, + "num_samples": 1.0, + "reward": 3.4375, + "reward_std": 0.48945680260658264, + "rewards/gpt4o_holistic_reward": 3.4375, + "rl_loss": -7.450580596923828e-09, + "sft_loss": 2.4091529846191406, + "speech_entropy": 2.1188344955444336, + "speech_kl": 0.0, + "step": 500, + "text_entropy": 1.2516241073608398, + "text_kl": 0.0, + "total_entropy": 1.9562331438064575 + } + ], + "logging_steps": 1, + "max_steps": 2000, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}