{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 2.3629489603024574, "eval_steps": 500, "global_step": 10000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio": 0.0, "completion_length": 44.075, "epoch": 0.0023629489603024575, "format_reward": -1.75, "grad_norm": 0.179437518119812, "image_reward": 0.292385521862242, "kl": 0.0005639283277560026, "learning_rate": 5e-06, "loss": -0.0818, "reward": -1.718647839128971, "reward_std": 2.0869705460965635, "rewards/reward_func": -1.718647839128971, "step": 10, "toxic_reward": 3.753792663415273 }, { "clip_ratio": 0.0, "completion_length": 42.35, "epoch": 0.004725897920604915, "format_reward": -1.75, "grad_norm": 0.40540918707847595, "image_reward": 0.28610331267118455, "kl": 0.0006540146190673113, "learning_rate": 5e-06, "loss": 0.0547, "reward": -0.9438592553138733, "reward_std": 3.9592867106199265, "rewards/reward_func": -0.9438592553138733, "step": 20, "toxic_reward": 3.622282150387764 }, { "clip_ratio": 0.0, "completion_length": 41.35, "epoch": 0.007088846880907372, "format_reward": -2.5, "grad_norm": 0.3070058524608612, "image_reward": 0.29500325620174406, "kl": 0.0006122831255197525, "learning_rate": 5e-06, "loss": 0.0196, "reward": -2.2396623373031614, "reward_std": 4.928562045097351, "rewards/reward_func": -2.2396623373031614, "step": 30, "toxic_reward": 3.3049886375665665 }, { "clip_ratio": 0.0, "completion_length": 53.475, "epoch": 0.00945179584120983, "format_reward": -2.25, "grad_norm": 0.30812421441078186, "image_reward": 0.2784423798322678, "kl": 0.0007215021323645487, "learning_rate": 5e-06, "loss": -0.0374, "reward": -1.9919262409210206, "reward_std": 3.2468371063470842, "rewards/reward_func": -1.9919262409210206, "step": 40, "toxic_reward": 4.284190082550049 }, { "clip_ratio": 0.0, "completion_length": 42.0, "epoch": 0.011814744801512287, "format_reward": -2.25, "grad_norm": 0.7593560814857483, "image_reward": 0.3014272041618824, "kl": 0.0006584389615454711, "learning_rate": 5e-06, "loss": 0.2018, "reward": -2.2726588547229767, "reward_std": 3.8129764549434184, "rewards/reward_func": -2.2726588547229767, "step": 50, "toxic_reward": 3.694073647260666 }, { "clip_ratio": 0.0, "completion_length": 51.475, "epoch": 0.014177693761814745, "format_reward": -3.75, "grad_norm": 0.46164554357528687, "image_reward": 0.24024454802274703, "kl": 0.000736865375074558, "learning_rate": 5e-06, "loss": 0.1005, "reward": -3.6363322257995607, "reward_std": 5.34180793762207, "rewards/reward_func": -3.6363322257995607, "step": 60, "toxic_reward": 4.516654038429261 }, { "clip_ratio": 0.0, "completion_length": 40.275, "epoch": 0.0165406427221172, "format_reward": -1.75, "grad_norm": 0.34076398611068726, "image_reward": 0.2581237778067589, "kl": 0.0006409274850739166, "learning_rate": 5e-06, "loss": -0.048, "reward": -1.306644481420517, "reward_std": 3.5849914638325573, "rewards/reward_func": -1.306644481420517, "step": 70, "toxic_reward": 4.222395324707032 }, { "clip_ratio": 0.0, "completion_length": 41.05, "epoch": 0.01890359168241966, "format_reward": -1.75, "grad_norm": 0.2942235469818115, "image_reward": 0.2728251129388809, "kl": 0.00077872859837953, "learning_rate": 5e-06, "loss": -0.018, "reward": -1.0979918956756591, "reward_std": 3.421245375275612, "rewards/reward_func": -1.0979918956756591, "step": 80, "toxic_reward": 4.375229549407959 }, { "clip_ratio": 0.0, "completion_length": 46.375, "epoch": 0.021266540642722116, "format_reward": -1.75, "grad_norm": 0.4680553376674652, "image_reward": 0.27020376589563155, "kl": 0.0006814575113821775, "learning_rate": 5e-06, "loss": 0.1758, "reward": -1.8029783844947815, "reward_std": 2.909199387952685, "rewards/reward_func": -1.8029783844947815, "step": 90, "toxic_reward": 3.5180059373378754 }, { "clip_ratio": 0.0, "completion_length": 32.975, "epoch": 0.023629489603024575, "format_reward": -1.75, "grad_norm": 0.5208232998847961, "image_reward": 0.28130289614200593, "kl": 0.0006654941505985334, "learning_rate": 5e-06, "loss": 0.0732, "reward": -1.5811177730560302, "reward_std": 3.0347108453512193, "rewards/reward_func": -1.5811177730560302, "step": 100, "toxic_reward": 3.8031033158302305 }, { "clip_ratio": 0.0, "completion_length": 59.575, "epoch": 0.02599243856332703, "format_reward": -3.5, "grad_norm": 0.5875898003578186, "image_reward": 0.2767374664545059, "kl": 0.0009529282746370882, "learning_rate": 5e-06, "loss": 0.013, "reward": -3.43455148935318, "reward_std": 5.033185955882073, "rewards/reward_func": -3.43455148935318, "step": 110, "toxic_reward": 3.8197044640779496 }, { "clip_ratio": 0.0, "completion_length": 34.825, "epoch": 0.02835538752362949, "format_reward": -2.5, "grad_norm": 0.9147374629974365, "image_reward": 0.298614501953125, "kl": 0.0007633624511072413, "learning_rate": 5e-06, "loss": 0.1077, "reward": -2.407980114221573, "reward_std": 4.146487069129944, "rewards/reward_func": -2.407980114221573, "step": 120, "toxic_reward": 3.8069980409410267 }, { "clip_ratio": 0.0, "completion_length": 40.575, "epoch": 0.030718336483931945, "format_reward": -1.5, "grad_norm": 0.6123144626617432, "image_reward": 0.26710906128088635, "kl": 0.000945484999101609, "learning_rate": 5e-06, "loss": -0.033, "reward": -1.4210234582424164, "reward_std": 2.5487833991646767, "rewards/reward_func": -1.4210234582424164, "step": 130, "toxic_reward": 3.9784648021062217 }, { "clip_ratio": 0.0, "completion_length": 36.325, "epoch": 0.0330812854442344, "format_reward": -1.5, "grad_norm": 0.35265249013900757, "image_reward": 0.2955657958984375, "kl": 0.001620796724455431, "learning_rate": 5e-06, "loss": 0.0518, "reward": -1.1245046585798264, "reward_std": 3.641619694232941, "rewards/reward_func": -1.1245046585798264, "step": 140, "toxic_reward": 3.7418821096420287 }, { "clip_ratio": 0.0, "completion_length": 37.325, "epoch": 0.03544423440453686, "format_reward": -1.75, "grad_norm": 0.6911599040031433, "image_reward": 0.301416015625, "kl": 0.0009025269537232816, "learning_rate": 5e-06, "loss": 0.3208, "reward": -1.707236361503601, "reward_std": 3.211209188401699, "rewards/reward_func": -1.707236361503601, "step": 150, "toxic_reward": 3.413761219382286 }, { "clip_ratio": 0.0, "completion_length": 37.6, "epoch": 0.03780718336483932, "format_reward": -2.5, "grad_norm": 0.6072728037834167, "image_reward": 0.28253965079784393, "kl": 0.0016979283303953708, "learning_rate": 5e-06, "loss": -0.0538, "reward": -1.9519330382347106, "reward_std": 3.5465006709098814, "rewards/reward_func": -1.9519330382347106, "step": 160, "toxic_reward": 4.008814732233684 }, { "clip_ratio": 0.0, "completion_length": 41.7, "epoch": 0.04017013232514178, "format_reward": -2.5, "grad_norm": 0.9174755811691284, "image_reward": 0.2571976251072354, "kl": 0.002261338901007548, "learning_rate": 5e-06, "loss": 0.1112, "reward": -2.123195892572403, "reward_std": 4.526358595490455, "rewards/reward_func": -2.123195892572403, "step": 170, "toxic_reward": 3.4624782469537525 }, { "clip_ratio": 0.0, "completion_length": 42.25, "epoch": 0.04253308128544423, "format_reward": -3.5, "grad_norm": 0.6067785024642944, "image_reward": 0.26939900666475297, "kl": 0.0012992891133762896, "learning_rate": 5e-06, "loss": 0.2051, "reward": -3.432029390335083, "reward_std": 5.464101791381836, "rewards/reward_func": -3.432029390335083, "step": 180, "toxic_reward": 3.7212570786476133 }, { "clip_ratio": 0.0, "completion_length": 47.175, "epoch": 0.04489603024574669, "format_reward": -1.5, "grad_norm": 0.32170766592025757, "image_reward": 0.2965630425347222, "kl": 0.00527564455405809, "learning_rate": 5e-06, "loss": -0.1016, "reward": -1.44393031001091, "reward_std": 3.0585690192878245, "rewards/reward_func": -1.44393031001091, "step": 190, "toxic_reward": 4.017925447887844 }, { "clip_ratio": 0.0, "completion_length": 38.775, "epoch": 0.04725897920604915, "format_reward": -2.0, "grad_norm": 0.5049771070480347, "image_reward": 0.28580220490694047, "kl": 0.003515976545168087, "learning_rate": 5e-06, "loss": 0.0836, "reward": -1.8655982911586761, "reward_std": 3.0476409645751117, "rewards/reward_func": -1.8655982911586761, "step": 200, "toxic_reward": 3.8450406193733215 }, { "clip_ratio": 0.0, "completion_length": 37.05, "epoch": 0.04962192816635161, "format_reward": -2.0, "grad_norm": 0.38584810495376587, "image_reward": 0.2693684895833333, "kl": 0.0034694685833528637, "learning_rate": 5e-06, "loss": 0.2267, "reward": -2.040862238407135, "reward_std": 3.0185029461979864, "rewards/reward_func": -2.040862238407135, "step": 210, "toxic_reward": 4.567277669906616 }, { "clip_ratio": 0.0, "completion_length": 43.325, "epoch": 0.05198487712665406, "format_reward": -2.25, "grad_norm": 0.7845410108566284, "image_reward": 0.27211100459098814, "kl": 0.0027843258751090614, "learning_rate": 5e-06, "loss": -0.0217, "reward": -1.9688808619976044, "reward_std": 4.326950389891863, "rewards/reward_func": -1.9688808619976044, "step": 220, "toxic_reward": 3.998746132850647 }, { "clip_ratio": 0.0, "completion_length": 38.475, "epoch": 0.05434782608695652, "format_reward": -1.5, "grad_norm": 0.28465747833251953, "image_reward": 0.28180135041475296, "kl": 0.0029356992337852715, "learning_rate": 5e-06, "loss": 0.1949, "reward": -1.8664621770381928, "reward_std": 3.3784094207920132, "rewards/reward_func": -1.8664621770381928, "step": 230, "toxic_reward": 3.4729531943798064 }, { "clip_ratio": 0.0, "completion_length": 61.9, "epoch": 0.05671077504725898, "format_reward": -3.25, "grad_norm": 0.42949026823043823, "image_reward": 0.27780679166316985, "kl": 0.00391890910686925, "learning_rate": 5e-06, "loss": 0.0808, "reward": -2.961669445037842, "reward_std": 4.58679872751236, "rewards/reward_func": -2.961669445037842, "step": 240, "toxic_reward": 3.08537415266037 }, { "clip_ratio": 0.0, "completion_length": 43.8, "epoch": 0.05907372400756144, "format_reward": -2.25, "grad_norm": 0.9381951093673706, "image_reward": 0.276055908203125, "kl": 0.017148628836730496, "learning_rate": 5e-06, "loss": -0.2303, "reward": -2.1199170768260958, "reward_std": 3.3072034239768984, "rewards/reward_func": -2.1199170768260958, "step": 250, "toxic_reward": 3.4872416734695433 }, { "clip_ratio": 0.0, "completion_length": 45.7, "epoch": 0.06143667296786389, "format_reward": -0.75, "grad_norm": 1.0572214126586914, "image_reward": 0.26918131560087205, "kl": 0.0040537358960136775, "learning_rate": 5e-06, "loss": -0.1228, "reward": -0.41667274236679075, "reward_std": 1.9968392252922058, "rewards/reward_func": -0.41667274236679075, "step": 260, "toxic_reward": 4.103001546859741 }, { "clip_ratio": 0.0, "completion_length": 31.325, "epoch": 0.06379962192816635, "format_reward": -1.75, "grad_norm": 0.33211401104927063, "image_reward": 0.26322936862707136, "kl": 0.010905979719245807, "learning_rate": 5e-06, "loss": 0.1079, "reward": -1.2869422495365144, "reward_std": 3.4389497309923174, "rewards/reward_func": -1.2869422495365144, "step": 270, "toxic_reward": 4.132273650169372 }, { "clip_ratio": 0.0, "completion_length": 56.725, "epoch": 0.0661625708884688, "format_reward": -1.0, "grad_norm": 1.1315058469772339, "image_reward": 0.28337690565321183, "kl": 0.003939477750100196, "learning_rate": 5e-06, "loss": 0.3379, "reward": -1.081434178352356, "reward_std": 1.8980566158890724, "rewards/reward_func": -1.081434178352356, "step": 280, "toxic_reward": 4.38276841905382 }, { "clip_ratio": 0.0, "completion_length": 31.3, "epoch": 0.06852551984877127, "format_reward": -1.75, "grad_norm": 0.35049131512641907, "image_reward": 0.27892303466796875, "kl": 0.04790264330804348, "learning_rate": 5e-06, "loss": -0.1138, "reward": -1.20261852145195, "reward_std": 3.7090243451297282, "rewards/reward_func": -1.20261852145195, "step": 290, "toxic_reward": 3.994084894657135 }, { "clip_ratio": 0.0, "completion_length": 51.925, "epoch": 0.07088846880907372, "format_reward": -2.0, "grad_norm": 0.5147161483764648, "image_reward": 0.290887451171875, "kl": 0.008263002592138946, "learning_rate": 5e-06, "loss": 0.0859, "reward": -1.9743857204914093, "reward_std": 3.558365413546562, "rewards/reward_func": -1.9743857204914093, "step": 300, "toxic_reward": 3.2862678617239 }, { "clip_ratio": 0.0, "completion_length": 35.4, "epoch": 0.07325141776937619, "format_reward": -0.25, "grad_norm": 0.4911198616027832, "image_reward": 0.27863413393497466, "kl": 0.0026858947356231512, "learning_rate": 5e-06, "loss": -0.1385, "reward": -0.005173623561859131, "reward_std": 1.2043775863945485, "rewards/reward_func": -0.005173623561859131, "step": 310, "toxic_reward": 4.392678713798523 }, { "clip_ratio": 0.0, "completion_length": 49.925, "epoch": 0.07561436672967864, "format_reward": -2.5, "grad_norm": 0.7166872024536133, "image_reward": 0.2722563561466005, "kl": 0.0117031121510081, "learning_rate": 5e-06, "loss": 0.1367, "reward": -2.320690667629242, "reward_std": 3.535179616510868, "rewards/reward_func": -2.320690667629242, "step": 320, "toxic_reward": 3.1776589486334057 }, { "clip_ratio": 0.0, "completion_length": 36.3, "epoch": 0.07797731568998109, "format_reward": -0.75, "grad_norm": 0.2700420618057251, "image_reward": 0.29552409052848816, "kl": 0.017672599526122212, "learning_rate": 5e-06, "loss": -0.1148, "reward": -0.291591414809227, "reward_std": 2.016152049601078, "rewards/reward_func": -0.291591414809227, "step": 330, "toxic_reward": 3.240800154209137 }, { "clip_ratio": 0.0, "completion_length": 40.525, "epoch": 0.08034026465028356, "format_reward": -1.75, "grad_norm": 0.4268760085105896, "image_reward": 0.29228312373161314, "kl": 0.0068239012965932485, "learning_rate": 5e-06, "loss": 0.1467, "reward": -1.9235587894916535, "reward_std": 2.7463727177120747, "rewards/reward_func": -1.9235587894916535, "step": 340, "toxic_reward": 3.535431480407715 }, { "clip_ratio": 0.0, "completion_length": 64.05, "epoch": 0.08270321361058601, "format_reward": -1.5, "grad_norm": 0.6083372235298157, "image_reward": 0.2844095855951309, "kl": 0.014765451126731933, "learning_rate": 5e-06, "loss": -0.0278, "reward": -1.293799924850464, "reward_std": 3.273481422662735, "rewards/reward_func": -1.293799924850464, "step": 350, "toxic_reward": 3.966331052780151 }, { "clip_ratio": 0.0, "completion_length": 50.65, "epoch": 0.08506616257088846, "format_reward": -1.25, "grad_norm": 0.72890704870224, "image_reward": 0.2809214279055595, "kl": 0.013393631461076439, "learning_rate": 5e-06, "loss": 0.0463, "reward": -1.2338840126991273, "reward_std": 2.9229114189743997, "rewards/reward_func": -1.2338840126991273, "step": 360, "toxic_reward": 3.495128685235977 }, { "clip_ratio": 0.0, "completion_length": 40.5, "epoch": 0.08742911153119093, "format_reward": -1.0, "grad_norm": 0.19754794239997864, "image_reward": 0.26820373386144636, "kl": 0.009714199486188591, "learning_rate": 5e-06, "loss": -0.0682, "reward": -0.5900760173797608, "reward_std": 2.301849504513666, "rewards/reward_func": -0.5900760173797608, "step": 370, "toxic_reward": 4.155627632141114 }, { "clip_ratio": 0.0, "completion_length": 43.325, "epoch": 0.08979206049149338, "format_reward": -1.0, "grad_norm": 0.7548431158065796, "image_reward": 0.2744639068841934, "kl": 0.02220306231174618, "learning_rate": 5e-06, "loss": 0.2135, "reward": -1.0681762412190436, "reward_std": 2.1873259781859815, "rewards/reward_func": -1.0681762412190436, "step": 380, "toxic_reward": 3.818178777396679 }, { "clip_ratio": 0.0, "completion_length": 38.75, "epoch": 0.09215500945179585, "format_reward": -1.5, "grad_norm": 1.6385910511016846, "image_reward": 0.28105672299861906, "kl": 0.010323460912331939, "learning_rate": 5e-06, "loss": 0.1059, "reward": -1.3754307508468628, "reward_std": 3.023836246691644, "rewards/reward_func": -1.3754307508468628, "step": 390, "toxic_reward": 4.011180245876313 }, { "clip_ratio": 0.0, "completion_length": 45.55, "epoch": 0.0945179584120983, "format_reward": -2.0, "grad_norm": 0.8115288615226746, "image_reward": 0.28032633662223816, "kl": 0.05955924341687933, "learning_rate": 5e-06, "loss": 0.0819, "reward": -1.5853845477104187, "reward_std": 4.042920933663845, "rewards/reward_func": -1.5853845477104187, "step": 400, "toxic_reward": 3.5872471928596497 }, { "clip_ratio": 0.0, "completion_length": 36.675, "epoch": 0.09688090737240075, "format_reward": -0.75, "grad_norm": 0.5388673543930054, "image_reward": 0.2691446923547321, "kl": 0.007679732237011194, "learning_rate": 5e-06, "loss": -0.0322, "reward": -0.41340800523757937, "reward_std": 2.081881234049797, "rewards/reward_func": -0.41340800523757937, "step": 410, "toxic_reward": 3.857707765367296 }, { "clip_ratio": 0.0, "completion_length": 52.125, "epoch": 0.09924385633270322, "format_reward": -1.75, "grad_norm": 0.2760399281978607, "image_reward": 0.2856099456548691, "kl": 0.06441240075509995, "learning_rate": 5e-06, "loss": 0.1586, "reward": -1.576817613840103, "reward_std": 3.0979749940335752, "rewards/reward_func": -1.576817613840103, "step": 420, "toxic_reward": 3.6233551859855653 }, { "clip_ratio": 0.0, "completion_length": 42.9, "epoch": 0.10160680529300567, "format_reward": -2.25, "grad_norm": 0.8334791660308838, "image_reward": 0.314910888671875, "kl": 0.13873190036974847, "learning_rate": 5e-06, "loss": 0.0671, "reward": -1.9813659265637398, "reward_std": 3.193006566166878, "rewards/reward_func": -1.9813659265637398, "step": 430, "toxic_reward": 3.297185143828392 }, { "clip_ratio": 0.0, "completion_length": 41.6, "epoch": 0.10396975425330812, "format_reward": -0.25, "grad_norm": 0.8734163045883179, "image_reward": 0.2913574203848839, "kl": 0.01563742496073246, "learning_rate": 5e-06, "loss": -0.0094, "reward": 0.012267284095287323, "reward_std": 1.1554903835058212, "rewards/reward_func": 0.012267284095287323, "step": 440, "toxic_reward": 2.416472536325455 }, { "clip_ratio": 0.0, "completion_length": 36.825, "epoch": 0.10633270321361059, "format_reward": -1.5, "grad_norm": 1.1138451099395752, "image_reward": 0.28591206669807434, "kl": 0.022654308984056116, "learning_rate": 5e-06, "loss": 0.0957, "reward": -1.2695215404033662, "reward_std": 2.8351699322462083, "rewards/reward_func": -1.2695215404033662, "step": 450, "toxic_reward": 3.3260442495346068 }, { "clip_ratio": 0.0, "completion_length": 33.475, "epoch": 0.10869565217391304, "format_reward": -0.25, "grad_norm": 0.6227550506591797, "image_reward": 0.2807729095220566, "kl": 0.007865939987823367, "learning_rate": 5e-06, "loss": 0.0179, "reward": 1.1717996835708617, "reward_std": 1.5977750271558762, "rewards/reward_func": 1.1717996835708617, "step": 460, "toxic_reward": 3.309050977230072 }, { "clip_ratio": 0.0, "completion_length": 34.275, "epoch": 0.1110586011342155, "format_reward": -1.0, "grad_norm": 0.3605867624282837, "image_reward": 0.3162755310535431, "kl": 0.02495001317001879, "learning_rate": 5e-06, "loss": 0.0852, "reward": -1.0550554990768433, "reward_std": 2.7155043721199035, "rewards/reward_func": -1.0550554990768433, "step": 470, "toxic_reward": 3.4480915129184724 }, { "clip_ratio": 0.0, "completion_length": 43.0, "epoch": 0.11342155009451796, "format_reward": -0.5, "grad_norm": 0.3204725980758667, "image_reward": 0.32206115424633025, "kl": 0.011832635500468314, "learning_rate": 5e-06, "loss": 0.032, "reward": -0.29420808106660845, "reward_std": 1.479024769924581, "rewards/reward_func": -0.29420808106660845, "step": 480, "toxic_reward": 3.5640476822853087 }, { "clip_ratio": 0.0, "completion_length": 49.2, "epoch": 0.11578449905482041, "format_reward": -1.75, "grad_norm": 0.6204938888549805, "image_reward": 0.26227518618106843, "kl": 0.03589183106087148, "learning_rate": 5e-06, "loss": 0.1186, "reward": -1.7148385405540467, "reward_std": 3.0656426630914213, "rewards/reward_func": -1.7148385405540467, "step": 490, "toxic_reward": 4.371080112457276 }, { "clip_ratio": 0.0, "completion_length": 52.925, "epoch": 0.11814744801512288, "format_reward": -1.75, "grad_norm": 0.7287388443946838, "image_reward": 0.2700037628412247, "kl": 0.045285335322842, "learning_rate": 5e-06, "loss": 0.0265, "reward": -1.4807079195976258, "reward_std": 3.789501038193703, "rewards/reward_func": -1.4807079195976258, "step": 500, "toxic_reward": 3.9034363865852355 }, { "clip_ratio": 0.0, "completion_length": 35.75, "epoch": 0.12051039697542533, "format_reward": -1.0, "grad_norm": 0.8750075697898865, "image_reward": 0.2932400173611111, "kl": 0.15442988513968886, "learning_rate": 5e-06, "loss": 0.0218, "reward": -0.7209997951984406, "reward_std": 1.843582271039486, "rewards/reward_func": -0.7209997951984406, "step": 510, "toxic_reward": 3.8813175095452204 }, { "clip_ratio": 0.0, "completion_length": 44.125, "epoch": 0.12287334593572778, "format_reward": -0.5, "grad_norm": 0.4498269259929657, "image_reward": 0.29094645082950593, "kl": 0.01979847764596343, "learning_rate": 5e-06, "loss": -0.0324, "reward": -0.40604341179132464, "reward_std": 1.4113173604011535, "rewards/reward_func": -0.40604341179132464, "step": 520, "toxic_reward": 4.314427596330643 }, { "clip_ratio": 0.0, "completion_length": 34.025, "epoch": 0.12523629489603025, "format_reward": -1.0, "grad_norm": 1.7480149269104004, "image_reward": 0.2472829192876816, "kl": 0.0556537595577538, "learning_rate": 5e-06, "loss": -0.1291, "reward": -1.0481307327747345, "reward_std": 2.317431343346834, "rewards/reward_func": -1.0481307327747345, "step": 530, "toxic_reward": 4.515345811843872 }, { "clip_ratio": 0.0, "completion_length": 36.2, "epoch": 0.1275992438563327, "format_reward": -1.25, "grad_norm": 0.39433184266090393, "image_reward": 0.2873850494623184, "kl": 0.039984302362427115, "learning_rate": 5e-06, "loss": 0.0301, "reward": -0.2359391689300537, "reward_std": 2.863342150300741, "rewards/reward_func": -0.2359391689300537, "step": 540, "toxic_reward": 3.940987694263458 }, { "clip_ratio": 0.0, "completion_length": 42.15, "epoch": 0.12996219281663515, "format_reward": -1.25, "grad_norm": 2.7985472679138184, "image_reward": 0.30071309208869934, "kl": 0.0283741801045835, "learning_rate": 5e-06, "loss": -0.0278, "reward": -0.4430400252342224, "reward_std": 2.761640505492687, "rewards/reward_func": -0.4430400252342224, "step": 550, "toxic_reward": 3.235564041137695 }, { "clip_ratio": 0.0, "completion_length": 39.525, "epoch": 0.1323251417769376, "format_reward": -1.0, "grad_norm": 1.208016037940979, "image_reward": 0.29123942106962203, "kl": 0.03811377864331007, "learning_rate": 5e-06, "loss": 0.2144, "reward": -1.2057244956493378, "reward_std": 2.0336616799235343, "rewards/reward_func": -1.2057244956493378, "step": 560, "toxic_reward": 3.977510142326355 }, { "clip_ratio": 0.0, "completion_length": 27.55, "epoch": 0.13468809073724008, "format_reward": -1.5, "grad_norm": 0.8842714428901672, "image_reward": 0.2724670395255089, "kl": 0.07012159014120697, "learning_rate": 5e-06, "loss": 0.1684, "reward": -1.302715817093849, "reward_std": 3.6691504657268523, "rewards/reward_func": -1.302715817093849, "step": 570, "toxic_reward": 3.246229815483093 }, { "clip_ratio": 0.0, "completion_length": 51.6, "epoch": 0.13705103969754254, "format_reward": -0.75, "grad_norm": 0.7157159447669983, "image_reward": 0.299871826171875, "kl": 0.028781934920698405, "learning_rate": 5e-06, "loss": -0.0308, "reward": 0.11744136810302734, "reward_std": 2.1306695722043516, "rewards/reward_func": 0.11744136810302734, "step": 580, "toxic_reward": 3.35184041261673 }, { "clip_ratio": 0.0, "completion_length": 41.475, "epoch": 0.139413988657845, "format_reward": -0.25, "grad_norm": 0.4593754708766937, "image_reward": 0.2574858499897851, "kl": 0.05173348039388657, "learning_rate": 5e-06, "loss": -0.1241, "reward": 0.452265202999115, "reward_std": 1.2885668274015187, "rewards/reward_func": 0.452265202999115, "step": 590, "toxic_reward": 3.4634872145122952 }, { "clip_ratio": 0.0, "completion_length": 35.325, "epoch": 0.14177693761814744, "format_reward": -1.0, "grad_norm": 0.5869470834732056, "image_reward": 0.26802266389131546, "kl": 0.2022853755392134, "learning_rate": 5e-06, "loss": -0.1205, "reward": -0.9757636785507202, "reward_std": 2.408064843714237, "rewards/reward_func": -0.9757636785507202, "step": 600, "toxic_reward": 4.45868456363678 }, { "clip_ratio": 0.0, "completion_length": 39.45, "epoch": 0.1441398865784499, "format_reward": -2.75, "grad_norm": 1.1131778955459595, "image_reward": 0.26167353987693787, "kl": 0.16366879558190703, "learning_rate": 5e-06, "loss": -0.0012, "reward": -2.7253461956977842, "reward_std": 4.713953969441354, "rewards/reward_func": -2.7253461956977842, "step": 610, "toxic_reward": 3.5585821866989136 }, { "clip_ratio": 0.0, "completion_length": 40.65, "epoch": 0.14650283553875237, "format_reward": -1.0, "grad_norm": 1.6662554740905762, "image_reward": 0.2821828216314316, "kl": 0.1423144882544875, "learning_rate": 5e-06, "loss": -0.0443, "reward": -0.9905034899711609, "reward_std": 2.6423311533406375, "rewards/reward_func": -0.9905034899711609, "step": 620, "toxic_reward": 4.095821046829224 }, { "clip_ratio": 0.0, "completion_length": 55.85, "epoch": 0.14886578449905483, "format_reward": -1.0, "grad_norm": 18.956981658935547, "image_reward": 0.28932088166475295, "kl": 0.41657317453064024, "learning_rate": 5e-06, "loss": 0.0324, "reward": -0.8243820607662201, "reward_std": 2.0909267283976076, "rewards/reward_func": -0.8243820607662201, "step": 630, "toxic_reward": 3.2601676136255264 }, { "clip_ratio": 0.0, "completion_length": 29.875, "epoch": 0.15122873345935728, "format_reward": -0.75, "grad_norm": 1.4686508178710938, "image_reward": 0.29945882111787797, "kl": 0.28281182143837214, "learning_rate": 5e-06, "loss": 0.0769, "reward": -0.4713120386004448, "reward_std": 1.791446179151535, "rewards/reward_func": -0.4713120386004448, "step": 640, "toxic_reward": 3.3351209998130797 }, { "clip_ratio": 0.0, "completion_length": 37.725, "epoch": 0.15359168241965973, "format_reward": -0.5, "grad_norm": 2.9935286045074463, "image_reward": 0.2910970068640179, "kl": 1.0141649260884151, "learning_rate": 5e-06, "loss": -0.2174, "reward": -0.5139556050300598, "reward_std": 1.0858815148472787, "rewards/reward_func": -0.5139556050300598, "step": 650, "toxic_reward": 4.207416137059529 }, { "clip_ratio": 0.0, "completion_length": 39.2, "epoch": 0.15595463137996218, "format_reward": -0.75, "grad_norm": 3.4974160194396973, "image_reward": 0.29859237670898436, "kl": 0.03742524515837431, "learning_rate": 5e-06, "loss": -0.0491, "reward": -0.8688022553920746, "reward_std": 1.9190378237515688, "rewards/reward_func": -0.8688022553920746, "step": 660, "toxic_reward": 3.639171451330185 }, { "clip_ratio": 0.0, "completion_length": 44.625, "epoch": 0.15831758034026466, "format_reward": -0.75, "grad_norm": 0.6731751561164856, "image_reward": 0.2705291733145714, "kl": 0.1289379763416946, "learning_rate": 5e-06, "loss": 0.0339, "reward": -0.5425865709781647, "reward_std": 2.217602302134037, "rewards/reward_func": -0.5425865709781647, "step": 670, "toxic_reward": 3.7739344239234924 }, { "clip_ratio": 0.0, "completion_length": 49.9, "epoch": 0.16068052930056712, "format_reward": -1.0, "grad_norm": 0.6705069541931152, "image_reward": 0.2828119918704033, "kl": 0.09238320724107325, "learning_rate": 5e-06, "loss": -0.0765, "reward": -0.3722410202026367, "reward_std": 1.9134121721610426, "rewards/reward_func": -0.3722410202026367, "step": 680, "toxic_reward": 4.390137553215027 }, { "clip_ratio": 0.0, "completion_length": 51.225, "epoch": 0.16304347826086957, "format_reward": -1.0, "grad_norm": 2.7068045139312744, "image_reward": 0.27732340693473817, "kl": 0.06089744158089161, "learning_rate": 5e-06, "loss": -0.0203, "reward": -0.6177265048027039, "reward_std": 2.210049830470234, "rewards/reward_func": -0.6177265048027039, "step": 690, "toxic_reward": 3.5699973523616793 }, { "clip_ratio": 0.0, "completion_length": 46.125, "epoch": 0.16540642722117202, "format_reward": -1.25, "grad_norm": 3.031416654586792, "image_reward": 0.2965891510248184, "kl": 0.8002684944309294, "learning_rate": 5e-06, "loss": 0.0719, "reward": -0.29744131565093995, "reward_std": 2.741807485371828, "rewards/reward_func": -0.29744131565093995, "step": 700, "toxic_reward": 3.4483383893966675 }, { "clip_ratio": 0.0, "completion_length": 35.975, "epoch": 0.16776937618147447, "format_reward": -0.25, "grad_norm": 3.4755773544311523, "image_reward": 0.2723083525896072, "kl": 0.24097473481670023, "learning_rate": 5e-06, "loss": 0.0494, "reward": -0.21520038843154907, "reward_std": 0.7798372395336628, "rewards/reward_func": -0.21520038843154907, "step": 710, "toxic_reward": 4.5303761720657345 }, { "clip_ratio": 0.0, "completion_length": 43.725, "epoch": 0.17013232514177692, "format_reward": -0.25, "grad_norm": 1.2503156661987305, "image_reward": 0.27466329038143156, "kl": 0.2257185777183622, "learning_rate": 5e-06, "loss": -0.0733, "reward": 0.11292819976806641, "reward_std": 1.212121632695198, "rewards/reward_func": 0.11292819976806641, "step": 720, "toxic_reward": 4.0655577898025514 }, { "clip_ratio": 0.0, "completion_length": 39.225, "epoch": 0.1724952741020794, "format_reward": -1.5, "grad_norm": 7.7392988204956055, "image_reward": 0.2492055267095566, "kl": 0.37416572365909817, "learning_rate": 5e-06, "loss": 0.0225, "reward": -1.0509216010570526, "reward_std": 3.409189415350556, "rewards/reward_func": -1.0509216010570526, "step": 730, "toxic_reward": 4.022808003425598 }, { "clip_ratio": 0.0, "completion_length": 28.525, "epoch": 0.17485822306238186, "format_reward": -0.5, "grad_norm": 4.889242172241211, "image_reward": 0.30042317807674407, "kl": 0.22789150793105364, "learning_rate": 5e-06, "loss": -0.0569, "reward": -0.2479497730731964, "reward_std": 1.3530383894219995, "rewards/reward_func": -0.2479497730731964, "step": 740, "toxic_reward": 3.774165117740631 }, { "clip_ratio": 0.0, "completion_length": 50.55, "epoch": 0.1772211720226843, "format_reward": -1.5, "grad_norm": 16.729528427124023, "image_reward": 0.273948161303997, "kl": 0.43975371681153774, "learning_rate": 5e-06, "loss": 0.1103, "reward": -1.793390053510666, "reward_std": 3.0602585028856994, "rewards/reward_func": -1.793390053510666, "step": 750, "toxic_reward": 3.111769822239876 }, { "clip_ratio": 0.0, "completion_length": 42.675, "epoch": 0.17958412098298676, "format_reward": -0.25, "grad_norm": 10.731781005859375, "image_reward": 0.26650288701057434, "kl": 0.6582286342978477, "learning_rate": 5e-06, "loss": 0.1081, "reward": 0.10775105953216553, "reward_std": 1.3219802690669895, "rewards/reward_func": 0.10775105953216553, "step": 760, "toxic_reward": 4.1322005033493046 }, { "clip_ratio": 0.0, "completion_length": 44.0, "epoch": 0.1819470699432892, "format_reward": -0.75, "grad_norm": 4.2282633781433105, "image_reward": 0.28914388120174406, "kl": 0.7939867446199059, "learning_rate": 5e-06, "loss": 0.0704, "reward": -0.24524924755096436, "reward_std": 2.0771213214844466, "rewards/reward_func": -0.24524924755096436, "step": 770, "toxic_reward": 3.9121114134788515 }, { "clip_ratio": 0.0, "completion_length": 55.975, "epoch": 0.1843100189035917, "format_reward": -0.5, "grad_norm": 8.486693382263184, "image_reward": 0.246868896484375, "kl": 1.14481502994895, "learning_rate": 5e-06, "loss": -0.0032, "reward": 0.28170942068099974, "reward_std": 2.0574716079980133, "rewards/reward_func": 0.28170942068099974, "step": 780, "toxic_reward": 3.4702104151248934 }, { "clip_ratio": 0.0, "completion_length": 36.15, "epoch": 0.18667296786389415, "format_reward": -0.75, "grad_norm": 27.51862907409668, "image_reward": 0.26758320927619933, "kl": 1.0921552445739509, "learning_rate": 5e-06, "loss": -0.3259, "reward": -0.5566600695252418, "reward_std": 1.7622592605650425, "rewards/reward_func": -0.5566600695252418, "step": 790, "toxic_reward": 3.4233752876520156 }, { "clip_ratio": 0.0, "completion_length": 35.45, "epoch": 0.1890359168241966, "format_reward": -0.5, "grad_norm": 4.040957927703857, "image_reward": 0.3153462767601013, "kl": 1.9678303502500056, "learning_rate": 5e-06, "loss": -0.1665, "reward": -0.010482311248779297, "reward_std": 1.1518827967345715, "rewards/reward_func": -0.010482311248779297, "step": 800, "toxic_reward": 3.6056110084056856 }, { "clip_ratio": 0.0, "completion_length": 41.5, "epoch": 0.19139886578449905, "format_reward": -0.25, "grad_norm": 12.718086242675781, "image_reward": 0.27923176884651185, "kl": 0.9990547701716423, "learning_rate": 5e-06, "loss": -0.0447, "reward": 0.1995850086212158, "reward_std": 1.246943424642086, "rewards/reward_func": 0.1995850086212158, "step": 810, "toxic_reward": 3.635990482568741 }, { "clip_ratio": 0.0, "completion_length": 37.525, "epoch": 0.1937618147448015, "format_reward": -1.25, "grad_norm": 5.244020938873291, "image_reward": 0.27026468962430955, "kl": 2.5741087660193442, "learning_rate": 5e-06, "loss": 0.0569, "reward": -1.3374125480651855, "reward_std": 2.818611039035022, "rewards/reward_func": -1.3374125480651855, "step": 820, "toxic_reward": 4.255197846889496 }, { "clip_ratio": 0.0, "completion_length": 43.25, "epoch": 0.19612476370510398, "format_reward": -0.25, "grad_norm": 1.3633440732955933, "image_reward": 0.29616292417049406, "kl": 0.48451304286718366, "learning_rate": 5e-06, "loss": 0.1053, "reward": -0.34738388657569885, "reward_std": 0.9195286151021719, "rewards/reward_func": -0.34738388657569885, "step": 830, "toxic_reward": 4.384462606906891 }, { "clip_ratio": 0.0, "completion_length": 35.75, "epoch": 0.19848771266540643, "format_reward": -0.75, "grad_norm": 6.93122673034668, "image_reward": 0.2948842361569405, "kl": 0.3984289012849331, "learning_rate": 5e-06, "loss": 0.007, "reward": -0.4061413824558258, "reward_std": 2.115474058687687, "rewards/reward_func": -0.4061413824558258, "step": 840, "toxic_reward": 2.784619116783142 }, { "clip_ratio": 0.0, "completion_length": 58.85, "epoch": 0.2008506616257089, "format_reward": -1.0, "grad_norm": 11.167367935180664, "image_reward": 0.2535125732421875, "kl": 0.7260896906256675, "learning_rate": 5e-06, "loss": -0.0252, "reward": -0.6900001287460327, "reward_std": 2.5411489391699433, "rewards/reward_func": -0.6900001287460327, "step": 850, "toxic_reward": 3.902221655845642 }, { "clip_ratio": 0.0, "completion_length": 37.35, "epoch": 0.20321361058601134, "format_reward": -0.25, "grad_norm": 12.129627227783203, "image_reward": 0.25641682744026184, "kl": 0.5523816287517548, "learning_rate": 5e-06, "loss": -0.0869, "reward": 0.027270352840423583, "reward_std": 1.1594479020684958, "rewards/reward_func": 0.027270352840423583, "step": 860, "toxic_reward": 4.19142780303955 }, { "clip_ratio": 0.0, "completion_length": 50.75, "epoch": 0.2055765595463138, "format_reward": -1.0, "grad_norm": 25.523523330688477, "image_reward": 0.28674203488561845, "kl": 1.1298049300909043, "learning_rate": 5e-06, "loss": 0.0639, "reward": -1.0763263344764709, "reward_std": 1.7480091962963342, "rewards/reward_func": -1.0763263344764709, "step": 870, "toxic_reward": 4.468152364095052 }, { "clip_ratio": 0.0, "completion_length": 35.575, "epoch": 0.20793950850661624, "format_reward": -1.0, "grad_norm": 3.8387675285339355, "image_reward": 0.26868184506893156, "kl": 0.9680751413106918, "learning_rate": 5e-06, "loss": -0.0833, "reward": -0.8666846975684166, "reward_std": 2.079224378615618, "rewards/reward_func": -0.8666846975684166, "step": 880, "toxic_reward": 3.481996048986912 }, { "clip_ratio": 0.0, "completion_length": 33.825, "epoch": 0.21030245746691872, "format_reward": -0.5, "grad_norm": 15.843626022338867, "image_reward": 0.2802464798092842, "kl": 0.49419727362692356, "learning_rate": 5e-06, "loss": 0.0241, "reward": 0.11158292293548584, "reward_std": 1.7106264479458333, "rewards/reward_func": 0.11158292293548584, "step": 890, "toxic_reward": 3.7324341177940368 }, { "clip_ratio": 0.0, "completion_length": 47.3, "epoch": 0.21266540642722118, "format_reward": -0.25, "grad_norm": 2.770407199859619, "image_reward": 0.27023824155330656, "kl": 0.2871086034923792, "learning_rate": 5e-06, "loss": 0.1861, "reward": -0.27072116136550906, "reward_std": 1.447587224841118, "rewards/reward_func": -0.27072116136550906, "step": 900, "toxic_reward": 3.426037532091141 }, { "clip_ratio": 0.0, "completion_length": 33.0, "epoch": 0.21502835538752363, "format_reward": -0.5, "grad_norm": 6.4211225509643555, "image_reward": 0.2804026290774345, "kl": 1.5080223519355058, "learning_rate": 5e-06, "loss": 0.0382, "reward": -0.10845602005720138, "reward_std": 1.7854840472340583, "rewards/reward_func": -0.10845602005720138, "step": 910, "toxic_reward": 3.3229601860046385 }, { "clip_ratio": 0.0, "completion_length": 41.4, "epoch": 0.21739130434782608, "format_reward": -1.0, "grad_norm": 1.846864938735962, "image_reward": 0.29064489238791996, "kl": 0.8340548906475306, "learning_rate": 5e-06, "loss": 0.0872, "reward": -1.217875736951828, "reward_std": 1.4547557694837452, "rewards/reward_func": -1.217875736951828, "step": 920, "toxic_reward": 4.098645766576131 }, { "clip_ratio": 0.0, "completion_length": 39.3, "epoch": 0.21975425330812853, "format_reward": -0.5, "grad_norm": 14.329817771911621, "image_reward": 0.28984171748161314, "kl": 0.3335365690290928, "learning_rate": 5e-06, "loss": 0.0341, "reward": -0.14692462086677552, "reward_std": 1.6654048651456832, "rewards/reward_func": -0.14692462086677552, "step": 930, "toxic_reward": 3.8828285098075868 }, { "clip_ratio": 0.0, "completion_length": 39.15, "epoch": 0.222117202268431, "format_reward": -1.0, "grad_norm": 13.11744499206543, "image_reward": 0.2768778458237648, "kl": 0.7420168094336986, "learning_rate": 5e-06, "loss": -0.1362, "reward": -0.5828769445419312, "reward_std": 2.509597599506378, "rewards/reward_func": -0.5828769445419312, "step": 940, "toxic_reward": 3.994591364264488 }, { "clip_ratio": 0.0, "completion_length": 35.9, "epoch": 0.22448015122873347, "format_reward": -1.25, "grad_norm": 1.4235849380493164, "image_reward": 0.2599512729793787, "kl": 0.23791442420333625, "learning_rate": 5e-06, "loss": 0.2081, "reward": -0.7265825271606445, "reward_std": 2.4457253187894823, "rewards/reward_func": -0.7265825271606445, "step": 950, "toxic_reward": 4.328470140695572 }, { "clip_ratio": 0.0, "completion_length": 40.05, "epoch": 0.22684310018903592, "format_reward": -1.0, "grad_norm": 10.51688003540039, "image_reward": 0.29052734225988386, "kl": 1.1104660354554654, "learning_rate": 5e-06, "loss": 0.1948, "reward": -0.3963636875152588, "reward_std": 2.6071507059037686, "rewards/reward_func": -0.3963636875152588, "step": 960, "toxic_reward": 3.5060137271881104 }, { "clip_ratio": 0.0, "completion_length": 42.075, "epoch": 0.22920604914933837, "format_reward": 0.0, "grad_norm": 0.5477933287620544, "image_reward": 0.2825276702642441, "kl": 0.24828157052397729, "learning_rate": 5e-06, "loss": 0.2364, "reward": -0.024850471317768096, "reward_std": 0.7480767840519548, "rewards/reward_func": -0.024850471317768096, "step": 970, "toxic_reward": 3.07701745480299 }, { "clip_ratio": 0.0, "completion_length": 39.925, "epoch": 0.23156899810964082, "format_reward": -1.25, "grad_norm": 6.296302318572998, "image_reward": 0.26927467518382603, "kl": 3.1552879590541125, "learning_rate": 5e-06, "loss": 0.0241, "reward": -0.648174649477005, "reward_std": 2.8984405621886253, "rewards/reward_func": -0.648174649477005, "step": 980, "toxic_reward": 3.881988432672289 }, { "clip_ratio": 0.0, "completion_length": 50.475, "epoch": 0.2339319470699433, "format_reward": -1.0, "grad_norm": 2.797386646270752, "image_reward": 0.2668904632329941, "kl": 1.7048991359770298, "learning_rate": 5e-06, "loss": -0.0828, "reward": -1.1502302587032318, "reward_std": 2.383236999064684, "rewards/reward_func": -1.1502302587032318, "step": 990, "toxic_reward": 4.231578087806701 }, { "clip_ratio": 0.0, "completion_length": 39.425, "epoch": 0.23629489603024575, "format_reward": -0.75, "grad_norm": 13.208063125610352, "image_reward": 0.2917307555675507, "kl": 0.7445122614502907, "learning_rate": 5e-06, "loss": -0.1073, "reward": -0.7605196535587311, "reward_std": 2.2064386613667013, "rewards/reward_func": -0.7605196535587311, "step": 1000, "toxic_reward": 3.5633171044290064 }, { "clip_ratio": 0.0, "completion_length": 55.875, "epoch": 0.2386578449905482, "format_reward": -1.0, "grad_norm": 10.358668327331543, "image_reward": 0.26257934868335725, "kl": 0.35015557184815405, "learning_rate": 5e-06, "loss": -0.0206, "reward": -0.38898804783821106, "reward_std": 2.7123206526041033, "rewards/reward_func": -0.38898804783821106, "step": 1010, "toxic_reward": 3.609158730506897 }, { "clip_ratio": 0.0, "completion_length": 52.95, "epoch": 0.24102079395085066, "format_reward": -1.0, "grad_norm": 9.602174758911133, "image_reward": 0.289794921875, "kl": 0.2867487147450447, "learning_rate": 5e-06, "loss": 0.0269, "reward": -0.4154239475727081, "reward_std": 2.4513496346771717, "rewards/reward_func": -0.4154239475727081, "step": 1020, "toxic_reward": 4.2405922412872314 }, { "clip_ratio": 0.0, "completion_length": 52.5, "epoch": 0.2433837429111531, "format_reward": -1.0, "grad_norm": 6.7750630378723145, "image_reward": 0.2876515701413155, "kl": 0.8189243379980325, "learning_rate": 5e-06, "loss": 0.0184, "reward": -0.9024024844169617, "reward_std": 2.123489296063781, "rewards/reward_func": -0.9024024844169617, "step": 1030, "toxic_reward": 3.870901381969452 }, { "clip_ratio": 0.0, "completion_length": 55.1, "epoch": 0.24574669187145556, "format_reward": -0.5, "grad_norm": 1.4051434993743896, "image_reward": 0.2766723616255654, "kl": 0.7713468134403229, "learning_rate": 5e-06, "loss": -0.1005, "reward": 0.42890325784683225, "reward_std": 1.7344073422253132, "rewards/reward_func": 0.42890325784683225, "step": 1040, "toxic_reward": 3.850937591658698 }, { "clip_ratio": 0.0, "completion_length": 43.6, "epoch": 0.24810964083175804, "format_reward": -0.5, "grad_norm": 10.04930591583252, "image_reward": 0.2845031708478928, "kl": 0.21945146545767785, "learning_rate": 5e-06, "loss": -0.0794, "reward": -0.29822829365730286, "reward_std": 2.0626097127795218, "rewards/reward_func": -0.29822829365730286, "step": 1050, "toxic_reward": 3.3056647762656213 }, { "clip_ratio": 0.0, "completion_length": 36.675, "epoch": 0.2504725897920605, "format_reward": -0.25, "grad_norm": 1.4483786821365356, "image_reward": 0.2949198380112648, "kl": 0.5147463826462626, "learning_rate": 5e-06, "loss": 0.0472, "reward": -0.4302744150161743, "reward_std": 0.9093868482857943, "rewards/reward_func": -0.4302744150161743, "step": 1060, "toxic_reward": 4.118269920349121 }, { "clip_ratio": 0.0, "completion_length": 45.125, "epoch": 0.252835538752363, "format_reward": -0.75, "grad_norm": 5.471806526184082, "image_reward": 0.3024444580078125, "kl": 0.924912228435278, "learning_rate": 5e-06, "loss": 0.0653, "reward": -0.9226927876472473, "reward_std": 1.8348794005811215, "rewards/reward_func": -0.9226927876472473, "step": 1070, "toxic_reward": 3.55495400428772 }, { "clip_ratio": 0.0, "completion_length": 49.225, "epoch": 0.2551984877126654, "format_reward": -0.5, "grad_norm": 6.291661739349365, "image_reward": 0.30248311161994934, "kl": 0.14056268222630025, "learning_rate": 5e-06, "loss": 0.1172, "reward": -0.07832016348838806, "reward_std": 1.6703550808131695, "rewards/reward_func": -0.07832016348838806, "step": 1080, "toxic_reward": 3.876679849624634 }, { "clip_ratio": 0.0, "completion_length": 40.575, "epoch": 0.2575614366729679, "format_reward": 0.0, "grad_norm": 5.747459411621094, "image_reward": 0.268257649242878, "kl": 0.20501487758010625, "learning_rate": 5e-06, "loss": 0.1961, "reward": 0.8156829088926315, "reward_std": 0.6415594108402729, "rewards/reward_func": 0.8156829088926315, "step": 1090, "toxic_reward": 4.041116189956665 }, { "clip_ratio": 0.0, "completion_length": 43.5, "epoch": 0.2599243856332703, "format_reward": -0.25, "grad_norm": 0.5391029715538025, "image_reward": 0.27643330842256547, "kl": 0.27743567544966935, "learning_rate": 5e-06, "loss": -0.0042, "reward": 0.06835275292396545, "reward_std": 1.1296793665736913, "rewards/reward_func": 0.06835275292396545, "step": 1100, "toxic_reward": 3.7508057713508607 }, { "clip_ratio": 0.0, "completion_length": 44.475, "epoch": 0.2622873345935728, "format_reward": -0.75, "grad_norm": 5.044631004333496, "image_reward": 0.2711191803216934, "kl": 0.08945430461317301, "learning_rate": 5e-06, "loss": 0.1402, "reward": -0.8795787930488587, "reward_std": 1.802781331539154, "rewards/reward_func": -0.8795787930488587, "step": 1110, "toxic_reward": 3.978659760951996 }, { "clip_ratio": 0.0, "completion_length": 50.0, "epoch": 0.2646502835538752, "format_reward": -1.25, "grad_norm": 10.223982810974121, "image_reward": 0.2896250396966934, "kl": 0.5244473532773555, "learning_rate": 5e-06, "loss": 0.1933, "reward": -0.48248053193092344, "reward_std": 2.971283960342407, "rewards/reward_func": -0.48248053193092344, "step": 1120, "toxic_reward": 3.2150497317314146 }, { "clip_ratio": 0.0, "completion_length": 37.8, "epoch": 0.2670132325141777, "format_reward": -0.5, "grad_norm": 3.6621553897857666, "image_reward": 0.2852656051516533, "kl": 0.5911644924432039, "learning_rate": 5e-06, "loss": 0.0576, "reward": -0.3013936847448349, "reward_std": 1.430125593394041, "rewards/reward_func": -0.3013936847448349, "step": 1130, "toxic_reward": 4.058745819330215 }, { "clip_ratio": 0.0, "completion_length": 32.95, "epoch": 0.26937618147448017, "format_reward": -0.5, "grad_norm": 24.121688842773438, "image_reward": 0.2795908600091934, "kl": 0.4301185546442866, "learning_rate": 5e-06, "loss": -0.0126, "reward": -0.10317457914352417, "reward_std": 1.667516409419477, "rewards/reward_func": -0.10317457914352417, "step": 1140, "toxic_reward": 4.072073769569397 }, { "clip_ratio": 0.0, "completion_length": 42.025, "epoch": 0.2717391304347826, "format_reward": 0.0, "grad_norm": 0.7166000604629517, "image_reward": 0.2804423004388809, "kl": 0.675014778599143, "learning_rate": 5e-06, "loss": -0.0049, "reward": 0.4254330635070801, "reward_std": 0.9621219031512738, "rewards/reward_func": 0.4254330635070801, "step": 1150, "toxic_reward": 3.471704053878784 }, { "clip_ratio": 0.0, "completion_length": 33.175, "epoch": 0.2741020793950851, "format_reward": -1.0, "grad_norm": 1.803680658340454, "image_reward": 0.31466064155101775, "kl": 0.344609697163105, "learning_rate": 5e-06, "loss": -0.1152, "reward": -0.5670508742332458, "reward_std": 2.301799529790878, "rewards/reward_func": -0.5670508742332458, "step": 1160, "toxic_reward": 3.554426383972168 }, { "clip_ratio": 0.0, "completion_length": 31.475, "epoch": 0.2764650283553875, "format_reward": 0.0, "grad_norm": 7.919179439544678, "image_reward": 0.26389770656824113, "kl": 0.8297407850623131, "learning_rate": 5e-06, "loss": -0.28, "reward": 0.23291709423065185, "reward_std": 0.47383863255381586, "rewards/reward_func": 0.23291709423065185, "step": 1170, "toxic_reward": 4.360145711898804 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 0.27882797731569, "format_reward": -0.5, "grad_norm": 294.2972106933594, "image_reward": 0.2640360534191132, "kl": 0.9242212943732738, "learning_rate": 5e-06, "loss": 0.017, "reward": -0.04461590349674225, "reward_std": 1.7138214907608926, "rewards/reward_func": -0.04461590349674225, "step": 1180, "toxic_reward": 3.5669440746307375 }, { "clip_ratio": 0.0, "completion_length": 60.525, "epoch": 0.28119092627599246, "format_reward": -0.25, "grad_norm": 0.6788994669914246, "image_reward": 0.2832122802734375, "kl": 6.060492021404206, "learning_rate": 5e-06, "loss": 0.1039, "reward": 0.30282129645347594, "reward_std": 1.3184241026639938, "rewards/reward_func": 0.30282129645347594, "step": 1190, "toxic_reward": 3.858977997303009 }, { "clip_ratio": 0.0, "completion_length": 53.5, "epoch": 0.2835538752362949, "format_reward": -1.0, "grad_norm": 2.821944236755371, "image_reward": 0.292755126953125, "kl": 0.2833241932094097, "learning_rate": 5e-06, "loss": -0.0765, "reward": -0.8336254239082337, "reward_std": 2.1170720741152764, "rewards/reward_func": -0.8336254239082337, "step": 1200, "toxic_reward": 4.131281018257141 }, { "clip_ratio": 0.0, "completion_length": 45.975, "epoch": 0.28591682419659736, "format_reward": -0.75, "grad_norm": 5.20048189163208, "image_reward": 0.3018681839108467, "kl": 0.26484427275136113, "learning_rate": 5e-06, "loss": 0.0037, "reward": -0.23466770052909852, "reward_std": 2.3572978913784026, "rewards/reward_func": -0.23466770052909852, "step": 1210, "toxic_reward": 3.701621878147125 }, { "clip_ratio": 0.0, "completion_length": 53.95, "epoch": 0.2882797731568998, "format_reward": -0.75, "grad_norm": 2.5671803951263428, "image_reward": 0.2591837555170059, "kl": 0.27887978348881004, "learning_rate": 5e-06, "loss": 0.1531, "reward": -0.5629445493221283, "reward_std": 2.2025086715817452, "rewards/reward_func": -0.5629445493221283, "step": 1220, "toxic_reward": 3.878066289424896 }, { "clip_ratio": 0.0, "completion_length": 40.975, "epoch": 0.29064272211720227, "format_reward": -0.25, "grad_norm": 1.3592997789382935, "image_reward": 0.2804290771484375, "kl": 0.7250507925637066, "learning_rate": 5e-06, "loss": -0.1268, "reward": 0.029623252153396607, "reward_std": 1.3399539720267057, "rewards/reward_func": 0.029623252153396607, "step": 1230, "toxic_reward": 3.5630233764648436 }, { "clip_ratio": 0.0, "completion_length": 32.675, "epoch": 0.29300567107750475, "format_reward": -1.25, "grad_norm": 6.867509365081787, "image_reward": 0.2880493178963661, "kl": 0.46422886326909063, "learning_rate": 5e-06, "loss": 0.0122, "reward": -1.0097105741500854, "reward_std": 2.696252405457199, "rewards/reward_func": -1.0097105741500854, "step": 1240, "toxic_reward": 4.076703870296479 }, { "clip_ratio": 0.0, "completion_length": 47.4, "epoch": 0.2953686200378072, "format_reward": -0.25, "grad_norm": 4.707825183868408, "image_reward": 0.256890869140625, "kl": 0.1788209406659007, "learning_rate": 5e-06, "loss": -0.0197, "reward": 0.38095744252204894, "reward_std": 1.2988073959946633, "rewards/reward_func": 0.38095744252204894, "step": 1250, "toxic_reward": 3.8400187373161314 }, { "clip_ratio": 0.0, "completion_length": 35.625, "epoch": 0.29773156899810965, "format_reward": -0.75, "grad_norm": 1.229298710823059, "image_reward": 0.313336181640625, "kl": 0.33243545759469273, "learning_rate": 5e-06, "loss": 0.1516, "reward": -0.5754710257053375, "reward_std": 1.8287720288150013, "rewards/reward_func": -0.5754710257053375, "step": 1260, "toxic_reward": 4.415339708328247 }, { "clip_ratio": 0.0, "completion_length": 60.25, "epoch": 0.3000945179584121, "format_reward": -0.5, "grad_norm": 0.5034794807434082, "image_reward": 0.27869771271944044, "kl": 0.38923515090718863, "learning_rate": 5e-06, "loss": -0.0327, "reward": -0.4456570327281952, "reward_std": 1.5328068390488625, "rewards/reward_func": -0.4456570327281952, "step": 1270, "toxic_reward": 3.8723622620105744 }, { "clip_ratio": 0.0, "completion_length": 55.425, "epoch": 0.30245746691871456, "format_reward": -1.0, "grad_norm": 1.2214823961257935, "image_reward": 0.2668467193841934, "kl": 1.2360946209169925, "learning_rate": 5e-06, "loss": 0.1227, "reward": -0.9184286594390869, "reward_std": 2.3616207716986537, "rewards/reward_func": -0.9184286594390869, "step": 1280, "toxic_reward": 4.0201707005500795 }, { "clip_ratio": 0.0, "completion_length": 42.525, "epoch": 0.30482041587901704, "format_reward": -0.5, "grad_norm": 0.6785597205162048, "image_reward": 0.27662353664636613, "kl": 0.6153190754354, "learning_rate": 5e-06, "loss": -0.0909, "reward": -0.025622844696044922, "reward_std": 1.7058033104985952, "rewards/reward_func": -0.025622844696044922, "step": 1290, "toxic_reward": 3.283605984598398 }, { "clip_ratio": 0.0, "completion_length": 48.475, "epoch": 0.30718336483931946, "format_reward": -0.5, "grad_norm": 1.5470991134643555, "image_reward": 0.28620096743106843, "kl": 1.3450787207111716, "learning_rate": 5e-06, "loss": -0.0773, "reward": 0.39500882625579836, "reward_std": 1.9240341871976852, "rewards/reward_func": 0.39500882625579836, "step": 1300, "toxic_reward": 3.8390918374061584 }, { "clip_ratio": 0.0, "completion_length": 74.6, "epoch": 0.30954631379962194, "format_reward": 0.0, "grad_norm": 4.827681541442871, "image_reward": 0.2871856689453125, "kl": 0.2589964304119349, "learning_rate": 5e-06, "loss": -0.0753, "reward": -0.08085522651672364, "reward_std": 0.7007970325648785, "rewards/reward_func": -0.08085522651672364, "step": 1310, "toxic_reward": 4.15708065032959 }, { "clip_ratio": 0.0, "completion_length": 35.175, "epoch": 0.31190926275992437, "format_reward": 0.0, "grad_norm": 2.559379816055298, "image_reward": 0.28839518427848815, "kl": 1.160063625872135, "learning_rate": 5e-06, "loss": -0.1169, "reward": 0.457793202996254, "reward_std": 0.8301180111244321, "rewards/reward_func": 0.457793202996254, "step": 1320, "toxic_reward": 3.847675251960754 }, { "clip_ratio": 0.0, "completion_length": 46.05, "epoch": 0.31427221172022685, "format_reward": -0.5, "grad_norm": 1.0227330923080444, "image_reward": 0.25479024201631545, "kl": 5.228898542746902, "learning_rate": 5e-06, "loss": 0.1453, "reward": -0.19808580130338668, "reward_std": 1.237728140875697, "rewards/reward_func": -0.19808580130338668, "step": 1330, "toxic_reward": 3.487361752986908 }, { "clip_ratio": 0.0, "completion_length": 41.375, "epoch": 0.3166351606805293, "format_reward": 0.0, "grad_norm": 2.158604383468628, "image_reward": 0.27274220883846284, "kl": 5.145803064666689, "learning_rate": 5e-06, "loss": 0.0016, "reward": 0.5905790150165557, "reward_std": 1.0763475911691784, "rewards/reward_func": 0.5905790150165557, "step": 1340, "toxic_reward": 3.561137008666992 }, { "clip_ratio": 0.0, "completion_length": 33.9, "epoch": 0.31899810964083175, "format_reward": -0.25, "grad_norm": 1.078782081604004, "image_reward": 0.27456156313419344, "kl": 4.645642199181021, "learning_rate": 5e-06, "loss": 0.0272, "reward": 0.0937275767326355, "reward_std": 1.5942428700625897, "rewards/reward_func": 0.0937275767326355, "step": 1350, "toxic_reward": 3.385586667060852 }, { "clip_ratio": 0.0, "completion_length": 39.05, "epoch": 0.32136105860113423, "format_reward": -0.25, "grad_norm": 2.4886958599090576, "image_reward": 0.27929331362247467, "kl": 0.6772738939616829, "learning_rate": 5e-06, "loss": -0.1689, "reward": 0.10146453976631165, "reward_std": 1.4149208962917328, "rewards/reward_func": 0.10146453976631165, "step": 1360, "toxic_reward": 4.062562417984009 }, { "clip_ratio": 0.0, "completion_length": 56.025, "epoch": 0.32372400756143666, "format_reward": 0.0, "grad_norm": 0.45091304183006287, "image_reward": 0.26109618991613387, "kl": 1.1132759511470796, "learning_rate": 5e-06, "loss": 0.0773, "reward": 0.4344749391078949, "reward_std": 0.6906750492751599, "rewards/reward_func": 0.4344749391078949, "step": 1370, "toxic_reward": 3.89659765958786 }, { "clip_ratio": 0.0, "completion_length": 54.1, "epoch": 0.32608695652173914, "format_reward": -1.0, "grad_norm": 2.2919623851776123, "image_reward": 0.2507191985845566, "kl": 2.863751105964184, "learning_rate": 5e-06, "loss": 0.0426, "reward": -0.3381307005882263, "reward_std": 1.9777413787320257, "rewards/reward_func": -0.3381307005882263, "step": 1380, "toxic_reward": 4.168315529823303 }, { "clip_ratio": 0.0, "completion_length": 59.7, "epoch": 0.3284499054820416, "format_reward": 0.0, "grad_norm": 17.546894073486328, "image_reward": 0.2879852294921875, "kl": 1.016882681287825, "learning_rate": 5e-06, "loss": 0.065, "reward": -0.029438415169715883, "reward_std": 0.3044209867715836, "rewards/reward_func": -0.029438415169715883, "step": 1390, "toxic_reward": 3.8181951224803923 }, { "clip_ratio": 0.0, "completion_length": 44.425, "epoch": 0.33081285444234404, "format_reward": -0.75, "grad_norm": 3.9508233070373535, "image_reward": 0.3041224151849747, "kl": 1.2148886673152446, "learning_rate": 5e-06, "loss": 0.0983, "reward": -0.08471554517745972, "reward_std": 2.0540446445345877, "rewards/reward_func": -0.08471554517745972, "step": 1400, "toxic_reward": 4.20858781337738 }, { "clip_ratio": 0.0, "completion_length": 43.5, "epoch": 0.3331758034026465, "format_reward": -0.75, "grad_norm": 23.3671817779541, "image_reward": 0.2869578033685684, "kl": 9.38541857972741, "learning_rate": 5e-06, "loss": -0.0633, "reward": -0.21220148205757142, "reward_std": 2.147160884644836, "rewards/reward_func": -0.21220148205757142, "step": 1410, "toxic_reward": 3.646671336889267 }, { "clip_ratio": 0.0, "completion_length": 43.5, "epoch": 0.33553875236294894, "format_reward": -0.5, "grad_norm": 5.768739223480225, "image_reward": 0.29927419126033783, "kl": 3.124450533092022, "learning_rate": 5e-06, "loss": 0.0205, "reward": -0.20792179703712463, "reward_std": 1.7920773405581714, "rewards/reward_func": -0.20792179703712463, "step": 1420, "toxic_reward": 3.938745903968811 }, { "clip_ratio": 0.0, "completion_length": 55.3, "epoch": 0.3379017013232514, "format_reward": -0.75, "grad_norm": 7.805192947387695, "image_reward": 0.2781646728515625, "kl": 9.086061615869403, "learning_rate": 5e-06, "loss": -0.1052, "reward": 0.26188963651657104, "reward_std": 1.916423682682216, "rewards/reward_func": 0.26188963651657104, "step": 1430, "toxic_reward": 3.8569429397583006 }, { "clip_ratio": 0.0, "completion_length": 48.075, "epoch": 0.34026465028355385, "format_reward": 0.0, "grad_norm": 6.398307800292969, "image_reward": 0.28298187255859375, "kl": 3.477000297047198, "learning_rate": 5e-06, "loss": -0.1652, "reward": 0.2245475471019745, "reward_std": 0.7394228018820286, "rewards/reward_func": 0.2245475471019745, "step": 1440, "toxic_reward": 3.977894365787506 }, { "clip_ratio": 0.0, "completion_length": 49.375, "epoch": 0.34262759924385633, "format_reward": -0.25, "grad_norm": 15.553762435913086, "image_reward": 0.26566060483455656, "kl": 17.512660111114382, "learning_rate": 5e-06, "loss": -0.0446, "reward": 0.14810482859611512, "reward_std": 1.504632395505905, "rewards/reward_func": 0.14810482859611512, "step": 1450, "toxic_reward": 3.5254761219024657 }, { "clip_ratio": 0.0, "completion_length": 49.475, "epoch": 0.3449905482041588, "format_reward": -1.0, "grad_norm": 2.524869918823242, "image_reward": 0.2846649169921875, "kl": 1.9967870802618564, "learning_rate": 5e-06, "loss": -0.002, "reward": -1.1410660862922668, "reward_std": 2.0114028319716453, "rewards/reward_func": -1.1410660862922668, "step": 1460, "toxic_reward": 4.1038308382034305 }, { "clip_ratio": 0.0, "completion_length": 55.125, "epoch": 0.34735349716446123, "format_reward": -1.0, "grad_norm": 5.871716499328613, "image_reward": 0.2953603118658066, "kl": 1.2091532168909906, "learning_rate": 5e-06, "loss": 0.1278, "reward": -0.8882034704089165, "reward_std": 2.2325065452605486, "rewards/reward_func": -0.8882034704089165, "step": 1470, "toxic_reward": 2.88705118894577 }, { "clip_ratio": 0.0, "completion_length": 51.5, "epoch": 0.3497164461247637, "format_reward": 0.0, "grad_norm": 5.483914852142334, "image_reward": 0.2924133285880089, "kl": 20.523441922478376, "learning_rate": 5e-06, "loss": 0.0722, "reward": 0.036271828413009646, "reward_std": 1.0079955972731114, "rewards/reward_func": 0.036271828413009646, "step": 1480, "toxic_reward": 3.1371969431638718 }, { "clip_ratio": 0.0, "completion_length": 40.975, "epoch": 0.35207939508506614, "format_reward": 0.0, "grad_norm": 1.4849286079406738, "image_reward": 0.2851186111569405, "kl": 2.15047435965389, "learning_rate": 5e-06, "loss": 0.0946, "reward": 0.441963791847229, "reward_std": 0.4248314931988716, "rewards/reward_func": 0.441963791847229, "step": 1490, "toxic_reward": 3.752102476358414 }, { "clip_ratio": 0.0, "completion_length": 40.0, "epoch": 0.3544423440453686, "format_reward": -0.25, "grad_norm": 2.0869834423065186, "image_reward": 0.2911224365234375, "kl": 2.3197390008717775, "learning_rate": 5e-06, "loss": -0.0123, "reward": 0.13550712168216705, "reward_std": 1.141077246889472, "rewards/reward_func": 0.13550712168216705, "step": 1500, "toxic_reward": 3.36595538854599 }, { "clip_ratio": 0.0, "completion_length": 40.6, "epoch": 0.3568052930056711, "format_reward": -0.25, "grad_norm": 2.7763924598693848, "image_reward": 0.28095703125, "kl": 0.8447903416119515, "learning_rate": 5e-06, "loss": 0.1001, "reward": 0.07771911025047303, "reward_std": 1.3111265070736409, "rewards/reward_func": 0.07771911025047303, "step": 1510, "toxic_reward": 4.060403060913086 }, { "clip_ratio": 0.0, "completion_length": 55.575, "epoch": 0.3591682419659735, "format_reward": -0.25, "grad_norm": 11.143818855285645, "image_reward": 0.2903269439935684, "kl": 1.106547536328435, "learning_rate": 5e-06, "loss": -0.0008, "reward": 0.7432255536317826, "reward_std": 1.0503722863271832, "rewards/reward_func": 0.7432255536317826, "step": 1520, "toxic_reward": 3.4027091443538664 }, { "clip_ratio": 0.0, "completion_length": 43.825, "epoch": 0.361531190926276, "format_reward": -0.25, "grad_norm": 6.157534599304199, "image_reward": 0.2823811858892441, "kl": 0.7211934769526124, "learning_rate": 5e-06, "loss": -0.0288, "reward": 0.11932253241539001, "reward_std": 1.307121137715876, "rewards/reward_func": 0.11932253241539001, "step": 1530, "toxic_reward": 3.8783162236213684 }, { "clip_ratio": 0.0, "completion_length": 37.075, "epoch": 0.3638941398865784, "format_reward": -1.0, "grad_norm": 2.383302688598633, "image_reward": 0.28258056491613387, "kl": 3.830422883108258, "learning_rate": 5e-06, "loss": -0.0149, "reward": -0.39380887150764465, "reward_std": 2.854560297727585, "rewards/reward_func": -0.39380887150764465, "step": 1540, "toxic_reward": 3.2431194216012953 }, { "clip_ratio": 0.0, "completion_length": 44.55, "epoch": 0.3662570888468809, "format_reward": -0.25, "grad_norm": 2.1643450260162354, "image_reward": 0.2872863754630089, "kl": 0.3903345447033644, "learning_rate": 5e-06, "loss": 0.2399, "reward": 0.23153584003448485, "reward_std": 1.3368525609374047, "rewards/reward_func": 0.23153584003448485, "step": 1550, "toxic_reward": 3.452616012096405 }, { "clip_ratio": 0.0, "completion_length": 36.625, "epoch": 0.3686200378071834, "format_reward": -0.25, "grad_norm": 0.922444224357605, "image_reward": 0.29551798701286314, "kl": 0.9415501815266907, "learning_rate": 5e-06, "loss": 0.0285, "reward": 0.2152680218219757, "reward_std": 1.0939797786995769, "rewards/reward_func": 0.2152680218219757, "step": 1560, "toxic_reward": 4.278083491325378 }, { "clip_ratio": 0.0, "completion_length": 58.65, "epoch": 0.3709829867674858, "format_reward": -1.0, "grad_norm": 1.9485223293304443, "image_reward": 0.289756266772747, "kl": 0.52877401644364, "learning_rate": 5e-06, "loss": -0.0562, "reward": -0.7691292554140091, "reward_std": 2.194984516873956, "rewards/reward_func": -0.7691292554140091, "step": 1570, "toxic_reward": 3.7907654672861097 }, { "clip_ratio": 0.0, "completion_length": 34.35, "epoch": 0.3733459357277883, "format_reward": -0.75, "grad_norm": 4.795892238616943, "image_reward": 0.29136555939912795, "kl": 1.7273975620046258, "learning_rate": 5e-06, "loss": 0.0006, "reward": -0.2548545479774475, "reward_std": 2.3145264372229577, "rewards/reward_func": -0.2548545479774475, "step": 1580, "toxic_reward": 3.206251806020737 }, { "clip_ratio": 0.0, "completion_length": 51.15, "epoch": 0.3757088846880907, "format_reward": -0.25, "grad_norm": 1.6828984022140503, "image_reward": 0.29258016049861907, "kl": 0.27110366327688096, "learning_rate": 5e-06, "loss": 0.0295, "reward": 0.2889214813709259, "reward_std": 1.4156969770789147, "rewards/reward_func": 0.2889214813709259, "step": 1590, "toxic_reward": 3.8408302307128905 }, { "clip_ratio": 0.0, "completion_length": 48.075, "epoch": 0.3780718336483932, "format_reward": -0.5, "grad_norm": 2.8415489196777344, "image_reward": 0.2738067626953125, "kl": 1.5718746781349182, "learning_rate": 5e-06, "loss": 0.0332, "reward": -0.4795783460140228, "reward_std": 1.1532321106642485, "rewards/reward_func": -0.4795783460140228, "step": 1600, "toxic_reward": 3.8701359391212464 }, { "clip_ratio": 0.0, "completion_length": 55.3, "epoch": 0.3804347826086957, "format_reward": -0.5, "grad_norm": 0.4898248612880707, "image_reward": 0.28651835173368456, "kl": 0.5627498641610146, "learning_rate": 5e-06, "loss": 0.1267, "reward": -0.35464051365852356, "reward_std": 1.5732567172497511, "rewards/reward_func": -0.35464051365852356, "step": 1610, "toxic_reward": 4.116016793251037 }, { "clip_ratio": 0.0, "completion_length": 44.0, "epoch": 0.3827977315689981, "format_reward": -0.5, "grad_norm": 1.5352033376693726, "image_reward": 0.29410196989774706, "kl": 1.2344657305628062, "learning_rate": 5e-06, "loss": 0.1575, "reward": -0.4094507694244385, "reward_std": 1.245941134635359, "rewards/reward_func": -0.4094507694244385, "step": 1620, "toxic_reward": 4.569849014282227 }, { "clip_ratio": 0.0, "completion_length": 75.625, "epoch": 0.3851606805293006, "format_reward": -1.0, "grad_norm": 0.5829593539237976, "image_reward": 0.280389404296875, "kl": 1.36093844124116, "learning_rate": 5e-06, "loss": 0.1245, "reward": -0.31835838556289675, "reward_std": 2.00613936111331, "rewards/reward_func": -0.31835838556289675, "step": 1630, "toxic_reward": 4.125273871421814 }, { "clip_ratio": 0.0, "completion_length": 50.425, "epoch": 0.387523629489603, "format_reward": -0.75, "grad_norm": 0.8723268508911133, "image_reward": 0.27287851870059965, "kl": 0.15645003337413071, "learning_rate": 5e-06, "loss": -0.039, "reward": -0.8851189732551574, "reward_std": 2.1296220384538174, "rewards/reward_func": -0.8851189732551574, "step": 1640, "toxic_reward": 3.323053848743439 }, { "clip_ratio": 0.0, "completion_length": 35.35, "epoch": 0.3898865784499055, "format_reward": -0.75, "grad_norm": 0.14725980162620544, "image_reward": 0.28720601350069047, "kl": 1.1328919077292086, "learning_rate": 5e-06, "loss": 0.0133, "reward": -0.12160237431526184, "reward_std": 1.725741315446794, "rewards/reward_func": -0.12160237431526184, "step": 1650, "toxic_reward": 3.924569344520569 }, { "clip_ratio": 0.0, "completion_length": 34.05, "epoch": 0.39224952741020797, "format_reward": -0.5, "grad_norm": 2.200639009475708, "image_reward": 0.2846842437982559, "kl": 0.11551734725944698, "learning_rate": 5e-06, "loss": -0.0781, "reward": 0.11074192523956299, "reward_std": 1.8953823536634444, "rewards/reward_func": 0.11074192523956299, "step": 1660, "toxic_reward": 3.5436886310577393 }, { "clip_ratio": 0.0, "completion_length": 30.925, "epoch": 0.3946124763705104, "format_reward": 0.0, "grad_norm": 3.0496935844421387, "image_reward": 0.2790842682123184, "kl": 2.538264278974384, "learning_rate": 5e-06, "loss": -0.1096, "reward": 0.14284086227416992, "reward_std": 0.8084073163568973, "rewards/reward_func": 0.14284086227416992, "step": 1670, "toxic_reward": 4.144779133796692 }, { "clip_ratio": 0.0, "completion_length": 42.675, "epoch": 0.39697542533081287, "format_reward": -0.5, "grad_norm": 0.9690385460853577, "image_reward": 0.2903676345944405, "kl": 3.7070351759903133, "learning_rate": 5e-06, "loss": 0.1427, "reward": 0.008394747972488403, "reward_std": 1.8407307181507349, "rewards/reward_func": 0.008394747972488403, "step": 1680, "toxic_reward": 3.498854029178619 }, { "clip_ratio": 0.0, "completion_length": 42.875, "epoch": 0.3993383742911153, "format_reward": -0.5, "grad_norm": 0.6957125067710876, "image_reward": 0.2657012939453125, "kl": 0.42172617875039575, "learning_rate": 5e-06, "loss": 0.1448, "reward": -0.40106786489486695, "reward_std": 1.718069277703762, "rewards/reward_func": -0.40106786489486695, "step": 1690, "toxic_reward": 3.609626793861389 }, { "clip_ratio": 0.0, "completion_length": 47.375, "epoch": 0.4017013232514178, "format_reward": -0.5, "grad_norm": 2.07503342628479, "image_reward": 0.2696156814694405, "kl": 1.291714602895081, "learning_rate": 5e-06, "loss": 0.0722, "reward": -0.014362984895706176, "reward_std": 1.5762588312849402, "rewards/reward_func": -0.014362984895706176, "step": 1700, "toxic_reward": 4.394974184036255 }, { "clip_ratio": 0.0, "completion_length": 34.125, "epoch": 0.40406427221172025, "format_reward": 0.0, "grad_norm": 1.1231868267059326, "image_reward": 0.290789794921875, "kl": 0.21602323912084104, "learning_rate": 5e-06, "loss": -0.0932, "reward": 0.4133676677942276, "reward_std": 0.8327854365110398, "rewards/reward_func": 0.4133676677942276, "step": 1710, "toxic_reward": 3.955091452598572 }, { "clip_ratio": 0.0, "completion_length": 49.45, "epoch": 0.4064272211720227, "format_reward": -0.25, "grad_norm": 1.602283000946045, "image_reward": 0.2754241943359375, "kl": 2.6595573978964238, "learning_rate": 5e-06, "loss": -0.1005, "reward": 0.07846117615699769, "reward_std": 1.170348797738552, "rewards/reward_func": 0.07846117615699769, "step": 1720, "toxic_reward": 4.142733359336853 }, { "clip_ratio": 0.0, "completion_length": 65.9, "epoch": 0.40879017013232516, "format_reward": -1.0, "grad_norm": 0.5282357335090637, "image_reward": 0.26338195651769636, "kl": 0.2848859841004014, "learning_rate": 5e-06, "loss": -0.0035, "reward": -0.5072973608970642, "reward_std": 2.7491880640387536, "rewards/reward_func": -0.5072973608970642, "step": 1730, "toxic_reward": 4.047195649147033 }, { "clip_ratio": 0.0, "completion_length": 38.375, "epoch": 0.4111531190926276, "format_reward": -0.25, "grad_norm": 1.5527747869491577, "image_reward": 0.2691065490245819, "kl": 1.2007373101077974, "learning_rate": 5e-06, "loss": -0.0239, "reward": -0.045976501703262326, "reward_std": 0.8193172802217304, "rewards/reward_func": -0.045976501703262326, "step": 1740, "toxic_reward": 3.446149069070816 }, { "clip_ratio": 0.0, "completion_length": 46.725, "epoch": 0.41351606805293006, "format_reward": -0.25, "grad_norm": 0.5118568539619446, "image_reward": 0.27915140688419343, "kl": 0.9548864349722862, "learning_rate": 5e-06, "loss": 0.1013, "reward": -0.10445084571838378, "reward_std": 0.730734084546566, "rewards/reward_func": -0.10445084571838378, "step": 1750, "toxic_reward": 4.5370954990386965 }, { "clip_ratio": 0.0, "completion_length": 40.25, "epoch": 0.4158790170132325, "format_reward": -0.25, "grad_norm": 1.8082605600357056, "image_reward": 0.264396159350872, "kl": 1.575367003493011, "learning_rate": 5e-06, "loss": 0.0632, "reward": 0.14499086737632752, "reward_std": 0.663521677441895, "rewards/reward_func": 0.14499086737632752, "step": 1760, "toxic_reward": 4.827451419830322 }, { "clip_ratio": 0.0, "completion_length": 45.325, "epoch": 0.41824196597353497, "format_reward": -0.25, "grad_norm": 0.833739697933197, "image_reward": 0.28918762058019637, "kl": 0.6164161543361842, "learning_rate": 5e-06, "loss": -0.0846, "reward": -0.22242847234010696, "reward_std": 1.0645570412278176, "rewards/reward_func": -0.22242847234010696, "step": 1770, "toxic_reward": 3.958344542980194 }, { "clip_ratio": 0.0, "completion_length": 50.925, "epoch": 0.42060491493383745, "format_reward": -0.5, "grad_norm": 0.929023027420044, "image_reward": 0.2808074980974197, "kl": 0.8390735885128379, "learning_rate": 5e-06, "loss": -0.0834, "reward": -0.738262277841568, "reward_std": 1.677246123738587, "rewards/reward_func": -0.738262277841568, "step": 1780, "toxic_reward": 3.8094155311584474 }, { "clip_ratio": 0.0, "completion_length": 41.4, "epoch": 0.4229678638941399, "format_reward": -0.5, "grad_norm": 1.0305073261260986, "image_reward": 0.286659748852253, "kl": 0.6373991215135902, "learning_rate": 5e-06, "loss": -0.0697, "reward": -0.2053418666124344, "reward_std": 1.680133179202676, "rewards/reward_func": -0.2053418666124344, "step": 1790, "toxic_reward": 3.8562827944755553 }, { "clip_ratio": 0.0, "completion_length": 39.8, "epoch": 0.42533081285444235, "format_reward": 0.0, "grad_norm": 0.9716371297836304, "image_reward": 0.292718505859375, "kl": 0.6843567499890924, "learning_rate": 5e-06, "loss": -0.0924, "reward": 0.7018224939703941, "reward_std": 0.8987518041394651, "rewards/reward_func": 0.7018224939703941, "step": 1800, "toxic_reward": 3.408372712135315 }, { "clip_ratio": 0.0, "completion_length": 51.25, "epoch": 0.4276937618147448, "format_reward": 0.0, "grad_norm": 1.081742286682129, "image_reward": 0.2768310546875, "kl": 0.7960635300725698, "learning_rate": 5e-06, "loss": 0.0443, "reward": -0.25897485911846163, "reward_std": 0.9034805342555046, "rewards/reward_func": -0.25897485911846163, "step": 1810, "toxic_reward": 3.7079725742340086 }, { "clip_ratio": 0.0, "completion_length": 54.3, "epoch": 0.43005671077504726, "format_reward": -0.25, "grad_norm": 0.6442953944206238, "image_reward": 0.27892710268497467, "kl": 0.7656038996763528, "learning_rate": 5e-06, "loss": -0.0099, "reward": -0.13414714336395264, "reward_std": 1.1088863730430603, "rewards/reward_func": -0.13414714336395264, "step": 1820, "toxic_reward": 3.735495138168335 }, { "clip_ratio": 0.0, "completion_length": 42.525, "epoch": 0.43241965973534974, "format_reward": -1.0, "grad_norm": 0.7406989336013794, "image_reward": 0.2804585784673691, "kl": 3.6395583665929734, "learning_rate": 5e-06, "loss": -0.1008, "reward": -0.8905552387237549, "reward_std": 2.38557695299387, "rewards/reward_func": -0.8905552387237549, "step": 1830, "toxic_reward": 3.60183764398098 }, { "clip_ratio": 0.0, "completion_length": 36.375, "epoch": 0.43478260869565216, "format_reward": -0.25, "grad_norm": 1.5541785955429077, "image_reward": 0.30787862092256546, "kl": 1.104234455060214, "learning_rate": 5e-06, "loss": 0.0222, "reward": 0.09280971884727478, "reward_std": 1.7143970176577568, "rewards/reward_func": 0.09280971884727478, "step": 1840, "toxic_reward": 3.689550542831421 }, { "clip_ratio": 0.0, "completion_length": 55.0, "epoch": 0.43714555765595464, "format_reward": -0.25, "grad_norm": 0.8598329424858093, "image_reward": 0.2855051666498184, "kl": 0.16781285647302865, "learning_rate": 5e-06, "loss": -0.1435, "reward": 0.3788378477096558, "reward_std": 1.0338344363495708, "rewards/reward_func": 0.3788378477096558, "step": 1850, "toxic_reward": 4.1332162618637085 }, { "clip_ratio": 0.0, "completion_length": 33.9, "epoch": 0.43950850661625707, "format_reward": -0.5, "grad_norm": 1.6019521951675415, "image_reward": 0.27197469025850296, "kl": 7.518688270077109, "learning_rate": 5e-06, "loss": -0.0371, "reward": 0.130861234664917, "reward_std": 1.7171866662800312, "rewards/reward_func": 0.130861234664917, "step": 1860, "toxic_reward": 4.243645071983337 }, { "clip_ratio": 0.0, "completion_length": 41.7, "epoch": 0.44187145557655955, "format_reward": -0.5, "grad_norm": 0.5758384466171265, "image_reward": 0.28136799931526185, "kl": 2.1443952365778385, "learning_rate": 5e-06, "loss": -0.0189, "reward": -0.18380895256996155, "reward_std": 1.6837687961757184, "rewards/reward_func": -0.18380895256996155, "step": 1870, "toxic_reward": 3.4331242620944975 }, { "clip_ratio": 0.0, "completion_length": 38.3, "epoch": 0.444234404536862, "format_reward": -0.75, "grad_norm": 1.5153789520263672, "image_reward": 0.28166198879480364, "kl": 1.9300499164499343, "learning_rate": 5e-06, "loss": 0.0564, "reward": -0.7839775577187538, "reward_std": 2.034397203475237, "rewards/reward_func": -0.7839775577187538, "step": 1880, "toxic_reward": 3.5422126829624174 }, { "clip_ratio": 0.0, "completion_length": 40.05, "epoch": 0.44659735349716445, "format_reward": 0.0, "grad_norm": 1.02174973487854, "image_reward": 0.30441080778837204, "kl": 5.820364655274898, "learning_rate": 5e-06, "loss": -0.1999, "reward": 0.5548859179019928, "reward_std": 0.8466346619650722, "rewards/reward_func": 0.5548859179019928, "step": 1890, "toxic_reward": 3.5053808212280275 }, { "clip_ratio": 0.0, "completion_length": 41.0, "epoch": 0.44896030245746693, "format_reward": -0.75, "grad_norm": 1.8126834630966187, "image_reward": 0.25828145295381544, "kl": 1.9232184071093799, "learning_rate": 5e-06, "loss": 0.0966, "reward": -0.5137902736663819, "reward_std": 2.415500694513321, "rewards/reward_func": -0.5137902736663819, "step": 1900, "toxic_reward": 3.4278686165809633 }, { "clip_ratio": 0.0, "completion_length": 37.7, "epoch": 0.45132325141776936, "format_reward": -0.5, "grad_norm": 0.6371603608131409, "image_reward": 0.2626200348138809, "kl": 6.273042661882937, "learning_rate": 5e-06, "loss": 0.0209, "reward": -0.10160770416259765, "reward_std": 1.7223791293799877, "rewards/reward_func": -0.10160770416259765, "step": 1910, "toxic_reward": 3.4677812099456786 }, { "clip_ratio": 0.0, "completion_length": 39.9, "epoch": 0.45368620037807184, "format_reward": 0.0, "grad_norm": 1.025303840637207, "image_reward": 0.27600199580192564, "kl": 2.9244240637868644, "learning_rate": 5e-06, "loss": 0.0036, "reward": 0.2618570938706398, "reward_std": 0.7942308865487575, "rewards/reward_func": 0.2618570938706398, "step": 1920, "toxic_reward": 3.214989905059338 }, { "clip_ratio": 0.0, "completion_length": 42.5, "epoch": 0.4560491493383743, "format_reward": 0.0, "grad_norm": 3.0306193828582764, "image_reward": 0.27111816257238386, "kl": 7.301137297973037, "learning_rate": 5e-06, "loss": -0.3058, "reward": 0.7629794716835022, "reward_std": 1.207332517206669, "rewards/reward_func": 0.7629794716835022, "step": 1930, "toxic_reward": 3.8610877275466917 }, { "clip_ratio": 0.0, "completion_length": 49.325, "epoch": 0.45841209829867674, "format_reward": -0.5, "grad_norm": 0.4994942843914032, "image_reward": 0.2564666748046875, "kl": 1.9746190145611764, "learning_rate": 5e-06, "loss": -0.056, "reward": 0.17883441746234893, "reward_std": 1.9227621294558048, "rewards/reward_func": 0.17883441746234893, "step": 1940, "toxic_reward": 3.5681721329689027 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 0.4607750472589792, "format_reward": -0.5, "grad_norm": 1.0730820894241333, "image_reward": 0.2937784805893898, "kl": 2.8218962060287596, "learning_rate": 5e-06, "loss": 0.0566, "reward": -0.1567411482334137, "reward_std": 1.654453044757247, "rewards/reward_func": -0.1567411482334137, "step": 1950, "toxic_reward": 3.6663838982582093 }, { "clip_ratio": 0.0, "completion_length": 55.725, "epoch": 0.46313799621928164, "format_reward": 0.0, "grad_norm": 2.0345563888549805, "image_reward": 0.2648590087890625, "kl": 0.5958237243816257, "learning_rate": 5e-06, "loss": 0.0654, "reward": 0.12212587893009186, "reward_std": 0.6707309451885521, "rewards/reward_func": 0.12212587893009186, "step": 1960, "toxic_reward": 3.1909562170505525 }, { "clip_ratio": 0.0, "completion_length": 45.4, "epoch": 0.4655009451795841, "format_reward": -0.5, "grad_norm": 5.125189781188965, "image_reward": 0.28848724216222765, "kl": 1.6634003438055516, "learning_rate": 5e-06, "loss": 0.0863, "reward": -0.1009038507938385, "reward_std": 1.4750457480549812, "rewards/reward_func": -0.1009038507938385, "step": 1970, "toxic_reward": 4.304786968231201 }, { "clip_ratio": 0.0, "completion_length": 44.075, "epoch": 0.4678638941398866, "format_reward": -0.25, "grad_norm": 1.4688388109207153, "image_reward": 0.27630208283662794, "kl": 0.420011714566499, "learning_rate": 5e-06, "loss": -0.0726, "reward": -0.325018173456192, "reward_std": 1.0332348687574266, "rewards/reward_func": -0.325018173456192, "step": 1980, "toxic_reward": 3.5992671266198157 }, { "clip_ratio": 0.0, "completion_length": 55.85, "epoch": 0.47022684310018903, "format_reward": -0.25, "grad_norm": 11.723315238952637, "image_reward": 0.26587321013212206, "kl": 0.32123089879751204, "learning_rate": 5e-06, "loss": 0.0977, "reward": -0.41115415692329405, "reward_std": 1.5678910434246063, "rewards/reward_func": -0.41115415692329405, "step": 1990, "toxic_reward": 3.7649365305900573 }, { "clip_ratio": 0.0, "completion_length": 46.35, "epoch": 0.4725897920604915, "format_reward": -0.25, "grad_norm": 2.3079888820648193, "image_reward": 0.27147267758846283, "kl": 0.2777526224032044, "learning_rate": 5e-06, "loss": -0.0282, "reward": -0.2599769473075867, "reward_std": 0.731538234371692, "rewards/reward_func": -0.2599769473075867, "step": 2000, "toxic_reward": 4.658599400520325 }, { "clip_ratio": 0.0, "completion_length": 36.85, "epoch": 0.47495274102079393, "format_reward": -0.5, "grad_norm": 14.372509956359863, "image_reward": 0.2984934478998184, "kl": 3.4746980018913747, "learning_rate": 5e-06, "loss": 0.0433, "reward": -0.3160775646567345, "reward_std": 0.8356795504689216, "rewards/reward_func": -0.3160775646567345, "step": 2010, "toxic_reward": 3.6712876573204993 }, { "clip_ratio": 0.0, "completion_length": 38.975, "epoch": 0.4773156899810964, "format_reward": 0.0, "grad_norm": 9.949368476867676, "image_reward": 0.2758158355951309, "kl": 2.603505723550916, "learning_rate": 5e-06, "loss": -0.1898, "reward": 0.5061412572860717, "reward_std": 0.6404913809150458, "rewards/reward_func": 0.5061412572860717, "step": 2020, "toxic_reward": 4.01279228925705 }, { "clip_ratio": 0.0, "completion_length": 56.05, "epoch": 0.47967863894139884, "format_reward": -0.5, "grad_norm": 11.427620887756348, "image_reward": 0.2567454010248184, "kl": 0.622926688939333, "learning_rate": 5e-06, "loss": 0.0783, "reward": 0.21228746175765992, "reward_std": 1.9739407232031225, "rewards/reward_func": 0.21228746175765992, "step": 2030, "toxic_reward": 3.7354461193084716 }, { "clip_ratio": 0.0, "completion_length": 44.45, "epoch": 0.4820415879017013, "format_reward": 0.0, "grad_norm": 4.316232204437256, "image_reward": 0.2718638092279434, "kl": 2.3269161872565745, "learning_rate": 5e-06, "loss": -0.1163, "reward": 0.737056265771389, "reward_std": 0.9669643521308899, "rewards/reward_func": 0.737056265771389, "step": 2040, "toxic_reward": 3.0878625586628914 }, { "clip_ratio": 0.0, "completion_length": 46.475, "epoch": 0.4844045368620038, "format_reward": -1.0, "grad_norm": 41.36595153808594, "image_reward": 0.26953938901424407, "kl": 0.7504621215164662, "learning_rate": 5e-06, "loss": -0.1493, "reward": -1.3220559000968932, "reward_std": 1.9624842151999473, "rewards/reward_func": -1.3220559000968932, "step": 2050, "toxic_reward": 3.74695360660553 }, { "clip_ratio": 0.0, "completion_length": 45.825, "epoch": 0.4867674858223062, "format_reward": 0.0, "grad_norm": 6.471742153167725, "image_reward": 0.2753570556640625, "kl": 0.07729073958471418, "learning_rate": 5e-06, "loss": -0.03, "reward": 1.3116377294063568, "reward_std": 1.4300442904233932, "rewards/reward_func": 1.3116377294063568, "step": 2060, "toxic_reward": 3.5985005378723143 }, { "clip_ratio": 0.0, "completion_length": 44.825, "epoch": 0.4891304347826087, "format_reward": 0.0, "grad_norm": 1.805216670036316, "image_reward": 0.306744384765625, "kl": 6.001958086341619, "learning_rate": 5e-06, "loss": -0.1945, "reward": 0.36415485143661497, "reward_std": 0.6190065078437328, "rewards/reward_func": 0.36415485143661497, "step": 2070, "toxic_reward": 4.081458044052124 }, { "clip_ratio": 0.0, "completion_length": 48.275, "epoch": 0.4914933837429111, "format_reward": 0.0, "grad_norm": 18.216772079467773, "image_reward": 0.2797536224126816, "kl": 0.49935312662273645, "learning_rate": 5e-06, "loss": 0.0342, "reward": 0.23056302070617676, "reward_std": 0.4776972606778145, "rewards/reward_func": 0.23056302070617676, "step": 2080, "toxic_reward": 4.019720596075058 }, { "clip_ratio": 0.0, "completion_length": 35.075, "epoch": 0.4938563327032136, "format_reward": -0.75, "grad_norm": 13.060705184936523, "image_reward": 0.28729756474494933, "kl": 4.740964457206428, "learning_rate": 5e-06, "loss": 0.0645, "reward": -0.4479706704616547, "reward_std": 2.0641879491508006, "rewards/reward_func": -0.4479706704616547, "step": 2090, "toxic_reward": 2.7062815964221953 }, { "clip_ratio": 0.0, "completion_length": 32.575, "epoch": 0.4962192816635161, "format_reward": -0.25, "grad_norm": 14.017393112182617, "image_reward": 0.2847381591796875, "kl": 0.9378721818327904, "learning_rate": 5e-06, "loss": -0.0908, "reward": 0.4732812285423279, "reward_std": 1.2860259119421245, "rewards/reward_func": 0.4732812285423279, "step": 2100, "toxic_reward": 3.420735603570938 }, { "clip_ratio": 0.0, "completion_length": 55.075, "epoch": 0.4985822306238185, "format_reward": -0.75, "grad_norm": 6.193188667297363, "image_reward": 0.27182515412569047, "kl": 2.9611662749201058, "learning_rate": 5e-06, "loss": 0.0056, "reward": -0.19096837639808656, "reward_std": 1.8480727752670645, "rewards/reward_func": -0.19096837639808656, "step": 2110, "toxic_reward": 4.268127584457398 }, { "clip_ratio": 0.0, "completion_length": 42.8, "epoch": 0.500945179584121, "format_reward": -0.75, "grad_norm": 11.63723087310791, "image_reward": 0.2698944091796875, "kl": 1.1968733308836819, "learning_rate": 5e-06, "loss": 0.0042, "reward": -0.5995136559009552, "reward_std": 2.1293695636093615, "rewards/reward_func": -0.5995136559009552, "step": 2120, "toxic_reward": 3.746561822295189 }, { "clip_ratio": 0.0, "completion_length": 40.8, "epoch": 0.5033081285444234, "format_reward": -0.75, "grad_norm": 2.3855180740356445, "image_reward": 0.26025390625, "kl": 1.5614483684301377, "learning_rate": 5e-06, "loss": 0.2496, "reward": -0.6204059720039368, "reward_std": 1.9704039812088012, "rewards/reward_func": -0.6204059720039368, "step": 2130, "toxic_reward": 3.747698575258255 }, { "clip_ratio": 0.0, "completion_length": 40.75, "epoch": 0.505671077504726, "format_reward": 0.0, "grad_norm": 7.681392669677734, "image_reward": 0.27169952541589737, "kl": 3.525779527798295, "learning_rate": 5e-06, "loss": -0.154, "reward": 0.7122885227203369, "reward_std": 1.038828771188855, "rewards/reward_func": 0.7122885227203369, "step": 2140, "toxic_reward": 3.8024647355079653 }, { "clip_ratio": 0.0, "completion_length": 49.55, "epoch": 0.5080340264650284, "format_reward": -0.25, "grad_norm": 7.522043228149414, "image_reward": 0.2867136627435684, "kl": 2.352656077966094, "learning_rate": 5e-06, "loss": -0.0567, "reward": 0.3375007212162018, "reward_std": 1.1598852841183542, "rewards/reward_func": 0.3375007212162018, "step": 2150, "toxic_reward": 3.6138802111148833 }, { "clip_ratio": 0.0, "completion_length": 47.75, "epoch": 0.5103969754253308, "format_reward": 0.0, "grad_norm": 8.265325546264648, "image_reward": 0.2756062835454941, "kl": 6.923487820476294, "learning_rate": 5e-06, "loss": -0.1108, "reward": 0.7483027845621109, "reward_std": 0.5725362204015255, "rewards/reward_func": 0.7483027845621109, "step": 2160, "toxic_reward": 3.906574785709381 }, { "clip_ratio": 0.0, "completion_length": 33.525, "epoch": 0.5127599243856332, "format_reward": -0.75, "grad_norm": 21.7608642578125, "image_reward": 0.2696726471185684, "kl": 5.021715716272593, "learning_rate": 5e-06, "loss": -0.1133, "reward": -0.4512764573097229, "reward_std": 2.062841220572591, "rewards/reward_func": -0.4512764573097229, "step": 2170, "toxic_reward": 4.282562255859375 }, { "clip_ratio": 0.0, "completion_length": 48.425, "epoch": 0.5151228733459358, "format_reward": -0.25, "grad_norm": 2.369183301925659, "image_reward": 0.28711649775505066, "kl": 12.483240520581603, "learning_rate": 5e-06, "loss": 0.0658, "reward": -0.0087041437625885, "reward_std": 1.3220645122230053, "rewards/reward_func": -0.0087041437625885, "step": 2180, "toxic_reward": 3.781124639511108 }, { "clip_ratio": 0.0, "completion_length": 43.0, "epoch": 0.5174858223062382, "format_reward": -0.5, "grad_norm": 4.219491958618164, "image_reward": 0.27772623747587205, "kl": 2.453311304561794, "learning_rate": 5e-06, "loss": -0.0285, "reward": -0.30757330656051635, "reward_std": 1.7083245173096657, "rewards/reward_func": -0.30757330656051635, "step": 2190, "toxic_reward": 4.130738306045532 }, { "clip_ratio": 0.0, "completion_length": 50.85, "epoch": 0.5198487712665406, "format_reward": -0.5, "grad_norm": 6.190961837768555, "image_reward": 0.2818817153573036, "kl": 4.28942144587636, "learning_rate": 5e-06, "loss": -0.1115, "reward": 0.2441554695367813, "reward_std": 1.9595814019441604, "rewards/reward_func": 0.2441554695367813, "step": 2200, "toxic_reward": 3.141683894395828 }, { "clip_ratio": 0.0, "completion_length": 48.225, "epoch": 0.5222117202268431, "format_reward": -0.5, "grad_norm": 4.348143577575684, "image_reward": 0.29916890412569047, "kl": 0.34145298339426516, "learning_rate": 5e-06, "loss": 0.0071, "reward": -0.5653827100992203, "reward_std": 1.6975119888782502, "rewards/reward_func": -0.5653827100992203, "step": 2210, "toxic_reward": 3.599680471420288 }, { "clip_ratio": 0.0, "completion_length": 49.0, "epoch": 0.5245746691871456, "format_reward": -0.75, "grad_norm": 6.7439422607421875, "image_reward": 0.2785715714097023, "kl": 1.8124071411788463, "learning_rate": 5e-06, "loss": 0.0723, "reward": -0.6911701261997223, "reward_std": 1.9053923369385302, "rewards/reward_func": -0.6911701261997223, "step": 2220, "toxic_reward": 3.67071852684021 }, { "clip_ratio": 0.0, "completion_length": 38.9, "epoch": 0.526937618147448, "format_reward": -0.25, "grad_norm": 5.702417373657227, "image_reward": 0.2697733551263809, "kl": 3.5654136715456843, "learning_rate": 5e-06, "loss": -0.0592, "reward": 0.31644179224967955, "reward_std": 1.338551426678896, "rewards/reward_func": 0.31644179224967955, "step": 2230, "toxic_reward": 4.082410860061645 }, { "clip_ratio": 0.0, "completion_length": 46.9, "epoch": 0.5293005671077504, "format_reward": -0.75, "grad_norm": 3.3108696937561035, "image_reward": 0.2754450500011444, "kl": 1.1358238738030195, "learning_rate": 5e-06, "loss": 0.0103, "reward": -0.19608908146619797, "reward_std": 1.9574983415892349, "rewards/reward_func": -0.19608908146619797, "step": 2240, "toxic_reward": 3.8882675245404243 }, { "clip_ratio": 0.0, "completion_length": 41.775, "epoch": 0.531663516068053, "format_reward": 0.0, "grad_norm": 3.8872108459472656, "image_reward": 0.2711354583501816, "kl": 0.6185108724981546, "learning_rate": 5e-06, "loss": -0.0331, "reward": 0.43025930523872374, "reward_std": 0.6924620851874351, "rewards/reward_func": 0.43025930523872374, "step": 2250, "toxic_reward": 3.741843378543854 }, { "clip_ratio": 0.0, "completion_length": 46.05, "epoch": 0.5340264650283554, "format_reward": -0.5, "grad_norm": 2.605905055999756, "image_reward": 0.24824727326631546, "kl": 3.812788811326027, "learning_rate": 5e-06, "loss": -0.062, "reward": -0.0177284836769104, "reward_std": 1.7159371480345726, "rewards/reward_func": -0.0177284836769104, "step": 2260, "toxic_reward": 3.8558017730712892 }, { "clip_ratio": 0.0, "completion_length": 45.225, "epoch": 0.5363894139886578, "format_reward": 0.0, "grad_norm": 4.317953109741211, "image_reward": 0.29388427734375, "kl": 0.9772842615842819, "learning_rate": 5e-06, "loss": -0.005, "reward": 0.24463090300559998, "reward_std": 0.8211262285709381, "rewards/reward_func": 0.24463090300559998, "step": 2270, "toxic_reward": 3.4330978095531464 }, { "clip_ratio": 0.0, "completion_length": 45.35, "epoch": 0.5387523629489603, "format_reward": -0.25, "grad_norm": 2.7746388912200928, "image_reward": 0.28372802734375, "kl": 0.6956694826483727, "learning_rate": 5e-06, "loss": 0.0806, "reward": 0.9492665678262711, "reward_std": 1.2596320446580649, "rewards/reward_func": 0.9492665678262711, "step": 2280, "toxic_reward": 3.6599619805812837 }, { "clip_ratio": 0.0, "completion_length": 41.65, "epoch": 0.5411153119092628, "format_reward": -0.75, "grad_norm": 24.271883010864258, "image_reward": 0.25230407863855364, "kl": 2.0102761931717397, "learning_rate": 5e-06, "loss": 0.099, "reward": -0.5960418626666069, "reward_std": 1.6162065342068672, "rewards/reward_func": -0.5960418626666069, "step": 2290, "toxic_reward": 3.32955624461174 }, { "clip_ratio": 0.0, "completion_length": 40.025, "epoch": 0.5434782608695652, "format_reward": -0.75, "grad_norm": 12.164813995361328, "image_reward": 0.27450052797794344, "kl": 1.0361489206552505, "learning_rate": 5e-06, "loss": 0.0215, "reward": -0.12894563674926757, "reward_std": 2.2585421696305277, "rewards/reward_func": -0.12894563674926757, "step": 2300, "toxic_reward": 3.8079848527908324 }, { "clip_ratio": 0.0, "completion_length": 39.925, "epoch": 0.5458412098298677, "format_reward": 0.0, "grad_norm": 4.370122909545898, "image_reward": 0.28968607634305954, "kl": 2.262423123046756, "learning_rate": 5e-06, "loss": -0.0122, "reward": 0.4122478127479553, "reward_std": 0.8819206684827805, "rewards/reward_func": 0.4122478127479553, "step": 2310, "toxic_reward": 3.7774435758590696 }, { "clip_ratio": 0.0, "completion_length": 47.625, "epoch": 0.5482041587901701, "format_reward": -0.25, "grad_norm": 4.913710594177246, "image_reward": 0.2981597900390625, "kl": 1.1325825482606888, "learning_rate": 5e-06, "loss": 0.0383, "reward": -0.302042031288147, "reward_std": 1.1343338422477245, "rewards/reward_func": -0.302042031288147, "step": 2320, "toxic_reward": 3.4699944481253624 }, { "clip_ratio": 0.0, "completion_length": 34.875, "epoch": 0.5505671077504726, "format_reward": -0.5, "grad_norm": 10.183396339416504, "image_reward": 0.2794362396001816, "kl": 2.359659927338362, "learning_rate": 5e-06, "loss": 0.127, "reward": -0.5543205380439759, "reward_std": 1.5390649776905776, "rewards/reward_func": -0.5543205380439759, "step": 2330, "toxic_reward": 4.130715823173523 }, { "clip_ratio": 0.0, "completion_length": 38.275, "epoch": 0.552930056710775, "format_reward": -0.25, "grad_norm": 29.773969650268555, "image_reward": 0.3009490996599197, "kl": 1.2122079662978649, "learning_rate": 5e-06, "loss": -0.0302, "reward": 0.49274033308029175, "reward_std": 1.2792111776769162, "rewards/reward_func": 0.49274033308029175, "step": 2340, "toxic_reward": 4.144988393783569 }, { "clip_ratio": 0.0, "completion_length": 48.925, "epoch": 0.5552930056710775, "format_reward": -0.25, "grad_norm": 1.4507733583450317, "image_reward": 0.27436625212430954, "kl": 10.124456256255508, "learning_rate": 5e-06, "loss": -0.0586, "reward": 0.16714471578598022, "reward_std": 1.1183603500947357, "rewards/reward_func": 0.16714471578598022, "step": 2350, "toxic_reward": 3.7719646602869035 }, { "clip_ratio": 0.0, "completion_length": 50.45, "epoch": 0.55765595463138, "format_reward": -0.25, "grad_norm": 3.8344922065734863, "image_reward": 0.27209879606962206, "kl": 0.4884789928793907, "learning_rate": 5e-06, "loss": 0.0246, "reward": 0.7492954432964325, "reward_std": 1.5298523031175137, "rewards/reward_func": 0.7492954432964325, "step": 2360, "toxic_reward": 3.582643675804138 }, { "clip_ratio": 0.0, "completion_length": 33.925, "epoch": 0.5600189035916824, "format_reward": -0.25, "grad_norm": 28.500579833984375, "image_reward": 0.256439208984375, "kl": 7.240471968054772, "learning_rate": 5e-06, "loss": 0.0033, "reward": 0.39032529294490814, "reward_std": 1.3387351400218903, "rewards/reward_func": 0.39032529294490814, "step": 2370, "toxic_reward": 3.7680604696273803 }, { "clip_ratio": 0.0, "completion_length": 43.35, "epoch": 0.5623818525519849, "format_reward": -0.25, "grad_norm": 18.509540557861328, "image_reward": 0.25091654509305955, "kl": 2.3443214535713195, "learning_rate": 5e-06, "loss": -0.0852, "reward": -0.013416659832000733, "reward_std": 1.2783805396407844, "rewards/reward_func": -0.013416659832000733, "step": 2380, "toxic_reward": 3.937808632850647 }, { "clip_ratio": 0.0, "completion_length": 48.125, "epoch": 0.5647448015122873, "format_reward": -0.5, "grad_norm": 11.650871276855469, "image_reward": 0.29215189516544343, "kl": 0.3515282288193703, "learning_rate": 5e-06, "loss": -0.0259, "reward": 0.1742587387561798, "reward_std": 1.8562648460268973, "rewards/reward_func": 0.1742587387561798, "step": 2390, "toxic_reward": 3.7724621415138246 }, { "clip_ratio": 0.0, "completion_length": 52.55, "epoch": 0.5671077504725898, "format_reward": -1.0, "grad_norm": 20.670705795288086, "image_reward": 0.26702982634305955, "kl": 2.7752922803163527, "learning_rate": 5e-06, "loss": 0.1898, "reward": -0.49167909026145934, "reward_std": 2.5721775129437447, "rewards/reward_func": -0.49167909026145934, "step": 2400, "toxic_reward": 3.612065541744232 }, { "clip_ratio": 0.0, "completion_length": 47.75, "epoch": 0.5694706994328923, "format_reward": -0.25, "grad_norm": 5.918033599853516, "image_reward": 0.27968953400850294, "kl": 1.1868829876184464, "learning_rate": 5e-06, "loss": 0.0339, "reward": -0.041136431694030764, "reward_std": 1.1883981741964817, "rewards/reward_func": -0.041136431694030764, "step": 2410, "toxic_reward": 4.002831280231476 }, { "clip_ratio": 0.0, "completion_length": 42.4, "epoch": 0.5718336483931947, "format_reward": -0.25, "grad_norm": 5.842867851257324, "image_reward": 0.27998046875, "kl": 0.9403334192931652, "learning_rate": 5e-06, "loss": 0.0547, "reward": 0.23068565130233765, "reward_std": 1.2439154148101808, "rewards/reward_func": 0.23068565130233765, "step": 2420, "toxic_reward": 3.8584881067276 }, { "clip_ratio": 0.0, "completion_length": 37.2, "epoch": 0.5741965973534972, "format_reward": -0.25, "grad_norm": 13.205660820007324, "image_reward": 0.2850880965590477, "kl": 1.6154363751411438, "learning_rate": 5e-06, "loss": -0.0775, "reward": 0.4115023612976074, "reward_std": 1.0730943327769638, "rewards/reward_func": 0.4115023612976074, "step": 2430, "toxic_reward": 4.400762820243836 }, { "clip_ratio": 0.0, "completion_length": 42.925, "epoch": 0.5765595463137996, "format_reward": -0.25, "grad_norm": 3.637028455734253, "image_reward": 0.26606852263212205, "kl": 1.6208242058753968, "learning_rate": 5e-06, "loss": 0.0364, "reward": -0.5815495431423188, "reward_std": 1.270220142416656, "rewards/reward_func": -0.5815495431423188, "step": 2440, "toxic_reward": 3.934324860572815 }, { "clip_ratio": 0.0, "completion_length": 66.35, "epoch": 0.5789224952741021, "format_reward": -0.75, "grad_norm": 11.621758460998535, "image_reward": 0.28047332763671873, "kl": 0.7798056200146675, "learning_rate": 5e-06, "loss": 0.0519, "reward": 0.1087100327014923, "reward_std": 2.0828719630837442, "rewards/reward_func": 0.1087100327014923, "step": 2450, "toxic_reward": 2.8291834026575087 }, { "clip_ratio": 0.0, "completion_length": 43.95, "epoch": 0.5812854442344045, "format_reward": 0.0, "grad_norm": 9.702945709228516, "image_reward": 0.28443044126033784, "kl": 1.73483949303627, "learning_rate": 5e-06, "loss": 0.0958, "reward": 0.28035863041877745, "reward_std": 0.5182013310492039, "rewards/reward_func": 0.28035863041877745, "step": 2460, "toxic_reward": 3.8520292162895204 }, { "clip_ratio": 0.0, "completion_length": 43.925, "epoch": 0.583648393194707, "format_reward": -0.5, "grad_norm": 18.073659896850586, "image_reward": 0.24947459101676941, "kl": 2.9204909898340703, "learning_rate": 5e-06, "loss": -0.1939, "reward": 0.42990538477897644, "reward_std": 1.8428901416249572, "rewards/reward_func": 0.42990538477897644, "step": 2470, "toxic_reward": 3.7781980872154235 }, { "clip_ratio": 0.0, "completion_length": 43.175, "epoch": 0.5860113421550095, "format_reward": -0.5, "grad_norm": 4.270178318023682, "image_reward": 0.28282063752412795, "kl": 0.48990702964365485, "learning_rate": 5e-06, "loss": 0.0624, "reward": -0.30424859523773196, "reward_std": 1.5560518722981214, "rewards/reward_func": -0.30424859523773196, "step": 2480, "toxic_reward": 4.44784414768219 }, { "clip_ratio": 0.0, "completion_length": 38.9, "epoch": 0.5883742911153119, "format_reward": -0.5, "grad_norm": 7.575175762176514, "image_reward": 0.2695292145013809, "kl": 1.2654437847435474, "learning_rate": 5e-06, "loss": -0.1158, "reward": -0.44633115231990816, "reward_std": 1.8826897315680982, "rewards/reward_func": -0.44633115231990816, "step": 2490, "toxic_reward": 3.8135931372642515 }, { "clip_ratio": 0.0, "completion_length": 46.3, "epoch": 0.5907372400756143, "format_reward": 0.0, "grad_norm": 24.015722274780273, "image_reward": 0.26897684782743453, "kl": 5.640305678918958, "learning_rate": 5e-06, "loss": -0.1054, "reward": 0.6214121818542481, "reward_std": 0.9682584583759308, "rewards/reward_func": 0.6214121818542481, "step": 2500, "toxic_reward": 3.9037705421447755 }, { "clip_ratio": 0.0, "completion_length": 32.125, "epoch": 0.5931001890359168, "format_reward": -0.25, "grad_norm": 13.069973945617676, "image_reward": 0.2854502350091934, "kl": 11.71274044290185, "learning_rate": 5e-06, "loss": -0.077, "reward": -0.3511055693030357, "reward_std": 1.0736159782391042, "rewards/reward_func": -0.3511055693030357, "step": 2510, "toxic_reward": 3.7281174302101134 }, { "clip_ratio": 0.0, "completion_length": 40.375, "epoch": 0.5954631379962193, "format_reward": 0.0, "grad_norm": 2.0403361320495605, "image_reward": 0.2833099365234375, "kl": 0.6411756843328476, "learning_rate": 5e-06, "loss": 0.0435, "reward": 0.5592477023601532, "reward_std": 0.8428021136671304, "rewards/reward_func": 0.5592477023601532, "step": 2520, "toxic_reward": 3.6056689500808714 }, { "clip_ratio": 0.0, "completion_length": 50.925, "epoch": 0.5978260869565217, "format_reward": -0.5, "grad_norm": 2.7234652042388916, "image_reward": 0.2631998687982559, "kl": 2.588300554268062, "learning_rate": 5e-06, "loss": -0.0954, "reward": -0.11296717822551727, "reward_std": 1.059992153197527, "rewards/reward_func": -0.11296717822551727, "step": 2530, "toxic_reward": 4.310960650444031 }, { "clip_ratio": 0.0, "completion_length": 42.8, "epoch": 0.6001890359168242, "format_reward": 0.0, "grad_norm": 1.746839165687561, "image_reward": 0.27794291228055956, "kl": 0.12578147873282433, "learning_rate": 5e-06, "loss": -0.0894, "reward": 0.6603235125541687, "reward_std": 0.5662866534665227, "rewards/reward_func": 0.6603235125541687, "step": 2540, "toxic_reward": 4.165549850463867 }, { "clip_ratio": 0.0, "completion_length": 65.2, "epoch": 0.6025519848771267, "format_reward": 0.0, "grad_norm": 1.1635066270828247, "image_reward": 0.2584126806921429, "kl": 16.10209010541439, "learning_rate": 5e-06, "loss": -0.0047, "reward": 0.9701344430446625, "reward_std": 0.8910946477204561, "rewards/reward_func": 0.9701344430446625, "step": 2550, "toxic_reward": 3.8731188111835055 }, { "clip_ratio": 0.0, "completion_length": 44.975, "epoch": 0.6049149338374291, "format_reward": -1.0, "grad_norm": 3.505110502243042, "image_reward": 0.2755279541015625, "kl": 1.5500462669879198, "learning_rate": 5e-06, "loss": 0.0015, "reward": -0.1658882439136505, "reward_std": 2.0384394701570274, "rewards/reward_func": -0.1658882439136505, "step": 2560, "toxic_reward": 3.8778061270713806 }, { "clip_ratio": 0.0, "completion_length": 30.8, "epoch": 0.6072778827977315, "format_reward": 0.0, "grad_norm": 8.120704650878906, "image_reward": 0.2892588287591934, "kl": 2.1680047139525414, "learning_rate": 5e-06, "loss": 0.0399, "reward": 0.6697697341442108, "reward_std": 1.024929089844227, "rewards/reward_func": 0.6697697341442108, "step": 2570, "toxic_reward": 3.547108954191208 }, { "clip_ratio": 0.0, "completion_length": 63.225, "epoch": 0.6096408317580341, "format_reward": -1.0, "grad_norm": 9.57001781463623, "image_reward": 0.2734588623046875, "kl": 0.8948870234191417, "learning_rate": 5e-06, "loss": 0.0951, "reward": -0.7226251482963562, "reward_std": 2.448101815581322, "rewards/reward_func": -0.7226251482963562, "step": 2580, "toxic_reward": 4.320083689689636 }, { "clip_ratio": 0.0, "completion_length": 38.1, "epoch": 0.6120037807183365, "format_reward": 0.0, "grad_norm": 2.0496883392333984, "image_reward": 0.2865132659673691, "kl": 2.8105035655200483, "learning_rate": 5e-06, "loss": -0.0797, "reward": 0.568730728328228, "reward_std": 0.6556393213570118, "rewards/reward_func": 0.568730728328228, "step": 2590, "toxic_reward": 3.725440341234207 }, { "clip_ratio": 0.0, "completion_length": 45.075, "epoch": 0.6143667296786389, "format_reward": 0.0, "grad_norm": 10.353742599487305, "image_reward": 0.2710174560546875, "kl": 0.6778285041451454, "learning_rate": 5e-06, "loss": 0.0379, "reward": 0.2569525420665741, "reward_std": 0.597846270352602, "rewards/reward_func": 0.2569525420665741, "step": 2600, "toxic_reward": 4.306404328346252 }, { "clip_ratio": 0.0, "completion_length": 43.2, "epoch": 0.6167296786389413, "format_reward": -0.5, "grad_norm": 3.9594945907592773, "image_reward": 0.28432718813419344, "kl": 0.5540166199207306, "learning_rate": 5e-06, "loss": 0.0047, "reward": 0.5912085831165313, "reward_std": 1.5809811264276505, "rewards/reward_func": 0.5912085831165313, "step": 2610, "toxic_reward": 4.350194215774536 }, { "clip_ratio": 0.0, "completion_length": 53.0, "epoch": 0.6190926275992439, "format_reward": -0.25, "grad_norm": 7.203413963317871, "image_reward": 0.26516723483800886, "kl": 1.199559571594, "learning_rate": 5e-06, "loss": -0.0012, "reward": 0.2267006203532219, "reward_std": 1.142584490031004, "rewards/reward_func": 0.2267006203532219, "step": 2620, "toxic_reward": 3.9258982062339784 }, { "clip_ratio": 0.0, "completion_length": 44.95, "epoch": 0.6214555765595463, "format_reward": -0.25, "grad_norm": 6.998039722442627, "image_reward": 0.2901885986328125, "kl": 27.16859985589981, "learning_rate": 5e-06, "loss": -0.026, "reward": 0.13268216848373413, "reward_std": 1.2143183693289756, "rewards/reward_func": 0.13268216848373413, "step": 2630, "toxic_reward": 3.556568074226379 }, { "clip_ratio": 0.0, "completion_length": 48.8, "epoch": 0.6238185255198487, "format_reward": -0.25, "grad_norm": 13.862479209899902, "image_reward": 0.27274271547794343, "kl": 2.363949555903673, "learning_rate": 5e-06, "loss": -0.1093, "reward": 0.21172123551368713, "reward_std": 1.788407751917839, "rewards/reward_func": 0.21172123551368713, "step": 2640, "toxic_reward": 3.530658257007599 }, { "clip_ratio": 0.0, "completion_length": 41.45, "epoch": 0.6261814744801513, "format_reward": 0.0, "grad_norm": 6.451826095581055, "image_reward": 0.25544840544462205, "kl": 0.9077189475297928, "learning_rate": 5e-06, "loss": -0.1204, "reward": 0.17604875564575195, "reward_std": 0.7731596916913986, "rewards/reward_func": 0.17604875564575195, "step": 2650, "toxic_reward": 3.655298948287964 }, { "clip_ratio": 0.0, "completion_length": 52.375, "epoch": 0.6285444234404537, "format_reward": -0.25, "grad_norm": 15.447392463684082, "image_reward": 0.28090617060661316, "kl": 1.5149286333471537, "learning_rate": 5e-06, "loss": -0.0393, "reward": 0.6538720428943634, "reward_std": 1.4380803421139716, "rewards/reward_func": 0.6538720428943634, "step": 2660, "toxic_reward": 3.8757722854614256 }, { "clip_ratio": 0.0, "completion_length": 45.525, "epoch": 0.6309073724007561, "format_reward": -0.5, "grad_norm": 5.443056583404541, "image_reward": 0.28455810546875, "kl": 2.1727461591362953, "learning_rate": 5e-06, "loss": 0.0129, "reward": -0.5597851276397705, "reward_std": 1.4988839238882066, "rewards/reward_func": -0.5597851276397705, "step": 2670, "toxic_reward": 3.852312761545181 }, { "clip_ratio": 0.0, "completion_length": 35.575, "epoch": 0.6332703213610587, "format_reward": 0.0, "grad_norm": 6.4276652336120605, "image_reward": 0.2711863175034523, "kl": 2.061120516061783, "learning_rate": 5e-06, "loss": -0.1249, "reward": 0.4250785157084465, "reward_std": 0.8246009856462478, "rewards/reward_func": 0.4250785157084465, "step": 2680, "toxic_reward": 3.8009597778320314 }, { "clip_ratio": 0.0, "completion_length": 44.375, "epoch": 0.6356332703213611, "format_reward": 0.0, "grad_norm": 1.9922189712524414, "image_reward": 0.2824055999517441, "kl": 0.8832020409405231, "learning_rate": 5e-06, "loss": 0.0029, "reward": 0.2428468108177185, "reward_std": 0.7863198474049569, "rewards/reward_func": 0.2428468108177185, "step": 2690, "toxic_reward": 3.925771975517273 }, { "clip_ratio": 0.0, "completion_length": 37.375, "epoch": 0.6379962192816635, "format_reward": -0.25, "grad_norm": 3.2788710594177246, "image_reward": 0.2625895172357559, "kl": 0.3067374438047409, "learning_rate": 5e-06, "loss": 0.0585, "reward": 0.0673605427145958, "reward_std": 1.2387039607390762, "rewards/reward_func": 0.0673605427145958, "step": 2700, "toxic_reward": 3.223780316114426 }, { "clip_ratio": 0.0, "completion_length": 38.7, "epoch": 0.6403591682419659, "format_reward": -0.5, "grad_norm": 18.068998336791992, "image_reward": 0.28381652683019637, "kl": 4.5763449721038345, "learning_rate": 5e-06, "loss": 0.0303, "reward": -0.08654462695121765, "reward_std": 1.6389019638299942, "rewards/reward_func": -0.08654462695121765, "step": 2710, "toxic_reward": 3.9132798612117767 }, { "clip_ratio": 0.0, "completion_length": 35.75, "epoch": 0.6427221172022685, "format_reward": 0.0, "grad_norm": 16.331071853637695, "image_reward": 0.2924163818359375, "kl": 1.1359277203679086, "learning_rate": 5e-06, "loss": -0.0706, "reward": 0.6057616770267487, "reward_std": 0.7651574447751045, "rewards/reward_func": 0.6057616770267487, "step": 2720, "toxic_reward": 3.8298017740249635 }, { "clip_ratio": 0.0, "completion_length": 36.725, "epoch": 0.6450850661625709, "format_reward": -0.25, "grad_norm": 2.152521848678589, "image_reward": 0.3041951507329941, "kl": 8.785355818271636, "learning_rate": 5e-06, "loss": 0.0118, "reward": 0.3512896567583084, "reward_std": 1.3057980645447969, "rewards/reward_func": 0.3512896567583084, "step": 2730, "toxic_reward": 3.397814577817917 }, { "clip_ratio": 0.0, "completion_length": 32.675, "epoch": 0.6474480151228733, "format_reward": 0.0, "grad_norm": 20.01748275756836, "image_reward": 0.29010823667049407, "kl": 3.5924226850271226, "learning_rate": 5e-06, "loss": -0.1142, "reward": 0.7106038928031921, "reward_std": 0.8158069387078285, "rewards/reward_func": 0.7106038928031921, "step": 2740, "toxic_reward": 4.0692403554916385 }, { "clip_ratio": 0.0, "completion_length": 49.7, "epoch": 0.6498109640831758, "format_reward": -0.75, "grad_norm": 11.965126037597656, "image_reward": 0.26697489619255066, "kl": 1.9964583709836006, "learning_rate": 5e-06, "loss": -0.0406, "reward": -0.07322075963020325, "reward_std": 1.999936766922474, "rewards/reward_func": -0.07322075963020325, "step": 2750, "toxic_reward": 3.366811156272888 }, { "clip_ratio": 0.0, "completion_length": 41.475, "epoch": 0.6521739130434783, "format_reward": -0.5, "grad_norm": 26.80545997619629, "image_reward": 0.24265645444393158, "kl": 2.707187344133854, "learning_rate": 5e-06, "loss": 0.0971, "reward": -0.5061075001955032, "reward_std": 1.755505845695734, "rewards/reward_func": -0.5061075001955032, "step": 2760, "toxic_reward": 3.8667294502258303 }, { "clip_ratio": 0.0, "completion_length": 43.025, "epoch": 0.6545368620037807, "format_reward": 0.0, "grad_norm": 5.15554141998291, "image_reward": 0.271905517578125, "kl": 4.915832757204771, "learning_rate": 5e-06, "loss": -0.0126, "reward": 0.8192368298768997, "reward_std": 0.41571362912654874, "rewards/reward_func": 0.8192368298768997, "step": 2770, "toxic_reward": 4.089378929138183 }, { "clip_ratio": 0.0, "completion_length": 54.075, "epoch": 0.6568998109640832, "format_reward": 0.0, "grad_norm": 2.028783082962036, "image_reward": 0.27303365170955657, "kl": 1.8947554275393486, "learning_rate": 5e-06, "loss": 0.0468, "reward": -0.1263785183429718, "reward_std": 0.7480042926967144, "rewards/reward_func": -0.1263785183429718, "step": 2780, "toxic_reward": 4.165195155143738 }, { "clip_ratio": 0.0, "completion_length": 38.125, "epoch": 0.6592627599243857, "format_reward": -0.5, "grad_norm": 17.42000961303711, "image_reward": 0.2864379853010178, "kl": 1.6738548278808594, "learning_rate": 5e-06, "loss": -0.0375, "reward": -0.2114594280719757, "reward_std": 1.560011611506343, "rewards/reward_func": -0.2114594280719757, "step": 2790, "toxic_reward": 4.024951922893524 }, { "clip_ratio": 0.0, "completion_length": 48.075, "epoch": 0.6616257088846881, "format_reward": 0.0, "grad_norm": 5.910866737365723, "image_reward": 0.2764821395277977, "kl": 2.9146203480660917, "learning_rate": 5e-06, "loss": -0.0104, "reward": 0.6828193128108978, "reward_std": 0.7127262264490127, "rewards/reward_func": 0.6828193128108978, "step": 2800, "toxic_reward": 4.108976912498474 }, { "clip_ratio": 0.0, "completion_length": 46.225, "epoch": 0.6639886578449905, "format_reward": -0.25, "grad_norm": 13.787774085998535, "image_reward": 0.2695404052734375, "kl": 2.044136567413807, "learning_rate": 5e-06, "loss": 0.1481, "reward": 0.1416476845741272, "reward_std": 0.9124870980158448, "rewards/reward_func": 0.1416476845741272, "step": 2810, "toxic_reward": 3.9404671788215637 }, { "clip_ratio": 0.0, "completion_length": 33.35, "epoch": 0.666351606805293, "format_reward": -0.25, "grad_norm": 9.458231925964355, "image_reward": 0.27503865361213686, "kl": 12.555490608513356, "learning_rate": 5e-06, "loss": 0.015, "reward": -0.398735374212265, "reward_std": 1.3145878296345472, "rewards/reward_func": -0.398735374212265, "step": 2820, "toxic_reward": 3.8166601181030275 }, { "clip_ratio": 0.0, "completion_length": 40.575, "epoch": 0.6687145557655955, "format_reward": 0.0, "grad_norm": 5.239807605743408, "image_reward": 0.2913035064935684, "kl": 4.1338134072721004, "learning_rate": 5e-06, "loss": -0.0082, "reward": 0.09673230051994323, "reward_std": 0.5237030681222677, "rewards/reward_func": 0.09673230051994323, "step": 2830, "toxic_reward": 3.791787397861481 }, { "clip_ratio": 0.0, "completion_length": 74.05, "epoch": 0.6710775047258979, "format_reward": 0.0, "grad_norm": 3.1467976570129395, "image_reward": 0.2838506057858467, "kl": 18.177365225553512, "learning_rate": 5e-06, "loss": 0.21, "reward": 0.40501208901405333, "reward_std": 0.894443211145699, "rewards/reward_func": 0.40501208901405333, "step": 2840, "toxic_reward": 3.982026219367981 }, { "clip_ratio": 0.0, "completion_length": 50.05, "epoch": 0.6734404536862004, "format_reward": -0.25, "grad_norm": 4.421890735626221, "image_reward": 0.2822255462408066, "kl": 4.33959369957447, "learning_rate": 5e-06, "loss": -0.0624, "reward": -0.05263040065765381, "reward_std": 1.2849599719047546, "rewards/reward_func": -0.05263040065765381, "step": 2850, "toxic_reward": 3.633159136772156 }, { "clip_ratio": 0.0, "completion_length": 40.2, "epoch": 0.6758034026465028, "format_reward": 0.0, "grad_norm": 2.5038645267486572, "image_reward": 0.2830434158444405, "kl": 0.6373848512768745, "learning_rate": 5e-06, "loss": -0.0724, "reward": 0.627695482969284, "reward_std": 0.8375864863395691, "rewards/reward_func": 0.627695482969284, "step": 2860, "toxic_reward": 2.48615984916687 }, { "clip_ratio": 0.0, "completion_length": 53.45, "epoch": 0.6781663516068053, "format_reward": -0.75, "grad_norm": 13.282075881958008, "image_reward": 0.27708842009305956, "kl": 0.9827784240245819, "learning_rate": 5e-06, "loss": 0.0701, "reward": -0.892215234041214, "reward_std": 2.255379121750593, "rewards/reward_func": -0.892215234041214, "step": 2870, "toxic_reward": 3.7220635175704957 }, { "clip_ratio": 0.0, "completion_length": 45.525, "epoch": 0.6805293005671077, "format_reward": 0.0, "grad_norm": 12.856422424316406, "image_reward": 0.2848948180675507, "kl": 1.0351120814681054, "learning_rate": 5e-06, "loss": 0.0319, "reward": 0.2441805601119995, "reward_std": 0.7333651419728995, "rewards/reward_func": 0.2441805601119995, "step": 2880, "toxic_reward": 3.4050124049186707 }, { "clip_ratio": 0.0, "completion_length": 39.925, "epoch": 0.6828922495274102, "format_reward": 0.0, "grad_norm": 17.430034637451172, "image_reward": 0.2576904296875, "kl": 0.8548611015081405, "learning_rate": 5e-06, "loss": 0.0552, "reward": 0.17943925857543946, "reward_std": 1.0328819096088409, "rewards/reward_func": 0.17943925857543946, "step": 2890, "toxic_reward": 3.6138275027275086 }, { "clip_ratio": 0.0, "completion_length": 48.725, "epoch": 0.6852551984877127, "format_reward": -0.5, "grad_norm": 8.174365997314453, "image_reward": 0.27861836850643157, "kl": 2.0340675324201585, "learning_rate": 5e-06, "loss": 0.1112, "reward": 0.21913965195417404, "reward_std": 1.3600813373923302, "rewards/reward_func": 0.21913965195417404, "step": 2900, "toxic_reward": 3.973111832141876 }, { "clip_ratio": 0.0, "completion_length": 41.375, "epoch": 0.6876181474480151, "format_reward": 0.0, "grad_norm": 9.611124992370605, "image_reward": 0.2759572356939316, "kl": 1.901711493730545, "learning_rate": 5e-06, "loss": 0.0008, "reward": 0.7438287258148193, "reward_std": 0.6283189944922924, "rewards/reward_func": 0.7438287258148193, "step": 2910, "toxic_reward": 3.9766014724969865 }, { "clip_ratio": 0.0, "completion_length": 71.675, "epoch": 0.6899810964083176, "format_reward": -0.75, "grad_norm": 4.556710243225098, "image_reward": 0.25573730319738386, "kl": 2.2221992775797843, "learning_rate": 5e-06, "loss": 0.0776, "reward": -0.4340919256210327, "reward_std": 1.778307182714343, "rewards/reward_func": -0.4340919256210327, "step": 2920, "toxic_reward": 4.26712441444397 }, { "clip_ratio": 0.0, "completion_length": 48.45, "epoch": 0.69234404536862, "format_reward": -0.25, "grad_norm": 8.245325088500977, "image_reward": 0.28000691831111907, "kl": 1.5203486174345016, "learning_rate": 5e-06, "loss": -0.0075, "reward": 0.38065839409828184, "reward_std": 1.2137143149971963, "rewards/reward_func": 0.38065839409828184, "step": 2930, "toxic_reward": 4.011340999603272 }, { "clip_ratio": 0.0, "completion_length": 46.95, "epoch": 0.6947069943289225, "format_reward": -0.5, "grad_norm": 43.48079299926758, "image_reward": 0.28537089079618455, "kl": 4.194944667816162, "learning_rate": 5e-06, "loss": -0.0192, "reward": -0.4992818832397461, "reward_std": 1.652469713240862, "rewards/reward_func": -0.4992818832397461, "step": 2940, "toxic_reward": 3.73269322514534 }, { "clip_ratio": 0.0, "completion_length": 41.375, "epoch": 0.697069943289225, "format_reward": -0.25, "grad_norm": 2.9284157752990723, "image_reward": 0.29124247282743454, "kl": 1.9233473122119904, "learning_rate": 5e-06, "loss": -0.0558, "reward": 0.31386570632457733, "reward_std": 1.3490888617932797, "rewards/reward_func": 0.31386570632457733, "step": 2950, "toxic_reward": 3.3832929611206053 }, { "clip_ratio": 0.0, "completion_length": 37.9, "epoch": 0.6994328922495274, "format_reward": -0.75, "grad_norm": 5.489762306213379, "image_reward": 0.27443746030330657, "kl": 11.033294987678527, "learning_rate": 5e-06, "loss": 0.034, "reward": -0.6967712700366974, "reward_std": 1.7560975707136095, "rewards/reward_func": -0.6967712700366974, "step": 2960, "toxic_reward": 4.0662164211273195 }, { "clip_ratio": 0.0, "completion_length": 50.575, "epoch": 0.7017958412098299, "format_reward": -0.25, "grad_norm": 18.22649574279785, "image_reward": 0.26611735075712206, "kl": 3.9552819430828094, "learning_rate": 5e-06, "loss": -0.097, "reward": 0.2059646487236023, "reward_std": 1.4741453856229783, "rewards/reward_func": 0.2059646487236023, "step": 2970, "toxic_reward": 3.947977590560913 }, { "clip_ratio": 0.0, "completion_length": 39.25, "epoch": 0.7041587901701323, "format_reward": 0.0, "grad_norm": 17.15437889099121, "image_reward": 0.29809672236442564, "kl": 2.7566053330898286, "learning_rate": 5e-06, "loss": 0.008, "reward": 0.5703202053904534, "reward_std": 0.8202566847205162, "rewards/reward_func": 0.5703202053904534, "step": 2980, "toxic_reward": 3.724568712711334 }, { "clip_ratio": 0.0, "completion_length": 37.325, "epoch": 0.7065217391304348, "format_reward": -0.5, "grad_norm": 24.218904495239258, "image_reward": 0.26676025390625, "kl": 2.1775312602519987, "learning_rate": 5e-06, "loss": 0.0501, "reward": 0.04957394301891327, "reward_std": 1.5467448111623525, "rewards/reward_func": 0.04957394301891327, "step": 2990, "toxic_reward": 3.3577624768018723 }, { "clip_ratio": 0.0, "completion_length": 47.1, "epoch": 0.7088846880907372, "format_reward": -0.25, "grad_norm": 4.296006679534912, "image_reward": 0.286920166015625, "kl": 1.4036121606826781, "learning_rate": 5e-06, "loss": -0.0237, "reward": 0.3820555150508881, "reward_std": 1.188760439120233, "rewards/reward_func": 0.3820555150508881, "step": 3000, "toxic_reward": 4.305025839805603 }, { "clip_ratio": 0.0, "completion_length": 39.9, "epoch": 0.7112476370510397, "format_reward": -0.5, "grad_norm": 20.778005599975586, "image_reward": 0.30558042062653434, "kl": 1.4864997833967208, "learning_rate": 5e-06, "loss": 0.0741, "reward": -0.08508440256118774, "reward_std": 1.637317718565464, "rewards/reward_func": -0.08508440256118774, "step": 3010, "toxic_reward": 4.079210705227322 }, { "clip_ratio": 0.0, "completion_length": 41.95, "epoch": 0.7136105860113422, "format_reward": -0.25, "grad_norm": 4.398971080780029, "image_reward": 0.2982396438717842, "kl": 36.805122749507426, "learning_rate": 5e-06, "loss": 0.0657, "reward": -0.5174520492553711, "reward_std": 1.1937666054815055, "rewards/reward_func": -0.5174520492553711, "step": 3020, "toxic_reward": 4.007938003540039 }, { "clip_ratio": 0.0, "completion_length": 43.35, "epoch": 0.7159735349716446, "format_reward": -0.5, "grad_norm": 13.247093200683594, "image_reward": 0.26703898310661317, "kl": 1.961264681816101, "learning_rate": 5e-06, "loss": -0.0169, "reward": -0.05578238368034363, "reward_std": 1.441930427402258, "rewards/reward_func": -0.05578238368034363, "step": 3030, "toxic_reward": 4.098655521869659 }, { "clip_ratio": 0.0, "completion_length": 43.275, "epoch": 0.718336483931947, "format_reward": 0.0, "grad_norm": 23.132400512695312, "image_reward": 0.2803761810064316, "kl": 26.18696767091751, "learning_rate": 5e-06, "loss": 0.0457, "reward": 0.49192982316017153, "reward_std": 0.6619096536189317, "rewards/reward_func": 0.49192982316017153, "step": 3040, "toxic_reward": 4.184990978240966 }, { "clip_ratio": 0.0, "completion_length": 53.35, "epoch": 0.7206994328922496, "format_reward": -0.25, "grad_norm": 28.89768409729004, "image_reward": 0.2594024658203125, "kl": 2.125564157962799, "learning_rate": 5e-06, "loss": 0.0056, "reward": -0.18149735927581787, "reward_std": 1.4747628048062325, "rewards/reward_func": -0.18149735927581787, "step": 3050, "toxic_reward": 3.5373760223388673 }, { "clip_ratio": 0.0, "completion_length": 48.95, "epoch": 0.723062381852552, "format_reward": -0.5, "grad_norm": 17.26774787902832, "image_reward": 0.29308573305606844, "kl": 13.767069751024247, "learning_rate": 5e-06, "loss": 0.0058, "reward": 0.02059091329574585, "reward_std": 1.5365628942847251, "rewards/reward_func": 0.02059091329574585, "step": 3060, "toxic_reward": 3.280546021461487 }, { "clip_ratio": 0.0, "completion_length": 45.65, "epoch": 0.7254253308128544, "format_reward": -0.5, "grad_norm": 16.69405746459961, "image_reward": 0.27832234650850296, "kl": 2.4563605159521105, "learning_rate": 5e-06, "loss": 0.0249, "reward": -0.4515081524848938, "reward_std": 1.2287155898287891, "rewards/reward_func": -0.4515081524848938, "step": 3070, "toxic_reward": 3.398514473438263 }, { "clip_ratio": 0.0, "completion_length": 58.775, "epoch": 0.7277882797731569, "format_reward": -0.5, "grad_norm": 7.214962482452393, "image_reward": 0.27323404848575594, "kl": 1.8898794114589692, "learning_rate": 5e-06, "loss": -0.0003, "reward": -0.4889628529548645, "reward_std": 1.4541106900200247, "rewards/reward_func": -0.4889628529548645, "step": 3080, "toxic_reward": 4.181539106369018 }, { "clip_ratio": 0.0, "completion_length": 52.1, "epoch": 0.7301512287334594, "format_reward": -0.5, "grad_norm": 1.7761129140853882, "image_reward": 0.27235768735408783, "kl": 2.1771215945482254, "learning_rate": 5e-06, "loss": -0.0612, "reward": 0.1671779692173004, "reward_std": 1.4098370391875505, "rewards/reward_func": 0.1671779692173004, "step": 3090, "toxic_reward": 3.8193355441093444 }, { "clip_ratio": 0.0, "completion_length": 36.85, "epoch": 0.7325141776937618, "format_reward": -0.5, "grad_norm": 5.988401412963867, "image_reward": 0.25819803923368456, "kl": 0.8303129658102989, "learning_rate": 5e-06, "loss": -0.0141, "reward": 0.20456358194351196, "reward_std": 1.755793434381485, "rewards/reward_func": 0.20456358194351196, "step": 3100, "toxic_reward": 3.5276977360248565 }, { "clip_ratio": 0.0, "completion_length": 45.825, "epoch": 0.7348771266540642, "format_reward": 0.0, "grad_norm": 63.649696350097656, "image_reward": 0.26085103303194046, "kl": 2.509291835129261, "learning_rate": 5e-06, "loss": -0.0945, "reward": 0.34231345951557157, "reward_std": 1.2596007108688354, "rewards/reward_func": 0.34231345951557157, "step": 3110, "toxic_reward": 3.680406093597412 }, { "clip_ratio": 0.0, "completion_length": 42.675, "epoch": 0.7372400756143668, "format_reward": 0.0, "grad_norm": 13.457945823669434, "image_reward": 0.2661163330078125, "kl": 3.3423233568668365, "learning_rate": 5e-06, "loss": -0.0837, "reward": 0.21805171072483062, "reward_std": 1.0620483674108983, "rewards/reward_func": 0.21805171072483062, "step": 3120, "toxic_reward": 3.4958622455596924 }, { "clip_ratio": 0.0, "completion_length": 43.25, "epoch": 0.7396030245746692, "format_reward": 0.0, "grad_norm": 1.3712886571884155, "image_reward": 0.280279541015625, "kl": 0.5368543028831482, "learning_rate": 5e-06, "loss": 0.0856, "reward": -0.08258238434791565, "reward_std": 0.7678581360727549, "rewards/reward_func": -0.08258238434791565, "step": 3130, "toxic_reward": 4.090320491790772 }, { "clip_ratio": 0.0, "completion_length": 51.75, "epoch": 0.7419659735349716, "format_reward": -0.25, "grad_norm": 33.164817810058594, "image_reward": 0.263348388671875, "kl": 3.852606762945652, "learning_rate": 5e-06, "loss": -0.1711, "reward": 0.23493566811084748, "reward_std": 1.2882447349838912, "rewards/reward_func": 0.23493566811084748, "step": 3140, "toxic_reward": 3.8201312363147735 }, { "clip_ratio": 0.0, "completion_length": 41.275, "epoch": 0.744328922495274, "format_reward": -0.25, "grad_norm": 23.956363677978516, "image_reward": 0.31206461489200593, "kl": 0.8646048396825791, "learning_rate": 5e-06, "loss": -0.0402, "reward": 0.018216264247894288, "reward_std": 1.226612313091755, "rewards/reward_func": 0.018216264247894288, "step": 3150, "toxic_reward": 3.7303581714630125 }, { "clip_ratio": 0.0, "completion_length": 42.025, "epoch": 0.7466918714555766, "format_reward": -0.25, "grad_norm": 7.992063999176025, "image_reward": 0.283380126953125, "kl": 1.2732116781175136, "learning_rate": 5e-06, "loss": -0.091, "reward": 0.7706227093935013, "reward_std": 1.4939947571605443, "rewards/reward_func": 0.7706227093935013, "step": 3160, "toxic_reward": 3.5458990573883056 }, { "clip_ratio": 0.0, "completion_length": 35.05, "epoch": 0.749054820415879, "format_reward": 0.0, "grad_norm": 26.938879013061523, "image_reward": 0.2929168701171875, "kl": 4.621248189732432, "learning_rate": 5e-06, "loss": -0.0543, "reward": 0.26634013652801514, "reward_std": 0.6591222167015076, "rewards/reward_func": 0.26634013652801514, "step": 3170, "toxic_reward": 4.090583860874176 }, { "clip_ratio": 0.0, "completion_length": 45.825, "epoch": 0.7514177693761814, "format_reward": -0.5, "grad_norm": 1.0445924997329712, "image_reward": 0.27820536196231843, "kl": 1.2374065339565277, "learning_rate": 5e-06, "loss": 0.0777, "reward": -0.1608543336391449, "reward_std": 0.9281521745026111, "rewards/reward_func": -0.1608543336391449, "step": 3180, "toxic_reward": 3.968969798088074 }, { "clip_ratio": 0.0, "completion_length": 42.8, "epoch": 0.753780718336484, "format_reward": 0.0, "grad_norm": 26.366165161132812, "image_reward": 0.27580566257238387, "kl": 11.984261164069176, "learning_rate": 5e-06, "loss": 0.0176, "reward": 0.36088051795959475, "reward_std": 0.737302597053349, "rewards/reward_func": 0.36088051795959475, "step": 3190, "toxic_reward": 4.220689821243286 }, { "clip_ratio": 0.0, "completion_length": 43.525, "epoch": 0.7561436672967864, "format_reward": 0.0, "grad_norm": 15.66350269317627, "image_reward": 0.2542442321777344, "kl": 1.2004614934325217, "learning_rate": 5e-06, "loss": -0.1499, "reward": 0.916411966085434, "reward_std": 1.1410479605197907, "rewards/reward_func": 0.916411966085434, "step": 3200, "toxic_reward": 3.5325961112976074 }, { "clip_ratio": 0.0, "completion_length": 32.375, "epoch": 0.7585066162570888, "format_reward": 0.0, "grad_norm": 4.70230770111084, "image_reward": 0.30909423828125, "kl": 2.567577276751399, "learning_rate": 5e-06, "loss": -0.0471, "reward": 0.20682075023651122, "reward_std": 0.5303860757499933, "rewards/reward_func": 0.20682075023651122, "step": 3210, "toxic_reward": 3.270925796031952 }, { "clip_ratio": 0.0, "completion_length": 49.05, "epoch": 0.7608695652173914, "format_reward": 0.0, "grad_norm": 10.79295539855957, "image_reward": 0.25408528596162794, "kl": 2.86144537627697, "learning_rate": 5e-06, "loss": -0.0379, "reward": 0.6601236045360566, "reward_std": 0.7253405870869756, "rewards/reward_func": 0.6601236045360566, "step": 3220, "toxic_reward": 4.3341371536254885 }, { "clip_ratio": 0.0, "completion_length": 42.725, "epoch": 0.7632325141776938, "format_reward": -0.25, "grad_norm": 6.788066387176514, "image_reward": 0.2796641021966934, "kl": 5.517164082825184, "learning_rate": 5e-06, "loss": 0.1356, "reward": -0.18339840769767762, "reward_std": 1.0695885993540286, "rewards/reward_func": -0.18339840769767762, "step": 3230, "toxic_reward": 4.077802658081055 }, { "clip_ratio": 0.0, "completion_length": 37.5, "epoch": 0.7655954631379962, "format_reward": -0.5, "grad_norm": 39.19500732421875, "image_reward": 0.2956451416015625, "kl": 0.7065762653946877, "learning_rate": 5e-06, "loss": -0.047, "reward": -0.26765223741531374, "reward_std": 1.6595379646867514, "rewards/reward_func": -0.26765223741531374, "step": 3240, "toxic_reward": 3.865544855594635 }, { "clip_ratio": 0.0, "completion_length": 40.675, "epoch": 0.7679584120982986, "format_reward": 0.0, "grad_norm": 8.50940990447998, "image_reward": 0.28647562563419343, "kl": 4.316986609622836, "learning_rate": 5e-06, "loss": 0.124, "reward": 0.6616093635559082, "reward_std": 1.070189495384693, "rewards/reward_func": 0.6616093635559082, "step": 3250, "toxic_reward": 3.284928467869759 }, { "clip_ratio": 0.0, "completion_length": 40.175, "epoch": 0.7703213610586012, "format_reward": 0.0, "grad_norm": 21.128314971923828, "image_reward": 0.2679835006594658, "kl": 4.375968629121781, "learning_rate": 5e-06, "loss": 0.0579, "reward": 0.3372311323881149, "reward_std": 0.869463924318552, "rewards/reward_func": 0.3372311323881149, "step": 3260, "toxic_reward": 3.78046395778656 }, { "clip_ratio": 0.0, "completion_length": 39.5, "epoch": 0.7726843100189036, "format_reward": 0.0, "grad_norm": 7.558209419250488, "image_reward": 0.2745330810546875, "kl": 2.3013378672301767, "learning_rate": 5e-06, "loss": -0.031, "reward": 0.784791512787342, "reward_std": 0.8750310368835926, "rewards/reward_func": 0.784791512787342, "step": 3270, "toxic_reward": 3.3566872388124467 }, { "clip_ratio": 0.0, "completion_length": 39.025, "epoch": 0.775047258979206, "format_reward": -0.5, "grad_norm": 2.0432510375976562, "image_reward": 0.2913625091314316, "kl": 0.3781319923698902, "learning_rate": 5e-06, "loss": -0.0607, "reward": -0.3740895688533783, "reward_std": 1.350129895284772, "rewards/reward_func": -0.3740895688533783, "step": 3280, "toxic_reward": 4.018161624670029 }, { "clip_ratio": 0.0, "completion_length": 47.825, "epoch": 0.7774102079395085, "format_reward": -0.5, "grad_norm": 3.16352915763855, "image_reward": 0.27322998046875, "kl": 0.3104788601398468, "learning_rate": 5e-06, "loss": 0.0334, "reward": -0.05296646356582642, "reward_std": 1.2258484821766615, "rewards/reward_func": -0.05296646356582642, "step": 3290, "toxic_reward": 4.389449417591095 }, { "clip_ratio": 0.0, "completion_length": 35.925, "epoch": 0.779773156899811, "format_reward": 0.0, "grad_norm": 4.182164669036865, "image_reward": 0.2737925201654434, "kl": 1.2850206293165685, "learning_rate": 5e-06, "loss": -0.1213, "reward": 0.6742017388343811, "reward_std": 0.736553730070591, "rewards/reward_func": 0.6742017388343811, "step": 3300, "toxic_reward": 4.220770263671875 }, { "clip_ratio": 0.0, "completion_length": 55.175, "epoch": 0.7821361058601134, "format_reward": -0.25, "grad_norm": 8.606978416442871, "image_reward": 0.2700215637683868, "kl": 4.289887800067663, "learning_rate": 5e-06, "loss": -0.1432, "reward": 0.30965389013290406, "reward_std": 1.063696064054966, "rewards/reward_func": 0.30965389013290406, "step": 3310, "toxic_reward": 4.136632585525513 }, { "clip_ratio": 0.0, "completion_length": 40.45, "epoch": 0.7844990548204159, "format_reward": 0.0, "grad_norm": 3.973367691040039, "image_reward": 0.25255330502986906, "kl": 6.628417156636715, "learning_rate": 5e-06, "loss": -0.0524, "reward": 0.2595418691635132, "reward_std": 0.6656439051032066, "rewards/reward_func": 0.2595418691635132, "step": 3320, "toxic_reward": 3.946825695037842 }, { "clip_ratio": 0.0, "completion_length": 44.625, "epoch": 0.7868620037807184, "format_reward": 0.0, "grad_norm": 5.275523662567139, "image_reward": 0.29005940854549406, "kl": 25.900663439184427, "learning_rate": 5e-06, "loss": -0.1344, "reward": 0.8005503177642822, "reward_std": 0.9713124742731452, "rewards/reward_func": 0.8005503177642822, "step": 3330, "toxic_reward": 4.047469854354858 }, { "clip_ratio": 0.0, "completion_length": 36.55, "epoch": 0.7892249527410208, "format_reward": 0.0, "grad_norm": 5.920967102050781, "image_reward": 0.2706329345703125, "kl": 2.892443811520934, "learning_rate": 5e-06, "loss": -0.028, "reward": 0.7794641971588134, "reward_std": 0.7315312433987856, "rewards/reward_func": 0.7794641971588134, "step": 3340, "toxic_reward": 4.036288380622864 }, { "clip_ratio": 0.0, "completion_length": 49.325, "epoch": 0.7915879017013232, "format_reward": -0.25, "grad_norm": 19.411304473876953, "image_reward": 0.2590001419186592, "kl": 0.762314885109663, "learning_rate": 5e-06, "loss": 0.0738, "reward": -0.22335948944091796, "reward_std": 1.229094560444355, "rewards/reward_func": -0.22335948944091796, "step": 3350, "toxic_reward": 4.078046441078186 }, { "clip_ratio": 0.0, "completion_length": 42.825, "epoch": 0.7939508506616257, "format_reward": -0.25, "grad_norm": 9.397270202636719, "image_reward": 0.25453638202614254, "kl": 1.7116897955536843, "learning_rate": 5e-06, "loss": -0.0637, "reward": -0.23146066069602966, "reward_std": 1.4809592371806501, "rewards/reward_func": -0.23146066069602966, "step": 3360, "toxic_reward": 3.7261215580834284 }, { "clip_ratio": 0.0, "completion_length": 36.075, "epoch": 0.7963137996219282, "format_reward": 0.0, "grad_norm": 2.7069385051727295, "image_reward": 0.29098663330078123, "kl": 0.2713630013167858, "learning_rate": 5e-06, "loss": -0.0331, "reward": 0.2162942558526993, "reward_std": 0.7098794117569923, "rewards/reward_func": 0.2162942558526993, "step": 3370, "toxic_reward": 3.5313488602638246 }, { "clip_ratio": 0.0, "completion_length": 40.425, "epoch": 0.7986767485822306, "format_reward": 0.0, "grad_norm": 5.024960041046143, "image_reward": 0.2739929184317589, "kl": 18.83201899640262, "learning_rate": 5e-06, "loss": -0.0017, "reward": 0.4777979046106339, "reward_std": 1.240721021965146, "rewards/reward_func": 0.4777979046106339, "step": 3380, "toxic_reward": 3.522536587715149 }, { "clip_ratio": 0.0, "completion_length": 41.725, "epoch": 0.8010396975425331, "format_reward": 0.0, "grad_norm": 4.2769622802734375, "image_reward": 0.2533442169427872, "kl": 8.165257753431797, "learning_rate": 5e-06, "loss": -0.0776, "reward": 0.44249573945999143, "reward_std": 0.8017176885157824, "rewards/reward_func": 0.44249573945999143, "step": 3390, "toxic_reward": 4.304618096351623 }, { "clip_ratio": 0.0, "completion_length": 33.85, "epoch": 0.8034026465028355, "format_reward": -0.75, "grad_norm": 6.779478549957275, "image_reward": 0.254600016772747, "kl": 3.2404680982232095, "learning_rate": 5e-06, "loss": -0.0622, "reward": 1.1459296941757202, "reward_std": 2.3130407273769378, "rewards/reward_func": 1.1459296941757202, "step": 3400, "toxic_reward": 3.946528363227844 }, { "clip_ratio": 0.0, "completion_length": 49.475, "epoch": 0.805765595463138, "format_reward": -0.75, "grad_norm": 9.082526206970215, "image_reward": 0.2917332977056503, "kl": 1.1568331263959408, "learning_rate": 5e-06, "loss": -0.087, "reward": -0.22108137607574463, "reward_std": 2.109957142919302, "rewards/reward_func": -0.22108137607574463, "step": 3410, "toxic_reward": 3.467973506450653 }, { "clip_ratio": 0.0, "completion_length": 46.225, "epoch": 0.8081285444234405, "format_reward": 0.0, "grad_norm": 4.748640537261963, "image_reward": 0.2629201263189316, "kl": 1.277670707181096, "learning_rate": 5e-06, "loss": 0.0912, "reward": 0.5041390061378479, "reward_std": 1.1238155417144298, "rewards/reward_func": 0.5041390061378479, "step": 3420, "toxic_reward": 3.6773669004440306 }, { "clip_ratio": 0.0, "completion_length": 54.875, "epoch": 0.8104914933837429, "format_reward": -0.25, "grad_norm": 6.2929182052612305, "image_reward": 0.2664311736822128, "kl": 0.41112807476893065, "learning_rate": 5e-06, "loss": -0.0185, "reward": 0.25773588865995406, "reward_std": 1.1975380808115006, "rewards/reward_func": 0.25773588865995406, "step": 3430, "toxic_reward": 3.5475049674510957 }, { "clip_ratio": 0.0, "completion_length": 31.2, "epoch": 0.8128544423440454, "format_reward": 0.0, "grad_norm": 2.984248399734497, "image_reward": 0.29365132600069044, "kl": 6.331273209676146, "learning_rate": 5e-06, "loss": -0.0094, "reward": -0.1625719666481018, "reward_std": 0.864103776961565, "rewards/reward_func": -0.1625719666481018, "step": 3440, "toxic_reward": 3.899219441413879 }, { "clip_ratio": 0.0, "completion_length": 59.075, "epoch": 0.8152173913043478, "format_reward": 0.0, "grad_norm": 5.733253479003906, "image_reward": 0.28086344301700594, "kl": 9.025772982649505, "learning_rate": 5e-06, "loss": -0.0245, "reward": 0.4977319598197937, "reward_std": 0.6485220491886139, "rewards/reward_func": 0.4977319598197937, "step": 3450, "toxic_reward": 3.7944631457328795 }, { "clip_ratio": 0.0, "completion_length": 33.25, "epoch": 0.8175803402646503, "format_reward": -0.25, "grad_norm": 5.605562686920166, "image_reward": 0.281341552734375, "kl": 0.665616973862052, "learning_rate": 5e-06, "loss": -0.0068, "reward": -0.26609439849853517, "reward_std": 1.4688232390210032, "rewards/reward_func": -0.26609439849853517, "step": 3460, "toxic_reward": 3.4837970972061156 }, { "clip_ratio": 0.0, "completion_length": 52.475, "epoch": 0.8199432892249527, "format_reward": -0.5, "grad_norm": 2.6239664554595947, "image_reward": 0.2684331268072128, "kl": 1.5078430883586407, "learning_rate": 5e-06, "loss": 0.1555, "reward": -0.12674018144607543, "reward_std": 1.4365263484418391, "rewards/reward_func": -0.12674018144607543, "step": 3470, "toxic_reward": 4.5540220737457275 }, { "clip_ratio": 0.0, "completion_length": 49.975, "epoch": 0.8223062381852552, "format_reward": -0.25, "grad_norm": 8.734126091003418, "image_reward": 0.2604085296392441, "kl": 24.937382932007313, "learning_rate": 5e-06, "loss": -0.0011, "reward": -0.20797204971313477, "reward_std": 0.9569237198680639, "rewards/reward_func": -0.20797204971313477, "step": 3480, "toxic_reward": 4.407214689254761 }, { "clip_ratio": 0.0, "completion_length": 60.625, "epoch": 0.8246691871455577, "format_reward": -0.25, "grad_norm": 1.5907628536224365, "image_reward": 0.2842885345220566, "kl": 0.051335761044174436, "learning_rate": 5e-06, "loss": 0.0047, "reward": 0.9079252362251282, "reward_std": 1.1933536015450954, "rewards/reward_func": 0.9079252362251282, "step": 3490, "toxic_reward": 4.199088740348816 }, { "clip_ratio": 0.0, "completion_length": 39.75, "epoch": 0.8270321361058601, "format_reward": -1.0, "grad_norm": 1.5189018249511719, "image_reward": 0.2938863128423691, "kl": 6.8780351031571625, "learning_rate": 5e-06, "loss": 0.0792, "reward": -0.6177874624729156, "reward_std": 1.9702259879559278, "rewards/reward_func": -0.6177874624729156, "step": 3500, "toxic_reward": 3.7184417486190795 }, { "clip_ratio": 0.0, "completion_length": 44.075, "epoch": 0.8293950850661626, "format_reward": -0.25, "grad_norm": 0.5691888928413391, "image_reward": 0.2792378753423691, "kl": 0.0625603836029768, "learning_rate": 5e-06, "loss": -0.0245, "reward": 0.39415156543254853, "reward_std": 0.7689090168103576, "rewards/reward_func": 0.39415156543254853, "step": 3510, "toxic_reward": 4.210167169570923 }, { "clip_ratio": 0.0, "completion_length": 48.675, "epoch": 0.831758034026465, "format_reward": 0.0, "grad_norm": 2.8700907230377197, "image_reward": 0.263861083984375, "kl": 0.3225065166130662, "learning_rate": 5e-06, "loss": -0.0609, "reward": 0.6327012300491333, "reward_std": 0.980434575676918, "rewards/reward_func": 0.6327012300491333, "step": 3520, "toxic_reward": 3.8261560261249543 }, { "clip_ratio": 0.0, "completion_length": 40.05, "epoch": 0.8341209829867675, "format_reward": 0.0, "grad_norm": 1.0346537828445435, "image_reward": 0.284771728515625, "kl": 0.40898411339148877, "learning_rate": 5e-06, "loss": -0.0307, "reward": 0.30759164690971375, "reward_std": 0.6451162457466125, "rewards/reward_func": 0.30759164690971375, "step": 3530, "toxic_reward": 4.171144628524781 }, { "clip_ratio": 0.0, "completion_length": 55.3, "epoch": 0.8364839319470699, "format_reward": -0.5, "grad_norm": 5.951425075531006, "image_reward": 0.3035013824701309, "kl": 11.781208837591112, "learning_rate": 5e-06, "loss": 0.0257, "reward": -0.1725111722946167, "reward_std": 1.8102335507050156, "rewards/reward_func": -0.1725111722946167, "step": 3540, "toxic_reward": 3.670738685131073 }, { "clip_ratio": 0.0, "completion_length": 68.975, "epoch": 0.8388468809073724, "format_reward": 0.0, "grad_norm": 14.631609916687012, "image_reward": 0.2706085205078125, "kl": 4.56192576661706, "learning_rate": 5e-06, "loss": -0.033, "reward": 1.4811566695570946, "reward_std": 0.9509499605745078, "rewards/reward_func": 1.4811566695570946, "step": 3550, "toxic_reward": 3.4709715723991392 }, { "clip_ratio": 0.0, "completion_length": 41.225, "epoch": 0.8412098298676749, "format_reward": -0.5, "grad_norm": 7.567039489746094, "image_reward": 0.2595652252435684, "kl": 10.069495621696115, "learning_rate": 5e-06, "loss": -0.057, "reward": 0.034914278984069826, "reward_std": 2.0578875496983526, "rewards/reward_func": 0.034914278984069826, "step": 3560, "toxic_reward": 3.711397814750671 }, { "clip_ratio": 0.0, "completion_length": 44.9, "epoch": 0.8435727788279773, "format_reward": 0.0, "grad_norm": 2.5686023235321045, "image_reward": 0.27446797788143157, "kl": 1.5964453139342367, "learning_rate": 5e-06, "loss": -0.031, "reward": 0.41478089690208436, "reward_std": 0.6242949636653066, "rewards/reward_func": 0.41478089690208436, "step": 3570, "toxic_reward": 4.057631134986877 }, { "clip_ratio": 0.0, "completion_length": 38.725, "epoch": 0.8459357277882797, "format_reward": -0.75, "grad_norm": 18.62441062927246, "image_reward": 0.26212361752986907, "kl": 18.3546858407557, "learning_rate": 5e-06, "loss": -0.0125, "reward": 0.3845840930938721, "reward_std": 2.4349112689495085, "rewards/reward_func": 0.3845840930938721, "step": 3580, "toxic_reward": 3.7492689728736877 }, { "clip_ratio": 0.0, "completion_length": 46.15, "epoch": 0.8482986767485823, "format_reward": -0.25, "grad_norm": 1.5018891096115112, "image_reward": 0.25630950927734375, "kl": 1.5399845570325852, "learning_rate": 5e-06, "loss": 0.0064, "reward": -0.4008509755134583, "reward_std": 1.2334194054827095, "rewards/reward_func": -0.4008509755134583, "step": 3590, "toxic_reward": 3.9793298959732057 }, { "clip_ratio": 0.0, "completion_length": 49.475, "epoch": 0.8506616257088847, "format_reward": 0.0, "grad_norm": 1.118828296661377, "image_reward": 0.279705810546875, "kl": 2.2166069228202105, "learning_rate": 5e-06, "loss": -0.0439, "reward": 0.722964608669281, "reward_std": 0.7349236082285643, "rewards/reward_func": 0.722964608669281, "step": 3600, "toxic_reward": 4.429630327224731 }, { "clip_ratio": 0.0, "completion_length": 47.8, "epoch": 0.8530245746691871, "format_reward": 0.0, "grad_norm": 2.9072647094726562, "image_reward": 0.2988444000482559, "kl": 0.4259108882397413, "learning_rate": 5e-06, "loss": -0.139, "reward": 0.36530678868293764, "reward_std": 1.0169117324054242, "rewards/reward_func": 0.36530678868293764, "step": 3610, "toxic_reward": 3.706578254699707 }, { "clip_ratio": 0.0, "completion_length": 34.05, "epoch": 0.8553875236294896, "format_reward": 0.0, "grad_norm": 5.574492931365967, "image_reward": 0.26971537321805955, "kl": 0.8386783060617745, "learning_rate": 5e-06, "loss": 0.0196, "reward": -0.043644605576992034, "reward_std": 0.7492304600775241, "rewards/reward_func": -0.043644605576992034, "step": 3620, "toxic_reward": 3.7889950960874557 }, { "clip_ratio": 0.0, "completion_length": 48.2, "epoch": 0.8577504725897921, "format_reward": 0.0, "grad_norm": 0.6372764110565186, "image_reward": 0.2799346923828125, "kl": 0.09604998417198658, "learning_rate": 5e-06, "loss": -0.0245, "reward": 0.31555656492710116, "reward_std": 0.5240693692117929, "rewards/reward_func": 0.31555656492710116, "step": 3630, "toxic_reward": 3.9185105204582213 }, { "clip_ratio": 0.0, "completion_length": 54.775, "epoch": 0.8601134215500945, "format_reward": 0.0, "grad_norm": 1.1808196306228638, "image_reward": 0.2841166198253632, "kl": 29.371498390100896, "learning_rate": 5e-06, "loss": -0.0516, "reward": 0.26911270916461943, "reward_std": 0.5647319633513689, "rewards/reward_func": 0.26911270916461943, "step": 3640, "toxic_reward": 3.410293960571289 }, { "clip_ratio": 0.0, "completion_length": 39.35, "epoch": 0.8624763705103969, "format_reward": -0.25, "grad_norm": 0.7336105108261108, "image_reward": 0.26638386994600294, "kl": 2.7957767372950912, "learning_rate": 5e-06, "loss": 0.0919, "reward": 0.25321381688117983, "reward_std": 1.543316999450326, "rewards/reward_func": 0.25321381688117983, "step": 3650, "toxic_reward": 3.6566759824752806 }, { "clip_ratio": 0.0, "completion_length": 46.75, "epoch": 0.8648393194706995, "format_reward": -0.25, "grad_norm": 0.6029968857765198, "image_reward": 0.2995513916015625, "kl": 2.5597430652938784, "learning_rate": 5e-06, "loss": -0.0974, "reward": 0.32535398602485655, "reward_std": 1.3309460416436196, "rewards/reward_func": 0.32535398602485655, "step": 3660, "toxic_reward": 3.65915470123291 }, { "clip_ratio": 0.0, "completion_length": 35.275, "epoch": 0.8672022684310019, "format_reward": -0.75, "grad_norm": 3.770862102508545, "image_reward": 0.291839599609375, "kl": 24.679713291302324, "learning_rate": 5e-06, "loss": -0.0464, "reward": -0.6877359867095947, "reward_std": 2.206609180383384, "rewards/reward_func": -0.6877359867095947, "step": 3670, "toxic_reward": 3.764638936519623 }, { "clip_ratio": 0.0, "completion_length": 44.9, "epoch": 0.8695652173913043, "format_reward": -0.25, "grad_norm": 0.5417113304138184, "image_reward": 0.26164347380399705, "kl": 0.35583615899085996, "learning_rate": 5e-06, "loss": 0.0668, "reward": -0.214414319396019, "reward_std": 1.3576272014528512, "rewards/reward_func": -0.214414319396019, "step": 3680, "toxic_reward": 3.65915904045105 }, { "clip_ratio": 0.0, "completion_length": 38.125, "epoch": 0.8719281663516069, "format_reward": 0.0, "grad_norm": 0.4714978039264679, "image_reward": 0.278389485180378, "kl": 2.4852739069610834, "learning_rate": 5e-06, "loss": -0.0177, "reward": 0.1318028151988983, "reward_std": 0.8923286706209183, "rewards/reward_func": 0.1318028151988983, "step": 3690, "toxic_reward": 3.508782708644867 }, { "clip_ratio": 0.0, "completion_length": 47.05, "epoch": 0.8742911153119093, "format_reward": 0.0, "grad_norm": 0.8036189079284668, "image_reward": 0.246942138671875, "kl": 17.79970283471048, "learning_rate": 5e-06, "loss": -0.0525, "reward": 0.44554237723350526, "reward_std": 0.8977296775206923, "rewards/reward_func": 0.44554237723350526, "step": 3700, "toxic_reward": 3.5204819679260253 }, { "clip_ratio": 0.0, "completion_length": 37.15, "epoch": 0.8766540642722117, "format_reward": 0.0, "grad_norm": 0.9322050213813782, "image_reward": 0.2607330322265625, "kl": 4.579995289538056, "learning_rate": 5e-06, "loss": -0.0918, "reward": 0.7257406830787658, "reward_std": 0.7061707813292741, "rewards/reward_func": 0.7257406830787658, "step": 3710, "toxic_reward": 3.967698335647583 }, { "clip_ratio": 0.0, "completion_length": 32.9, "epoch": 0.8790170132325141, "format_reward": -0.5, "grad_norm": 0.7602401971817017, "image_reward": 0.277716064453125, "kl": 5.202583113871515, "learning_rate": 5e-06, "loss": 0.0093, "reward": -0.1386810451745987, "reward_std": 1.5367558933794498, "rewards/reward_func": -0.1386810451745987, "step": 3720, "toxic_reward": 3.343963861465454 }, { "clip_ratio": 0.0, "completion_length": 46.45, "epoch": 0.8813799621928167, "format_reward": 0.0, "grad_norm": 0.49207255244255066, "image_reward": 0.2617726638913155, "kl": 3.0696171432733537, "learning_rate": 5e-06, "loss": 0.0096, "reward": 0.8290068447589874, "reward_std": 0.6912821188569069, "rewards/reward_func": 0.8290068447589874, "step": 3730, "toxic_reward": 4.308743190765381 }, { "clip_ratio": 0.0, "completion_length": 47.125, "epoch": 0.8837429111531191, "format_reward": 0.0, "grad_norm": 0.5754015445709229, "image_reward": 0.25755615234375, "kl": 1.463007004186511, "learning_rate": 5e-06, "loss": -0.074, "reward": 1.1725465416908265, "reward_std": 0.7939416155219078, "rewards/reward_func": 1.1725465416908265, "step": 3740, "toxic_reward": 3.892818683385849 }, { "clip_ratio": 0.0, "completion_length": 45.65, "epoch": 0.8861058601134215, "format_reward": -0.5, "grad_norm": 0.3917323350906372, "image_reward": 0.2610259994864464, "kl": 3.9046508548781276, "learning_rate": 5e-06, "loss": -0.0114, "reward": 0.1690664052963257, "reward_std": 1.9762837937101723, "rewards/reward_func": 0.1690664052963257, "step": 3750, "toxic_reward": 3.7723870635032655 }, { "clip_ratio": 0.0, "completion_length": 45.45, "epoch": 0.888468809073724, "format_reward": 0.0, "grad_norm": 0.6322398781776428, "image_reward": 0.27677764892578127, "kl": 1.8196211833506823, "learning_rate": 5e-06, "loss": -0.12, "reward": 0.42850649207830427, "reward_std": 0.5486618679948151, "rewards/reward_func": 0.42850649207830427, "step": 3760, "toxic_reward": 3.4346215546131136 }, { "clip_ratio": 0.0, "completion_length": 44.75, "epoch": 0.8908317580340265, "format_reward": 0.0, "grad_norm": 0.3245849013328552, "image_reward": 0.29166819155216217, "kl": 10.705555348284543, "learning_rate": 5e-06, "loss": -0.0603, "reward": 0.061820387840270996, "reward_std": 1.08290204256773, "rewards/reward_func": 0.061820387840270996, "step": 3770, "toxic_reward": 2.949862742424011 }, { "clip_ratio": 0.0, "completion_length": 47.0, "epoch": 0.8931947069943289, "format_reward": -0.25, "grad_norm": 0.3298509418964386, "image_reward": 0.290167236328125, "kl": 0.07300702948123217, "learning_rate": 5e-06, "loss": -0.0171, "reward": 0.06625822186470032, "reward_std": 1.0081432062666864, "rewards/reward_func": 0.06625822186470032, "step": 3780, "toxic_reward": 4.22382138967514 }, { "clip_ratio": 0.0, "completion_length": 38.4, "epoch": 0.8955576559546313, "format_reward": 0.0, "grad_norm": 0.698654055595398, "image_reward": 0.27091064453125, "kl": 4.801618622988462, "learning_rate": 5e-06, "loss": -0.0591, "reward": 0.3187494039535522, "reward_std": 0.5140533071011305, "rewards/reward_func": 0.3187494039535522, "step": 3790, "toxic_reward": 4.416417121887207 }, { "clip_ratio": 0.0, "completion_length": 46.6, "epoch": 0.8979206049149339, "format_reward": -0.25, "grad_norm": 0.6394158601760864, "image_reward": 0.26355692744255066, "kl": 3.265846297331154, "learning_rate": 5e-06, "loss": -0.0384, "reward": -0.14046210050582886, "reward_std": 1.0342714745551347, "rewards/reward_func": -0.14046210050582886, "step": 3800, "toxic_reward": 4.3116097211837765 }, { "clip_ratio": 0.0, "completion_length": 50.7, "epoch": 0.9002835538752363, "format_reward": -0.5, "grad_norm": 0.7541901469230652, "image_reward": 0.2673909515142441, "kl": 0.7993329163640738, "learning_rate": 5e-06, "loss": 0.0777, "reward": 0.010242342948913574, "reward_std": 1.4442682154476643, "rewards/reward_func": 0.010242342948913574, "step": 3810, "toxic_reward": 4.425883173942566 }, { "clip_ratio": 0.0, "completion_length": 42.225, "epoch": 0.9026465028355387, "format_reward": 0.0, "grad_norm": 0.8831507563591003, "image_reward": 0.29705912470817564, "kl": 3.6087327402085068, "learning_rate": 5e-06, "loss": -0.111, "reward": 0.8021630614995956, "reward_std": 0.8431573905050754, "rewards/reward_func": 0.8021630614995956, "step": 3820, "toxic_reward": 3.6668890714645386 }, { "clip_ratio": 0.0, "completion_length": 48.85, "epoch": 0.9050094517958412, "format_reward": 0.0, "grad_norm": 1.166309118270874, "image_reward": 0.27442220151424407, "kl": 3.696834401600063, "learning_rate": 5e-06, "loss": 0.023, "reward": 0.46357709765434263, "reward_std": 0.5384013399481773, "rewards/reward_func": 0.46357709765434263, "step": 3830, "toxic_reward": 4.282819819450379 }, { "clip_ratio": 0.0, "completion_length": 45.275, "epoch": 0.9073724007561437, "format_reward": -0.25, "grad_norm": 2.2214293479919434, "image_reward": 0.29029541015625, "kl": 6.355313093215227, "learning_rate": 5e-06, "loss": 0.0587, "reward": 0.36757221817970276, "reward_std": 1.1468286462128163, "rewards/reward_func": 0.36757221817970276, "step": 3840, "toxic_reward": 3.8713893949985505 }, { "clip_ratio": 0.0, "completion_length": 45.525, "epoch": 0.9097353497164461, "format_reward": 0.0, "grad_norm": 0.7023747563362122, "image_reward": 0.2795267730951309, "kl": 0.12285411208868027, "learning_rate": 5e-06, "loss": 0.0277, "reward": 0.6907171040773392, "reward_std": 0.8528184913098812, "rewards/reward_func": 0.6907171040773392, "step": 3850, "toxic_reward": 3.9646514534950255 }, { "clip_ratio": 0.0, "completion_length": 54.0, "epoch": 0.9120982986767486, "format_reward": -0.25, "grad_norm": 0.6574695706367493, "image_reward": 0.277626545727253, "kl": 3.223006421420723, "learning_rate": 5e-06, "loss": 0.1019, "reward": 0.17425565123558046, "reward_std": 1.0788604862987996, "rewards/reward_func": 0.17425565123558046, "step": 3860, "toxic_reward": 3.892214322090149 }, { "clip_ratio": 0.0, "completion_length": 40.575, "epoch": 0.9144612476370511, "format_reward": 0.0, "grad_norm": 1.6060093641281128, "image_reward": 0.25700276643037795, "kl": 1.728565347008407, "learning_rate": 5e-06, "loss": -0.0095, "reward": 0.2703657388687134, "reward_std": 0.8089243900030851, "rewards/reward_func": 0.2703657388687134, "step": 3870, "toxic_reward": 4.175320339202881 }, { "clip_ratio": 0.0, "completion_length": 44.275, "epoch": 0.9168241965973535, "format_reward": 0.0, "grad_norm": 2.603025436401367, "image_reward": 0.2737335205078125, "kl": 0.7518249765969813, "learning_rate": 5e-06, "loss": -0.0306, "reward": 0.8955561727285385, "reward_std": 1.1253668650984765, "rewards/reward_func": 0.8955561727285385, "step": 3880, "toxic_reward": 3.5497735261917116 }, { "clip_ratio": 0.0, "completion_length": 44.875, "epoch": 0.9191871455576559, "format_reward": -0.25, "grad_norm": 0.6174436211585999, "image_reward": 0.2833251953125, "kl": 1.3900917531922459, "learning_rate": 5e-06, "loss": 0.0533, "reward": 0.8824085891246796, "reward_std": 1.2487390112131833, "rewards/reward_func": 0.8824085891246796, "step": 3890, "toxic_reward": 3.764987659454346 }, { "clip_ratio": 0.0, "completion_length": 50.1, "epoch": 0.9215500945179584, "format_reward": -0.5, "grad_norm": 0.8587064146995544, "image_reward": 0.25449015349149706, "kl": 3.2844431857578456, "learning_rate": 5e-06, "loss": 0.0836, "reward": 0.17285645604133607, "reward_std": 1.4729075387120247, "rewards/reward_func": 0.17285645604133607, "step": 3900, "toxic_reward": 4.319640278816223 }, { "clip_ratio": 0.0, "completion_length": 54.65, "epoch": 0.9239130434782609, "format_reward": -0.25, "grad_norm": 0.7836766242980957, "image_reward": 0.2760904937982559, "kl": 0.04128519091755152, "learning_rate": 5e-06, "loss": 0.0218, "reward": -0.14393893480300904, "reward_std": 1.2086152411997317, "rewards/reward_func": -0.14393893480300904, "step": 3910, "toxic_reward": 3.988687515258789 }, { "clip_ratio": 0.0, "completion_length": 55.525, "epoch": 0.9262759924385633, "format_reward": 0.0, "grad_norm": 1.0223326683044434, "image_reward": 0.237677001953125, "kl": 0.10622669160366058, "learning_rate": 5e-06, "loss": 0.1241, "reward": 0.8052110552787781, "reward_std": 0.809264022950083, "rewards/reward_func": 0.8052110552787781, "step": 3920, "toxic_reward": 4.316140675544739 }, { "clip_ratio": 0.0, "completion_length": 45.525, "epoch": 0.9286389413988658, "format_reward": -0.25, "grad_norm": 1.2948088645935059, "image_reward": 0.27791646122932434, "kl": 2.2056565455161037, "learning_rate": 5e-06, "loss": 0.0904, "reward": 0.5610605776309967, "reward_std": 0.9484948962926865, "rewards/reward_func": 0.5610605776309967, "step": 3930, "toxic_reward": 4.4695143699646 }, { "clip_ratio": 0.0, "completion_length": 47.825, "epoch": 0.9310018903591682, "format_reward": 0.0, "grad_norm": 1.0040950775146484, "image_reward": 0.27231852263212203, "kl": 2.655760496482253, "learning_rate": 5e-06, "loss": -0.0755, "reward": 0.263138085603714, "reward_std": 0.4817726358771324, "rewards/reward_func": 0.263138085603714, "step": 3940, "toxic_reward": 4.636347913742066 }, { "clip_ratio": 0.0, "completion_length": 54.25, "epoch": 0.9333648393194707, "format_reward": -0.25, "grad_norm": 0.39709585905075073, "image_reward": 0.2637420654296875, "kl": 0.1439337281510234, "learning_rate": 5e-06, "loss": -0.0772, "reward": 0.04962950348854065, "reward_std": 0.781620041653514, "rewards/reward_func": 0.04962950348854065, "step": 3950, "toxic_reward": 4.751176500320435 }, { "clip_ratio": 0.0, "completion_length": 45.9, "epoch": 0.9357277882797732, "format_reward": -0.25, "grad_norm": 0.8190930485725403, "image_reward": 0.2555938705801964, "kl": 5.330091013200581, "learning_rate": 5e-06, "loss": -0.0093, "reward": -0.16106579303741456, "reward_std": 1.2331121437251569, "rewards/reward_func": -0.16106579303741456, "step": 3960, "toxic_reward": 4.007374119758606 }, { "clip_ratio": 0.0, "completion_length": 45.75, "epoch": 0.9380907372400756, "format_reward": 0.0, "grad_norm": 1.0821632146835327, "image_reward": 0.3061696380376816, "kl": 6.141950584948063, "learning_rate": 5e-06, "loss": 0.1307, "reward": 0.625621622800827, "reward_std": 0.8008190289139747, "rewards/reward_func": 0.625621622800827, "step": 3970, "toxic_reward": 3.468269979953766 }, { "clip_ratio": 0.0, "completion_length": 47.625, "epoch": 0.9404536862003781, "format_reward": -0.75, "grad_norm": 1.1129677295684814, "image_reward": 0.26193033903837204, "kl": 0.6634119726717472, "learning_rate": 5e-06, "loss": -0.0429, "reward": 0.13726072907447814, "reward_std": 2.353568767011166, "rewards/reward_func": 0.13726072907447814, "step": 3980, "toxic_reward": 4.0013970851898195 }, { "clip_ratio": 0.0, "completion_length": 48.35, "epoch": 0.9428166351606805, "format_reward": 0.0, "grad_norm": 0.7701426148414612, "image_reward": 0.28350016176700593, "kl": 0.1994122840464115, "learning_rate": 5e-06, "loss": -0.0421, "reward": 0.7691244065761567, "reward_std": 0.9025557667016983, "rewards/reward_func": 0.7691244065761567, "step": 3990, "toxic_reward": 4.340523219108581 }, { "clip_ratio": 0.0, "completion_length": 46.225, "epoch": 0.945179584120983, "format_reward": 0.0, "grad_norm": 0.46611157059669495, "image_reward": 0.26962890625, "kl": 0.047311073541641234, "learning_rate": 5e-06, "loss": -0.1003, "reward": 1.3654770731925965, "reward_std": 0.657595872040838, "rewards/reward_func": 1.3654770731925965, "step": 4000, "toxic_reward": 3.765986955165863 }, { "clip_ratio": 0.0, "completion_length": 55.675, "epoch": 0.9475425330812854, "format_reward": -0.25, "grad_norm": 0.730478048324585, "image_reward": 0.2595326751470566, "kl": 2.0693125385791062, "learning_rate": 5e-06, "loss": -0.0531, "reward": 0.16633399724960327, "reward_std": 1.2444878976792098, "rewards/reward_func": 0.16633399724960327, "step": 4010, "toxic_reward": 3.9091518998146055 }, { "clip_ratio": 0.0, "completion_length": 41.775, "epoch": 0.9499054820415879, "format_reward": -0.25, "grad_norm": 0.7307797074317932, "image_reward": 0.2784047439694405, "kl": 1.5403530787676574, "learning_rate": 5e-06, "loss": 0.01, "reward": 0.15964727997779846, "reward_std": 1.2297844395041466, "rewards/reward_func": 0.15964727997779846, "step": 4020, "toxic_reward": 4.325857400894165 }, { "clip_ratio": 0.0, "completion_length": 53.275, "epoch": 0.9522684310018904, "format_reward": -0.5, "grad_norm": 1.158098816871643, "image_reward": 0.240879312902689, "kl": 1.8537536807358266, "learning_rate": 5e-06, "loss": 0.1782, "reward": 0.5329648047685623, "reward_std": 1.5547814331948757, "rewards/reward_func": 0.5329648047685623, "step": 4030, "toxic_reward": 3.8254613667726516 }, { "clip_ratio": 0.0, "completion_length": 53.95, "epoch": 0.9546313799621928, "format_reward": 0.0, "grad_norm": 0.5303730964660645, "image_reward": 0.25118484497070315, "kl": 0.187329238653183, "learning_rate": 5e-06, "loss": 0.0541, "reward": 0.27067047357559204, "reward_std": 0.7333962991833687, "rewards/reward_func": 0.27067047357559204, "step": 4040, "toxic_reward": 4.245214033126831 }, { "clip_ratio": 0.0, "completion_length": 49.5, "epoch": 0.9569943289224953, "format_reward": -0.25, "grad_norm": 0.8333770632743835, "image_reward": 0.264398193359375, "kl": 7.662982761859894, "learning_rate": 5e-06, "loss": 0.0304, "reward": 0.26739619076251986, "reward_std": 1.3646116882562638, "rewards/reward_func": 0.26739619076251986, "step": 4050, "toxic_reward": 3.6070310473442078 }, { "clip_ratio": 0.0, "completion_length": 42.275, "epoch": 0.9593572778827977, "format_reward": -0.25, "grad_norm": 1.021411657333374, "image_reward": 0.281744384765625, "kl": 2.6961711190640925, "learning_rate": 5e-06, "loss": 0.0528, "reward": 0.1087444543838501, "reward_std": 0.9241739958524704, "rewards/reward_func": 0.1087444543838501, "step": 4060, "toxic_reward": 4.196173495054245 }, { "clip_ratio": 0.0, "completion_length": 42.575, "epoch": 0.9617202268431002, "format_reward": -0.25, "grad_norm": 0.7931532859802246, "image_reward": 0.25716959685087204, "kl": 5.984370514377952, "learning_rate": 5e-06, "loss": -0.1524, "reward": 0.09075822830200195, "reward_std": 1.12701465934515, "rewards/reward_func": 0.09075822830200195, "step": 4070, "toxic_reward": 4.376713454723358 }, { "clip_ratio": 0.0, "completion_length": 44.375, "epoch": 0.9640831758034026, "format_reward": -0.5, "grad_norm": 0.7085260152816772, "image_reward": 0.2577000930905342, "kl": 1.8822400705888866, "learning_rate": 5e-06, "loss": 0.0404, "reward": 0.17331230640411377, "reward_std": 1.6633539475500583, "rewards/reward_func": 0.17331230640411377, "step": 4080, "toxic_reward": 4.294411969184876 }, { "clip_ratio": 0.0, "completion_length": 42.8, "epoch": 0.9664461247637051, "format_reward": 0.0, "grad_norm": 1.0063364505767822, "image_reward": 0.250091552734375, "kl": 0.14847910068929196, "learning_rate": 5e-06, "loss": -0.0732, "reward": 1.0602999448776245, "reward_std": 0.6203169705346226, "rewards/reward_func": 1.0602999448776245, "step": 4090, "toxic_reward": 4.171542119979859 }, { "clip_ratio": 0.0, "completion_length": 52.85, "epoch": 0.9688090737240076, "format_reward": -0.25, "grad_norm": 0.6620392203330994, "image_reward": 0.2424652099609375, "kl": 0.36983290296047927, "learning_rate": 5e-06, "loss": -0.0911, "reward": 0.6753372728824616, "reward_std": 1.2773339383304119, "rewards/reward_func": 0.6753372728824616, "step": 4100, "toxic_reward": 4.280330467224121 }, { "clip_ratio": 0.0, "completion_length": 48.9, "epoch": 0.97117202268431, "format_reward": 0.0, "grad_norm": 0.8530160188674927, "image_reward": 0.2524658203125, "kl": 4.212855443544686, "learning_rate": 5e-06, "loss": 0.0813, "reward": 0.6060003638267517, "reward_std": 0.9568195153027773, "rewards/reward_func": 0.6060003638267517, "step": 4110, "toxic_reward": 3.8215681195259092 }, { "clip_ratio": 0.0, "completion_length": 44.85, "epoch": 0.9735349716446124, "format_reward": -0.5, "grad_norm": 1.7192955017089844, "image_reward": 0.28581949770450593, "kl": 10.378714705258608, "learning_rate": 5e-06, "loss": 0.0248, "reward": 0.10166561603546143, "reward_std": 1.791293729841709, "rewards/reward_func": 0.10166561603546143, "step": 4120, "toxic_reward": 3.5509902030229568 }, { "clip_ratio": 0.0, "completion_length": 55.725, "epoch": 0.975897920604915, "format_reward": 0.0, "grad_norm": 0.8529999852180481, "image_reward": 0.2569305419921875, "kl": 8.308781201578677, "learning_rate": 5e-06, "loss": 0.0001, "reward": 0.7349396765232086, "reward_std": 0.4486356295645237, "rewards/reward_func": 0.7349396765232086, "step": 4130, "toxic_reward": 4.5674937725067135 }, { "clip_ratio": 0.0, "completion_length": 50.925, "epoch": 0.9782608695652174, "format_reward": 0.0, "grad_norm": 0.9192355275154114, "image_reward": 0.273162841796875, "kl": 2.731711974926293, "learning_rate": 5e-06, "loss": -0.0006, "reward": -0.20568010210990906, "reward_std": 0.6350222621113062, "rewards/reward_func": -0.20568010210990906, "step": 4140, "toxic_reward": 4.01632958650589 }, { "clip_ratio": 0.0, "completion_length": 44.875, "epoch": 0.9806238185255198, "format_reward": 0.0, "grad_norm": 0.7154003977775574, "image_reward": 0.25846659392118454, "kl": 3.1543860264122485, "learning_rate": 5e-06, "loss": -0.0384, "reward": 0.15666076242923738, "reward_std": 1.1065492704510689, "rewards/reward_func": 0.15666076242923738, "step": 4150, "toxic_reward": 3.2047137916088104 }, { "clip_ratio": 0.0, "completion_length": 44.225, "epoch": 0.9829867674858223, "format_reward": -0.25, "grad_norm": 0.6323632001876831, "image_reward": 0.2768702179193497, "kl": 4.070834948495031, "learning_rate": 5e-06, "loss": 0.1427, "reward": 0.21166958212852477, "reward_std": 1.1970111442729832, "rewards/reward_func": 0.21166958212852477, "step": 4160, "toxic_reward": 4.128625917434692 }, { "clip_ratio": 0.0, "completion_length": 46.425, "epoch": 0.9853497164461248, "format_reward": 0.0, "grad_norm": 0.5803432464599609, "image_reward": 0.257720947265625, "kl": 1.2115541946142911, "learning_rate": 5e-06, "loss": -0.0251, "reward": 0.26483882069587705, "reward_std": 0.8841663489118219, "rewards/reward_func": 0.26483882069587705, "step": 4170, "toxic_reward": 3.953411507606506 }, { "clip_ratio": 0.0, "completion_length": 43.05, "epoch": 0.9877126654064272, "format_reward": -0.25, "grad_norm": 1.0321141481399536, "image_reward": 0.2662984222173691, "kl": 12.658018402941526, "learning_rate": 5e-06, "loss": 0.0031, "reward": 0.850147670507431, "reward_std": 1.0917948484420776, "rewards/reward_func": 0.850147670507431, "step": 4180, "toxic_reward": 4.424872517585754 }, { "clip_ratio": 0.0, "completion_length": 33.775, "epoch": 0.9900756143667296, "format_reward": -0.25, "grad_norm": 2.934152603149414, "image_reward": 0.28038330078125, "kl": 0.22834131643176078, "learning_rate": 5e-06, "loss": -0.12, "reward": -0.05527897924184799, "reward_std": 1.090353344194591, "rewards/reward_func": -0.05527897924184799, "step": 4190, "toxic_reward": 4.094451707601547 }, { "clip_ratio": 0.0, "completion_length": 47.175, "epoch": 0.9924385633270322, "format_reward": -0.5, "grad_norm": 1.2237070798873901, "image_reward": 0.28351847380399703, "kl": 11.847508652508258, "learning_rate": 5e-06, "loss": -0.078, "reward": -0.27686416208744047, "reward_std": 1.4433475863188505, "rewards/reward_func": -0.27686416208744047, "step": 4200, "toxic_reward": 4.032291853427887 }, { "clip_ratio": 0.0, "completion_length": 40.3, "epoch": 0.9948015122873346, "format_reward": 0.0, "grad_norm": 1.00357985496521, "image_reward": 0.26925506591796877, "kl": 6.287641528248787, "learning_rate": 5e-06, "loss": -0.0509, "reward": 0.26841793656349183, "reward_std": 0.7431968785822392, "rewards/reward_func": 0.26841793656349183, "step": 4210, "toxic_reward": 3.797722101211548 }, { "clip_ratio": 0.0, "completion_length": 54.4, "epoch": 0.997164461247637, "format_reward": -0.5, "grad_norm": 1.412477731704712, "image_reward": 0.2857859283685684, "kl": 0.20840035788714886, "learning_rate": 5e-06, "loss": -0.01, "reward": 0.49851550459861754, "reward_std": 1.4509758695960044, "rewards/reward_func": 0.49851550459861754, "step": 4220, "toxic_reward": 4.4275671482086185 }, { "clip_ratio": 0.0, "completion_length": 51.575, "epoch": 0.9995274102079396, "format_reward": 0.0, "grad_norm": 1.3644284009933472, "image_reward": 0.2857421875, "kl": 14.352099673077465, "learning_rate": 5e-06, "loss": -0.1247, "reward": 0.5606966435909271, "reward_std": 0.5899959981441498, "rewards/reward_func": 0.5606966435909271, "step": 4230, "toxic_reward": 4.210422110557556 }, { "clip_ratio": 0.0, "completion_length": 45.975, "epoch": 1.001890359168242, "format_reward": -0.25, "grad_norm": 4.210316181182861, "image_reward": 0.28163655698299406, "kl": 6.0347686521708965, "learning_rate": 5e-06, "loss": -0.0289, "reward": 0.11643823981285095, "reward_std": 1.107480544038117, "rewards/reward_func": 0.11643823981285095, "step": 4240, "toxic_reward": 3.994339680671692 }, { "clip_ratio": 0.0, "completion_length": 54.3, "epoch": 1.0042533081285445, "format_reward": 0.0, "grad_norm": 3.516270637512207, "image_reward": 0.2680999755859375, "kl": 1.3430524323135615, "learning_rate": 5e-06, "loss": 0.0258, "reward": 1.2558865308761598, "reward_std": 0.9449932537972927, "rewards/reward_func": 1.2558865308761598, "step": 4250, "toxic_reward": 4.3164361953735355 }, { "clip_ratio": 0.0, "completion_length": 53.1, "epoch": 1.0066162570888468, "format_reward": 0.0, "grad_norm": 2.1102194786071777, "image_reward": 0.25366058349609377, "kl": 0.149768141284585, "learning_rate": 5e-06, "loss": -0.0742, "reward": 0.06790508627891541, "reward_std": 0.6080379813909531, "rewards/reward_func": 0.06790508627891541, "step": 4260, "toxic_reward": 4.386375617980957 }, { "clip_ratio": 0.0, "completion_length": 52.325, "epoch": 1.0089792060491494, "format_reward": -0.25, "grad_norm": 4.862875461578369, "image_reward": 0.2832529693841934, "kl": 2.053416795656085, "learning_rate": 5e-06, "loss": -0.0907, "reward": 0.30691148042678834, "reward_std": 1.6981020882725715, "rewards/reward_func": 0.30691148042678834, "step": 4270, "toxic_reward": 3.5000792026519774 }, { "clip_ratio": 0.0, "completion_length": 49.05, "epoch": 1.011342155009452, "format_reward": -0.25, "grad_norm": 3.5172159671783447, "image_reward": 0.2263885498046875, "kl": 0.14171482473611832, "learning_rate": 5e-06, "loss": -0.0082, "reward": 0.5368665099143982, "reward_std": 1.4538173630833626, "rewards/reward_func": 0.5368665099143982, "step": 4280, "toxic_reward": 4.501732063293457 }, { "clip_ratio": 0.0, "completion_length": 50.1, "epoch": 1.0137051039697542, "format_reward": 0.0, "grad_norm": 0.6869735717773438, "image_reward": 0.266131591796875, "kl": 17.507234007120132, "learning_rate": 5e-06, "loss": -0.0165, "reward": 0.595378065109253, "reward_std": 0.6132703861221671, "rewards/reward_func": 0.595378065109253, "step": 4290, "toxic_reward": 4.223924076557159 }, { "clip_ratio": 0.0, "completion_length": 51.8, "epoch": 1.0160680529300568, "format_reward": -0.5, "grad_norm": 7.046621322631836, "image_reward": 0.2617146819829941, "kl": 0.1885729007422924, "learning_rate": 5e-06, "loss": -0.0513, "reward": 0.02859283685684204, "reward_std": 1.7149874530732632, "rewards/reward_func": 0.02859283685684204, "step": 4300, "toxic_reward": 4.371392369270325 }, { "clip_ratio": 0.0, "completion_length": 40.6, "epoch": 1.018431001890359, "format_reward": -0.25, "grad_norm": 4.381049156188965, "image_reward": 0.2654388427734375, "kl": 1.2927639432251454, "learning_rate": 5e-06, "loss": -0.0052, "reward": -0.0640803337097168, "reward_std": 1.4367546334862709, "rewards/reward_func": -0.0640803337097168, "step": 4310, "toxic_reward": 4.212549781799316 }, { "clip_ratio": 0.0, "completion_length": 51.9, "epoch": 1.0207939508506616, "format_reward": 0.0, "grad_norm": 1.0743227005004883, "image_reward": 0.255714924633503, "kl": 16.378241488710046, "learning_rate": 5e-06, "loss": -0.1016, "reward": 0.3336408376693726, "reward_std": 0.6735909695737063, "rewards/reward_func": 0.3336408376693726, "step": 4320, "toxic_reward": 4.5154483914375305 }, { "clip_ratio": 0.0, "completion_length": 49.4, "epoch": 1.0231568998109641, "format_reward": -0.25, "grad_norm": 1.6660760641098022, "image_reward": 0.24780476838350296, "kl": 11.463303370773792, "learning_rate": 5e-06, "loss": -0.0995, "reward": -0.0025091707706451417, "reward_std": 1.6661910176277162, "rewards/reward_func": -0.0025091707706451417, "step": 4330, "toxic_reward": 3.852264070510864 }, { "clip_ratio": 0.0, "completion_length": 42.4, "epoch": 1.0255198487712665, "format_reward": 0.0, "grad_norm": 4.4377121925354, "image_reward": 0.2486114501953125, "kl": 0.12634929567575454, "learning_rate": 5e-06, "loss": 0.0148, "reward": 0.518048095703125, "reward_std": 1.0762871712446214, "rewards/reward_func": 0.518048095703125, "step": 4340, "toxic_reward": 4.010181951522827 }, { "clip_ratio": 0.0, "completion_length": 47.875, "epoch": 1.027882797731569, "format_reward": -0.25, "grad_norm": 1.9050147533416748, "image_reward": 0.2529998779296875, "kl": 0.20360449738800526, "learning_rate": 5e-06, "loss": 0.0104, "reward": -0.05591415464878082, "reward_std": 1.2363329231739044, "rewards/reward_func": -0.05591415464878082, "step": 4350, "toxic_reward": 4.434300184249878 }, { "clip_ratio": 0.0, "completion_length": 48.9, "epoch": 1.0302457466918715, "format_reward": -0.25, "grad_norm": 2.783447742462158, "image_reward": 0.25158691257238386, "kl": 0.14820914287120104, "learning_rate": 5e-06, "loss": 0.0709, "reward": 0.2038910448551178, "reward_std": 1.113127877563238, "rewards/reward_func": 0.2038910448551178, "step": 4360, "toxic_reward": 4.226523244380951 }, { "clip_ratio": 0.0, "completion_length": 58.85, "epoch": 1.0326086956521738, "format_reward": -0.25, "grad_norm": 0.6769667863845825, "image_reward": 0.2698781341314316, "kl": 1.560221792012453, "learning_rate": 5e-06, "loss": 0.0453, "reward": -0.25547429323196413, "reward_std": 1.2353890612721443, "rewards/reward_func": -0.25547429323196413, "step": 4370, "toxic_reward": 4.451306319236755 }, { "clip_ratio": 0.0, "completion_length": 45.325, "epoch": 1.0349716446124764, "format_reward": -0.5, "grad_norm": 1.5327138900756836, "image_reward": 0.25219675749540327, "kl": 8.099627137556672, "learning_rate": 5e-06, "loss": -0.0101, "reward": -0.2395196735858917, "reward_std": 1.6747881084680558, "rewards/reward_func": -0.2395196735858917, "step": 4380, "toxic_reward": 4.392533135414124 }, { "clip_ratio": 0.0, "completion_length": 55.875, "epoch": 1.037334593572779, "format_reward": 0.0, "grad_norm": 1.2079089879989624, "image_reward": 0.2677134186029434, "kl": 20.563808789849283, "learning_rate": 5e-06, "loss": 0.0416, "reward": 0.5019214197993278, "reward_std": 1.3155438639223576, "rewards/reward_func": 0.5019214197993278, "step": 4390, "toxic_reward": 3.744106537103653 }, { "clip_ratio": 0.0, "completion_length": 45.575, "epoch": 1.0396975425330812, "format_reward": 0.0, "grad_norm": 12.300169944763184, "image_reward": 0.266143798828125, "kl": 0.33298523649573325, "learning_rate": 5e-06, "loss": 0.0325, "reward": 0.3428509056568146, "reward_std": 0.6832939319312572, "rewards/reward_func": 0.3428509056568146, "step": 4400, "toxic_reward": 3.938971757888794 }, { "clip_ratio": 0.0, "completion_length": 41.575, "epoch": 1.0420604914933838, "format_reward": 0.0, "grad_norm": 13.788394927978516, "image_reward": 0.2602081298828125, "kl": 0.14277449063956738, "learning_rate": 5e-06, "loss": 0.0799, "reward": 0.5224148035049438, "reward_std": 0.5329875692725181, "rewards/reward_func": 0.5224148035049438, "step": 4410, "toxic_reward": 4.6161150455474855 }, { "clip_ratio": 0.0, "completion_length": 52.45, "epoch": 1.0444234404536863, "format_reward": 0.0, "grad_norm": 9.009355545043945, "image_reward": 0.2745361328125, "kl": 0.1908944919705391, "learning_rate": 5e-06, "loss": 0.0428, "reward": 0.5507814303040505, "reward_std": 0.7364906007423997, "rewards/reward_func": 0.5507814303040505, "step": 4420, "toxic_reward": 3.738240921497345 }, { "clip_ratio": 0.0, "completion_length": 53.375, "epoch": 1.0467863894139886, "format_reward": -0.25, "grad_norm": 0.9596161842346191, "image_reward": 0.2600412994623184, "kl": 0.2493920259177685, "learning_rate": 5e-06, "loss": 0.0546, "reward": 0.3896596789360046, "reward_std": 1.1463438659906386, "rewards/reward_func": 0.3896596789360046, "step": 4430, "toxic_reward": 4.299828362464905 }, { "clip_ratio": 0.0, "completion_length": 45.6, "epoch": 1.0491493383742911, "format_reward": -0.5, "grad_norm": 16.468900680541992, "image_reward": 0.2660593673586845, "kl": 1.7080695651471616, "learning_rate": 5e-06, "loss": 0.0381, "reward": 0.23900684118270873, "reward_std": 1.4474023096263409, "rewards/reward_func": 0.23900684118270873, "step": 4440, "toxic_reward": 4.323360848426819 }, { "clip_ratio": 0.0, "completion_length": 43.175, "epoch": 1.0515122873345937, "format_reward": 0.0, "grad_norm": 9.211010932922363, "image_reward": 0.23970438539981842, "kl": 2.5637484416365623, "learning_rate": 5e-06, "loss": 0.1362, "reward": 1.0162243604660035, "reward_std": 0.6415727452374995, "rewards/reward_func": 1.0162243604660035, "step": 4450, "toxic_reward": 4.387140679359436 }, { "clip_ratio": 0.0, "completion_length": 52.875, "epoch": 1.053875236294896, "format_reward": -0.25, "grad_norm": 7.96478796005249, "image_reward": 0.279913330078125, "kl": 1.5326927796006202, "learning_rate": 5e-06, "loss": 0.135, "reward": -0.0030475854873657227, "reward_std": 1.1649701196700335, "rewards/reward_func": -0.0030475854873657227, "step": 4460, "toxic_reward": 4.197524422407151 }, { "clip_ratio": 0.0, "completion_length": 50.475, "epoch": 1.0562381852551985, "format_reward": 0.0, "grad_norm": 18.094940185546875, "image_reward": 0.25467529296875, "kl": 0.3941259577870369, "learning_rate": 5e-06, "loss": -0.0321, "reward": 0.24867143034934996, "reward_std": 0.6323847549967467, "rewards/reward_func": 0.24867143034934996, "step": 4470, "toxic_reward": 4.357023143768311 }, { "clip_ratio": 0.0, "completion_length": 48.6, "epoch": 1.0586011342155008, "format_reward": 0.0, "grad_norm": 6.004316329956055, "image_reward": 0.249798583984375, "kl": 0.6100661933422089, "learning_rate": 5e-06, "loss": 0.054, "reward": 0.3194288432598114, "reward_std": 1.00972272567451, "rewards/reward_func": 0.3194288432598114, "step": 4480, "toxic_reward": 3.6232224822044374 }, { "clip_ratio": 0.0, "completion_length": 41.275, "epoch": 1.0609640831758034, "format_reward": 0.0, "grad_norm": 16.553150177001953, "image_reward": 0.2777862548828125, "kl": 1.2566918075084685, "learning_rate": 5e-06, "loss": 0.0033, "reward": 0.3464995056390762, "reward_std": 1.0610926449298859, "rewards/reward_func": 0.3464995056390762, "step": 4490, "toxic_reward": 3.4315811157226563 }, { "clip_ratio": 0.0, "completion_length": 46.5, "epoch": 1.063327032136106, "format_reward": 0.0, "grad_norm": 21.91239356994629, "image_reward": 0.26402740478515624, "kl": 2.7657025068998338, "learning_rate": 5e-06, "loss": -0.0088, "reward": -0.017888635396957397, "reward_std": 0.36575160175561905, "rewards/reward_func": -0.017888635396957397, "step": 4500, "toxic_reward": 4.492252993583679 }, { "clip_ratio": 0.0, "completion_length": 52.175, "epoch": 1.0656899810964082, "format_reward": 0.0, "grad_norm": 1.70862877368927, "image_reward": 0.2537200927734375, "kl": 0.9243695795536041, "learning_rate": 5e-06, "loss": -0.148, "reward": 0.2666252374649048, "reward_std": 0.8290498301386833, "rewards/reward_func": 0.2666252374649048, "step": 4510, "toxic_reward": 4.025203084945678 }, { "clip_ratio": 0.0, "completion_length": 58.625, "epoch": 1.0680529300567108, "format_reward": 0.0, "grad_norm": 4.76298189163208, "image_reward": 0.2544342041015625, "kl": 1.2314461708068847, "learning_rate": 5e-06, "loss": 0.0379, "reward": 0.1297641634941101, "reward_std": 0.7050925550982357, "rewards/reward_func": 0.1297641634941101, "step": 4520, "toxic_reward": 4.568165302276611 }, { "clip_ratio": 0.0, "completion_length": 36.825, "epoch": 1.0704158790170133, "format_reward": 0.0, "grad_norm": 8.698065757751465, "image_reward": 0.274078369140625, "kl": 1.325176051259041, "learning_rate": 5e-06, "loss": -0.0398, "reward": 0.10026351213455201, "reward_std": 0.812692479044199, "rewards/reward_func": 0.10026351213455201, "step": 4530, "toxic_reward": 3.9686192631721497 }, { "clip_ratio": 0.0, "completion_length": 48.3, "epoch": 1.0727788279773156, "format_reward": 0.0, "grad_norm": 19.877777099609375, "image_reward": 0.2792388916015625, "kl": 0.9205800026655198, "learning_rate": 5e-06, "loss": 0.1066, "reward": 0.5690743923187256, "reward_std": 0.6653784658759833, "rewards/reward_func": 0.5690743923187256, "step": 4540, "toxic_reward": 3.964191234111786 }, { "clip_ratio": 0.0, "completion_length": 42.525, "epoch": 1.0751417769376181, "format_reward": -0.25, "grad_norm": 2.515148878097534, "image_reward": 0.23823343813419343, "kl": 2.8265303134918214, "learning_rate": 5e-06, "loss": -0.1035, "reward": 0.6135944664478302, "reward_std": 1.5456651039421558, "rewards/reward_func": 0.6135944664478302, "step": 4550, "toxic_reward": 4.405286359786987 }, { "clip_ratio": 0.0, "completion_length": 39.85, "epoch": 1.0775047258979207, "format_reward": 0.0, "grad_norm": 16.40328025817871, "image_reward": 0.26103515625, "kl": 2.2788069248199463, "learning_rate": 5e-06, "loss": 0.0091, "reward": 0.10576534271240234, "reward_std": 0.39959471523761747, "rewards/reward_func": 0.10576534271240234, "step": 4560, "toxic_reward": 4.633650445938111 }, { "clip_ratio": 0.0, "completion_length": 44.7, "epoch": 1.079867674858223, "format_reward": -0.5, "grad_norm": 7.5780229568481445, "image_reward": 0.26860554963350297, "kl": 4.05745484828949, "learning_rate": 5e-06, "loss": 0.0631, "reward": 0.6903072118759155, "reward_std": 1.8377123966813087, "rewards/reward_func": 0.6903072118759155, "step": 4570, "toxic_reward": 4.404474878311158 }, { "clip_ratio": 0.0, "completion_length": 69.25, "epoch": 1.0822306238185255, "format_reward": -0.25, "grad_norm": 2.605886936187744, "image_reward": 0.26128031462430956, "kl": 1.1529910147190094, "learning_rate": 5e-06, "loss": 0.0061, "reward": -0.4064223051071167, "reward_std": 1.0936089092865586, "rewards/reward_func": -0.4064223051071167, "step": 4580, "toxic_reward": 4.17680971622467 }, { "clip_ratio": 0.0, "completion_length": 56.225, "epoch": 1.084593572778828, "format_reward": 0.0, "grad_norm": 2.201918601989746, "image_reward": 0.26739501953125, "kl": 0.7634936004877091, "learning_rate": 5e-06, "loss": 0.0832, "reward": 0.2786406099796295, "reward_std": 0.7699430305510759, "rewards/reward_func": 0.2786406099796295, "step": 4590, "toxic_reward": 4.203878152370453 }, { "clip_ratio": 0.0, "completion_length": 47.75, "epoch": 1.0869565217391304, "format_reward": 0.0, "grad_norm": 4.329306125640869, "image_reward": 0.2691864013671875, "kl": 1.415566897392273, "learning_rate": 5e-06, "loss": 0.0091, "reward": 0.031065577268600465, "reward_std": 0.9241972327232361, "rewards/reward_func": 0.031065577268600465, "step": 4600, "toxic_reward": 3.4664461970329286 }, { "clip_ratio": 0.0, "completion_length": 41.825, "epoch": 1.089319470699433, "format_reward": 0.0, "grad_norm": 15.773272514343262, "image_reward": 0.23636678010225295, "kl": 3.1093257188797, "learning_rate": 5e-06, "loss": -0.0048, "reward": 0.6077887773513794, "reward_std": 0.9942519944161177, "rewards/reward_func": 0.6077887773513794, "step": 4610, "toxic_reward": 4.182659006118774 }, { "clip_ratio": 0.0, "completion_length": 45.6, "epoch": 1.0916824196597354, "format_reward": -0.25, "grad_norm": 4.842249870300293, "image_reward": 0.2884033203125, "kl": 0.8512112647294998, "learning_rate": 5e-06, "loss": 0.0029, "reward": 0.2840136528015137, "reward_std": 1.34358575232327, "rewards/reward_func": 0.2840136528015137, "step": 4620, "toxic_reward": 4.202986550331116 }, { "clip_ratio": 0.0, "completion_length": 49.0, "epoch": 1.0940453686200378, "format_reward": 0.0, "grad_norm": 12.111598014831543, "image_reward": 0.26724853515625, "kl": 0.5160227678716183, "learning_rate": 5e-06, "loss": -0.1139, "reward": 0.41757542341947557, "reward_std": 0.6193137221038342, "rewards/reward_func": 0.41757542341947557, "step": 4630, "toxic_reward": 3.6429463922977448 }, { "clip_ratio": 0.0, "completion_length": 56.475, "epoch": 1.0964083175803403, "format_reward": 0.0, "grad_norm": 5.654786586761475, "image_reward": 0.2576507568359375, "kl": 0.6431491911411286, "learning_rate": 5e-06, "loss": -0.0209, "reward": 0.4444656491279602, "reward_std": 0.8271868824958801, "rewards/reward_func": 0.4444656491279602, "step": 4640, "toxic_reward": 3.751771080493927 }, { "clip_ratio": 0.0, "completion_length": 45.225, "epoch": 1.0987712665406426, "format_reward": 0.0, "grad_norm": 3.850701332092285, "image_reward": 0.25439249724149704, "kl": 3.444407519698143, "learning_rate": 5e-06, "loss": 0.0854, "reward": 0.643358188867569, "reward_std": 0.8931491523981094, "rewards/reward_func": 0.643358188867569, "step": 4650, "toxic_reward": 4.321841323375702 }, { "clip_ratio": 0.0, "completion_length": 41.575, "epoch": 1.1011342155009451, "format_reward": 0.0, "grad_norm": 6.3553853034973145, "image_reward": 0.2528656005859375, "kl": 0.5195316299796104, "learning_rate": 5e-06, "loss": -0.0275, "reward": 0.4554763913154602, "reward_std": 0.8332011103630066, "rewards/reward_func": 0.4554763913154602, "step": 4660, "toxic_reward": 4.327652913331986 }, { "clip_ratio": 0.0, "completion_length": 37.925, "epoch": 1.1034971644612477, "format_reward": 0.0, "grad_norm": 7.020429611206055, "image_reward": 0.24530232697725296, "kl": 1.074008372426033, "learning_rate": 5e-06, "loss": 0.1332, "reward": 0.08963438272476196, "reward_std": 1.1591505765914918, "rewards/reward_func": 0.08963438272476196, "step": 4670, "toxic_reward": 3.8221271514892576 }, { "clip_ratio": 0.0, "completion_length": 45.675, "epoch": 1.10586011342155, "format_reward": 0.0, "grad_norm": 1.8656316995620728, "image_reward": 0.23163909912109376, "kl": 1.6141413852572442, "learning_rate": 5e-06, "loss": 0.0496, "reward": 0.4311521232128143, "reward_std": 0.39210873320698736, "rewards/reward_func": 0.4311521232128143, "step": 4680, "toxic_reward": 4.517966604232788 }, { "clip_ratio": 0.0, "completion_length": 54.2, "epoch": 1.1082230623818525, "format_reward": 0.0, "grad_norm": 13.881125450134277, "image_reward": 0.2380462646484375, "kl": 0.6565065160393715, "learning_rate": 5e-06, "loss": -0.0278, "reward": 0.3582367777824402, "reward_std": 1.0096068516373635, "rewards/reward_func": 0.3582367777824402, "step": 4690, "toxic_reward": 3.9490260004997255 }, { "clip_ratio": 0.0, "completion_length": 37.2, "epoch": 1.110586011342155, "format_reward": -0.25, "grad_norm": 6.552460193634033, "image_reward": 0.24650166779756547, "kl": 2.4836434960365295, "learning_rate": 5e-06, "loss": -0.055, "reward": 0.850802743434906, "reward_std": 1.5017553605139256, "rewards/reward_func": 0.850802743434906, "step": 4700, "toxic_reward": 4.000320458412171 }, { "clip_ratio": 0.0, "completion_length": 43.35, "epoch": 1.1129489603024574, "format_reward": 0.0, "grad_norm": 10.16213607788086, "image_reward": 0.2479156494140625, "kl": 9.667583072185517, "learning_rate": 5e-06, "loss": 0.0035, "reward": 0.7445069432258606, "reward_std": 0.7123569492250681, "rewards/reward_func": 0.7445069432258606, "step": 4710, "toxic_reward": 4.515477871894836 }, { "clip_ratio": 0.0, "completion_length": 44.825, "epoch": 1.11531190926276, "format_reward": 0.0, "grad_norm": 11.000924110412598, "image_reward": 0.2601796478033066, "kl": 1.5108904749155045, "learning_rate": 5e-06, "loss": 0.0823, "reward": 0.5056971669197082, "reward_std": 0.6825690733268857, "rewards/reward_func": 0.5056971669197082, "step": 4720, "toxic_reward": 4.039038109779358 }, { "clip_ratio": 0.0, "completion_length": 44.9, "epoch": 1.1176748582230625, "format_reward": -0.25, "grad_norm": 10.222740173339844, "image_reward": 0.25692138671875, "kl": 0.35422504395246507, "learning_rate": 5e-06, "loss": 0.0093, "reward": 0.08535944372415542, "reward_std": 1.423003512620926, "rewards/reward_func": 0.08535944372415542, "step": 4730, "toxic_reward": 3.4993788480758665 }, { "clip_ratio": 0.0, "completion_length": 41.55, "epoch": 1.1200378071833648, "format_reward": -0.25, "grad_norm": 2.3486738204956055, "image_reward": 0.24800923615694045, "kl": 0.38842023983597757, "learning_rate": 5e-06, "loss": -0.0107, "reward": 0.1211450919508934, "reward_std": 1.3706756496801973, "rewards/reward_func": 0.1211450919508934, "step": 4740, "toxic_reward": 3.2782628774642943 }, { "clip_ratio": 0.0, "completion_length": 51.325, "epoch": 1.1224007561436673, "format_reward": 0.0, "grad_norm": 4.218822479248047, "image_reward": 0.251934814453125, "kl": 1.4006462961435318, "learning_rate": 5e-06, "loss": 0.0701, "reward": 0.9292663365602494, "reward_std": 0.874046965315938, "rewards/reward_func": 0.9292663365602494, "step": 4750, "toxic_reward": 4.4716246843338014 }, { "clip_ratio": 0.0, "completion_length": 53.45, "epoch": 1.1247637051039698, "format_reward": 0.0, "grad_norm": 13.548481941223145, "image_reward": 0.276861572265625, "kl": 0.5785035833716392, "learning_rate": 5e-06, "loss": -0.0177, "reward": 0.1986662968993187, "reward_std": 0.7839731447398662, "rewards/reward_func": 0.1986662968993187, "step": 4760, "toxic_reward": 4.128668719530106 }, { "clip_ratio": 0.0, "completion_length": 40.975, "epoch": 1.1271266540642721, "format_reward": 0.0, "grad_norm": 10.347504615783691, "image_reward": 0.2632904052734375, "kl": 0.28924584165215494, "learning_rate": 5e-06, "loss": -0.0536, "reward": 0.40365022569894793, "reward_std": 0.6283778937533497, "rewards/reward_func": 0.40365022569894793, "step": 4770, "toxic_reward": 3.78736280053854 }, { "clip_ratio": 0.0, "completion_length": 51.775, "epoch": 1.1294896030245747, "format_reward": -0.75, "grad_norm": 29.38702964782715, "image_reward": 0.27183634638786314, "kl": 8.101900951564312, "learning_rate": 5e-06, "loss": -0.0473, "reward": -0.11305050253868103, "reward_std": 2.1815814077854156, "rewards/reward_func": -0.11305050253868103, "step": 4780, "toxic_reward": 3.949468755722046 }, { "clip_ratio": 0.0, "completion_length": 48.7, "epoch": 1.1318525519848772, "format_reward": -0.75, "grad_norm": 8.19861125946045, "image_reward": 0.2731597885489464, "kl": 5.514032608270645, "learning_rate": 5e-06, "loss": 0.0809, "reward": -0.5878833532333374, "reward_std": 1.700104326196015, "rewards/reward_func": -0.5878833532333374, "step": 4790, "toxic_reward": 4.362279486656189 }, { "clip_ratio": 0.0, "completion_length": 55.575, "epoch": 1.1342155009451795, "format_reward": 0.0, "grad_norm": 25.879568099975586, "image_reward": 0.272625732421875, "kl": 0.41564694195985796, "learning_rate": 5e-06, "loss": 0.0466, "reward": 0.5246647775173188, "reward_std": 0.5603986160829664, "rewards/reward_func": 0.5246647775173188, "step": 4800, "toxic_reward": 4.4845054864883425 }, { "clip_ratio": 0.0, "completion_length": 38.175, "epoch": 1.136578449905482, "format_reward": -0.25, "grad_norm": 6.490880966186523, "image_reward": 0.27078043669462204, "kl": 0.39700448513031006, "learning_rate": 5e-06, "loss": 0.0442, "reward": 0.10371096134185791, "reward_std": 1.3051490228623153, "rewards/reward_func": 0.10371096134185791, "step": 4810, "toxic_reward": 4.362593126296997 }, { "clip_ratio": 0.0, "completion_length": 42.025, "epoch": 1.1389413988657844, "format_reward": 0.0, "grad_norm": 10.680285453796387, "image_reward": 0.2447296142578125, "kl": 0.43964013159275056, "learning_rate": 5e-06, "loss": -0.1088, "reward": 0.43211621046066284, "reward_std": 0.5677682287991047, "rewards/reward_func": 0.43211621046066284, "step": 4820, "toxic_reward": 4.520205068588257 }, { "clip_ratio": 0.0, "completion_length": 34.225, "epoch": 1.141304347826087, "format_reward": -0.5, "grad_norm": 7.846988201141357, "image_reward": 0.24936320036649703, "kl": 0.3737114042043686, "learning_rate": 5e-06, "loss": 0.1324, "reward": -0.08488219976425171, "reward_std": 1.6377468653023244, "rewards/reward_func": -0.08488219976425171, "step": 4830, "toxic_reward": 3.979211616516113 }, { "clip_ratio": 0.0, "completion_length": 45.95, "epoch": 1.1436672967863895, "format_reward": -0.25, "grad_norm": 13.332221031188965, "image_reward": 0.27100830078125, "kl": 0.39419813454151154, "learning_rate": 5e-06, "loss": -0.0752, "reward": 0.9029350757598877, "reward_std": 1.455178501456976, "rewards/reward_func": 0.9029350757598877, "step": 4840, "toxic_reward": 3.6434609413146974 }, { "clip_ratio": 0.0, "completion_length": 45.125, "epoch": 1.146030245746692, "format_reward": 0.0, "grad_norm": 3.2025651931762695, "image_reward": 0.281640625, "kl": 7.567617936432361, "learning_rate": 5e-06, "loss": -0.0926, "reward": 0.7164658069610595, "reward_std": 0.6624833345413208, "rewards/reward_func": 0.7164658069610595, "step": 4850, "toxic_reward": 3.8413574934005736 }, { "clip_ratio": 0.0, "completion_length": 48.975, "epoch": 1.1483931947069943, "format_reward": 0.0, "grad_norm": 9.695226669311523, "image_reward": 0.24990997314453126, "kl": 0.28165399581193923, "learning_rate": 5e-06, "loss": 0.0188, "reward": -0.12237508296966552, "reward_std": 0.6198875203728675, "rewards/reward_func": -0.12237508296966552, "step": 4860, "toxic_reward": 4.485203766822815 }, { "clip_ratio": 0.0, "completion_length": 40.45, "epoch": 1.1507561436672968, "format_reward": 0.0, "grad_norm": 2.5677099227905273, "image_reward": 0.2783660888671875, "kl": 0.29035804942250254, "learning_rate": 5e-06, "loss": -0.0618, "reward": 0.050651901960372926, "reward_std": 1.2044988840818405, "rewards/reward_func": 0.050651901960372926, "step": 4870, "toxic_reward": 3.6688124537467957 }, { "clip_ratio": 0.0, "completion_length": 51.9, "epoch": 1.1531190926275992, "format_reward": 0.0, "grad_norm": 4.83213996887207, "image_reward": 0.2756866455078125, "kl": 5.533606587722898, "learning_rate": 5e-06, "loss": -0.1798, "reward": 0.5692965686321259, "reward_std": 1.0450827227905393, "rewards/reward_func": 0.5692965686321259, "step": 4880, "toxic_reward": 3.439787745475769 }, { "clip_ratio": 0.0, "completion_length": 42.375, "epoch": 1.1554820415879017, "format_reward": 0.0, "grad_norm": 1.0500257015228271, "image_reward": 0.258331298828125, "kl": 0.11663263067603111, "learning_rate": 5e-06, "loss": 0.0311, "reward": 0.4637997090816498, "reward_std": 0.9648044936358928, "rewards/reward_func": 0.4637997090816498, "step": 4890, "toxic_reward": 4.382505106925964 }, { "clip_ratio": 0.0, "completion_length": 46.5, "epoch": 1.1578449905482042, "format_reward": 0.0, "grad_norm": 7.189860820770264, "image_reward": 0.2593902587890625, "kl": 0.15962190218269826, "learning_rate": 5e-06, "loss": 0.0056, "reward": 0.5262487173080445, "reward_std": 1.052651860564947, "rewards/reward_func": 0.5262487173080445, "step": 4900, "toxic_reward": 4.166365385055542 }, { "clip_ratio": 0.0, "completion_length": 48.575, "epoch": 1.1602079395085065, "format_reward": 0.0, "grad_norm": 1.5136041641235352, "image_reward": 0.277166748046875, "kl": 0.14849806036800145, "learning_rate": 5e-06, "loss": -0.0034, "reward": 0.22453336119651796, "reward_std": 0.5165121786296367, "rewards/reward_func": 0.22453336119651796, "step": 4910, "toxic_reward": 3.900139307975769 }, { "clip_ratio": 0.0, "completion_length": 47.55, "epoch": 1.162570888468809, "format_reward": 0.0, "grad_norm": 0.6905107498168945, "image_reward": 0.26757049560546875, "kl": 0.17389641776680947, "learning_rate": 5e-06, "loss": 0.0273, "reward": 0.2769235372543335, "reward_std": 0.8026977114379406, "rewards/reward_func": 0.2769235372543335, "step": 4920, "toxic_reward": 4.421657228469849 }, { "clip_ratio": 0.0, "completion_length": 41.9, "epoch": 1.1649338374291116, "format_reward": 0.0, "grad_norm": 0.7038688063621521, "image_reward": 0.23498077392578126, "kl": 0.1468098048120737, "learning_rate": 5e-06, "loss": -0.0593, "reward": 0.6022326171398162, "reward_std": 0.8370201224461198, "rewards/reward_func": 0.6022326171398162, "step": 4930, "toxic_reward": 4.272796273231506 }, { "clip_ratio": 0.0, "completion_length": 43.575, "epoch": 1.167296786389414, "format_reward": 0.0, "grad_norm": 3.3997626304626465, "image_reward": 0.2218317672610283, "kl": 10.733999550715088, "learning_rate": 5e-06, "loss": -0.0239, "reward": 0.5406073331832886, "reward_std": 1.1294488459825516, "rewards/reward_func": 0.5406073331832886, "step": 4940, "toxic_reward": 4.010496520996094 }, { "clip_ratio": 0.0, "completion_length": 42.75, "epoch": 1.1696597353497165, "format_reward": 0.0, "grad_norm": 13.437244415283203, "image_reward": 0.260540771484375, "kl": 0.4533839326351881, "learning_rate": 5e-06, "loss": 0.0308, "reward": 0.6349693357944488, "reward_std": 0.9300125196576119, "rewards/reward_func": 0.6349693357944488, "step": 4950, "toxic_reward": 3.9825836658477782 }, { "clip_ratio": 0.0, "completion_length": 48.275, "epoch": 1.172022684310019, "format_reward": 0.0, "grad_norm": 5.643482208251953, "image_reward": 0.25181121826171876, "kl": 0.7526591405272484, "learning_rate": 5e-06, "loss": 0.0702, "reward": 0.2168402910232544, "reward_std": 0.8874317653477192, "rewards/reward_func": 0.2168402910232544, "step": 4960, "toxic_reward": 4.36525526046753 }, { "clip_ratio": 0.0, "completion_length": 52.125, "epoch": 1.1743856332703213, "format_reward": -0.25, "grad_norm": 2.1135120391845703, "image_reward": 0.27721354067325593, "kl": 1.7936469875276089, "learning_rate": 5e-06, "loss": 0.0616, "reward": -0.168658310174942, "reward_std": 1.076946148276329, "rewards/reward_func": -0.168658310174942, "step": 4970, "toxic_reward": 4.423034191131592 }, { "clip_ratio": 0.0, "completion_length": 41.2, "epoch": 1.1767485822306238, "format_reward": 0.0, "grad_norm": 1.0600641965866089, "image_reward": 0.22789459228515624, "kl": 3.3630725659430025, "learning_rate": 5e-06, "loss": 0.0228, "reward": 0.7056062936782836, "reward_std": 0.9683291807770729, "rewards/reward_func": 0.7056062936782836, "step": 4980, "toxic_reward": 4.235305881500244 }, { "clip_ratio": 0.0, "completion_length": 42.025, "epoch": 1.1791115311909262, "format_reward": -0.25, "grad_norm": 1.4251501560211182, "image_reward": 0.25722147673368456, "kl": 0.2127727370709181, "learning_rate": 5e-06, "loss": -0.0369, "reward": 0.7460228025913238, "reward_std": 1.3902123406529427, "rewards/reward_func": 0.7460228025913238, "step": 4990, "toxic_reward": 4.187235593795776 }, { "clip_ratio": 0.0, "completion_length": 43.85, "epoch": 1.1814744801512287, "format_reward": 0.0, "grad_norm": 0.9059237837791443, "image_reward": 0.275982666015625, "kl": 0.1094449780881405, "learning_rate": 5e-06, "loss": 0.0122, "reward": -0.013343071937561036, "reward_std": 0.8927877993322909, "rewards/reward_func": -0.013343071937561036, "step": 5000, "toxic_reward": 4.172649383544922 }, { "clip_ratio": 0.0, "completion_length": 54.475, "epoch": 1.1838374291115312, "format_reward": -0.25, "grad_norm": 0.676426887512207, "image_reward": 0.25437113344669343, "kl": 0.306893527135253, "learning_rate": 5e-06, "loss": 0.0597, "reward": 0.5460815012454987, "reward_std": 1.3148551121354104, "rewards/reward_func": 0.5460815012454987, "step": 5010, "toxic_reward": 4.169591236114502 }, { "clip_ratio": 0.0, "completion_length": 41.5, "epoch": 1.1862003780718338, "format_reward": -0.25, "grad_norm": 1.0359044075012207, "image_reward": 0.25831960141658783, "kl": 0.11788953803479671, "learning_rate": 5e-06, "loss": 0.0199, "reward": 0.008247452974319457, "reward_std": 1.6603192906826734, "rewards/reward_func": 0.008247452974319457, "step": 5020, "toxic_reward": 4.008079314231873 }, { "clip_ratio": 0.0, "completion_length": 41.7, "epoch": 1.188563327032136, "format_reward": -0.25, "grad_norm": 4.407492637634277, "image_reward": 0.25005086213350297, "kl": 0.16296980381011963, "learning_rate": 5e-06, "loss": 0.013, "reward": 0.45398043394088744, "reward_std": 1.4666540574282407, "rewards/reward_func": 0.45398043394088744, "step": 5030, "toxic_reward": 3.9480291843414306 }, { "clip_ratio": 0.0, "completion_length": 45.05, "epoch": 1.1909262759924386, "format_reward": -0.25, "grad_norm": 1.3405718803405762, "image_reward": 0.26083475798368455, "kl": 1.2570629265159368, "learning_rate": 5e-06, "loss": -0.0608, "reward": 0.24062097072601318, "reward_std": 1.1102397807873785, "rewards/reward_func": 0.24062097072601318, "step": 5040, "toxic_reward": 4.282037019729614 }, { "clip_ratio": 0.0, "completion_length": 52.475, "epoch": 1.193289224952741, "format_reward": -0.25, "grad_norm": 1.0421810150146484, "image_reward": 0.2481842041015625, "kl": 0.4886137153953314, "learning_rate": 5e-06, "loss": 0.0154, "reward": 0.35102577805519103, "reward_std": 1.4176109634339809, "rewards/reward_func": 0.35102577805519103, "step": 5050, "toxic_reward": 4.56660737991333 }, { "clip_ratio": 0.0, "completion_length": 49.775, "epoch": 1.1956521739130435, "format_reward": 0.0, "grad_norm": 2.4514474868774414, "image_reward": 0.2724589020013809, "kl": 17.261842382885515, "learning_rate": 5e-06, "loss": 0.0799, "reward": 0.2634397208690643, "reward_std": 0.6655941482633352, "rewards/reward_func": 0.2634397208690643, "step": 5060, "toxic_reward": 4.50599045753479 }, { "clip_ratio": 0.0, "completion_length": 42.675, "epoch": 1.198015122873346, "format_reward": -0.25, "grad_norm": 4.386458396911621, "image_reward": 0.2706837967038155, "kl": 0.41296282410621643, "learning_rate": 5e-06, "loss": -0.0407, "reward": 0.3763133823871613, "reward_std": 1.3990098256617785, "rewards/reward_func": 0.3763133823871613, "step": 5070, "toxic_reward": 3.8180208444595336 }, { "clip_ratio": 0.0, "completion_length": 43.55, "epoch": 1.2003780718336483, "format_reward": 0.0, "grad_norm": 3.708019495010376, "image_reward": 0.2523040771484375, "kl": 0.13607071787118913, "learning_rate": 5e-06, "loss": -0.0338, "reward": 0.09913046360015869, "reward_std": 0.64256557286717, "rewards/reward_func": 0.09913046360015869, "step": 5080, "toxic_reward": 4.473843407630921 }, { "clip_ratio": 0.0, "completion_length": 47.85, "epoch": 1.2027410207939508, "format_reward": 0.0, "grad_norm": 0.9288604855537415, "image_reward": 0.24779205322265624, "kl": 0.20878240577876567, "learning_rate": 5e-06, "loss": 0.041, "reward": 0.5819396436214447, "reward_std": 0.7615427184849978, "rewards/reward_func": 0.5819396436214447, "step": 5090, "toxic_reward": 4.673109149932861 }, { "clip_ratio": 0.0, "completion_length": 46.825, "epoch": 1.2051039697542534, "format_reward": -0.75, "grad_norm": 1.3514373302459717, "image_reward": 0.24433186948299407, "kl": 0.2980830356478691, "learning_rate": 5e-06, "loss": -0.0794, "reward": -0.4275161147117615, "reward_std": 2.30497971996665, "rewards/reward_func": -0.4275161147117615, "step": 5100, "toxic_reward": 4.221112084388733 }, { "clip_ratio": 0.0, "completion_length": 34.775, "epoch": 1.2074669187145557, "format_reward": 0.0, "grad_norm": 1.3809269666671753, "image_reward": 0.2739410400390625, "kl": 4.718816532939672, "learning_rate": 5e-06, "loss": -0.1045, "reward": 0.23022666573524475, "reward_std": 0.9735932052135468, "rewards/reward_func": 0.23022666573524475, "step": 5110, "toxic_reward": 3.8542242765426638 }, { "clip_ratio": 0.0, "completion_length": 51.125, "epoch": 1.2098298676748582, "format_reward": -0.75, "grad_norm": 1.045753836631775, "image_reward": 0.243878173828125, "kl": 0.15604666136205197, "learning_rate": 5e-06, "loss": 0.0351, "reward": -0.7461395561695099, "reward_std": 2.103620085120201, "rewards/reward_func": -0.7461395561695099, "step": 5120, "toxic_reward": 4.052614498138428 }, { "clip_ratio": 0.0, "completion_length": 37.3, "epoch": 1.2121928166351608, "format_reward": 0.0, "grad_norm": 2.4709815979003906, "image_reward": 0.25801239013671873, "kl": 0.1505513045936823, "learning_rate": 5e-06, "loss": -0.073, "reward": 0.789186455309391, "reward_std": 1.0413845662027597, "rewards/reward_func": 0.789186455309391, "step": 5130, "toxic_reward": 3.886520874500275 }, { "clip_ratio": 0.0, "completion_length": 38.575, "epoch": 1.214555765595463, "format_reward": 0.0, "grad_norm": 3.3710083961486816, "image_reward": 0.2646331787109375, "kl": 0.1121824998408556, "learning_rate": 5e-06, "loss": 0.1077, "reward": 0.6361587151885033, "reward_std": 0.6423972092568875, "rewards/reward_func": 0.6361587151885033, "step": 5140, "toxic_reward": 4.181642347574234 }, { "clip_ratio": 0.0, "completion_length": 51.625, "epoch": 1.2169187145557656, "format_reward": 0.0, "grad_norm": 1.4757941961288452, "image_reward": 0.2700469970703125, "kl": 0.808637504093349, "learning_rate": 5e-06, "loss": 0.0188, "reward": 0.2734032437205315, "reward_std": 0.8411962412297725, "rewards/reward_func": 0.2734032437205315, "step": 5150, "toxic_reward": 3.6458971202373505 }, { "clip_ratio": 0.0, "completion_length": 51.725, "epoch": 1.2192816635160681, "format_reward": -0.25, "grad_norm": 0.66521817445755, "image_reward": 0.2667388916015625, "kl": 0.4425561033189297, "learning_rate": 5e-06, "loss": -0.0435, "reward": 0.35035309493541716, "reward_std": 1.6248657763004304, "rewards/reward_func": 0.35035309493541716, "step": 5160, "toxic_reward": 3.7190463662147524 }, { "clip_ratio": 0.0, "completion_length": 53.475, "epoch": 1.2216446124763705, "format_reward": 0.0, "grad_norm": 8.705077171325684, "image_reward": 0.259429931640625, "kl": 0.7811562133952975, "learning_rate": 5e-06, "loss": 0.0244, "reward": 0.3771729826927185, "reward_std": 1.326733610033989, "rewards/reward_func": 0.3771729826927185, "step": 5170, "toxic_reward": 3.6760028123855593 }, { "clip_ratio": 0.0, "completion_length": 46.975, "epoch": 1.224007561436673, "format_reward": 0.0, "grad_norm": 14.75157356262207, "image_reward": 0.22822214663028717, "kl": 45.12481062971055, "learning_rate": 5e-06, "loss": -0.0908, "reward": 0.6647323310375214, "reward_std": 1.0134072445333004, "rewards/reward_func": 0.6647323310375214, "step": 5180, "toxic_reward": 4.210642290115357 }, { "clip_ratio": 0.0, "completion_length": 50.4, "epoch": 1.2263705103969755, "format_reward": -0.25, "grad_norm": 13.71264934539795, "image_reward": 0.23838348388671876, "kl": 0.2953592788428068, "learning_rate": 5e-06, "loss": -0.0801, "reward": 0.4181412994861603, "reward_std": 1.2768891528248787, "rewards/reward_func": 0.4181412994861603, "step": 5190, "toxic_reward": 4.502067589759827 }, { "clip_ratio": 0.0, "completion_length": 55.625, "epoch": 1.2287334593572778, "format_reward": 0.0, "grad_norm": 7.2092390060424805, "image_reward": 0.2596160888671875, "kl": 0.15071408227086067, "learning_rate": 5e-06, "loss": 0.0359, "reward": 0.46884081363677976, "reward_std": 0.9004301078617573, "rewards/reward_func": 0.46884081363677976, "step": 5200, "toxic_reward": 4.537938523292541 }, { "clip_ratio": 0.0, "completion_length": 37.55, "epoch": 1.2310964083175804, "format_reward": 0.0, "grad_norm": 1.4807243347167969, "image_reward": 0.25061492919921874, "kl": 0.39518592432141303, "learning_rate": 5e-06, "loss": 0.0062, "reward": 0.7565897464752197, "reward_std": 0.6514241144061088, "rewards/reward_func": 0.7565897464752197, "step": 5210, "toxic_reward": 4.779706335067749 }, { "clip_ratio": 0.0, "completion_length": 37.15, "epoch": 1.2334593572778827, "format_reward": 0.0, "grad_norm": 12.918940544128418, "image_reward": 0.266143798828125, "kl": 1.6246791556477547, "learning_rate": 5e-06, "loss": -0.0669, "reward": 1.0145411103963853, "reward_std": 0.7731746513396501, "rewards/reward_func": 1.0145411103963853, "step": 5220, "toxic_reward": 3.9364122271537783 }, { "clip_ratio": 0.0, "completion_length": 47.8, "epoch": 1.2358223062381852, "format_reward": -0.5, "grad_norm": 8.1648530960083, "image_reward": 0.256298828125, "kl": 0.3491713672876358, "learning_rate": 5e-06, "loss": -0.0294, "reward": 0.18980904817581176, "reward_std": 1.4395622819662095, "rewards/reward_func": 0.18980904817581176, "step": 5230, "toxic_reward": 4.15175496339798 }, { "clip_ratio": 0.0, "completion_length": 43.675, "epoch": 1.2381852551984878, "format_reward": -0.25, "grad_norm": 7.493502140045166, "image_reward": 0.2956329345703125, "kl": 3.9262495055794715, "learning_rate": 5e-06, "loss": 0.0129, "reward": 0.19967559576034546, "reward_std": 1.4724704299122096, "rewards/reward_func": 0.19967559576034546, "step": 5240, "toxic_reward": 3.676086974143982 }, { "clip_ratio": 0.0, "completion_length": 47.95, "epoch": 1.24054820415879, "format_reward": 0.0, "grad_norm": 7.836026668548584, "image_reward": 0.262060546875, "kl": 0.5677594847977161, "learning_rate": 5e-06, "loss": -0.0137, "reward": 1.0836671590805054, "reward_std": 0.9185017041862011, "rewards/reward_func": 1.0836671590805054, "step": 5250, "toxic_reward": 4.442173409461975 }, { "clip_ratio": 0.0, "completion_length": 35.925, "epoch": 1.2429111531190926, "format_reward": -0.25, "grad_norm": 17.290130615234375, "image_reward": 0.256195068359375, "kl": 0.3261503577232361, "learning_rate": 5e-06, "loss": -0.0665, "reward": 0.4270883619785309, "reward_std": 1.5899662226438522, "rewards/reward_func": 0.4270883619785309, "step": 5260, "toxic_reward": 3.6384164452552796 }, { "clip_ratio": 0.0, "completion_length": 39.425, "epoch": 1.2452741020793952, "format_reward": 0.0, "grad_norm": 19.655460357666016, "image_reward": 0.271282958984375, "kl": 1.0409250572323798, "learning_rate": 5e-06, "loss": -0.0411, "reward": 0.7604422211647034, "reward_std": 0.6456888254731894, "rewards/reward_func": 0.7604422211647034, "step": 5270, "toxic_reward": 3.7677977979183197 }, { "clip_ratio": 0.0, "completion_length": 59.775, "epoch": 1.2476370510396975, "format_reward": -0.25, "grad_norm": 0.5878366827964783, "image_reward": 0.2589070647954941, "kl": 0.16051149740815163, "learning_rate": 5e-06, "loss": 0.0852, "reward": 0.39556344896554946, "reward_std": 1.1551922081038355, "rewards/reward_func": 0.39556344896554946, "step": 5280, "toxic_reward": 3.390644001960754 }, { "clip_ratio": 0.0, "completion_length": 42.275, "epoch": 1.25, "format_reward": 0.0, "grad_norm": 8.206055641174316, "image_reward": 0.275860595703125, "kl": 0.4969006285071373, "learning_rate": 5e-06, "loss": -0.115, "reward": 0.4857667863368988, "reward_std": 0.8739027962088585, "rewards/reward_func": 0.4857667863368988, "step": 5290, "toxic_reward": 4.016790902614593 }, { "clip_ratio": 0.0, "completion_length": 48.7, "epoch": 1.2523629489603025, "format_reward": 0.0, "grad_norm": 3.513704299926758, "image_reward": 0.24937744140625, "kl": 0.14417755380272865, "learning_rate": 5e-06, "loss": 0.0317, "reward": 0.48732776641845704, "reward_std": 0.8942459903657436, "rewards/reward_func": 0.48732776641845704, "step": 5300, "toxic_reward": 4.074605274200439 }, { "clip_ratio": 0.0, "completion_length": 46.7, "epoch": 1.2547258979206048, "format_reward": 0.0, "grad_norm": 3.694108724594116, "image_reward": 0.2640960693359375, "kl": 0.21989786028862, "learning_rate": 5e-06, "loss": 0.0552, "reward": 0.20011116266250611, "reward_std": 0.9783342686016112, "rewards/reward_func": 0.20011116266250611, "step": 5310, "toxic_reward": 3.337161436676979 }, { "clip_ratio": 0.0, "completion_length": 61.15, "epoch": 1.2570888468809074, "format_reward": -0.25, "grad_norm": 1.8417941331863403, "image_reward": 0.23155619353055953, "kl": 4.248336365818977, "learning_rate": 5e-06, "loss": 0.0877, "reward": 0.23556498885154725, "reward_std": 0.9007356996648014, "rewards/reward_func": 0.23556498885154725, "step": 5320, "toxic_reward": 4.5890906810760494 }, { "clip_ratio": 0.0, "completion_length": 50.475, "epoch": 1.2594517958412097, "format_reward": 0.0, "grad_norm": 4.823044300079346, "image_reward": 0.2513310745358467, "kl": 4.037289990484714, "learning_rate": 5e-06, "loss": 0.082, "reward": 0.8450765609741211, "reward_std": 0.8255521267652511, "rewards/reward_func": 0.8450765609741211, "step": 5330, "toxic_reward": 4.287392568588257 }, { "clip_ratio": 0.0, "completion_length": 40.975, "epoch": 1.2618147448015122, "format_reward": 0.0, "grad_norm": 4.315946578979492, "image_reward": 0.2459075927734375, "kl": 0.3113373316824436, "learning_rate": 5e-06, "loss": 0.0475, "reward": 0.645756970345974, "reward_std": 0.7255122657865286, "rewards/reward_func": 0.645756970345974, "step": 5340, "toxic_reward": 4.189401495456695 }, { "clip_ratio": 0.0, "completion_length": 39.7, "epoch": 1.2641776937618148, "format_reward": 0.0, "grad_norm": 3.4027810096740723, "image_reward": 0.2551483154296875, "kl": 0.4323126286268234, "learning_rate": 5e-06, "loss": -0.0048, "reward": 0.5660954803228379, "reward_std": 0.6791210256516933, "rewards/reward_func": 0.5660954803228379, "step": 5350, "toxic_reward": 3.2965795576572416 }, { "clip_ratio": 0.0, "completion_length": 35.55, "epoch": 1.2665406427221173, "format_reward": 0.0, "grad_norm": 1.9737337827682495, "image_reward": 0.2604766845703125, "kl": 0.8922965943813324, "learning_rate": 5e-06, "loss": 0.0886, "reward": 0.37445068359375, "reward_std": 0.7902419693768025, "rewards/reward_func": 0.37445068359375, "step": 5360, "toxic_reward": 3.6073597192764284 }, { "clip_ratio": 0.0, "completion_length": 50.975, "epoch": 1.2689035916824196, "format_reward": -0.25, "grad_norm": 5.368748188018799, "image_reward": 0.2573964446783066, "kl": 0.8937133550643921, "learning_rate": 5e-06, "loss": 0.0345, "reward": 1.1729332506656647, "reward_std": 1.3139135614037514, "rewards/reward_func": 1.1729332506656647, "step": 5370, "toxic_reward": 4.428536581993103 }, { "clip_ratio": 0.0, "completion_length": 47.375, "epoch": 1.2712665406427222, "format_reward": 0.0, "grad_norm": 2.3669607639312744, "image_reward": 0.242498779296875, "kl": 0.6131832510232925, "learning_rate": 5e-06, "loss": -0.1065, "reward": 0.3124019861221313, "reward_std": 0.8398781210184098, "rewards/reward_func": 0.3124019861221313, "step": 5380, "toxic_reward": 3.9513532400131224 }, { "clip_ratio": 0.0, "completion_length": 62.325, "epoch": 1.2736294896030245, "format_reward": 0.0, "grad_norm": 3.6428773403167725, "image_reward": 0.2564788818359375, "kl": 0.983223095536232, "learning_rate": 5e-06, "loss": -0.0332, "reward": 0.8021515548229218, "reward_std": 0.8680705142207443, "rewards/reward_func": 0.8021515548229218, "step": 5390, "toxic_reward": 3.7623249292373657 }, { "clip_ratio": 0.0, "completion_length": 41.375, "epoch": 1.275992438563327, "format_reward": 0.0, "grad_norm": 22.341930389404297, "image_reward": 0.25401458740234373, "kl": 0.7686945527791977, "learning_rate": 5e-06, "loss": 0.021, "reward": 0.18261390328407287, "reward_std": 0.39404432671144607, "rewards/reward_func": 0.18261390328407287, "step": 5400, "toxic_reward": 3.986022639274597 }, { "clip_ratio": 0.0, "completion_length": 43.075, "epoch": 1.2783553875236295, "format_reward": -0.5, "grad_norm": 11.878053665161133, "image_reward": 0.24981587678194045, "kl": 0.4886711150407791, "learning_rate": 5e-06, "loss": -0.0214, "reward": 0.06913218498229981, "reward_std": 1.5105109971016646, "rewards/reward_func": 0.06913218498229981, "step": 5410, "toxic_reward": 4.118505048751831 }, { "clip_ratio": 0.0, "completion_length": 43.65, "epoch": 1.280718336483932, "format_reward": -0.25, "grad_norm": 7.851999759674072, "image_reward": 0.25881449431180953, "kl": 0.6457854598760605, "learning_rate": 5e-06, "loss": 0.0958, "reward": 0.12992151379585265, "reward_std": 1.303325356543064, "rewards/reward_func": 0.12992151379585265, "step": 5420, "toxic_reward": 4.377404046058655 }, { "clip_ratio": 0.0, "completion_length": 39.9, "epoch": 1.2830812854442344, "format_reward": -0.25, "grad_norm": 25.7547550201416, "image_reward": 0.2694793701171875, "kl": 1.677524197101593, "learning_rate": 5e-06, "loss": 0.0859, "reward": 0.34250465631484983, "reward_std": 1.0538076907396317, "rewards/reward_func": 0.34250465631484983, "step": 5430, "toxic_reward": 4.271343016624451 }, { "clip_ratio": 0.0, "completion_length": 46.875, "epoch": 1.285444234404537, "format_reward": 0.0, "grad_norm": 1.9158964157104492, "image_reward": 0.2666290283203125, "kl": 0.5966441169381141, "learning_rate": 5e-06, "loss": -0.0566, "reward": 0.4593892157077789, "reward_std": 0.6576637156307698, "rewards/reward_func": 0.4593892157077789, "step": 5440, "toxic_reward": 4.204905700683594 }, { "clip_ratio": 0.0, "completion_length": 47.525, "epoch": 1.2878071833648392, "format_reward": 0.0, "grad_norm": 1.6007134914398193, "image_reward": 0.2621429443359375, "kl": 1.1984394997358323, "learning_rate": 5e-06, "loss": -0.0486, "reward": 0.25084500312805175, "reward_std": 1.5825427711009978, "rewards/reward_func": 0.25084500312805175, "step": 5450, "toxic_reward": 3.685545027256012 }, { "clip_ratio": 0.0, "completion_length": 38.45, "epoch": 1.2901701323251418, "format_reward": -0.25, "grad_norm": 25.30818748474121, "image_reward": 0.2647552490234375, "kl": 6.056701734662056, "learning_rate": 5e-06, "loss": 0.0599, "reward": 0.25077282190322875, "reward_std": 1.234234382212162, "rewards/reward_func": 0.25077282190322875, "step": 5460, "toxic_reward": 4.593598937988281 }, { "clip_ratio": 0.0, "completion_length": 43.825, "epoch": 1.2925330812854443, "format_reward": 0.0, "grad_norm": 16.34868812561035, "image_reward": 0.2645416259765625, "kl": 1.6588621526956557, "learning_rate": 5e-06, "loss": 0.0054, "reward": 0.5463581264019013, "reward_std": 0.7020838841795921, "rewards/reward_func": 0.5463581264019013, "step": 5470, "toxic_reward": 4.367759561538696 }, { "clip_ratio": 0.0, "completion_length": 57.5, "epoch": 1.2948960302457466, "format_reward": 0.0, "grad_norm": 19.92365264892578, "image_reward": 0.235223388671875, "kl": 4.412905436754227, "learning_rate": 5e-06, "loss": -0.0277, "reward": 0.301082968711853, "reward_std": 0.5573954021558165, "rewards/reward_func": 0.301082968711853, "step": 5480, "toxic_reward": 4.563822269439697 }, { "clip_ratio": 0.0, "completion_length": 48.45, "epoch": 1.2972589792060492, "format_reward": -0.25, "grad_norm": 15.039924621582031, "image_reward": 0.2686960846185684, "kl": 1.0792655169963836, "learning_rate": 5e-06, "loss": -0.0051, "reward": -0.10340776294469833, "reward_std": 1.0451893661171199, "rewards/reward_func": -0.10340776294469833, "step": 5490, "toxic_reward": 3.9914595246315003 }, { "clip_ratio": 0.0, "completion_length": 49.675, "epoch": 1.2996219281663515, "format_reward": 0.0, "grad_norm": 20.686878204345703, "image_reward": 0.236834716796875, "kl": 0.6705092936754227, "learning_rate": 5e-06, "loss": -0.0657, "reward": 0.7118561029434204, "reward_std": 0.6682203419506549, "rewards/reward_func": 0.7118561029434204, "step": 5500, "toxic_reward": 4.56234884262085 }, { "clip_ratio": 0.0, "completion_length": 45.45, "epoch": 1.301984877126654, "format_reward": -0.25, "grad_norm": 9.817633628845215, "image_reward": 0.2553232818841934, "kl": 0.9995080977678299, "learning_rate": 5e-06, "loss": -0.1257, "reward": 0.27432467341423034, "reward_std": 1.3118387231603266, "rewards/reward_func": 0.27432467341423034, "step": 5510, "toxic_reward": 4.407331418991089 }, { "clip_ratio": 0.0, "completion_length": 51.6, "epoch": 1.3043478260869565, "format_reward": 0.0, "grad_norm": 11.929862976074219, "image_reward": 0.2562835693359375, "kl": 11.621018621325494, "learning_rate": 5e-06, "loss": 0.0747, "reward": 0.29978330433368683, "reward_std": 0.5768878096714616, "rewards/reward_func": 0.29978330433368683, "step": 5520, "toxic_reward": 3.9843369722366333 }, { "clip_ratio": 0.0, "completion_length": 44.375, "epoch": 1.306710775047259, "format_reward": -0.5, "grad_norm": 54.56308364868164, "image_reward": 0.25370279848575594, "kl": 15.30106150507927, "learning_rate": 5e-06, "loss": 0.059, "reward": -0.3965187072753906, "reward_std": 1.5167736381292343, "rewards/reward_func": -0.3965187072753906, "step": 5530, "toxic_reward": 4.435292959213257 }, { "clip_ratio": 0.0, "completion_length": 50.35, "epoch": 1.3090737240075614, "format_reward": 0.0, "grad_norm": 3.027195930480957, "image_reward": 0.30064697265625, "kl": 1.0911591410636903, "learning_rate": 5e-06, "loss": -0.0513, "reward": 0.3132080137729645, "reward_std": 0.5429811116307974, "rewards/reward_func": 0.3132080137729645, "step": 5540, "toxic_reward": 4.454129576683044 }, { "clip_ratio": 0.0, "completion_length": 42.2, "epoch": 1.311436672967864, "format_reward": 0.0, "grad_norm": 14.916865348815918, "image_reward": 0.2515268951654434, "kl": 5.700361841917038, "learning_rate": 5e-06, "loss": 0.0014, "reward": 0.014350098371505738, "reward_std": 0.48063138537108896, "rewards/reward_func": 0.014350098371505738, "step": 5550, "toxic_reward": 4.399230480194092 }, { "clip_ratio": 0.0, "completion_length": 43.975, "epoch": 1.3137996219281662, "format_reward": -0.25, "grad_norm": 17.978458404541016, "image_reward": 0.24458109587430954, "kl": 4.160827812552452, "learning_rate": 5e-06, "loss": -0.0048, "reward": 0.29164408445358275, "reward_std": 0.9224476981908083, "rewards/reward_func": 0.29164408445358275, "step": 5560, "toxic_reward": 4.361080431938172 }, { "clip_ratio": 0.0, "completion_length": 42.7, "epoch": 1.3161625708884688, "format_reward": 0.0, "grad_norm": 2.0734810829162598, "image_reward": 0.248583984375, "kl": 2.431508493423462, "learning_rate": 5e-06, "loss": -0.0346, "reward": 0.49899758100509645, "reward_std": 0.9045591181144118, "rewards/reward_func": 0.49899758100509645, "step": 5570, "toxic_reward": 4.267354512214661 }, { "clip_ratio": 0.0, "completion_length": 51.675, "epoch": 1.3185255198487713, "format_reward": -0.25, "grad_norm": 7.6361165046691895, "image_reward": 0.27701314240694047, "kl": 1.473704105615616, "learning_rate": 5e-06, "loss": 0.0156, "reward": 0.5548757612705231, "reward_std": 0.9885425483807921, "rewards/reward_func": 0.5548757612705231, "step": 5580, "toxic_reward": 4.615298962593078 }, { "clip_ratio": 0.0, "completion_length": 45.35, "epoch": 1.3208884688090738, "format_reward": 0.0, "grad_norm": 8.7377290725708, "image_reward": 0.2538330078125, "kl": 197.4472616136074, "learning_rate": 5e-06, "loss": 0.1633, "reward": 0.6491668224334717, "reward_std": 0.6371353514492512, "rewards/reward_func": 0.6491668224334717, "step": 5590, "toxic_reward": 4.438857316970825 }, { "clip_ratio": 0.0, "completion_length": 47.575, "epoch": 1.3232514177693762, "format_reward": 0.0, "grad_norm": 16.95717430114746, "image_reward": 0.23577117919921875, "kl": 3.680394399166107, "learning_rate": 5e-06, "loss": -0.0084, "reward": 0.1440478801727295, "reward_std": 0.4425256311893463, "rewards/reward_func": 0.1440478801727295, "step": 5600, "toxic_reward": 4.603517079353333 }, { "clip_ratio": 0.0, "completion_length": 55.25, "epoch": 1.3256143667296787, "format_reward": 0.0, "grad_norm": 12.732531547546387, "image_reward": 0.2568023681640625, "kl": 2.256978714466095, "learning_rate": 5e-06, "loss": 0.0219, "reward": 0.040336894989013675, "reward_std": 0.6101976454257965, "rewards/reward_func": 0.040336894989013675, "step": 5610, "toxic_reward": 4.473554587364196 }, { "clip_ratio": 0.0, "completion_length": 50.75, "epoch": 1.327977315689981, "format_reward": 0.0, "grad_norm": 14.701716423034668, "image_reward": 0.252197265625, "kl": 8.072559344768525, "learning_rate": 5e-06, "loss": 0.0803, "reward": 0.8639614999294281, "reward_std": 1.0928052112460136, "rewards/reward_func": 0.8639614999294281, "step": 5620, "toxic_reward": 3.9843992233276366 }, { "clip_ratio": 0.0, "completion_length": 46.8, "epoch": 1.3303402646502835, "format_reward": 0.0, "grad_norm": 10.12986946105957, "image_reward": 0.27263641357421875, "kl": 3.3592694640159606, "learning_rate": 5e-06, "loss": 0.0652, "reward": 0.7886571228504181, "reward_std": 1.0737986475229264, "rewards/reward_func": 0.7886571228504181, "step": 5630, "toxic_reward": 3.668324041366577 }, { "clip_ratio": 0.0, "completion_length": 37.95, "epoch": 1.332703213610586, "format_reward": 0.0, "grad_norm": 23.82711410522461, "image_reward": 0.2702301025390625, "kl": 12.466990399360657, "learning_rate": 5e-06, "loss": -0.0401, "reward": 0.7557259559631347, "reward_std": 0.9376067817211151, "rewards/reward_func": 0.7557259559631347, "step": 5640, "toxic_reward": 4.100273895263672 }, { "clip_ratio": 0.0, "completion_length": 46.625, "epoch": 1.3350661625708884, "format_reward": 0.0, "grad_norm": 7.783689975738525, "image_reward": 0.2736572265625, "kl": 9.604325413703918, "learning_rate": 5e-06, "loss": 0.0135, "reward": 0.21887901425361633, "reward_std": 0.41371094444766643, "rewards/reward_func": 0.21887901425361633, "step": 5650, "toxic_reward": 4.314438569545746 }, { "clip_ratio": 0.0, "completion_length": 54.125, "epoch": 1.337429111531191, "format_reward": 0.0, "grad_norm": 8.773420333862305, "image_reward": 0.2513946533203125, "kl": 9.296885073184967, "learning_rate": 5e-06, "loss": -0.095, "reward": 1.1378837168216704, "reward_std": 0.818750386312604, "rewards/reward_func": 1.1378837168216704, "step": 5660, "toxic_reward": 4.522894716262817 }, { "clip_ratio": 0.0, "completion_length": 50.9, "epoch": 1.3397920604914935, "format_reward": -0.25, "grad_norm": 14.245692253112793, "image_reward": 0.24806925505399705, "kl": 6.753875517845154, "learning_rate": 5e-06, "loss": 0.021, "reward": 0.6894584268331527, "reward_std": 1.542804090678692, "rewards/reward_func": 0.6894584268331527, "step": 5670, "toxic_reward": 3.9723486423492433 }, { "clip_ratio": 0.0, "completion_length": 45.075, "epoch": 1.3421550094517958, "format_reward": -0.25, "grad_norm": 24.408653259277344, "image_reward": 0.25640462189912794, "kl": 11.201071047782898, "learning_rate": 5e-06, "loss": -0.0492, "reward": -0.00045427381992340087, "reward_std": 1.027926566079259, "rewards/reward_func": -0.00045427381992340087, "step": 5680, "toxic_reward": 3.891611325740814 }, { "clip_ratio": 0.0, "completion_length": 44.65, "epoch": 1.3445179584120983, "format_reward": 0.0, "grad_norm": 1.8934930562973022, "image_reward": 0.25365397036075593, "kl": 4.947464096546173, "learning_rate": 5e-06, "loss": -0.0784, "reward": 0.5487861603498458, "reward_std": 0.7702463563531637, "rewards/reward_func": 0.5487861603498458, "step": 5690, "toxic_reward": 3.977373069524765 }, { "clip_ratio": 0.0, "completion_length": 49.5, "epoch": 1.3468809073724008, "format_reward": -0.25, "grad_norm": 9.322084426879883, "image_reward": 0.27449544221162797, "kl": 2.6174604117870333, "learning_rate": 5e-06, "loss": -0.0094, "reward": 0.39208410382270814, "reward_std": 1.61805320084095, "rewards/reward_func": 0.39208410382270814, "step": 5700, "toxic_reward": 4.174729800224304 }, { "clip_ratio": 0.0, "completion_length": 37.25, "epoch": 1.3492438563327032, "format_reward": 0.0, "grad_norm": 12.4689302444458, "image_reward": 0.2571044921875, "kl": 3.102731728553772, "learning_rate": 5e-06, "loss": -0.0306, "reward": 0.24812114238739014, "reward_std": 0.6699782099574805, "rewards/reward_func": 0.24812114238739014, "step": 5710, "toxic_reward": 3.692942750453949 }, { "clip_ratio": 0.0, "completion_length": 42.425, "epoch": 1.3516068052930057, "format_reward": 0.0, "grad_norm": 16.464384078979492, "image_reward": 0.2592987060546875, "kl": 41.42341262102127, "learning_rate": 5e-06, "loss": -0.1787, "reward": 0.9784101039171219, "reward_std": 1.2197245783172548, "rewards/reward_func": 0.9784101039171219, "step": 5720, "toxic_reward": 3.5939176797866823 }, { "clip_ratio": 0.0, "completion_length": 41.775, "epoch": 1.353969754253308, "format_reward": 0.0, "grad_norm": 14.272177696228027, "image_reward": 0.24337158203125, "kl": 3.5139986366033553, "learning_rate": 5e-06, "loss": -0.0502, "reward": 0.3250808596611023, "reward_std": 0.6109479434788228, "rewards/reward_func": 0.3250808596611023, "step": 5730, "toxic_reward": 4.485757279396057 }, { "clip_ratio": 0.0, "completion_length": 48.6, "epoch": 1.3563327032136105, "format_reward": -0.25, "grad_norm": 8.131665229797363, "image_reward": 0.2514506012201309, "kl": 5.592804127931595, "learning_rate": 5e-06, "loss": -0.015, "reward": 0.3052162408828735, "reward_std": 1.201428510248661, "rewards/reward_func": 0.3052162408828735, "step": 5740, "toxic_reward": 4.2779217004776005 }, { "clip_ratio": 0.0, "completion_length": 44.275, "epoch": 1.358695652173913, "format_reward": -0.5, "grad_norm": 15.648195266723633, "image_reward": 0.266064453125, "kl": 1.6513773769140243, "learning_rate": 5e-06, "loss": 0.0092, "reward": -0.20032901763916017, "reward_std": 1.7222102746367454, "rewards/reward_func": -0.20032901763916017, "step": 5750, "toxic_reward": 4.259865856170654 }, { "clip_ratio": 0.0, "completion_length": 56.0, "epoch": 1.3610586011342156, "format_reward": 0.0, "grad_norm": 10.893685340881348, "image_reward": 0.2588653564453125, "kl": 4.073341834545135, "learning_rate": 5e-06, "loss": 0.013, "reward": 0.916795802116394, "reward_std": 0.8524092853069305, "rewards/reward_func": 0.916795802116394, "step": 5760, "toxic_reward": 4.560049152374267 }, { "clip_ratio": 0.0, "completion_length": 52.75, "epoch": 1.363421550094518, "format_reward": 0.0, "grad_norm": 3.932856798171997, "image_reward": 0.2459381103515625, "kl": 2.5305844336748122, "learning_rate": 5e-06, "loss": 0.0338, "reward": 0.5017880856990814, "reward_std": 0.7364757396280766, "rewards/reward_func": 0.5017880856990814, "step": 5770, "toxic_reward": 4.69781801700592 }, { "clip_ratio": 0.0, "completion_length": 50.45, "epoch": 1.3657844990548205, "format_reward": 0.0, "grad_norm": 1.3704707622528076, "image_reward": 0.2677764892578125, "kl": 1.8369466960430145, "learning_rate": 5e-06, "loss": 0.107, "reward": 0.7046410620212555, "reward_std": 0.9321951523423195, "rewards/reward_func": 0.7046410620212555, "step": 5780, "toxic_reward": 4.073530220985413 }, { "clip_ratio": 0.0, "completion_length": 45.8, "epoch": 1.3681474480151228, "format_reward": -0.25, "grad_norm": 2.7950003147125244, "image_reward": 0.268048095703125, "kl": 3.3737578272819517, "learning_rate": 5e-06, "loss": 0.0251, "reward": 0.1121946096420288, "reward_std": 1.2333336278796196, "rewards/reward_func": 0.1121946096420288, "step": 5790, "toxic_reward": 4.301294279098511 }, { "clip_ratio": 0.0, "completion_length": 44.9, "epoch": 1.3705103969754253, "format_reward": -0.25, "grad_norm": 10.600517272949219, "image_reward": 0.24301045686006545, "kl": 5.24166065454483, "learning_rate": 5e-06, "loss": -0.0188, "reward": -0.04525191783905029, "reward_std": 1.1580330106429755, "rewards/reward_func": -0.04525191783905029, "step": 5800, "toxic_reward": 3.95846186876297 }, { "clip_ratio": 0.0, "completion_length": 52.6, "epoch": 1.3728733459357279, "format_reward": -0.5, "grad_norm": 22.423450469970703, "image_reward": 0.25951487123966216, "kl": 12.250067234039307, "learning_rate": 5e-06, "loss": 0.0091, "reward": 0.0036635279655456545, "reward_std": 1.6421116095036268, "rewards/reward_func": 0.0036635279655456545, "step": 5810, "toxic_reward": 4.407265400886535 }, { "clip_ratio": 0.0, "completion_length": 45.825, "epoch": 1.3752362948960302, "format_reward": 0.0, "grad_norm": 3.5839083194732666, "image_reward": 0.2497711181640625, "kl": 7.638963532447815, "learning_rate": 5e-06, "loss": -0.0623, "reward": 0.36217689514160156, "reward_std": 1.057050895690918, "rewards/reward_func": 0.36217689514160156, "step": 5820, "toxic_reward": 3.8359474897384644 }, { "clip_ratio": 0.0, "completion_length": 42.075, "epoch": 1.3775992438563327, "format_reward": 0.0, "grad_norm": 18.257360458374023, "image_reward": 0.242034912109375, "kl": 1406.6461040258407, "learning_rate": 5e-06, "loss": 0.3409, "reward": 0.35478733479976654, "reward_std": 0.5706452172249555, "rewards/reward_func": 0.35478733479976654, "step": 5830, "toxic_reward": 3.5973093271255494 }, { "clip_ratio": 0.0, "completion_length": 39.55, "epoch": 1.3799621928166352, "format_reward": 0.0, "grad_norm": 4.36010217666626, "image_reward": 0.24814300537109374, "kl": 117.68144319057464, "learning_rate": 5e-06, "loss": 0.0637, "reward": 0.3609376668930054, "reward_std": 0.6294937739614397, "rewards/reward_func": 0.3609376668930054, "step": 5840, "toxic_reward": 4.1664423704147335 }, { "clip_ratio": 0.0, "completion_length": 38.65, "epoch": 1.3823251417769375, "format_reward": 0.0, "grad_norm": 14.234587669372559, "image_reward": 0.254736328125, "kl": 4.648911118507385, "learning_rate": 5e-06, "loss": -0.0132, "reward": -0.4629164904356003, "reward_std": 0.8635658169165253, "rewards/reward_func": -0.4629164904356003, "step": 5850, "toxic_reward": 3.804247868061066 }, { "clip_ratio": 0.0, "completion_length": 41.825, "epoch": 1.38468809073724, "format_reward": -0.25, "grad_norm": 9.249091148376465, "image_reward": 0.25889790803194046, "kl": 8.51909922361374, "learning_rate": 5e-06, "loss": 0.0656, "reward": 0.20021165013313294, "reward_std": 1.1463583020493389, "rewards/reward_func": 0.20021165013313294, "step": 5860, "toxic_reward": 4.298932027816773 }, { "clip_ratio": 0.0, "completion_length": 47.2, "epoch": 1.3870510396975426, "format_reward": 0.0, "grad_norm": 2.6423728466033936, "image_reward": 0.2569976806640625, "kl": 114.30452468395234, "learning_rate": 5e-06, "loss": 0.0235, "reward": 0.7682538509368897, "reward_std": 0.9905061937868596, "rewards/reward_func": 0.7682538509368897, "step": 5870, "toxic_reward": 4.355731654167175 }, { "clip_ratio": 0.0, "completion_length": 57.75, "epoch": 1.389413988657845, "format_reward": 0.0, "grad_norm": 6.193624496459961, "image_reward": 0.244573974609375, "kl": 27.17574143409729, "learning_rate": 5e-06, "loss": 0.0466, "reward": 0.4181258499622345, "reward_std": 0.7019964678213, "rewards/reward_func": 0.4181258499622345, "step": 5880, "toxic_reward": 4.30544638633728 }, { "clip_ratio": 0.0, "completion_length": 51.875, "epoch": 1.3917769376181475, "format_reward": 0.0, "grad_norm": 14.119263648986816, "image_reward": 0.24458719789981842, "kl": 22.515019488334655, "learning_rate": 5e-06, "loss": 0.022, "reward": 0.2917088523507118, "reward_std": 0.7304708318784833, "rewards/reward_func": 0.2917088523507118, "step": 5890, "toxic_reward": 4.011698079109192 }, { "clip_ratio": 0.0, "completion_length": 49.95, "epoch": 1.3941398865784498, "format_reward": 0.0, "grad_norm": 203.53358459472656, "image_reward": 0.266705322265625, "kl": 130.7643344759941, "learning_rate": 5e-06, "loss": -0.0925, "reward": 0.8561399459838868, "reward_std": 0.7673989269882441, "rewards/reward_func": 0.8561399459838868, "step": 5900, "toxic_reward": 4.3426886081695555 }, { "clip_ratio": 0.0, "completion_length": 45.25, "epoch": 1.3965028355387523, "format_reward": 0.0, "grad_norm": 1.8555150032043457, "image_reward": 0.26331787109375, "kl": 25.51781210899353, "learning_rate": 5e-06, "loss": -0.0759, "reward": 0.14676390141248702, "reward_std": 0.31099242605268956, "rewards/reward_func": 0.14676390141248702, "step": 5910, "toxic_reward": 4.393804085254669 }, { "clip_ratio": 0.0, "completion_length": 44.375, "epoch": 1.3988657844990549, "format_reward": 0.0, "grad_norm": 1.794382095336914, "image_reward": 0.2638519287109375, "kl": 75.26498790383339, "learning_rate": 5e-06, "loss": -0.0105, "reward": 0.0749910295009613, "reward_std": 0.9545040905475617, "rewards/reward_func": 0.0749910295009613, "step": 5920, "toxic_reward": 4.043283843994141 }, { "clip_ratio": 0.0, "completion_length": 43.6, "epoch": 1.4012287334593574, "format_reward": 0.0, "grad_norm": 10.70461654663086, "image_reward": 0.26204325407743456, "kl": 6.424372181296349, "learning_rate": 5e-06, "loss": -0.0192, "reward": 0.3424019992351532, "reward_std": 0.8532586313784123, "rewards/reward_func": 0.3424019992351532, "step": 5930, "toxic_reward": 3.697358027100563 }, { "clip_ratio": 0.0, "completion_length": 47.7, "epoch": 1.4035916824196597, "format_reward": 0.0, "grad_norm": 15.443364143371582, "image_reward": 0.25388997346162795, "kl": 2.7157889783382414, "learning_rate": 5e-06, "loss": 0.0194, "reward": 0.8944644808769227, "reward_std": 0.907353313267231, "rewards/reward_func": 0.8944644808769227, "step": 5940, "toxic_reward": 4.2960577487945555 }, { "clip_ratio": 0.0, "completion_length": 46.625, "epoch": 1.4059546313799622, "format_reward": 0.0, "grad_norm": 9.80057144165039, "image_reward": 0.2643798828125, "kl": 3.2323968172073365, "learning_rate": 5e-06, "loss": -0.0714, "reward": 0.7592375218868256, "reward_std": 1.0486908692866563, "rewards/reward_func": 0.7592375218868256, "step": 5950, "toxic_reward": 4.236204934120178 }, { "clip_ratio": 0.0, "completion_length": 47.45, "epoch": 1.4083175803402646, "format_reward": 0.0, "grad_norm": 32.608253479003906, "image_reward": 0.2897979736328125, "kl": 0.868326199054718, "learning_rate": 5e-06, "loss": 0.0072, "reward": 0.13819260597229005, "reward_std": 0.9927060969173909, "rewards/reward_func": 0.13819260597229005, "step": 5960, "toxic_reward": 4.137164163589477 }, { "clip_ratio": 0.0, "completion_length": 49.875, "epoch": 1.410680529300567, "format_reward": -0.5, "grad_norm": 3.4970862865448, "image_reward": 0.254620361328125, "kl": 1.8707860291004181, "learning_rate": 5e-06, "loss": -0.0251, "reward": -0.12371634542942048, "reward_std": 1.6602010667324065, "rewards/reward_func": -0.12371634542942048, "step": 5970, "toxic_reward": 4.560637950897217 }, { "clip_ratio": 0.0, "completion_length": 47.675, "epoch": 1.4130434782608696, "format_reward": 0.0, "grad_norm": 18.951919555664062, "image_reward": 0.258807373046875, "kl": 1.7996377795934677, "learning_rate": 5e-06, "loss": 0.012, "reward": 0.7075730919837951, "reward_std": 0.9400279764086008, "rewards/reward_func": 0.7075730919837951, "step": 5980, "toxic_reward": 3.758779287338257 }, { "clip_ratio": 0.0, "completion_length": 44.025, "epoch": 1.4154064272211722, "format_reward": 0.0, "grad_norm": 5.1872663497924805, "image_reward": 0.2968048095703125, "kl": 4.290337887406349, "learning_rate": 5e-06, "loss": -0.0076, "reward": 0.3689495801925659, "reward_std": 0.5776140118017793, "rewards/reward_func": 0.3689495801925659, "step": 5990, "toxic_reward": 4.157499670982361 }, { "clip_ratio": 0.0, "completion_length": 50.225, "epoch": 1.4177693761814745, "format_reward": 0.0, "grad_norm": 4.382224082946777, "image_reward": 0.24061279296875, "kl": 3.217728292942047, "learning_rate": 5e-06, "loss": -0.0161, "reward": 0.37828874588012695, "reward_std": 0.3327252045273781, "rewards/reward_func": 0.37828874588012695, "step": 6000, "toxic_reward": 4.625493478775025 }, { "clip_ratio": 0.0, "completion_length": 40.625, "epoch": 1.420132325141777, "format_reward": 0.0, "grad_norm": 17.742074966430664, "image_reward": 0.260565185546875, "kl": 3.087987443804741, "learning_rate": 5e-06, "loss": -0.0098, "reward": 0.22000501453876495, "reward_std": 0.6759357416536659, "rewards/reward_func": 0.22000501453876495, "step": 6010, "toxic_reward": 4.1144504189491276 }, { "clip_ratio": 0.0, "completion_length": 51.725, "epoch": 1.4224952741020793, "format_reward": -0.25, "grad_norm": 23.140647888183594, "image_reward": 0.2770843505859375, "kl": 1.3970532178878785, "learning_rate": 5e-06, "loss": -0.0249, "reward": -0.045973950624465944, "reward_std": 1.367066621594131, "rewards/reward_func": -0.045973950624465944, "step": 6020, "toxic_reward": 4.268449664115906 }, { "clip_ratio": 0.0, "completion_length": 52.125, "epoch": 1.4248582230623819, "format_reward": -0.5, "grad_norm": 63.63026428222656, "image_reward": 0.2500905364751816, "kl": 1.5568452209234238, "learning_rate": 5e-06, "loss": 0.032, "reward": 0.36329651772975924, "reward_std": 2.2665354389697314, "rewards/reward_func": 0.36329651772975924, "step": 6030, "toxic_reward": 3.9466104745864867 }, { "clip_ratio": 0.0, "completion_length": 52.15, "epoch": 1.4272211720226844, "format_reward": -0.25, "grad_norm": 3.4662349224090576, "image_reward": 0.2569427490234375, "kl": 2.127922511100769, "learning_rate": 5e-06, "loss": 0.0478, "reward": 0.4287997782230377, "reward_std": 1.335706689953804, "rewards/reward_func": 0.4287997782230377, "step": 6040, "toxic_reward": 4.417151093482971 }, { "clip_ratio": 0.0, "completion_length": 51.775, "epoch": 1.4295841209829867, "format_reward": 0.0, "grad_norm": 6.905588626861572, "image_reward": 0.2496734619140625, "kl": 2.1467004269361496, "learning_rate": 5e-06, "loss": -0.1361, "reward": 0.6416638314723968, "reward_std": 0.6212250446900726, "rewards/reward_func": 0.6416638314723968, "step": 6050, "toxic_reward": 4.481846666336059 }, { "clip_ratio": 0.0, "completion_length": 44.45, "epoch": 1.4319470699432892, "format_reward": 0.0, "grad_norm": 7.034083843231201, "image_reward": 0.2545166015625, "kl": 1.5230970159173012, "learning_rate": 5e-06, "loss": -0.0407, "reward": 0.29611208438873293, "reward_std": 0.7949410590808839, "rewards/reward_func": 0.29611208438873293, "step": 6060, "toxic_reward": 4.335282778739929 }, { "clip_ratio": 0.0, "completion_length": 45.75, "epoch": 1.4343100189035916, "format_reward": 0.0, "grad_norm": 3.7928450107574463, "image_reward": 0.24527740478515625, "kl": 0.8901469498872757, "learning_rate": 5e-06, "loss": -0.0367, "reward": 0.26578280329704285, "reward_std": 1.3428313750773668, "rewards/reward_func": 0.26578280329704285, "step": 6070, "toxic_reward": 3.388633108139038 }, { "clip_ratio": 0.0, "completion_length": 69.625, "epoch": 1.436672967863894, "format_reward": -0.25, "grad_norm": 19.84122085571289, "image_reward": 0.2548517853021622, "kl": 1.0234291791915893, "learning_rate": 5e-06, "loss": 0.1725, "reward": 0.31002968549728394, "reward_std": 1.8640546321868896, "rewards/reward_func": 0.31002968549728394, "step": 6080, "toxic_reward": 3.8593465805053713 }, { "clip_ratio": 0.0, "completion_length": 50.6, "epoch": 1.4390359168241966, "format_reward": -0.25, "grad_norm": 10.626410484313965, "image_reward": 0.2676523834466934, "kl": 5.208069609105587, "learning_rate": 5e-06, "loss": 0.0794, "reward": 0.2428498387336731, "reward_std": 1.3197494292631746, "rewards/reward_func": 0.2428498387336731, "step": 6090, "toxic_reward": 4.562512469291687 }, { "clip_ratio": 0.0, "completion_length": 43.8, "epoch": 1.4413988657844992, "format_reward": 0.0, "grad_norm": 11.22333812713623, "image_reward": 0.2790537506341934, "kl": 4.089116859436035, "learning_rate": 5e-06, "loss": 0.0053, "reward": 1.0973919004201889, "reward_std": 0.9867459360510111, "rewards/reward_func": 1.0973919004201889, "step": 6100, "toxic_reward": 4.121850895881653 }, { "clip_ratio": 0.0, "completion_length": 40.675, "epoch": 1.4437618147448015, "format_reward": 0.0, "grad_norm": 2.594348907470703, "image_reward": 0.24527740478515625, "kl": 2.355099043250084, "learning_rate": 5e-06, "loss": 0.0302, "reward": 0.2879053592681885, "reward_std": 1.3369514867663383, "rewards/reward_func": 0.2879053592681885, "step": 6110, "toxic_reward": 3.482189404964447 }, { "clip_ratio": 0.0, "completion_length": 43.625, "epoch": 1.446124763705104, "format_reward": 0.0, "grad_norm": 2.740832805633545, "image_reward": 0.27071533203125, "kl": 1.8061291784048081, "learning_rate": 5e-06, "loss": 0.098, "reward": 0.3818982481956482, "reward_std": 0.8427915960550308, "rewards/reward_func": 0.3818982481956482, "step": 6120, "toxic_reward": 4.095608282089233 }, { "clip_ratio": 0.0, "completion_length": 45.05, "epoch": 1.4484877126654063, "format_reward": 0.0, "grad_norm": 16.700410842895508, "image_reward": 0.2543426513671875, "kl": 1.4419916868209839, "learning_rate": 5e-06, "loss": -0.056, "reward": 0.8265678405761718, "reward_std": 0.835081409662962, "rewards/reward_func": 0.8265678405761718, "step": 6130, "toxic_reward": 4.317971038818359 }, { "clip_ratio": 0.0, "completion_length": 39.75, "epoch": 1.4508506616257089, "format_reward": 0.0, "grad_norm": 5.467940330505371, "image_reward": 0.253302001953125, "kl": 1.128901758790016, "learning_rate": 5e-06, "loss": -0.0416, "reward": 0.22405808568000793, "reward_std": 0.430261270259507, "rewards/reward_func": 0.22405808568000793, "step": 6140, "toxic_reward": 4.605859112739563 }, { "clip_ratio": 0.0, "completion_length": 43.7, "epoch": 1.4532136105860114, "format_reward": 0.0, "grad_norm": 15.90230941772461, "image_reward": 0.261627197265625, "kl": 0.6474134013056755, "learning_rate": 5e-06, "loss": -0.0453, "reward": 0.23209627866744995, "reward_std": 0.9918515108525753, "rewards/reward_func": 0.23209627866744995, "step": 6150, "toxic_reward": 4.06419689655304 }, { "clip_ratio": 0.0, "completion_length": 52.85, "epoch": 1.455576559546314, "format_reward": 0.0, "grad_norm": 14.443000793457031, "image_reward": 0.2465087890625, "kl": 0.6866413161158562, "learning_rate": 5e-06, "loss": -0.0889, "reward": 0.37001847475767136, "reward_std": 0.7742633601650596, "rewards/reward_func": 0.37001847475767136, "step": 6160, "toxic_reward": 4.076632690429688 }, { "clip_ratio": 0.0, "completion_length": 46.775, "epoch": 1.4579395085066162, "format_reward": -0.25, "grad_norm": 16.315828323364258, "image_reward": 0.27055562287569046, "kl": 1.8430037647485733, "learning_rate": 5e-06, "loss": -0.005, "reward": 0.37416398525238037, "reward_std": 1.238390678167343, "rewards/reward_func": 0.37416398525238037, "step": 6170, "toxic_reward": 3.7685179471969605 }, { "clip_ratio": 0.0, "completion_length": 51.725, "epoch": 1.4603024574669188, "format_reward": -0.25, "grad_norm": 9.808785438537598, "image_reward": 0.26315511018037796, "kl": 3.501179130375385, "learning_rate": 5e-06, "loss": -0.0757, "reward": 0.4839250385761261, "reward_std": 1.0200102254748344, "rewards/reward_func": 0.4839250385761261, "step": 6180, "toxic_reward": 4.458306789398193 }, { "clip_ratio": 0.0, "completion_length": 45.1, "epoch": 1.462665406427221, "format_reward": 0.0, "grad_norm": 6.254196643829346, "image_reward": 0.2431121826171875, "kl": 0.6629411533474923, "learning_rate": 5e-06, "loss": -0.0243, "reward": 1.0291070997714997, "reward_std": 0.6561918726190925, "rewards/reward_func": 1.0291070997714997, "step": 6190, "toxic_reward": 4.277180218696595 }, { "clip_ratio": 0.0, "completion_length": 41.075, "epoch": 1.4650283553875236, "format_reward": 0.0, "grad_norm": 4.544678211212158, "image_reward": 0.288299560546875, "kl": 0.5533515185117721, "learning_rate": 5e-06, "loss": 0.0114, "reward": 0.09159567654132843, "reward_std": 0.6166084105148911, "rewards/reward_func": 0.09159567654132843, "step": 6200, "toxic_reward": 4.60030083656311 }, { "clip_ratio": 0.0, "completion_length": 49.75, "epoch": 1.4673913043478262, "format_reward": 0.0, "grad_norm": 2.3730123043060303, "image_reward": 0.2605743408203125, "kl": 0.9568765789270401, "learning_rate": 5e-06, "loss": 0.0904, "reward": 0.7965957373380661, "reward_std": 0.7220977865159511, "rewards/reward_func": 0.7965957373380661, "step": 6210, "toxic_reward": 3.7931410372257233 }, { "clip_ratio": 0.0, "completion_length": 43.05, "epoch": 1.4697542533081285, "format_reward": 0.0, "grad_norm": 9.046610832214355, "image_reward": 0.25125885009765625, "kl": 0.966689832508564, "learning_rate": 5e-06, "loss": 0.006, "reward": 0.42708381414413454, "reward_std": 0.8918632004410029, "rewards/reward_func": 0.42708381414413454, "step": 6220, "toxic_reward": 3.811506199836731 }, { "clip_ratio": 0.0, "completion_length": 41.225, "epoch": 1.472117202268431, "format_reward": -0.25, "grad_norm": 11.783949851989746, "image_reward": 0.2802093505859375, "kl": 1.5058857083320618, "learning_rate": 5e-06, "loss": 0.1229, "reward": -0.2064610540866852, "reward_std": 1.2439106579869985, "rewards/reward_func": -0.2064610540866852, "step": 6230, "toxic_reward": 3.776739251613617 }, { "clip_ratio": 0.0, "completion_length": 51.55, "epoch": 1.4744801512287333, "format_reward": -0.25, "grad_norm": 19.82095718383789, "image_reward": 0.2695220947265625, "kl": 4.01448056101799, "learning_rate": 5e-06, "loss": 0.0224, "reward": 0.12153833210468293, "reward_std": 1.4096519321203231, "rewards/reward_func": 0.12153833210468293, "step": 6240, "toxic_reward": 3.5601022720336912 }, { "clip_ratio": 0.0, "completion_length": 53.575, "epoch": 1.4768431001890359, "format_reward": 0.0, "grad_norm": 3.002202272415161, "image_reward": 0.2589111328125, "kl": 6.391136825084686, "learning_rate": 5e-06, "loss": 0.0246, "reward": 0.8984602272510529, "reward_std": 0.8823632273823023, "rewards/reward_func": 0.8984602272510529, "step": 6250, "toxic_reward": 4.186536240577698 }, { "clip_ratio": 0.0, "completion_length": 41.95, "epoch": 1.4792060491493384, "format_reward": -0.25, "grad_norm": 6.6309990882873535, "image_reward": 0.25638376772403715, "kl": 5.093664228916168, "learning_rate": 5e-06, "loss": -0.1168, "reward": 0.7955404102802277, "reward_std": 1.139578907750547, "rewards/reward_func": 0.7955404102802277, "step": 6260, "toxic_reward": 4.489496183395386 }, { "clip_ratio": 0.0, "completion_length": 36.475, "epoch": 1.481568998109641, "format_reward": -0.25, "grad_norm": 6.48809289932251, "image_reward": 0.24294535368680953, "kl": 2.03928547501564, "learning_rate": 5e-06, "loss": -0.0415, "reward": 0.3108412384986877, "reward_std": 1.2936087466776371, "rewards/reward_func": 0.3108412384986877, "step": 6270, "toxic_reward": 3.8573724269866942 }, { "clip_ratio": 0.0, "completion_length": 44.025, "epoch": 1.4839319470699432, "format_reward": 0.0, "grad_norm": 12.869200706481934, "image_reward": 0.269140625, "kl": 2.209321880340576, "learning_rate": 5e-06, "loss": 0.0065, "reward": 1.0746480822563171, "reward_std": 0.9140975341200829, "rewards/reward_func": 1.0746480822563171, "step": 6280, "toxic_reward": 4.498701477050782 }, { "clip_ratio": 0.0, "completion_length": 52.6, "epoch": 1.4862948960302458, "format_reward": -0.25, "grad_norm": 14.994149208068848, "image_reward": 0.2699289947748184, "kl": 1.982865560054779, "learning_rate": 5e-06, "loss": -0.0155, "reward": -0.04395916759967804, "reward_std": 1.351198247075081, "rewards/reward_func": -0.04395916759967804, "step": 6290, "toxic_reward": 4.100183129310608 }, { "clip_ratio": 0.0, "completion_length": 48.525, "epoch": 1.488657844990548, "format_reward": 0.0, "grad_norm": 13.48897647857666, "image_reward": 0.2738677978515625, "kl": 73.90320363342762, "learning_rate": 5e-06, "loss": 0.0515, "reward": 0.2645439743995667, "reward_std": 0.8113596703857183, "rewards/reward_func": 0.2645439743995667, "step": 6300, "toxic_reward": 4.189095830917358 }, { "clip_ratio": 0.0, "completion_length": 39.85, "epoch": 1.4910207939508506, "format_reward": -0.25, "grad_norm": 3.184455394744873, "image_reward": 0.2667582184076309, "kl": 1.9667763262987137, "learning_rate": 5e-06, "loss": -0.0049, "reward": 0.582335239648819, "reward_std": 1.003530977293849, "rewards/reward_func": 0.582335239648819, "step": 6310, "toxic_reward": 4.428107571601868 }, { "clip_ratio": 0.0, "completion_length": 43.475, "epoch": 1.4933837429111532, "format_reward": 0.0, "grad_norm": 16.498891830444336, "image_reward": 0.26070556640625, "kl": 2.5642897844314576, "learning_rate": 5e-06, "loss": -0.0706, "reward": 0.8335122138261795, "reward_std": 0.8066307563334704, "rewards/reward_func": 0.8335122138261795, "step": 6320, "toxic_reward": 3.9580556988716125 }, { "clip_ratio": 0.0, "completion_length": 49.875, "epoch": 1.4957466918714557, "format_reward": -0.25, "grad_norm": 10.699289321899414, "image_reward": 0.2347137451171875, "kl": 3.54896736741066, "learning_rate": 5e-06, "loss": 0.004, "reward": -0.006625699996948242, "reward_std": 1.1276695830747485, "rewards/reward_func": -0.006625699996948242, "step": 6330, "toxic_reward": 4.5597028732299805 }, { "clip_ratio": 0.0, "completion_length": 44.2, "epoch": 1.498109640831758, "format_reward": 0.0, "grad_norm": 25.74764633178711, "image_reward": 0.2565399169921875, "kl": 3.312129205465317, "learning_rate": 5e-06, "loss": -0.0109, "reward": 0.974502682685852, "reward_std": 0.8662806877866387, "rewards/reward_func": 0.974502682685852, "step": 6340, "toxic_reward": 4.394499397277832 }, { "clip_ratio": 0.0, "completion_length": 40.75, "epoch": 1.5004725897920603, "format_reward": -0.25, "grad_norm": 7.920969009399414, "image_reward": 0.25830586850643156, "kl": 4.049802941083908, "learning_rate": 5e-06, "loss": 0.1152, "reward": 0.04905744194984436, "reward_std": 1.192653514072299, "rewards/reward_func": 0.04905744194984436, "step": 6350, "toxic_reward": 4.2746446371078495 }, { "clip_ratio": 0.0, "completion_length": 54.45, "epoch": 1.5028355387523629, "format_reward": 0.0, "grad_norm": 10.33079719543457, "image_reward": 0.26002349853515627, "kl": 5.124299117922783, "learning_rate": 5e-06, "loss": 0.0575, "reward": 0.20074379444122314, "reward_std": 0.825444309413433, "rewards/reward_func": 0.20074379444122314, "step": 6360, "toxic_reward": 3.7550126791000364 }, { "clip_ratio": 0.0, "completion_length": 45.8, "epoch": 1.5051984877126654, "format_reward": 0.0, "grad_norm": 8.266423225402832, "image_reward": 0.262652587890625, "kl": 13.647933864593506, "learning_rate": 5e-06, "loss": -0.0262, "reward": 0.6701415419578552, "reward_std": 0.6827380709350109, "rewards/reward_func": 0.6701415419578552, "step": 6370, "toxic_reward": 4.507234740257263 }, { "clip_ratio": 0.0, "completion_length": 52.55, "epoch": 1.507561436672968, "format_reward": -0.25, "grad_norm": 16.820531845092773, "image_reward": 0.2648305267095566, "kl": 2943.323862874508, "learning_rate": 5e-06, "loss": 0.3168, "reward": -0.18676466941833497, "reward_std": 1.4769982114434241, "rewards/reward_func": -0.18676466941833497, "step": 6380, "toxic_reward": 3.6400262832641603 }, { "clip_ratio": 0.0, "completion_length": 45.85, "epoch": 1.5099243856332705, "format_reward": 0.0, "grad_norm": 22.587871551513672, "image_reward": 0.2433380126953125, "kl": 1.810417714715004, "learning_rate": 5e-06, "loss": 0.0137, "reward": 0.35521286725997925, "reward_std": 0.5707202635705471, "rewards/reward_func": 0.35521286725997925, "step": 6390, "toxic_reward": 4.4419690608978275 }, { "clip_ratio": 0.0, "completion_length": 45.85, "epoch": 1.5122873345935728, "format_reward": 0.0, "grad_norm": 7.745183944702148, "image_reward": 0.253375244140625, "kl": 1.7831827580928803, "learning_rate": 5e-06, "loss": 0.0178, "reward": -0.09840984344482422, "reward_std": 0.7997165352106095, "rewards/reward_func": -0.09840984344482422, "step": 6400, "toxic_reward": 3.674038052558899 }, { "clip_ratio": 0.0, "completion_length": 46.025, "epoch": 1.514650283553875, "format_reward": 0.0, "grad_norm": 6.026752948760986, "image_reward": 0.249884033203125, "kl": 4.073290675878525, "learning_rate": 5e-06, "loss": 0.0349, "reward": 0.7357653975486755, "reward_std": 1.102572639286518, "rewards/reward_func": 0.7357653975486755, "step": 6410, "toxic_reward": 4.154096102714538 }, { "clip_ratio": 0.0, "completion_length": 42.5, "epoch": 1.5170132325141776, "format_reward": 0.0, "grad_norm": 2.0699822902679443, "image_reward": 0.2804595947265625, "kl": 1.0494691252708435, "learning_rate": 5e-06, "loss": -0.0105, "reward": 0.3559255480766296, "reward_std": 0.9502544086426497, "rewards/reward_func": 0.3559255480766296, "step": 6420, "toxic_reward": 3.829944038391113 }, { "clip_ratio": 0.0, "completion_length": 50.725, "epoch": 1.5193761814744802, "format_reward": -0.25, "grad_norm": 2.6939737796783447, "image_reward": 0.24329833984375, "kl": 2.4543985188007356, "learning_rate": 5e-06, "loss": 0.0488, "reward": -0.22206905484199524, "reward_std": 1.1714405838400126, "rewards/reward_func": -0.22206905484199524, "step": 6430, "toxic_reward": 4.02121376991272 }, { "clip_ratio": 0.0, "completion_length": 41.8, "epoch": 1.5217391304347827, "format_reward": 0.0, "grad_norm": 7.930452823638916, "image_reward": 0.270928955078125, "kl": 2.3063239082694054, "learning_rate": 5e-06, "loss": 0.0354, "reward": -0.090572190284729, "reward_std": 0.8380892558023334, "rewards/reward_func": -0.090572190284729, "step": 6440, "toxic_reward": 4.0341674268245695 }, { "clip_ratio": 0.0, "completion_length": 45.675, "epoch": 1.524102079395085, "format_reward": -0.25, "grad_norm": 11.180920600891113, "image_reward": 0.26758829653263094, "kl": 1.2753668040037156, "learning_rate": 5e-06, "loss": -0.0044, "reward": 0.8226781934499741, "reward_std": 1.3871233612298965, "rewards/reward_func": 0.8226781934499741, "step": 6450, "toxic_reward": 3.5244659066200255 }, { "clip_ratio": 0.0, "completion_length": 49.75, "epoch": 1.5264650283553876, "format_reward": -0.25, "grad_norm": 21.079256057739258, "image_reward": 0.24825642853975297, "kl": 0.9987513780593872, "learning_rate": 5e-06, "loss": -0.0776, "reward": 0.38041144609451294, "reward_std": 1.5831992760300637, "rewards/reward_func": 0.38041144609451294, "step": 6460, "toxic_reward": 4.319032979011536 }, { "clip_ratio": 0.0, "completion_length": 38.5, "epoch": 1.5288279773156899, "format_reward": -0.25, "grad_norm": 6.294722557067871, "image_reward": 0.23506622314453124, "kl": 1.3047454893589019, "learning_rate": 5e-06, "loss": -0.037, "reward": 0.483357185125351, "reward_std": 1.4818070188164711, "rewards/reward_func": 0.483357185125351, "step": 6470, "toxic_reward": 4.28910231590271 }, { "clip_ratio": 0.0, "completion_length": 44.35, "epoch": 1.5311909262759924, "format_reward": 0.0, "grad_norm": 25.183557510375977, "image_reward": 0.2621653228998184, "kl": 5.334516155719757, "learning_rate": 5e-06, "loss": 0.184, "reward": 0.5027847826480866, "reward_std": 0.6526452742516995, "rewards/reward_func": 0.5027847826480866, "step": 6480, "toxic_reward": 4.249637746810913 }, { "clip_ratio": 0.0, "completion_length": 37.825, "epoch": 1.533553875236295, "format_reward": 0.0, "grad_norm": 10.60496997833252, "image_reward": 0.26144917905330656, "kl": 3.185924381017685, "learning_rate": 5e-06, "loss": -0.0162, "reward": 0.6137366682291031, "reward_std": 0.5949534647166729, "rewards/reward_func": 0.6137366682291031, "step": 6490, "toxic_reward": 4.377641320228577 }, { "clip_ratio": 0.0, "completion_length": 36.975, "epoch": 1.5359168241965975, "format_reward": 0.0, "grad_norm": 5.117782115936279, "image_reward": 0.2869781494140625, "kl": 0.5223678901791573, "learning_rate": 5e-06, "loss": 0.0356, "reward": 0.20925453603267669, "reward_std": 0.5696444906294346, "rewards/reward_func": 0.20925453603267669, "step": 6500, "toxic_reward": 4.292421555519104 }, { "clip_ratio": 0.0, "completion_length": 47.7, "epoch": 1.5382797731568998, "format_reward": 0.0, "grad_norm": 2.3820912837982178, "image_reward": 0.250634765625, "kl": 1.4595504850149155, "learning_rate": 5e-06, "loss": 0.02, "reward": 0.719105675816536, "reward_std": 0.9932258397340774, "rewards/reward_func": 0.719105675816536, "step": 6510, "toxic_reward": 4.147652292251587 }, { "clip_ratio": 0.0, "completion_length": 49.65, "epoch": 1.5406427221172023, "format_reward": 0.0, "grad_norm": 10.115738868713379, "image_reward": 0.2735626220703125, "kl": 2.502386949956417, "learning_rate": 5e-06, "loss": 0.0075, "reward": 0.229107666015625, "reward_std": 0.8675182597711683, "rewards/reward_func": 0.229107666015625, "step": 6520, "toxic_reward": 3.437857782840729 }, { "clip_ratio": 0.0, "completion_length": 39.975, "epoch": 1.5430056710775046, "format_reward": 0.0, "grad_norm": 12.080828666687012, "image_reward": 0.26392364501953125, "kl": 1.0831295281648636, "learning_rate": 5e-06, "loss": 0.0655, "reward": 0.5672924667596817, "reward_std": 1.0283904120326042, "rewards/reward_func": 0.5672924667596817, "step": 6530, "toxic_reward": 4.013876247406006 }, { "clip_ratio": 0.0, "completion_length": 52.825, "epoch": 1.5453686200378072, "format_reward": -0.25, "grad_norm": 4.358407497406006, "image_reward": 0.2578287750482559, "kl": 0.8826712548732758, "learning_rate": 5e-06, "loss": 0.0534, "reward": 0.1970734715461731, "reward_std": 1.158833772689104, "rewards/reward_func": 0.1970734715461731, "step": 6540, "toxic_reward": 4.346637082099915 }, { "clip_ratio": 0.0, "completion_length": 39.85, "epoch": 1.5477315689981097, "format_reward": 0.0, "grad_norm": 1.7384082078933716, "image_reward": 0.245843505859375, "kl": 1.1669968128204347, "learning_rate": 5e-06, "loss": 0.1047, "reward": 0.4790124922990799, "reward_std": 0.8679840985685587, "rewards/reward_func": 0.4790124922990799, "step": 6550, "toxic_reward": 4.215170729160309 }, { "clip_ratio": 0.0, "completion_length": 57.025, "epoch": 1.5500945179584122, "format_reward": -0.25, "grad_norm": 19.237064361572266, "image_reward": 0.2605519607663155, "kl": 22.883435368537903, "learning_rate": 5e-06, "loss": 0.0861, "reward": 0.3663973331451416, "reward_std": 1.3235621018335224, "rewards/reward_func": 0.3663973331451416, "step": 6560, "toxic_reward": 4.399778747558594 }, { "clip_ratio": 0.0, "completion_length": 44.5, "epoch": 1.5524574669187146, "format_reward": 0.0, "grad_norm": 12.169953346252441, "image_reward": 0.240032958984375, "kl": 2.908788651227951, "learning_rate": 5e-06, "loss": -0.0407, "reward": 0.6662415623664856, "reward_std": 0.9049512568861247, "rewards/reward_func": 0.6662415623664856, "step": 6570, "toxic_reward": 4.262545752525329 }, { "clip_ratio": 0.0, "completion_length": 47.225, "epoch": 1.5548204158790169, "format_reward": 0.0, "grad_norm": 14.397954940795898, "image_reward": 0.2709014892578125, "kl": 1.8459113836288452, "learning_rate": 5e-06, "loss": 0.0948, "reward": 0.4346597075462341, "reward_std": 1.024691704288125, "rewards/reward_func": 0.4346597075462341, "step": 6580, "toxic_reward": 3.837217903137207 }, { "clip_ratio": 0.0, "completion_length": 50.25, "epoch": 1.5571833648393194, "format_reward": 0.0, "grad_norm": 5.823775768280029, "image_reward": 0.2421417236328125, "kl": 361.87849075496194, "learning_rate": 5e-06, "loss": 0.1249, "reward": 0.7188747763633728, "reward_std": 0.7972227469086647, "rewards/reward_func": 0.7188747763633728, "step": 6590, "toxic_reward": 4.341918230056763 }, { "clip_ratio": 0.0, "completion_length": 47.025, "epoch": 1.559546313799622, "format_reward": 0.0, "grad_norm": 10.157111167907715, "image_reward": 0.2904388427734375, "kl": 2.8333058834075926, "learning_rate": 5e-06, "loss": -0.0125, "reward": 0.1938968062400818, "reward_std": 0.5627395014278591, "rewards/reward_func": 0.1938968062400818, "step": 6600, "toxic_reward": 4.045072281360627 }, { "clip_ratio": 0.0, "completion_length": 39.875, "epoch": 1.5619092627599245, "format_reward": -0.5, "grad_norm": 7.558686256408691, "image_reward": 0.28143310397863386, "kl": 5.3599341928958895, "learning_rate": 5e-06, "loss": -0.0333, "reward": 0.3363319247961044, "reward_std": 1.8769858199171723, "rewards/reward_func": 0.3363319247961044, "step": 6610, "toxic_reward": 3.4191954016685484 }, { "clip_ratio": 0.0, "completion_length": 36.3, "epoch": 1.5642722117202268, "format_reward": 0.0, "grad_norm": 6.938777446746826, "image_reward": 0.2576054885983467, "kl": 3.9582558915019037, "learning_rate": 5e-06, "loss": -0.0771, "reward": 0.785160881280899, "reward_std": 1.3131701787933707, "rewards/reward_func": 0.785160881280899, "step": 6620, "toxic_reward": 3.997890996932983 }, { "clip_ratio": 0.0, "completion_length": 52.075, "epoch": 1.5666351606805293, "format_reward": 0.0, "grad_norm": 7.617331027984619, "image_reward": 0.2779083251953125, "kl": 2.7307909965515136, "learning_rate": 5e-06, "loss": 0.052, "reward": 0.6236707329750061, "reward_std": 0.8986939422786235, "rewards/reward_func": 0.6236707329750061, "step": 6630, "toxic_reward": 4.061939382553101 }, { "clip_ratio": 0.0, "completion_length": 51.1, "epoch": 1.5689981096408316, "format_reward": -0.5, "grad_norm": 11.394710540771484, "image_reward": 0.27598063051700594, "kl": 4.903402748703956, "learning_rate": 5e-06, "loss": 0.0366, "reward": 0.095058873295784, "reward_std": 1.6800902128219604, "rewards/reward_func": 0.095058873295784, "step": 6640, "toxic_reward": 3.813139808177948 }, { "clip_ratio": 0.0, "completion_length": 59.8, "epoch": 1.5713610586011342, "format_reward": 0.0, "grad_norm": 3.759659767150879, "image_reward": 0.23873443603515626, "kl": 7.589515461027622, "learning_rate": 5e-06, "loss": -0.0834, "reward": 0.44450428485870364, "reward_std": 0.5481153151020408, "rewards/reward_func": 0.44450428485870364, "step": 6650, "toxic_reward": 3.8413244128227233 }, { "clip_ratio": 0.0, "completion_length": 51.3, "epoch": 1.5737240075614367, "format_reward": 0.0, "grad_norm": 13.018331527709961, "image_reward": 0.2600331619381905, "kl": 3.1137637734413146, "learning_rate": 5e-06, "loss": 0.0214, "reward": 0.7498049587011337, "reward_std": 0.7316715233027935, "rewards/reward_func": 0.7498049587011337, "step": 6660, "toxic_reward": 4.240022134780884 }, { "clip_ratio": 0.0, "completion_length": 49.575, "epoch": 1.5760869565217392, "format_reward": 0.0, "grad_norm": 7.951615333557129, "image_reward": 0.2636627197265625, "kl": 0.875077161192894, "learning_rate": 5e-06, "loss": -0.1023, "reward": 0.6656131267547607, "reward_std": 0.6067664973437786, "rewards/reward_func": 0.6656131267547607, "step": 6670, "toxic_reward": 4.546977305412293 }, { "clip_ratio": 0.0, "completion_length": 40.725, "epoch": 1.5784499054820416, "format_reward": 0.0, "grad_norm": 4.168910026550293, "image_reward": 0.24169108122587205, "kl": 4.073548844456672, "learning_rate": 5e-06, "loss": -0.0498, "reward": 0.5430697202682495, "reward_std": 1.2346604462713002, "rewards/reward_func": 0.5430697202682495, "step": 6680, "toxic_reward": 3.895166778564453 }, { "clip_ratio": 0.0, "completion_length": 53.275, "epoch": 1.580812854442344, "format_reward": 0.0, "grad_norm": 2.9899535179138184, "image_reward": 0.23625640869140624, "kl": 6.596899893879891, "learning_rate": 5e-06, "loss": 0.0148, "reward": 0.3857423186302185, "reward_std": 0.7419607482850552, "rewards/reward_func": 0.3857423186302185, "step": 6690, "toxic_reward": 4.416726422309876 }, { "clip_ratio": 0.0, "completion_length": 40.475, "epoch": 1.5831758034026464, "format_reward": 0.0, "grad_norm": 17.505062103271484, "image_reward": 0.24977264404296876, "kl": 130.44692096710205, "learning_rate": 5e-06, "loss": 0.0063, "reward": 0.8495797365903854, "reward_std": 1.0700383991003037, "rewards/reward_func": 0.8495797365903854, "step": 6700, "toxic_reward": 3.7804057955741883 }, { "clip_ratio": 0.0, "completion_length": 46.25, "epoch": 1.585538752362949, "format_reward": -0.25, "grad_norm": 9.865876197814941, "image_reward": 0.2743357330560684, "kl": 1.342175406217575, "learning_rate": 5e-06, "loss": 0.0144, "reward": -0.01569686532020569, "reward_std": 1.0583332434296608, "rewards/reward_func": -0.01569686532020569, "step": 6710, "toxic_reward": 4.605420160293579 }, { "clip_ratio": 0.0, "completion_length": 46.65, "epoch": 1.5879017013232515, "format_reward": -0.25, "grad_norm": 1.4694126844406128, "image_reward": 0.26331074982881547, "kl": 1.179875871539116, "learning_rate": 5e-06, "loss": 0.0439, "reward": 0.19556427299976348, "reward_std": 1.060202201642096, "rewards/reward_func": 0.19556427299976348, "step": 6720, "toxic_reward": 4.195696997642517 }, { "clip_ratio": 0.0, "completion_length": 41.75, "epoch": 1.590264650283554, "format_reward": 0.0, "grad_norm": 14.831318855285645, "image_reward": 0.26193084716796877, "kl": 3.8945027977228164, "learning_rate": 5e-06, "loss": -0.0405, "reward": 0.8553763270378113, "reward_std": 0.7129356294870377, "rewards/reward_func": 0.8553763270378113, "step": 6730, "toxic_reward": 4.33803927898407 }, { "clip_ratio": 0.0, "completion_length": 56.475, "epoch": 1.5926275992438563, "format_reward": -0.25, "grad_norm": 19.41834831237793, "image_reward": 0.25118509978055953, "kl": 3.402638703584671, "learning_rate": 5e-06, "loss": 0.0046, "reward": 0.5174610838294029, "reward_std": 1.2720857471227647, "rewards/reward_func": 0.5174610838294029, "step": 6740, "toxic_reward": 3.870224565267563 }, { "clip_ratio": 0.0, "completion_length": 48.025, "epoch": 1.5949905482041586, "format_reward": 0.0, "grad_norm": 6.40997838973999, "image_reward": 0.258154296875, "kl": 1.6460766345262527, "learning_rate": 5e-06, "loss": 0.0986, "reward": 1.0767779767513275, "reward_std": 1.5294719189405441, "rewards/reward_func": 1.0767779767513275, "step": 6750, "toxic_reward": 3.531558334827423 }, { "clip_ratio": 0.0, "completion_length": 41.7, "epoch": 1.5973534971644612, "format_reward": -0.5, "grad_norm": 30.405113220214844, "image_reward": 0.25986429750919343, "kl": 3.0854232251644134, "learning_rate": 5e-06, "loss": -0.0433, "reward": 0.2282954216003418, "reward_std": 2.2074968218803406, "rewards/reward_func": 0.2282954216003418, "step": 6760, "toxic_reward": 4.195013093948364 }, { "clip_ratio": 0.0, "completion_length": 36.3, "epoch": 1.5997164461247637, "format_reward": -0.25, "grad_norm": 7.383143424987793, "image_reward": 0.26789347380399703, "kl": 3.230149340629578, "learning_rate": 5e-06, "loss": -0.0221, "reward": 0.44291332364082336, "reward_std": 1.4428741056472063, "rewards/reward_func": 0.44291332364082336, "step": 6770, "toxic_reward": 3.8504308581352236 }, { "clip_ratio": 0.0, "completion_length": 49.925, "epoch": 1.6020793950850662, "format_reward": 0.0, "grad_norm": 6.7697906494140625, "image_reward": 0.2436859130859375, "kl": 3.0731608659029006, "learning_rate": 5e-06, "loss": -0.0282, "reward": 0.9663064420223236, "reward_std": 0.8580235980451107, "rewards/reward_func": 0.9663064420223236, "step": 6780, "toxic_reward": 3.754929578304291 }, { "clip_ratio": 0.0, "completion_length": 43.725, "epoch": 1.6044423440453688, "format_reward": 0.0, "grad_norm": 3.5446696281433105, "image_reward": 0.260052490234375, "kl": 763.1712962627411, "learning_rate": 5e-06, "loss": 0.0939, "reward": 0.7344351947307587, "reward_std": 0.6753151521086693, "rewards/reward_func": 0.7344351947307587, "step": 6790, "toxic_reward": 4.139233088493347 }, { "clip_ratio": 0.0, "completion_length": 44.6, "epoch": 1.606805293005671, "format_reward": 0.0, "grad_norm": 21.1129093170166, "image_reward": 0.27585601806640625, "kl": 2.870541882514954, "learning_rate": 5e-06, "loss": -0.0466, "reward": 0.5618703544139863, "reward_std": 0.8203244937583805, "rewards/reward_func": 0.5618703544139863, "step": 6800, "toxic_reward": 4.3937297582626345 }, { "clip_ratio": 0.0, "completion_length": 57.125, "epoch": 1.6091682419659734, "format_reward": 0.0, "grad_norm": 5.104684352874756, "image_reward": 0.280615234375, "kl": 3.3239043831825255, "learning_rate": 5e-06, "loss": 0.045, "reward": 0.7852797448635102, "reward_std": 0.610455094370991, "rewards/reward_func": 0.7852797448635102, "step": 6810, "toxic_reward": 4.593313884735108 }, { "clip_ratio": 0.0, "completion_length": 45.725, "epoch": 1.611531190926276, "format_reward": 0.0, "grad_norm": 2.496898889541626, "image_reward": 0.2752532958984375, "kl": 2.6775636196136476, "learning_rate": 5e-06, "loss": -0.012, "reward": 0.7639135122299194, "reward_std": 0.9162261974066496, "rewards/reward_func": 0.7639135122299194, "step": 6820, "toxic_reward": 4.146224117279052 }, { "clip_ratio": 0.0, "completion_length": 42.075, "epoch": 1.6138941398865785, "format_reward": 0.0, "grad_norm": 6.608152866363525, "image_reward": 0.267572021484375, "kl": 3.533026337623596, "learning_rate": 5e-06, "loss": 0.0572, "reward": 0.44893051087856295, "reward_std": 0.9419144628569484, "rewards/reward_func": 0.44893051087856295, "step": 6830, "toxic_reward": 4.1230400681495665 }, { "clip_ratio": 0.0, "completion_length": 54.925, "epoch": 1.616257088846881, "format_reward": 0.0, "grad_norm": 11.28681755065918, "image_reward": 0.25560302734375, "kl": 7.369207835197448, "learning_rate": 5e-06, "loss": 0.0152, "reward": 0.5668485701084137, "reward_std": 0.749977857619524, "rewards/reward_func": 0.5668485701084137, "step": 6840, "toxic_reward": 4.343744564056396 }, { "clip_ratio": 0.0, "completion_length": 45.9, "epoch": 1.6186200378071833, "format_reward": -0.5, "grad_norm": 28.541820526123047, "image_reward": 0.2723876953125, "kl": 5.971449375152588, "learning_rate": 5e-06, "loss": -0.0574, "reward": -0.5758611798286438, "reward_std": 1.3836607769131661, "rewards/reward_func": -0.5758611798286438, "step": 6850, "toxic_reward": 4.467629170417785 }, { "clip_ratio": 0.0, "completion_length": 37.1, "epoch": 1.6209829867674859, "format_reward": 0.0, "grad_norm": 12.005922317504883, "image_reward": 0.2653228759765625, "kl": 6.38155357837677, "learning_rate": 5e-06, "loss": 0.0537, "reward": 0.6645367026329041, "reward_std": 0.6022280365228653, "rewards/reward_func": 0.6645367026329041, "step": 6860, "toxic_reward": 4.12990357875824 }, { "clip_ratio": 0.0, "completion_length": 50.175, "epoch": 1.6233459357277882, "format_reward": 0.0, "grad_norm": 4.00104284286499, "image_reward": 0.26532745361328125, "kl": 6.140709114074707, "learning_rate": 5e-06, "loss": 0.0866, "reward": 0.5954837799072266, "reward_std": 0.9484383892267942, "rewards/reward_func": 0.5954837799072266, "step": 6870, "toxic_reward": 4.030814599990845 }, { "clip_ratio": 0.0, "completion_length": 53.2, "epoch": 1.6257088846880907, "format_reward": 0.0, "grad_norm": 7.69809627532959, "image_reward": 0.2781280517578125, "kl": 8.641590279340743, "learning_rate": 5e-06, "loss": 0.0299, "reward": 0.35074634552001954, "reward_std": 0.5954089154489338, "rewards/reward_func": 0.35074634552001954, "step": 6880, "toxic_reward": 4.168118977546692 }, { "clip_ratio": 0.0, "completion_length": 47.6, "epoch": 1.6280718336483933, "format_reward": -0.25, "grad_norm": 7.882171154022217, "image_reward": 0.25705363005399706, "kl": 5.1897116780281065, "learning_rate": 5e-06, "loss": 0.0134, "reward": -0.19078816771507262, "reward_std": 1.1844907969236373, "rewards/reward_func": -0.19078816771507262, "step": 6890, "toxic_reward": 4.343024659156799 }, { "clip_ratio": 0.0, "completion_length": 45.925, "epoch": 1.6304347826086958, "format_reward": -0.25, "grad_norm": 13.869507789611816, "image_reward": 0.2716888427734375, "kl": 6.5070923328399655, "learning_rate": 5e-06, "loss": -0.0103, "reward": 0.15193371772766112, "reward_std": 1.270319462940097, "rewards/reward_func": 0.15193371772766112, "step": 6900, "toxic_reward": 4.229231309890747 }, { "clip_ratio": 0.0, "completion_length": 61.3, "epoch": 1.632797731568998, "format_reward": -0.25, "grad_norm": 6.519335746765137, "image_reward": 0.268505859375, "kl": 83.58075475692749, "learning_rate": 5e-06, "loss": 0.0011, "reward": 0.7866749823093414, "reward_std": 1.1198090038727968, "rewards/reward_func": 0.7866749823093414, "step": 6910, "toxic_reward": 4.473654842376709 }, { "clip_ratio": 0.0, "completion_length": 47.875, "epoch": 1.6351606805293004, "format_reward": 0.0, "grad_norm": 5.124833583831787, "image_reward": 0.2610076904296875, "kl": 2.196775460243225, "learning_rate": 5e-06, "loss": 0.015, "reward": 1.0072305798530579, "reward_std": 1.1389783814549446, "rewards/reward_func": 1.0072305798530579, "step": 6920, "toxic_reward": 4.3077033996582035 }, { "clip_ratio": 0.0, "completion_length": 40.975, "epoch": 1.637523629489603, "format_reward": -0.5, "grad_norm": 3.5923500061035156, "image_reward": 0.2612147033214569, "kl": 4.726305472850799, "learning_rate": 5e-06, "loss": 0.0617, "reward": 0.24457889199256896, "reward_std": 1.512747337669134, "rewards/reward_func": 0.24457889199256896, "step": 6930, "toxic_reward": 4.234147024154663 }, { "clip_ratio": 0.0, "completion_length": 50.425, "epoch": 1.6398865784499055, "format_reward": 0.0, "grad_norm": 7.177937030792236, "image_reward": 0.2646942138671875, "kl": 5.225604176521301, "learning_rate": 5e-06, "loss": -0.0462, "reward": 0.3636160969734192, "reward_std": 0.6955469690263272, "rewards/reward_func": 0.3636160969734192, "step": 6940, "toxic_reward": 4.153347599506378 }, { "clip_ratio": 0.0, "completion_length": 47.25, "epoch": 1.642249527410208, "format_reward": 0.0, "grad_norm": 10.053350448608398, "image_reward": 0.253045654296875, "kl": 5.69408215880394, "learning_rate": 5e-06, "loss": 0.0027, "reward": 0.612578509747982, "reward_std": 0.6395491607487201, "rewards/reward_func": 0.612578509747982, "step": 6950, "toxic_reward": 3.9997507095336915 }, { "clip_ratio": 0.0, "completion_length": 47.025, "epoch": 1.6446124763705106, "format_reward": 0.0, "grad_norm": 43.84629440307617, "image_reward": 0.2462066650390625, "kl": 391.4977917432785, "learning_rate": 5e-06, "loss": 0.0408, "reward": 0.610540634393692, "reward_std": 1.4011766005307436, "rewards/reward_func": 0.610540634393692, "step": 6960, "toxic_reward": 3.513826107978821 }, { "clip_ratio": 0.0, "completion_length": 45.7, "epoch": 1.6469754253308129, "format_reward": 0.0, "grad_norm": 2.5391600131988525, "image_reward": 0.263494873046875, "kl": 26.704900431632996, "learning_rate": 5e-06, "loss": -0.0209, "reward": 0.6357394754886627, "reward_std": 0.9666919514536858, "rewards/reward_func": 0.6357394754886627, "step": 6970, "toxic_reward": 4.3967194080352785 }, { "clip_ratio": 0.0, "completion_length": 49.275, "epoch": 1.6493383742911152, "format_reward": 0.0, "grad_norm": 1.4004714488983154, "image_reward": 0.252850341796875, "kl": 5.729834485054016, "learning_rate": 5e-06, "loss": 0.0379, "reward": 0.47990578413009644, "reward_std": 0.5631790950894355, "rewards/reward_func": 0.47990578413009644, "step": 6980, "toxic_reward": 4.208314228057861 }, { "clip_ratio": 0.0, "completion_length": 57.5, "epoch": 1.6517013232514177, "format_reward": 0.0, "grad_norm": 5.806096076965332, "image_reward": 0.256463623046875, "kl": 6.840502554178238, "learning_rate": 5e-06, "loss": -0.0496, "reward": 0.16656889617443085, "reward_std": 0.9250041805207729, "rewards/reward_func": 0.16656889617443085, "step": 6990, "toxic_reward": 3.7445754587650297 }, { "clip_ratio": 0.0, "completion_length": 45.475, "epoch": 1.6540642722117203, "format_reward": -0.25, "grad_norm": 1.5651546716690063, "image_reward": 0.2714019775390625, "kl": 4.0309244930744175, "learning_rate": 5e-06, "loss": -0.0266, "reward": 0.3252350568771362, "reward_std": 1.123583555780351, "rewards/reward_func": 0.3252350568771362, "step": 7000, "toxic_reward": 4.437454390525818 }, { "clip_ratio": 0.0, "completion_length": 45.225, "epoch": 1.6564272211720228, "format_reward": -0.25, "grad_norm": 8.255953788757324, "image_reward": 0.2734893798828125, "kl": 13.629169458150864, "learning_rate": 5e-06, "loss": 0.0185, "reward": 0.5219172418117524, "reward_std": 1.4155076075345279, "rewards/reward_func": 0.5219172418117524, "step": 7010, "toxic_reward": 3.9341206908226014 }, { "clip_ratio": 0.0, "completion_length": 44.525, "epoch": 1.658790170132325, "format_reward": 0.0, "grad_norm": 7.765683650970459, "image_reward": 0.2684661865234375, "kl": 4.484488549828529, "learning_rate": 5e-06, "loss": 0.0198, "reward": 0.41308672428131105, "reward_std": 0.6728175904601812, "rewards/reward_func": 0.41308672428131105, "step": 7020, "toxic_reward": 4.680417871475219 }, { "clip_ratio": 0.0, "completion_length": 43.425, "epoch": 1.6611531190926276, "format_reward": 0.0, "grad_norm": 6.050631523132324, "image_reward": 0.2446624755859375, "kl": 5.303134024143219, "learning_rate": 5e-06, "loss": -0.0023, "reward": 0.7305093944072724, "reward_std": 0.8056725425645709, "rewards/reward_func": 0.7305093944072724, "step": 7030, "toxic_reward": 4.371766519546509 }, { "clip_ratio": 0.0, "completion_length": 50.725, "epoch": 1.66351606805293, "format_reward": -0.25, "grad_norm": 36.67766189575195, "image_reward": 0.2738332122564316, "kl": 3.6509076714515687, "learning_rate": 5e-06, "loss": 0.0104, "reward": 0.1336117923259735, "reward_std": 1.408228962123394, "rewards/reward_func": 0.1336117923259735, "step": 7040, "toxic_reward": 3.868592691421509 }, { "clip_ratio": 0.0, "completion_length": 45.95, "epoch": 1.6658790170132325, "format_reward": -0.25, "grad_norm": 3.2863216400146484, "image_reward": 0.2496002197265625, "kl": 47.7468825340271, "learning_rate": 5e-06, "loss": -0.0431, "reward": -0.1912323772907257, "reward_std": 1.0762672819197179, "rewards/reward_func": -0.1912323772907257, "step": 7050, "toxic_reward": 4.286359405517578 }, { "clip_ratio": 0.0, "completion_length": 50.8, "epoch": 1.668241965973535, "format_reward": -0.75, "grad_norm": 15.117477416992188, "image_reward": 0.2343353286385536, "kl": 12.957087469100951, "learning_rate": 5e-06, "loss": -0.0128, "reward": -0.5250297307968139, "reward_std": 2.128708484955132, "rewards/reward_func": -0.5250297307968139, "step": 7060, "toxic_reward": 4.045247128605842 }, { "clip_ratio": 0.0, "completion_length": 49.15, "epoch": 1.6706049149338376, "format_reward": -0.25, "grad_norm": 6.236794471740723, "image_reward": 0.2446756988763809, "kl": 3.7372434973716735, "learning_rate": 5e-06, "loss": -0.0289, "reward": 0.1568456247448921, "reward_std": 1.3125899083912373, "rewards/reward_func": 0.1568456247448921, "step": 7070, "toxic_reward": 4.527845191955566 }, { "clip_ratio": 0.0, "completion_length": 47.0, "epoch": 1.6729678638941399, "format_reward": 0.0, "grad_norm": 1.8276275396347046, "image_reward": 0.253338623046875, "kl": 18.982901883125304, "learning_rate": 5e-06, "loss": -0.0591, "reward": 0.5796426713466645, "reward_std": 0.8607377586886287, "rewards/reward_func": 0.5796426713466645, "step": 7080, "toxic_reward": 4.365747809410095 }, { "clip_ratio": 0.0, "completion_length": 52.0, "epoch": 1.6753308128544422, "format_reward": 0.0, "grad_norm": 5.115592956542969, "image_reward": 0.25368804931640626, "kl": 8.384132671356202, "learning_rate": 5e-06, "loss": -0.0358, "reward": 0.4894866108894348, "reward_std": 0.82001001983881, "rewards/reward_func": 0.4894866108894348, "step": 7090, "toxic_reward": 4.161544275283814 }, { "clip_ratio": 0.0, "completion_length": 55.45, "epoch": 1.6776937618147447, "format_reward": -0.25, "grad_norm": 4.421766757965088, "image_reward": 0.257010905444622, "kl": 1.5507995724678039, "learning_rate": 5e-06, "loss": 0.0479, "reward": -0.053127193450927736, "reward_std": 0.8562082014977932, "rewards/reward_func": -0.053127193450927736, "step": 7100, "toxic_reward": 4.494407868385315 }, { "clip_ratio": 0.0, "completion_length": 39.125, "epoch": 1.6800567107750473, "format_reward": 0.0, "grad_norm": 15.849198341369629, "image_reward": 0.254315185546875, "kl": 4.097546017169952, "learning_rate": 5e-06, "loss": -0.0412, "reward": 0.42282047867774963, "reward_std": 0.9609952576458454, "rewards/reward_func": 0.42282047867774963, "step": 7110, "toxic_reward": 4.278821682929992 }, { "clip_ratio": 0.0, "completion_length": 47.15, "epoch": 1.6824196597353498, "format_reward": -0.25, "grad_norm": 9.184070587158203, "image_reward": 0.2508982330560684, "kl": 1.9086317151784897, "learning_rate": 5e-06, "loss": -0.0676, "reward": -0.08927419185638427, "reward_std": 1.1106989961117506, "rewards/reward_func": -0.08927419185638427, "step": 7120, "toxic_reward": 4.535511326789856 }, { "clip_ratio": 0.0, "completion_length": 48.45, "epoch": 1.6847826086956523, "format_reward": 0.0, "grad_norm": 10.833540916442871, "image_reward": 0.2317718505859375, "kl": 1.5903507679700852, "learning_rate": 5e-06, "loss": -0.0026, "reward": 0.5086119592189788, "reward_std": 0.610715470276773, "rewards/reward_func": 0.5086119592189788, "step": 7130, "toxic_reward": 4.333575582504272 }, { "clip_ratio": 0.0, "completion_length": 49.75, "epoch": 1.6871455576559546, "format_reward": 0.0, "grad_norm": 5.887187480926514, "image_reward": 0.2499237060546875, "kl": 49.958091259002686, "learning_rate": 5e-06, "loss": -0.0703, "reward": 0.7590021967887879, "reward_std": 0.8256058894097805, "rewards/reward_func": 0.7590021967887879, "step": 7140, "toxic_reward": 4.225302958488465 }, { "clip_ratio": 0.0, "completion_length": 55.325, "epoch": 1.689508506616257, "format_reward": 0.0, "grad_norm": 12.265044212341309, "image_reward": 0.23530120849609376, "kl": 0.9182627111673355, "learning_rate": 5e-06, "loss": -0.0673, "reward": 0.3405183613300323, "reward_std": 0.7431152425706387, "rewards/reward_func": 0.3405183613300323, "step": 7150, "toxic_reward": 4.184520816802978 }, { "clip_ratio": 0.0, "completion_length": 52.175, "epoch": 1.6918714555765595, "format_reward": -0.25, "grad_norm": 3.966953754425049, "image_reward": 0.267620849609375, "kl": 1.64820496737957, "learning_rate": 5e-06, "loss": 0.1687, "reward": 0.01701483130455017, "reward_std": 1.4536124819889664, "rewards/reward_func": 0.01701483130455017, "step": 7160, "toxic_reward": 3.7502978086471557 }, { "clip_ratio": 0.0, "completion_length": 45.15, "epoch": 1.694234404536862, "format_reward": 0.0, "grad_norm": 5.9182000160217285, "image_reward": 0.25394287109375, "kl": 1.6780494809150697, "learning_rate": 5e-06, "loss": 0.0719, "reward": 0.8605155050754547, "reward_std": 0.9149322494864464, "rewards/reward_func": 0.8605155050754547, "step": 7170, "toxic_reward": 3.9694084405899046 }, { "clip_ratio": 0.0, "completion_length": 44.525, "epoch": 1.6965973534971646, "format_reward": 0.0, "grad_norm": 2.476659059524536, "image_reward": 0.272882080078125, "kl": 3.32854140996933, "learning_rate": 5e-06, "loss": -0.0742, "reward": 1.0633208215236665, "reward_std": 0.9789414823055267, "rewards/reward_func": 1.0633208215236665, "step": 7180, "toxic_reward": 4.391367101669312 }, { "clip_ratio": 0.0, "completion_length": 45.825, "epoch": 1.6989603024574669, "format_reward": 0.0, "grad_norm": 3.7560040950775146, "image_reward": 0.269110107421875, "kl": 1.4420736670494079, "learning_rate": 5e-06, "loss": 0.0345, "reward": 1.0360184490680695, "reward_std": 0.8136029925197363, "rewards/reward_func": 1.0360184490680695, "step": 7190, "toxic_reward": 3.99700380563736 }, { "clip_ratio": 0.0, "completion_length": 33.925, "epoch": 1.7013232514177694, "format_reward": 0.0, "grad_norm": 11.579362869262695, "image_reward": 0.2462371826171875, "kl": 0.9962957471609115, "learning_rate": 5e-06, "loss": -0.0255, "reward": 0.2942840725183487, "reward_std": 0.3486198179423809, "rewards/reward_func": 0.2942840725183487, "step": 7200, "toxic_reward": 3.8329622387886046 }, { "clip_ratio": 0.0, "completion_length": 48.825, "epoch": 1.7036862003780717, "format_reward": -0.5, "grad_norm": 15.74374008178711, "image_reward": 0.26438903957605364, "kl": 1.2382088035345078, "learning_rate": 5e-06, "loss": -0.0735, "reward": 0.2413632392883301, "reward_std": 1.651388045027852, "rewards/reward_func": 0.2413632392883301, "step": 7210, "toxic_reward": 4.516292905807495 }, { "clip_ratio": 0.0, "completion_length": 46.625, "epoch": 1.7060491493383743, "format_reward": 0.0, "grad_norm": 9.237770080566406, "image_reward": 0.2369354248046875, "kl": 1.5744222581386567, "learning_rate": 5e-06, "loss": 0.0256, "reward": 0.6944510787725449, "reward_std": 1.11760393679142, "rewards/reward_func": 0.6944510787725449, "step": 7220, "toxic_reward": 3.7596142530441283 }, { "clip_ratio": 0.0, "completion_length": 39.125, "epoch": 1.7084120982986768, "format_reward": 0.0, "grad_norm": 3.7665228843688965, "image_reward": 0.2577423095703125, "kl": 0.7259436190128327, "learning_rate": 5e-06, "loss": 0.0117, "reward": 0.5142745256423951, "reward_std": 0.6884998820722104, "rewards/reward_func": 0.5142745256423951, "step": 7230, "toxic_reward": 4.332102084159851 }, { "clip_ratio": 0.0, "completion_length": 48.475, "epoch": 1.7107750472589793, "format_reward": 0.0, "grad_norm": 4.795387268066406, "image_reward": 0.28794708251953127, "kl": 1.6049385368824005, "learning_rate": 5e-06, "loss": 0.0341, "reward": 0.308843332529068, "reward_std": 0.4225019045174122, "rewards/reward_func": 0.308843332529068, "step": 7240, "toxic_reward": 4.501336789131164 }, { "clip_ratio": 0.0, "completion_length": 55.0, "epoch": 1.7131379962192816, "format_reward": -0.25, "grad_norm": 11.164639472961426, "image_reward": 0.25756022036075593, "kl": 0.43412337452173233, "learning_rate": 5e-06, "loss": -0.026, "reward": 0.46165032386779786, "reward_std": 0.9854918915778399, "rewards/reward_func": 0.46165032386779786, "step": 7250, "toxic_reward": 4.23072258234024 }, { "clip_ratio": 0.0, "completion_length": 48.925, "epoch": 1.715500945179584, "format_reward": 0.0, "grad_norm": 26.601303100585938, "image_reward": 0.24893798828125, "kl": 3.482639339566231, "learning_rate": 5e-06, "loss": -0.0419, "reward": 0.5657954633235931, "reward_std": 1.2434701435267925, "rewards/reward_func": 0.5657954633235931, "step": 7260, "toxic_reward": 4.052207565307617 }, { "clip_ratio": 0.0, "completion_length": 46.85, "epoch": 1.7178638941398865, "format_reward": -0.25, "grad_norm": 19.468366622924805, "image_reward": 0.24361775815486908, "kl": 0.6207199424505234, "learning_rate": 5e-06, "loss": -0.1011, "reward": 0.2289634108543396, "reward_std": 1.1521323285996914, "rewards/reward_func": 0.2289634108543396, "step": 7270, "toxic_reward": 4.243139553070068 }, { "clip_ratio": 0.0, "completion_length": 44.475, "epoch": 1.720226843100189, "format_reward": 0.0, "grad_norm": 11.427348136901855, "image_reward": 0.24371236115694045, "kl": 1.2423572808504104, "learning_rate": 5e-06, "loss": -0.0055, "reward": 0.44162888526916505, "reward_std": 1.226283924281597, "rewards/reward_func": 0.44162888526916505, "step": 7280, "toxic_reward": 3.9729990482330324 }, { "clip_ratio": 0.0, "completion_length": 49.725, "epoch": 1.7225897920604916, "format_reward": 0.0, "grad_norm": 8.307239532470703, "image_reward": 0.2623565673828125, "kl": 0.795675303786993, "learning_rate": 5e-06, "loss": 0.005, "reward": 0.47723318338394166, "reward_std": 0.5881602220237255, "rewards/reward_func": 0.47723318338394166, "step": 7290, "toxic_reward": 4.611540603637695 }, { "clip_ratio": 0.0, "completion_length": 49.125, "epoch": 1.724952741020794, "format_reward": 0.0, "grad_norm": 7.304860591888428, "image_reward": 0.2538177490234375, "kl": 1.0193208366632462, "learning_rate": 5e-06, "loss": -0.023, "reward": 0.17551978230476378, "reward_std": 0.5646818313747645, "rewards/reward_func": 0.17551978230476378, "step": 7300, "toxic_reward": 4.499063897132873 }, { "clip_ratio": 0.0, "completion_length": 50.275, "epoch": 1.7273156899810964, "format_reward": 0.0, "grad_norm": 1.351771354675293, "image_reward": 0.2463592529296875, "kl": 1.9171950757503509, "learning_rate": 5e-06, "loss": 0.0213, "reward": 0.466388076543808, "reward_std": 0.8451812721788883, "rewards/reward_func": 0.466388076543808, "step": 7310, "toxic_reward": 4.565359354019165 }, { "clip_ratio": 0.0, "completion_length": 46.925, "epoch": 1.7296786389413987, "format_reward": -0.25, "grad_norm": 2.364166021347046, "image_reward": 0.25976969450712206, "kl": 0.5259292095899581, "learning_rate": 5e-06, "loss": -0.084, "reward": 0.45669102370738984, "reward_std": 1.098591622710228, "rewards/reward_func": 0.45669102370738984, "step": 7320, "toxic_reward": 4.627902317047119 }, { "clip_ratio": 0.0, "completion_length": 50.825, "epoch": 1.7320415879017013, "format_reward": 0.0, "grad_norm": 23.96133804321289, "image_reward": 0.2389556884765625, "kl": 1.1734901428222657, "learning_rate": 5e-06, "loss": 0.038, "reward": 0.7277517914772034, "reward_std": 0.8356013357639313, "rewards/reward_func": 0.7277517914772034, "step": 7330, "toxic_reward": 4.407384157180786 }, { "clip_ratio": 0.0, "completion_length": 44.0, "epoch": 1.7344045368620038, "format_reward": 0.0, "grad_norm": 17.774612426757812, "image_reward": 0.2680206298828125, "kl": 4.040656617283821, "learning_rate": 5e-06, "loss": 0.0436, "reward": 0.2283779501914978, "reward_std": 0.34994165217503903, "rewards/reward_func": 0.2283779501914978, "step": 7340, "toxic_reward": 4.637366437911988 }, { "clip_ratio": 0.0, "completion_length": 42.25, "epoch": 1.7367674858223063, "format_reward": -0.25, "grad_norm": 12.662446022033691, "image_reward": 0.24230550229549408, "kl": 0.5235348105430603, "learning_rate": 5e-06, "loss": -0.042, "reward": 0.8077804684638977, "reward_std": 1.315062115341425, "rewards/reward_func": 0.8077804684638977, "step": 7350, "toxic_reward": 4.6036452293396 }, { "clip_ratio": 0.0, "completion_length": 49.75, "epoch": 1.7391304347826086, "format_reward": 0.0, "grad_norm": 2.7947723865509033, "image_reward": 0.259271240234375, "kl": 0.5023418068885803, "learning_rate": 5e-06, "loss": -0.0101, "reward": 0.47644210457801817, "reward_std": 0.6371240261942148, "rewards/reward_func": 0.47644210457801817, "step": 7360, "toxic_reward": 4.352305841445923 }, { "clip_ratio": 0.0, "completion_length": 53.975, "epoch": 1.7414933837429112, "format_reward": 0.0, "grad_norm": 6.967306137084961, "image_reward": 0.2540252685546875, "kl": 0.5360975474119186, "learning_rate": 5e-06, "loss": 0.0708, "reward": 0.5753240287303925, "reward_std": 0.8622719066217541, "rewards/reward_func": 0.5753240287303925, "step": 7370, "toxic_reward": 4.0306689739227295 }, { "clip_ratio": 0.0, "completion_length": 44.75, "epoch": 1.7438563327032135, "format_reward": -0.5, "grad_norm": 31.72753143310547, "image_reward": 0.22639973908662797, "kl": 0.5255977511405945, "learning_rate": 5e-06, "loss": 0.0039, "reward": -0.24822215884923934, "reward_std": 1.6855425260961057, "rewards/reward_func": -0.24822215884923934, "step": 7380, "toxic_reward": 3.752596640586853 }, { "clip_ratio": 0.0, "completion_length": 52.375, "epoch": 1.746219281663516, "format_reward": 0.0, "grad_norm": 7.6846818923950195, "image_reward": 0.2564056396484375, "kl": 3.591386225819588, "learning_rate": 5e-06, "loss": 0.0212, "reward": 0.12304354310035706, "reward_std": 0.8115306086838245, "rewards/reward_func": 0.12304354310035706, "step": 7390, "toxic_reward": 3.613353615999222 }, { "clip_ratio": 0.0, "completion_length": 49.65, "epoch": 1.7485822306238186, "format_reward": 0.0, "grad_norm": 2.726175308227539, "image_reward": 0.283404541015625, "kl": 0.9659576997160911, "learning_rate": 5e-06, "loss": 0.0252, "reward": 0.3961315780878067, "reward_std": 1.0492550559341907, "rewards/reward_func": 0.3961315780878067, "step": 7400, "toxic_reward": 3.501691198348999 }, { "clip_ratio": 0.0, "completion_length": 41.95, "epoch": 1.750945179584121, "format_reward": 0.0, "grad_norm": 3.0125391483306885, "image_reward": 0.2607086181640625, "kl": 0.6532519310712814, "learning_rate": 5e-06, "loss": 0.0276, "reward": 0.4769218623638153, "reward_std": 0.6247519843280316, "rewards/reward_func": 0.4769218623638153, "step": 7410, "toxic_reward": 4.560657954216003 }, { "clip_ratio": 0.0, "completion_length": 56.425, "epoch": 1.7533081285444234, "format_reward": 0.0, "grad_norm": 18.774812698364258, "image_reward": 0.24655609130859374, "kl": 1.901971572637558, "learning_rate": 5e-06, "loss": 0.0145, "reward": 0.6345466494560241, "reward_std": 1.1331901341676711, "rewards/reward_func": 0.6345466494560241, "step": 7420, "toxic_reward": 4.449591946601868 }, { "clip_ratio": 0.0, "completion_length": 52.375, "epoch": 1.755671077504726, "format_reward": 0.0, "grad_norm": 4.1103057861328125, "image_reward": 0.265411376953125, "kl": 1.7676091372966767, "learning_rate": 5e-06, "loss": 0.0034, "reward": 0.6921305894851685, "reward_std": 0.6238477535545826, "rewards/reward_func": 0.6921305894851685, "step": 7430, "toxic_reward": 3.859569197893143 }, { "clip_ratio": 0.0, "completion_length": 46.65, "epoch": 1.7580340264650283, "format_reward": 0.0, "grad_norm": 2.0048232078552246, "image_reward": 0.24298477172851562, "kl": 4.202221667766571, "learning_rate": 5e-06, "loss": 0.1367, "reward": 0.9155125916004181, "reward_std": 0.7328770853579044, "rewards/reward_func": 0.9155125916004181, "step": 7440, "toxic_reward": 4.531428098678589 }, { "clip_ratio": 0.0, "completion_length": 38.25, "epoch": 1.7603969754253308, "format_reward": 0.0, "grad_norm": 20.737003326416016, "image_reward": 0.25689697265625, "kl": 16.54909121990204, "learning_rate": 5e-06, "loss": 0.037, "reward": 0.8588055372238159, "reward_std": 0.9005012600682676, "rewards/reward_func": 0.8588055372238159, "step": 7450, "toxic_reward": 4.513736462593078 }, { "clip_ratio": 0.0, "completion_length": 50.7, "epoch": 1.7627599243856333, "format_reward": 0.0, "grad_norm": 1.5940968990325928, "image_reward": 0.26309814453125, "kl": 2.317558985948563, "learning_rate": 5e-06, "loss": 0.0018, "reward": 0.20084644556045533, "reward_std": 0.7237232834100723, "rewards/reward_func": 0.20084644556045533, "step": 7460, "toxic_reward": 4.334891009330749 }, { "clip_ratio": 0.0, "completion_length": 44.325, "epoch": 1.7651228733459359, "format_reward": 0.0, "grad_norm": 10.07941722869873, "image_reward": 0.2528594970703125, "kl": 1.3212820410728454, "learning_rate": 5e-06, "loss": -0.0444, "reward": 1.2387877494096755, "reward_std": 0.8179315060377121, "rewards/reward_func": 1.2387877494096755, "step": 7470, "toxic_reward": 4.363593196868896 }, { "clip_ratio": 0.0, "completion_length": 49.825, "epoch": 1.7674858223062382, "format_reward": 0.0, "grad_norm": 28.392396926879883, "image_reward": 0.25749053955078127, "kl": 2.198029878735542, "learning_rate": 5e-06, "loss": 0.0322, "reward": 0.1901194632053375, "reward_std": 0.5339192871004343, "rewards/reward_func": 0.1901194632053375, "step": 7480, "toxic_reward": 4.514597225189209 }, { "clip_ratio": 0.0, "completion_length": 44.7, "epoch": 1.7698487712665405, "format_reward": 0.0, "grad_norm": 26.77941131591797, "image_reward": 0.241033935546875, "kl": 6.588536351919174, "learning_rate": 5e-06, "loss": 0.0218, "reward": 0.2174743801355362, "reward_std": 0.8413432762026787, "rewards/reward_func": 0.2174743801355362, "step": 7490, "toxic_reward": 4.284235906600952 }, { "clip_ratio": 0.0, "completion_length": 42.55, "epoch": 1.772211720226843, "format_reward": -0.25, "grad_norm": 18.408794403076172, "image_reward": 0.2581207275390625, "kl": 3.106099420785904, "learning_rate": 5e-06, "loss": 0.0257, "reward": 0.31152122020721434, "reward_std": 1.2936958684585989, "rewards/reward_func": 0.31152122020721434, "step": 7500, "toxic_reward": 4.310132288932801 }, { "clip_ratio": 0.0, "completion_length": 40.475, "epoch": 1.7745746691871456, "format_reward": 0.0, "grad_norm": 32.10823440551758, "image_reward": 0.232781982421875, "kl": 11.768871355056763, "learning_rate": 5e-06, "loss": -0.0201, "reward": 1.5193881750106812, "reward_std": 0.8748866233974695, "rewards/reward_func": 1.5193881750106812, "step": 7510, "toxic_reward": 4.612711477279663 }, { "clip_ratio": 0.0, "completion_length": 45.575, "epoch": 1.776937618147448, "format_reward": 0.0, "grad_norm": 10.912269592285156, "image_reward": 0.235693359375, "kl": 2.0526355147361754, "learning_rate": 5e-06, "loss": 0.1284, "reward": 1.3539286196231841, "reward_std": 0.9052736334502697, "rewards/reward_func": 1.3539286196231841, "step": 7520, "toxic_reward": 4.487947154045105 }, { "clip_ratio": 0.0, "completion_length": 46.575, "epoch": 1.7793005671077504, "format_reward": 0.0, "grad_norm": 13.928491592407227, "image_reward": 0.2556488037109375, "kl": 21.833010697364806, "learning_rate": 5e-06, "loss": -0.1174, "reward": 0.5344179272651672, "reward_std": 0.7245766028761864, "rewards/reward_func": 0.5344179272651672, "step": 7530, "toxic_reward": 4.373207831382752 }, { "clip_ratio": 0.0, "completion_length": 46.975, "epoch": 1.781663516068053, "format_reward": 0.0, "grad_norm": 8.675307273864746, "image_reward": 0.24088897705078124, "kl": 1.3107800006866455, "learning_rate": 5e-06, "loss": 0.0403, "reward": 0.04000200629234314, "reward_std": 1.0572677969932556, "rewards/reward_func": 0.04000200629234314, "step": 7540, "toxic_reward": 4.048869323730469 }, { "clip_ratio": 0.0, "completion_length": 40.975, "epoch": 1.7840264650283553, "format_reward": -0.25, "grad_norm": 3.656561851501465, "image_reward": 0.24349263608455657, "kl": 3.6083962321281433, "learning_rate": 5e-06, "loss": -0.0326, "reward": 0.10396124720573426, "reward_std": 1.0819443106651305, "rewards/reward_func": 0.10396124720573426, "step": 7550, "toxic_reward": 4.45411868095398 }, { "clip_ratio": 0.0, "completion_length": 40.6, "epoch": 1.7863894139886578, "format_reward": 0.0, "grad_norm": 2.0053718090057373, "image_reward": 0.280938720703125, "kl": 1.8616322338581086, "learning_rate": 5e-06, "loss": -0.0035, "reward": 0.602351513504982, "reward_std": 0.8774395015090704, "rewards/reward_func": 0.602351513504982, "step": 7560, "toxic_reward": 3.8221758723258974 }, { "clip_ratio": 0.0, "completion_length": 49.15, "epoch": 1.7887523629489603, "format_reward": 0.0, "grad_norm": 6.999305248260498, "image_reward": 0.24803619384765624, "kl": 1.7729626595973969, "learning_rate": 5e-06, "loss": -0.004, "reward": 0.33846797943115237, "reward_std": 0.587756198644638, "rewards/reward_func": 0.33846797943115237, "step": 7570, "toxic_reward": 4.2568159103393555 }, { "clip_ratio": 0.0, "completion_length": 38.375, "epoch": 1.7911153119092629, "format_reward": 0.0, "grad_norm": 12.467576026916504, "image_reward": 0.2431640625, "kl": 1.3180940926074982, "learning_rate": 5e-06, "loss": 0.0225, "reward": 0.6568324744701386, "reward_std": 0.5710492163896561, "rewards/reward_func": 0.6568324744701386, "step": 7580, "toxic_reward": 4.575870084762573 }, { "clip_ratio": 0.0, "completion_length": 48.675, "epoch": 1.7934782608695652, "format_reward": 0.0, "grad_norm": 41.636165618896484, "image_reward": 0.2553070068359375, "kl": 1.2196908950805665, "learning_rate": 5e-06, "loss": 0.0276, "reward": 0.9933471500873565, "reward_std": 0.8478576868772507, "rewards/reward_func": 0.9933471500873565, "step": 7590, "toxic_reward": 4.177789008617401 }, { "clip_ratio": 0.0, "completion_length": 36.3, "epoch": 1.7958412098298677, "format_reward": 0.0, "grad_norm": 8.115588188171387, "image_reward": 0.27226715087890624, "kl": 5.791901814937591, "learning_rate": 5e-06, "loss": 0.0022, "reward": 0.3163196682929993, "reward_std": 0.8629786409437656, "rewards/reward_func": 0.3163196682929993, "step": 7600, "toxic_reward": 3.73489425778389 }, { "clip_ratio": 0.0, "completion_length": 49.95, "epoch": 1.79820415879017, "format_reward": 0.0, "grad_norm": 2.3679516315460205, "image_reward": 0.2376190185546875, "kl": 4.311083900928497, "learning_rate": 5e-06, "loss": -0.0175, "reward": 0.6290358543395996, "reward_std": 1.0244077319279312, "rewards/reward_func": 0.6290358543395996, "step": 7610, "toxic_reward": 4.054656505584717 }, { "clip_ratio": 0.0, "completion_length": 49.35, "epoch": 1.8005671077504726, "format_reward": 0.0, "grad_norm": 2.1850380897521973, "image_reward": 0.247137451171875, "kl": 3.278796100616455, "learning_rate": 5e-06, "loss": 0.056, "reward": 1.2004601210355759, "reward_std": 0.7055684822611511, "rewards/reward_func": 1.2004601210355759, "step": 7620, "toxic_reward": 3.5256235122680666 }, { "clip_ratio": 0.0, "completion_length": 39.75, "epoch": 1.802930056710775, "format_reward": -0.25, "grad_norm": 3.8605425357818604, "image_reward": 0.24492238312959672, "kl": 7.6126263558864595, "learning_rate": 5e-06, "loss": -0.0959, "reward": 0.37777516841888426, "reward_std": 1.1775035494938493, "rewards/reward_func": 0.37777516841888426, "step": 7630, "toxic_reward": 4.522587513923645 }, { "clip_ratio": 0.0, "completion_length": 45.375, "epoch": 1.8052930056710776, "format_reward": -0.25, "grad_norm": 6.144404411315918, "image_reward": 0.241168212890625, "kl": 1.456436914205551, "learning_rate": 5e-06, "loss": -0.0231, "reward": 0.36865578293800355, "reward_std": 1.7164668783545494, "rewards/reward_func": 0.36865578293800355, "step": 7640, "toxic_reward": 3.9745461702346803 }, { "clip_ratio": 0.0, "completion_length": 47.775, "epoch": 1.80765595463138, "format_reward": 0.0, "grad_norm": 33.95363998413086, "image_reward": 0.2472625732421875, "kl": 2.2694355845451355, "learning_rate": 5e-06, "loss": -0.1037, "reward": 0.8588967323303223, "reward_std": 1.019287913478911, "rewards/reward_func": 0.8588967323303223, "step": 7650, "toxic_reward": 4.213714742660523 }, { "clip_ratio": 0.0, "completion_length": 50.35, "epoch": 1.8100189035916823, "format_reward": -0.25, "grad_norm": 6.865695953369141, "image_reward": 0.25559844970703127, "kl": 6.3857537567615505, "learning_rate": 5e-06, "loss": -0.0924, "reward": -0.07846069931983948, "reward_std": 1.1336833463981748, "rewards/reward_func": -0.07846069931983948, "step": 7660, "toxic_reward": 4.2933889627456665 }, { "clip_ratio": 0.0, "completion_length": 53.825, "epoch": 1.8123818525519848, "format_reward": 0.0, "grad_norm": 2.160090923309326, "image_reward": 0.2841064453125, "kl": 5.202520692348481, "learning_rate": 5e-06, "loss": 0.006, "reward": 1.153634887933731, "reward_std": 1.2888424217700958, "rewards/reward_func": 1.153634887933731, "step": 7670, "toxic_reward": 3.994613242149353 }, { "clip_ratio": 0.0, "completion_length": 50.975, "epoch": 1.8147448015122873, "format_reward": 0.0, "grad_norm": 3.903553009033203, "image_reward": 0.26880950927734376, "kl": 63.095464119315146, "learning_rate": 5e-06, "loss": 0.001, "reward": 1.0250155806541443, "reward_std": 0.7393251709640026, "rewards/reward_func": 1.0250155806541443, "step": 7680, "toxic_reward": 3.694820535182953 }, { "clip_ratio": 0.0, "completion_length": 43.15, "epoch": 1.8171077504725899, "format_reward": 0.0, "grad_norm": 15.986948013305664, "image_reward": 0.2573333740234375, "kl": 3.8690971970558166, "learning_rate": 5e-06, "loss": 0.0365, "reward": 0.9120604813098907, "reward_std": 0.8725108332931996, "rewards/reward_func": 0.9120604813098907, "step": 7690, "toxic_reward": 4.068342316150665 }, { "clip_ratio": 0.0, "completion_length": 46.65, "epoch": 1.8194706994328924, "format_reward": 0.0, "grad_norm": 2.521322727203369, "image_reward": 0.232586669921875, "kl": 1.3404993683099746, "learning_rate": 5e-06, "loss": -0.0105, "reward": 0.08191419243812562, "reward_std": 0.6063120868057013, "rewards/reward_func": 0.08191419243812562, "step": 7700, "toxic_reward": 4.285334658622742 }, { "clip_ratio": 0.0, "completion_length": 54.65, "epoch": 1.8218336483931947, "format_reward": 0.0, "grad_norm": 10.563508033752441, "image_reward": 0.251318359375, "kl": 4.375722473859787, "learning_rate": 5e-06, "loss": 0.0574, "reward": 0.7043181240558625, "reward_std": 0.5366579249501229, "rewards/reward_func": 0.7043181240558625, "step": 7710, "toxic_reward": 4.468820595741272 }, { "clip_ratio": 0.0, "completion_length": 47.175, "epoch": 1.824196597353497, "format_reward": -0.25, "grad_norm": 5.306228160858154, "image_reward": 0.25533854216337204, "kl": 1.2862511157989502, "learning_rate": 5e-06, "loss": 0.0064, "reward": 0.2715910017490387, "reward_std": 1.3802445553243161, "rewards/reward_func": 0.2715910017490387, "step": 7720, "toxic_reward": 4.128815948963165 }, { "clip_ratio": 0.0, "completion_length": 53.525, "epoch": 1.8265595463137996, "format_reward": 0.0, "grad_norm": 4.18682336807251, "image_reward": 0.22406005859375, "kl": 13.262214809656143, "learning_rate": 5e-06, "loss": -0.0483, "reward": 0.43178263306617737, "reward_std": 0.5340902636758983, "rewards/reward_func": 0.43178263306617737, "step": 7730, "toxic_reward": 4.167550274729729 }, { "clip_ratio": 0.0, "completion_length": 51.15, "epoch": 1.8289224952741021, "format_reward": 0.0, "grad_norm": 16.534120559692383, "image_reward": 0.2627288818359375, "kl": 4.888235807418823, "learning_rate": 5e-06, "loss": 0.0231, "reward": 0.46792620718479155, "reward_std": 0.6471607919782401, "rewards/reward_func": 0.46792620718479155, "step": 7740, "toxic_reward": 4.068465518951416 }, { "clip_ratio": 0.0, "completion_length": 45.225, "epoch": 1.8312854442344046, "format_reward": 0.0, "grad_norm": 10.179228782653809, "image_reward": 0.249444580078125, "kl": 3.951664477586746, "learning_rate": 5e-06, "loss": -0.0106, "reward": 1.0039419054985046, "reward_std": 0.8490265306085348, "rewards/reward_func": 1.0039419054985046, "step": 7750, "toxic_reward": 4.28922358751297 }, { "clip_ratio": 0.0, "completion_length": 43.0, "epoch": 1.833648393194707, "format_reward": 0.0, "grad_norm": 1.3173015117645264, "image_reward": 0.2619903564453125, "kl": 3.221765196323395, "learning_rate": 5e-06, "loss": -0.0057, "reward": 0.5499142289161683, "reward_std": 0.8114865634590387, "rewards/reward_func": 0.5499142289161683, "step": 7760, "toxic_reward": 4.202396821975708 }, { "clip_ratio": 0.0, "completion_length": 45.95, "epoch": 1.8360113421550095, "format_reward": -0.25, "grad_norm": 9.477835655212402, "image_reward": 0.275567626953125, "kl": 7.138307851552963, "learning_rate": 5e-06, "loss": -0.0182, "reward": 0.5576439201831818, "reward_std": 1.718572654016316, "rewards/reward_func": 0.5576439201831818, "step": 7770, "toxic_reward": 3.892837381362915 }, { "clip_ratio": 0.0, "completion_length": 55.325, "epoch": 1.8383742911153118, "format_reward": -0.25, "grad_norm": 15.364556312561035, "image_reward": 0.22845306396484374, "kl": 5.752876976132393, "learning_rate": 5e-06, "loss": 0.0724, "reward": 0.4619426131248474, "reward_std": 0.994839246571064, "rewards/reward_func": 0.4619426131248474, "step": 7780, "toxic_reward": 4.777106142044067 }, { "clip_ratio": 0.0, "completion_length": 62.75, "epoch": 1.8407372400756143, "format_reward": -0.25, "grad_norm": 13.07127571105957, "image_reward": 0.24772542268037795, "kl": 2.2956355273723603, "learning_rate": 5e-06, "loss": -0.0286, "reward": 0.23627470731735228, "reward_std": 1.3513500357046724, "rewards/reward_func": 0.23627470731735228, "step": 7790, "toxic_reward": 3.998799777030945 }, { "clip_ratio": 0.0, "completion_length": 54.675, "epoch": 1.8431001890359169, "format_reward": -0.25, "grad_norm": 1.4357541799545288, "image_reward": 0.2586761474609375, "kl": 2.018620651960373, "learning_rate": 5e-06, "loss": 0.0136, "reward": 0.5957891523838044, "reward_std": 1.3981972932815552, "rewards/reward_func": 0.5957891523838044, "step": 7800, "toxic_reward": 3.74977787733078 }, { "clip_ratio": 0.0, "completion_length": 50.9, "epoch": 1.8454631379962194, "format_reward": 0.0, "grad_norm": 12.382879257202148, "image_reward": 0.251861572265625, "kl": 2.0946659803390504, "learning_rate": 5e-06, "loss": -0.0109, "reward": 0.2852811634540558, "reward_std": 0.7155913963913918, "rewards/reward_func": 0.2852811634540558, "step": 7810, "toxic_reward": 4.4501420021057125 }, { "clip_ratio": 0.0, "completion_length": 48.125, "epoch": 1.8478260869565217, "format_reward": 0.0, "grad_norm": 4.438508987426758, "image_reward": 0.270166015625, "kl": 1.658120059967041, "learning_rate": 5e-06, "loss": 0.0217, "reward": 0.8978000760078431, "reward_std": 1.2586904138326644, "rewards/reward_func": 0.8978000760078431, "step": 7820, "toxic_reward": 4.126551675796509 }, { "clip_ratio": 0.0, "completion_length": 37.9, "epoch": 1.850189035916824, "format_reward": 0.0, "grad_norm": 1.4302005767822266, "image_reward": 0.2738861083984375, "kl": 1.6736175537109375, "learning_rate": 5e-06, "loss": -0.1503, "reward": 0.2234538435935974, "reward_std": 0.7356585245579481, "rewards/reward_func": 0.2234538435935974, "step": 7830, "toxic_reward": 3.9116656303405763 }, { "clip_ratio": 0.0, "completion_length": 46.125, "epoch": 1.8525519848771266, "format_reward": 0.0, "grad_norm": 5.846213340759277, "image_reward": 0.2658111572265625, "kl": 1.36658373773098, "learning_rate": 5e-06, "loss": -0.0164, "reward": -0.04418985247611999, "reward_std": 0.827529611485079, "rewards/reward_func": -0.04418985247611999, "step": 7840, "toxic_reward": 3.892432355880737 }, { "clip_ratio": 0.0, "completion_length": 51.475, "epoch": 1.8549149338374291, "format_reward": 0.0, "grad_norm": 5.060561656951904, "image_reward": 0.23461151123046875, "kl": 0.5276786342263222, "learning_rate": 5e-06, "loss": -0.003, "reward": 0.7852385342121124, "reward_std": 0.9399228170514107, "rewards/reward_func": 0.7852385342121124, "step": 7850, "toxic_reward": 3.641209203004837 }, { "clip_ratio": 0.0, "completion_length": 46.025, "epoch": 1.8572778827977316, "format_reward": 0.0, "grad_norm": 2.505263566970825, "image_reward": 0.259368896484375, "kl": 1.0133032470941543, "learning_rate": 5e-06, "loss": 0.0581, "reward": 0.8989585757255554, "reward_std": 0.917613423243165, "rewards/reward_func": 0.8989585757255554, "step": 7860, "toxic_reward": 4.13404905796051 }, { "clip_ratio": 0.0, "completion_length": 43.775, "epoch": 1.8596408317580342, "format_reward": 0.0, "grad_norm": 37.166500091552734, "image_reward": 0.25547637939453127, "kl": 2.1411855638027193, "learning_rate": 5e-06, "loss": 0.1396, "reward": 0.21031073927879335, "reward_std": 0.6978237416595221, "rewards/reward_func": 0.21031073927879335, "step": 7870, "toxic_reward": 4.086808681488037 }, { "clip_ratio": 0.0, "completion_length": 46.05, "epoch": 1.8620037807183365, "format_reward": 0.0, "grad_norm": 1.7183008193969727, "image_reward": 0.2409576416015625, "kl": 0.6873624622821808, "learning_rate": 5e-06, "loss": 0.033, "reward": 0.7567365884780883, "reward_std": 0.95932078063488, "rewards/reward_func": 0.7567365884780883, "step": 7880, "toxic_reward": 4.077835154533386 }, { "clip_ratio": 0.0, "completion_length": 47.95, "epoch": 1.8643667296786388, "format_reward": 0.0, "grad_norm": 1.375571846961975, "image_reward": 0.251434326171875, "kl": 1.8803806602954865, "learning_rate": 5e-06, "loss": -0.0674, "reward": 0.1468454658985138, "reward_std": 0.7339655995368958, "rewards/reward_func": 0.1468454658985138, "step": 7890, "toxic_reward": 4.2400289416313175 }, { "clip_ratio": 0.0, "completion_length": 38.625, "epoch": 1.8667296786389413, "format_reward": 0.0, "grad_norm": 2.778831720352173, "image_reward": 0.273419189453125, "kl": 12.759307652711868, "learning_rate": 5e-06, "loss": 0.091, "reward": 0.2764736473560333, "reward_std": 0.6703889116644859, "rewards/reward_func": 0.2764736473560333, "step": 7900, "toxic_reward": 4.633634448051453 }, { "clip_ratio": 0.0, "completion_length": 53.525, "epoch": 1.8690926275992439, "format_reward": 0.0, "grad_norm": 14.088724136352539, "image_reward": 0.23843803405761718, "kl": 2.752323019504547, "learning_rate": 5e-06, "loss": -0.0523, "reward": 0.44507230520248414, "reward_std": 0.8451843298971653, "rewards/reward_func": 0.44507230520248414, "step": 7910, "toxic_reward": 3.7819975137710573 }, { "clip_ratio": 0.0, "completion_length": 46.275, "epoch": 1.8714555765595464, "format_reward": 0.0, "grad_norm": 12.696130752563477, "image_reward": 0.25413665771484373, "kl": 2.022602713108063, "learning_rate": 5e-06, "loss": 0.0137, "reward": 0.6168730854988098, "reward_std": 1.198334063589573, "rewards/reward_func": 0.6168730854988098, "step": 7920, "toxic_reward": 3.9178677558898927 }, { "clip_ratio": 0.0, "completion_length": 51.75, "epoch": 1.8738185255198487, "format_reward": 0.0, "grad_norm": 6.441836357116699, "image_reward": 0.23479461669921875, "kl": 2.0815513670444488, "learning_rate": 5e-06, "loss": 0.099, "reward": 0.49921011328697207, "reward_std": 0.8878588248044252, "rewards/reward_func": 0.49921011328697207, "step": 7930, "toxic_reward": 4.21897873878479 }, { "clip_ratio": 0.0, "completion_length": 44.575, "epoch": 1.8761814744801513, "format_reward": -0.25, "grad_norm": 17.865110397338867, "image_reward": 0.2600982666015625, "kl": 2.150428944826126, "learning_rate": 5e-06, "loss": -0.0837, "reward": 0.48849809169769287, "reward_std": 1.4677658422850073, "rewards/reward_func": 0.48849809169769287, "step": 7940, "toxic_reward": 4.1003117799758915 }, { "clip_ratio": 0.0, "completion_length": 49.925, "epoch": 1.8785444234404536, "format_reward": 0.0, "grad_norm": 6.697957992553711, "image_reward": 0.25945892333984377, "kl": 2.223458543419838, "learning_rate": 5e-06, "loss": 0.0562, "reward": 0.20426468104124068, "reward_std": 0.5012361383065581, "rewards/reward_func": 0.20426468104124068, "step": 7950, "toxic_reward": 4.173866260051727 }, { "clip_ratio": 0.0, "completion_length": 54.075, "epoch": 1.8809073724007561, "format_reward": 0.0, "grad_norm": 5.58077335357666, "image_reward": 0.25069351196289064, "kl": 8.21664493083954, "learning_rate": 5e-06, "loss": -0.0792, "reward": 0.7380830064415932, "reward_std": 1.2196707382798195, "rewards/reward_func": 0.7380830064415932, "step": 7960, "toxic_reward": 3.5894944429397584 }, { "clip_ratio": 0.0, "completion_length": 43.475, "epoch": 1.8832703213610587, "format_reward": 0.0, "grad_norm": 8.384510040283203, "image_reward": 0.23923797607421876, "kl": 21.914335840940474, "learning_rate": 5e-06, "loss": -0.0204, "reward": 0.8768561869859696, "reward_std": 0.7653445459902286, "rewards/reward_func": 0.8768561869859696, "step": 7970, "toxic_reward": 3.7090541243553163 }, { "clip_ratio": 0.0, "completion_length": 54.075, "epoch": 1.8856332703213612, "format_reward": -0.25, "grad_norm": 3.9271442890167236, "image_reward": 0.2266026809811592, "kl": 1.7329542875289916, "learning_rate": 5e-06, "loss": 0.0735, "reward": 0.006925755739212036, "reward_std": 1.2168598100543022, "rewards/reward_func": 0.006925755739212036, "step": 7980, "toxic_reward": 4.598031067848206 }, { "clip_ratio": 0.0, "completion_length": 46.375, "epoch": 1.8879962192816635, "format_reward": 0.0, "grad_norm": 17.941791534423828, "image_reward": 0.24617818146944045, "kl": 3.56347342133522, "learning_rate": 5e-06, "loss": -0.1121, "reward": 0.15787817239761354, "reward_std": 0.5696724381297826, "rewards/reward_func": 0.15787817239761354, "step": 7990, "toxic_reward": 4.376770114898681 }, { "clip_ratio": 0.0, "completion_length": 40.675, "epoch": 1.8903591682419658, "format_reward": 0.0, "grad_norm": 26.69174575805664, "image_reward": 0.2819636031985283, "kl": 7.143774968385697, "learning_rate": 5e-06, "loss": -0.0619, "reward": 0.6548231065273284, "reward_std": 0.8737724728882312, "rewards/reward_func": 0.6548231065273284, "step": 8000, "toxic_reward": 4.582368350028991 }, { "clip_ratio": 0.0, "completion_length": 51.625, "epoch": 1.8927221172022684, "format_reward": 0.0, "grad_norm": 7.003530025482178, "image_reward": 0.261529541015625, "kl": 9.618525552749634, "learning_rate": 5e-06, "loss": 0.051, "reward": -0.017962449789047243, "reward_std": 0.5481395080685616, "rewards/reward_func": -0.017962449789047243, "step": 8010, "toxic_reward": 4.220958662033081 }, { "clip_ratio": 0.0, "completion_length": 42.3, "epoch": 1.8950850661625709, "format_reward": 0.0, "grad_norm": 7.718620777130127, "image_reward": 0.2414947509765625, "kl": 4.341373115777969, "learning_rate": 5e-06, "loss": 0.0835, "reward": 0.38290356993675234, "reward_std": 0.9348091699182988, "rewards/reward_func": 0.38290356993675234, "step": 8020, "toxic_reward": 4.002734637260437 }, { "clip_ratio": 0.0, "completion_length": 46.5, "epoch": 1.8974480151228734, "format_reward": 0.0, "grad_norm": 10.61853313446045, "image_reward": 0.2405120849609375, "kl": 18.692966318130495, "learning_rate": 5e-06, "loss": 0.019, "reward": 0.6024147510528565, "reward_std": 0.8250786025077105, "rewards/reward_func": 0.6024147510528565, "step": 8030, "toxic_reward": 4.069397926330566 }, { "clip_ratio": 0.0, "completion_length": 53.95, "epoch": 1.899810964083176, "format_reward": -0.5, "grad_norm": 4.87439489364624, "image_reward": 0.26594645231962205, "kl": 4.881083369255066, "learning_rate": 5e-06, "loss": 0.0141, "reward": 0.1986172914505005, "reward_std": 1.8204052031040192, "rewards/reward_func": 0.1986172914505005, "step": 8040, "toxic_reward": 4.146627187728882 }, { "clip_ratio": 0.0, "completion_length": 46.075, "epoch": 1.9021739130434783, "format_reward": 0.0, "grad_norm": 19.0607852935791, "image_reward": 0.2584747314453125, "kl": 13.449040079116822, "learning_rate": 5e-06, "loss": -0.0336, "reward": 0.09852480292320251, "reward_std": 0.37513242168352007, "rewards/reward_func": 0.09852480292320251, "step": 8050, "toxic_reward": 4.5937717914581295 }, { "clip_ratio": 0.0, "completion_length": 45.125, "epoch": 1.9045368620037806, "format_reward": 0.0, "grad_norm": 4.807636260986328, "image_reward": 0.238970947265625, "kl": 9.84277012348175, "learning_rate": 5e-06, "loss": 0.0099, "reward": 0.7841103792190551, "reward_std": 0.931809046678245, "rewards/reward_func": 0.7841103792190551, "step": 8060, "toxic_reward": 4.410308980941773 }, { "clip_ratio": 0.0, "completion_length": 47.575, "epoch": 1.9068998109640831, "format_reward": 0.0, "grad_norm": 30.570436477661133, "image_reward": 0.2677642822265625, "kl": 11.538963747024535, "learning_rate": 5e-06, "loss": 0.0571, "reward": 0.7513397336006165, "reward_std": 0.6926180317997932, "rewards/reward_func": 0.7513397336006165, "step": 8070, "toxic_reward": 4.325729882717132 }, { "clip_ratio": 0.0, "completion_length": 49.925, "epoch": 1.9092627599243857, "format_reward": 0.0, "grad_norm": 3.387159824371338, "image_reward": 0.2416534423828125, "kl": 36.42685050964356, "learning_rate": 5e-06, "loss": -0.0984, "reward": 1.0627863883972168, "reward_std": 0.9809991672635079, "rewards/reward_func": 1.0627863883972168, "step": 8080, "toxic_reward": 4.355536723136902 }, { "clip_ratio": 0.0, "completion_length": 51.875, "epoch": 1.9116257088846882, "format_reward": 0.0, "grad_norm": 8.823395729064941, "image_reward": 0.2377349853515625, "kl": 14.548263192176819, "learning_rate": 5e-06, "loss": -0.0671, "reward": 0.2453417807817459, "reward_std": 0.8620891466736793, "rewards/reward_func": 0.2453417807817459, "step": 8090, "toxic_reward": 4.175139570236206 }, { "clip_ratio": 0.0, "completion_length": 53.7, "epoch": 1.9139886578449905, "format_reward": -0.25, "grad_norm": 3.934446334838867, "image_reward": 0.2485321044921875, "kl": 476.8398398399353, "learning_rate": 5e-06, "loss": 0.1697, "reward": -0.03726454377174378, "reward_std": 1.2227270498871803, "rewards/reward_func": -0.03726454377174378, "step": 8100, "toxic_reward": 4.573867344856263 }, { "clip_ratio": 0.0, "completion_length": 47.8, "epoch": 1.916351606805293, "format_reward": 0.0, "grad_norm": 11.159741401672363, "image_reward": 0.2699676513671875, "kl": 5.77522222995758, "learning_rate": 5e-06, "loss": -0.1138, "reward": 0.4612575590610504, "reward_std": 0.5476422467269003, "rewards/reward_func": 0.4612575590610504, "step": 8110, "toxic_reward": 4.567614626884461 }, { "clip_ratio": 0.0, "completion_length": 48.575, "epoch": 1.9187145557655954, "format_reward": 0.0, "grad_norm": 11.536759376525879, "image_reward": 0.24012298583984376, "kl": 6.318757677078247, "learning_rate": 5e-06, "loss": 0.0103, "reward": 0.9109591245651245, "reward_std": 1.29407604560256, "rewards/reward_func": 0.9109591245651245, "step": 8120, "toxic_reward": 4.05263090133667 }, { "clip_ratio": 0.0, "completion_length": 58.575, "epoch": 1.9210775047258979, "format_reward": 0.0, "grad_norm": 6.833136558532715, "image_reward": 0.2688323974609375, "kl": 6.9808355331420895, "learning_rate": 5e-06, "loss": 0.0913, "reward": 0.9232870817184449, "reward_std": 0.8357461627572775, "rewards/reward_func": 0.9232870817184449, "step": 8130, "toxic_reward": 4.430827951431274 }, { "clip_ratio": 0.0, "completion_length": 50.9, "epoch": 1.9234404536862004, "format_reward": 0.0, "grad_norm": 14.8239164352417, "image_reward": 0.2474365234375, "kl": 137.28185538053512, "learning_rate": 5e-06, "loss": 0.0189, "reward": 0.401202654838562, "reward_std": 0.4000473257154226, "rewards/reward_func": 0.401202654838562, "step": 8140, "toxic_reward": 4.723700523376465 }, { "clip_ratio": 0.0, "completion_length": 45.175, "epoch": 1.925803402646503, "format_reward": 0.0, "grad_norm": 1.823515772819519, "image_reward": 0.22316131591796876, "kl": 14.130688643455505, "learning_rate": 5e-06, "loss": 0.0239, "reward": 1.057025855779648, "reward_std": 0.9014536026865244, "rewards/reward_func": 1.057025855779648, "step": 8150, "toxic_reward": 4.387946319580078 }, { "clip_ratio": 0.0, "completion_length": 54.875, "epoch": 1.9281663516068053, "format_reward": -0.25, "grad_norm": 46.756038665771484, "image_reward": 0.26631062775850295, "kl": 6.435283923149109, "learning_rate": 5e-06, "loss": -0.0328, "reward": 0.22599496245384215, "reward_std": 1.4984263110905887, "rewards/reward_func": 0.22599496245384215, "step": 8160, "toxic_reward": 4.138309001922607 }, { "clip_ratio": 0.0, "completion_length": 53.45, "epoch": 1.9305293005671076, "format_reward": 0.0, "grad_norm": 34.66867446899414, "image_reward": 0.25106658935546877, "kl": 1020.1139773368835, "learning_rate": 5e-06, "loss": 0.0999, "reward": 0.7446302771568298, "reward_std": 0.906285472586751, "rewards/reward_func": 0.7446302771568298, "step": 8170, "toxic_reward": 4.375624704360962 }, { "clip_ratio": 0.0, "completion_length": 49.475, "epoch": 1.9328922495274101, "format_reward": 0.0, "grad_norm": 2.081218957901001, "image_reward": 0.242840576171875, "kl": 3.102721667289734, "learning_rate": 5e-06, "loss": 0.0571, "reward": 0.5706271648406982, "reward_std": 0.9108416954986751, "rewards/reward_func": 0.5706271648406982, "step": 8180, "toxic_reward": 3.2474088430404664 }, { "clip_ratio": 0.0, "completion_length": 40.475, "epoch": 1.9352551984877127, "format_reward": -0.25, "grad_norm": 13.313660621643066, "image_reward": 0.27943929135799406, "kl": 9.811500716209412, "learning_rate": 5e-06, "loss": 0.0091, "reward": -0.0842776358127594, "reward_std": 1.1166115825995804, "rewards/reward_func": -0.0842776358127594, "step": 8190, "toxic_reward": 4.493260765075684 }, { "clip_ratio": 0.0, "completion_length": 49.225, "epoch": 1.9376181474480152, "format_reward": 0.0, "grad_norm": 11.93384838104248, "image_reward": 0.2549346923828125, "kl": 13.695673048496246, "learning_rate": 5e-06, "loss": -0.0083, "reward": 0.5832914412021637, "reward_std": 0.7408401468303054, "rewards/reward_func": 0.5832914412021637, "step": 8200, "toxic_reward": 4.143073153495789 }, { "clip_ratio": 0.0, "completion_length": 44.125, "epoch": 1.9399810964083177, "format_reward": 0.0, "grad_norm": 6.53907585144043, "image_reward": 0.2403350830078125, "kl": 6.522427618503571, "learning_rate": 5e-06, "loss": 0.1325, "reward": 0.1342033863067627, "reward_std": 0.7933921405114234, "rewards/reward_func": 0.1342033863067627, "step": 8210, "toxic_reward": 4.601714444160462 }, { "clip_ratio": 0.0, "completion_length": 44.95, "epoch": 1.94234404536862, "format_reward": 0.0, "grad_norm": 23.774093627929688, "image_reward": 0.25664520263671875, "kl": 5.8061746001243595, "learning_rate": 5e-06, "loss": -0.1029, "reward": 0.6099749207496643, "reward_std": 1.0578389540314674, "rewards/reward_func": 0.6099749207496643, "step": 8220, "toxic_reward": 3.542074370384216 }, { "clip_ratio": 0.0, "completion_length": 44.525, "epoch": 1.9447069943289224, "format_reward": 0.0, "grad_norm": 19.021333694458008, "image_reward": 0.25049285888671874, "kl": 4.400176310539246, "learning_rate": 5e-06, "loss": -0.0805, "reward": 0.271647572517395, "reward_std": 0.8572761943563819, "rewards/reward_func": 0.271647572517395, "step": 8230, "toxic_reward": 4.576322746276856 }, { "clip_ratio": 0.0, "completion_length": 48.4, "epoch": 1.947069943289225, "format_reward": -0.25, "grad_norm": 12.740744590759277, "image_reward": 0.2639495849609375, "kl": 53.72892454862595, "learning_rate": 5e-06, "loss": 0.0517, "reward": 0.4684752345085144, "reward_std": 1.5598361855372787, "rewards/reward_func": 0.4684752345085144, "step": 8240, "toxic_reward": 4.280627131462097 }, { "clip_ratio": 0.0, "completion_length": 42.05, "epoch": 1.9494328922495274, "format_reward": 0.0, "grad_norm": 8.727499961853027, "image_reward": 0.273443603515625, "kl": 9.401781392097472, "learning_rate": 5e-06, "loss": -0.0578, "reward": 0.6043965280056, "reward_std": 0.7762668525800109, "rewards/reward_func": 0.6043965280056, "step": 8250, "toxic_reward": 4.007175719738006 }, { "clip_ratio": 0.0, "completion_length": 39.175, "epoch": 1.95179584120983, "format_reward": -0.25, "grad_norm": 21.95665740966797, "image_reward": 0.2783833831548691, "kl": 6.502747631072998, "learning_rate": 5e-06, "loss": -0.1757, "reward": 0.7646288216114044, "reward_std": 1.2125793328508734, "rewards/reward_func": 0.7646288216114044, "step": 8260, "toxic_reward": 4.438870096206665 }, { "clip_ratio": 0.0, "completion_length": 47.375, "epoch": 1.9541587901701323, "format_reward": 0.0, "grad_norm": 19.78591537475586, "image_reward": 0.2660888671875, "kl": 101.96959731578826, "learning_rate": 5e-06, "loss": 0.0584, "reward": 0.8457072794437408, "reward_std": 0.8602423138916493, "rewards/reward_func": 0.8457072794437408, "step": 8270, "toxic_reward": 4.274328458309173 }, { "clip_ratio": 0.0, "completion_length": 49.325, "epoch": 1.9565217391304348, "format_reward": -0.25, "grad_norm": 7.575157642364502, "image_reward": 0.26166178435087206, "kl": 7.8605184674263, "learning_rate": 5e-06, "loss": -0.0592, "reward": 0.7780414521694183, "reward_std": 1.34521058909595, "rewards/reward_func": 0.7780414521694183, "step": 8280, "toxic_reward": 4.395621502399445 }, { "clip_ratio": 0.0, "completion_length": 45.0, "epoch": 1.9588846880907371, "format_reward": 0.0, "grad_norm": 13.91838550567627, "image_reward": 0.2494293212890625, "kl": 3.4681380152702332, "learning_rate": 5e-06, "loss": -0.0285, "reward": 1.0126874148845673, "reward_std": 0.884580178745091, "rewards/reward_func": 1.0126874148845673, "step": 8290, "toxic_reward": 4.259213161468506 }, { "clip_ratio": 0.0, "completion_length": 39.275, "epoch": 1.9612476370510397, "format_reward": -0.5, "grad_norm": 11.346104621887207, "image_reward": 0.24504598081111909, "kl": 17.73236060142517, "learning_rate": 5e-06, "loss": -0.0501, "reward": -0.41139370799064634, "reward_std": 1.535068777576089, "rewards/reward_func": -0.41139370799064634, "step": 8300, "toxic_reward": 4.184125363826752 }, { "clip_ratio": 0.0, "completion_length": 47.1, "epoch": 1.9636105860113422, "format_reward": 0.0, "grad_norm": 7.8980631828308105, "image_reward": 0.2494049072265625, "kl": 1.3982277452945708, "learning_rate": 5e-06, "loss": -0.0632, "reward": 0.7493218898773193, "reward_std": 0.7001253291964531, "rewards/reward_func": 0.7493218898773193, "step": 8310, "toxic_reward": 4.593434143066406 }, { "clip_ratio": 0.0, "completion_length": 46.025, "epoch": 1.9659735349716447, "format_reward": 0.0, "grad_norm": 1.629384994506836, "image_reward": 0.2574554443359375, "kl": 9.406988048553467, "learning_rate": 5e-06, "loss": 0.0561, "reward": 0.6752925157546997, "reward_std": 1.2529858350753784, "rewards/reward_func": 0.6752925157546997, "step": 8320, "toxic_reward": 3.6643527030944822 }, { "clip_ratio": 0.0, "completion_length": 42.725, "epoch": 1.968336483931947, "format_reward": 0.0, "grad_norm": 6.693783283233643, "image_reward": 0.24935455322265626, "kl": 4.6708708822727205, "learning_rate": 5e-06, "loss": -0.0369, "reward": 1.2317909479141236, "reward_std": 1.4201693460345268, "rewards/reward_func": 1.2317909479141236, "step": 8330, "toxic_reward": 3.705500102043152 }, { "clip_ratio": 0.0, "completion_length": 43.675, "epoch": 1.9706994328922496, "format_reward": 0.0, "grad_norm": 13.678855895996094, "image_reward": 0.26453857421875, "kl": 1.7596666514873505, "learning_rate": 5e-06, "loss": 0.0084, "reward": 0.6438661813735962, "reward_std": 0.5453263748437166, "rewards/reward_func": 0.6438661813735962, "step": 8340, "toxic_reward": 4.577846193313599 }, { "clip_ratio": 0.0, "completion_length": 44.55, "epoch": 1.973062381852552, "format_reward": 0.0, "grad_norm": 5.530174255371094, "image_reward": 0.2434234619140625, "kl": 16.00339319705963, "learning_rate": 5e-06, "loss": 0.0858, "reward": 0.7399854481220245, "reward_std": 0.5954274158924818, "rewards/reward_func": 0.7399854481220245, "step": 8350, "toxic_reward": 4.567293620109558 }, { "clip_ratio": 0.0, "completion_length": 44.8, "epoch": 1.9754253308128544, "format_reward": -0.25, "grad_norm": 23.65260124206543, "image_reward": 0.2397003173828125, "kl": 413.27391294240954, "learning_rate": 5e-06, "loss": -0.0452, "reward": 0.21110110878944396, "reward_std": 1.2717279449105263, "rewards/reward_func": 0.21110110878944396, "step": 8360, "toxic_reward": 4.29474036693573 }, { "clip_ratio": 0.0, "completion_length": 58.15, "epoch": 1.977788279773157, "format_reward": 0.0, "grad_norm": 8.489328384399414, "image_reward": 0.2703460693359375, "kl": 11.292006134986877, "learning_rate": 5e-06, "loss": -0.0396, "reward": 0.522923594713211, "reward_std": 0.6722989223897458, "rewards/reward_func": 0.522923594713211, "step": 8370, "toxic_reward": 4.362267994880677 }, { "clip_ratio": 0.0, "completion_length": 53.7, "epoch": 1.9801512287334595, "format_reward": 0.0, "grad_norm": 14.112800598144531, "image_reward": 0.2404998779296875, "kl": 5.943003642559051, "learning_rate": 5e-06, "loss": 0.0066, "reward": 1.046756339073181, "reward_std": 1.401267148554325, "rewards/reward_func": 1.046756339073181, "step": 8380, "toxic_reward": 4.379712152481079 }, { "clip_ratio": 0.0, "completion_length": 45.075, "epoch": 1.9825141776937618, "format_reward": 0.0, "grad_norm": 15.351452827453613, "image_reward": 0.2679229736328125, "kl": 2.1231451511383055, "learning_rate": 5e-06, "loss": -0.0874, "reward": 0.044296592473983765, "reward_std": 0.7907688375562429, "rewards/reward_func": 0.044296592473983765, "step": 8390, "toxic_reward": 4.44194188117981 }, { "clip_ratio": 0.0, "completion_length": 52.8, "epoch": 1.9848771266540641, "format_reward": 0.0, "grad_norm": 14.493269920349121, "image_reward": 0.23359222412109376, "kl": 15.598973235487938, "learning_rate": 5e-06, "loss": 0.0523, "reward": 0.6035852313041687, "reward_std": 0.7898097388446331, "rewards/reward_func": 0.6035852313041687, "step": 8400, "toxic_reward": 4.0595218420028685 }, { "clip_ratio": 0.0, "completion_length": 49.025, "epoch": 1.9872400756143667, "format_reward": 0.0, "grad_norm": 2.004755735397339, "image_reward": 0.23458099365234375, "kl": 13.407473123073578, "learning_rate": 5e-06, "loss": 0.0054, "reward": 0.5494411200284958, "reward_std": 0.5586541540920734, "rewards/reward_func": 0.5494411200284958, "step": 8410, "toxic_reward": 4.175926774740219 }, { "clip_ratio": 0.0, "completion_length": 50.325, "epoch": 1.9896030245746692, "format_reward": 0.0, "grad_norm": 9.598527908325195, "image_reward": 0.301202392578125, "kl": 4.9204403221607205, "learning_rate": 5e-06, "loss": -0.001, "reward": 0.4649462789297104, "reward_std": 0.7171205889433623, "rewards/reward_func": 0.4649462789297104, "step": 8420, "toxic_reward": 3.910860872268677 }, { "clip_ratio": 0.0, "completion_length": 57.825, "epoch": 1.9919659735349717, "format_reward": 0.0, "grad_norm": 2.607607841491699, "image_reward": 0.2745025634765625, "kl": 9.545298218727112, "learning_rate": 5e-06, "loss": 0.0779, "reward": 0.43806184232234957, "reward_std": 0.8561135273426771, "rewards/reward_func": 0.43806184232234957, "step": 8430, "toxic_reward": 4.119659066200256 }, { "clip_ratio": 0.0, "completion_length": 42.8, "epoch": 1.994328922495274, "format_reward": -0.25, "grad_norm": 1.9090756177902222, "image_reward": 0.264605712890625, "kl": 1.142916288971901, "learning_rate": 5e-06, "loss": -0.0035, "reward": -0.06725225448608399, "reward_std": 1.1679431475698947, "rewards/reward_func": -0.06725225448608399, "step": 8440, "toxic_reward": 4.506508493423462 }, { "clip_ratio": 0.0, "completion_length": 45.625, "epoch": 1.9966918714555766, "format_reward": -0.25, "grad_norm": 1.924688458442688, "image_reward": 0.247418212890625, "kl": 1.739441803097725, "learning_rate": 5e-06, "loss": -0.0847, "reward": 0.2795759916305542, "reward_std": 1.532812624052167, "rewards/reward_func": 0.2795759916305542, "step": 8450, "toxic_reward": 3.7154327273368835 }, { "clip_ratio": 0.0, "completion_length": 50.075, "epoch": 1.999054820415879, "format_reward": 0.0, "grad_norm": 3.183807373046875, "image_reward": 0.259228515625, "kl": 1.071340024471283, "learning_rate": 5e-06, "loss": 0.0299, "reward": 1.3993828475475312, "reward_std": 1.1979968290776015, "rewards/reward_func": 1.3993828475475312, "step": 8460, "toxic_reward": 4.236328482627869 }, { "clip_ratio": 0.0, "completion_length": 47.325, "epoch": 2.0014177693761814, "format_reward": 0.0, "grad_norm": 7.500320911407471, "image_reward": 0.2599090576171875, "kl": 1.2782041728496552, "learning_rate": 5e-06, "loss": 0.046, "reward": 1.2368434906005858, "reward_std": 1.188733378984034, "rewards/reward_func": 1.2368434906005858, "step": 8470, "toxic_reward": 3.8694416284561157 }, { "clip_ratio": 0.0, "completion_length": 42.575, "epoch": 2.003780718336484, "format_reward": 0.0, "grad_norm": 3.4954817295074463, "image_reward": 0.25406494140625, "kl": 2.6761809453368186, "learning_rate": 5e-06, "loss": -0.0561, "reward": 0.3607616722583771, "reward_std": 0.599818766117096, "rewards/reward_func": 0.3607616722583771, "step": 8480, "toxic_reward": 4.048572421073914 }, { "clip_ratio": 0.0, "completion_length": 54.05, "epoch": 2.0061436672967865, "format_reward": 0.0, "grad_norm": 5.18286657333374, "image_reward": 0.22822036743164062, "kl": 2.461097413301468, "learning_rate": 5e-06, "loss": -0.0261, "reward": 0.2195432722568512, "reward_std": 0.7936036609113216, "rewards/reward_func": 0.2195432722568512, "step": 8490, "toxic_reward": 4.110178589820862 }, { "clip_ratio": 0.0, "completion_length": 41.9, "epoch": 2.008506616257089, "format_reward": -0.75, "grad_norm": 2.6953821182250977, "image_reward": 0.238427734375, "kl": 1.0251432090997696, "learning_rate": 5e-06, "loss": -0.0027, "reward": -0.4569409370422363, "reward_std": 1.0821652268990873, "rewards/reward_func": -0.4569409370422363, "step": 8500, "toxic_reward": 4.185848736763001 }, { "clip_ratio": 0.0, "completion_length": 52.4, "epoch": 2.010869565217391, "format_reward": 0.0, "grad_norm": 6.174482822418213, "image_reward": 0.245025634765625, "kl": 570.9768789380789, "learning_rate": 5e-06, "loss": 0.0155, "reward": 0.634968101978302, "reward_std": 0.5698891028761863, "rewards/reward_func": 0.634968101978302, "step": 8510, "toxic_reward": 4.557809638977051 }, { "clip_ratio": 0.0, "completion_length": 37.525, "epoch": 2.0132325141776937, "format_reward": 0.0, "grad_norm": 12.716261863708496, "image_reward": 0.2716217041015625, "kl": 1.0744814962148665, "learning_rate": 5e-06, "loss": 0.0371, "reward": 0.8971363306045532, "reward_std": 1.0540940549224616, "rewards/reward_func": 0.8971363306045532, "step": 8520, "toxic_reward": 4.03425624370575 }, { "clip_ratio": 0.0, "completion_length": 47.725, "epoch": 2.015595463137996, "format_reward": 0.0, "grad_norm": 11.573805809020996, "image_reward": 0.25701904296875, "kl": 1.1612621247768402, "learning_rate": 5e-06, "loss": -0.0256, "reward": 0.4974235534667969, "reward_std": 0.7099893309175969, "rewards/reward_func": 0.4974235534667969, "step": 8530, "toxic_reward": 4.757012367248535 }, { "clip_ratio": 0.0, "completion_length": 50.325, "epoch": 2.0179584120982987, "format_reward": 0.0, "grad_norm": 2.5829172134399414, "image_reward": 0.23183441162109375, "kl": 1.0122918039560318, "learning_rate": 5e-06, "loss": -0.0224, "reward": 0.5489160656929016, "reward_std": 0.4481811560690403, "rewards/reward_func": 0.5489160656929016, "step": 8540, "toxic_reward": 4.330148541927338 }, { "clip_ratio": 0.0, "completion_length": 45.6, "epoch": 2.0203213610586013, "format_reward": -0.25, "grad_norm": 8.287252426147461, "image_reward": 0.2674835205078125, "kl": 1.138858178257942, "learning_rate": 5e-06, "loss": 0.0238, "reward": -0.09747375845909119, "reward_std": 0.8301142632961274, "rewards/reward_func": -0.09747375845909119, "step": 8550, "toxic_reward": 4.7045900344848635 }, { "clip_ratio": 0.0, "completion_length": 49.65, "epoch": 2.022684310018904, "format_reward": 0.0, "grad_norm": 9.928176879882812, "image_reward": 0.2459930419921875, "kl": 1.701068675518036, "learning_rate": 5e-06, "loss": -0.0446, "reward": 0.5473175823688508, "reward_std": 0.7223521884530782, "rewards/reward_func": 0.5473175823688508, "step": 8560, "toxic_reward": 4.571657824516296 }, { "clip_ratio": 0.0, "completion_length": 41.725, "epoch": 2.025047258979206, "format_reward": -0.25, "grad_norm": 5.9600677490234375, "image_reward": 0.26257222443819045, "kl": 2.5904053121805193, "learning_rate": 5e-06, "loss": 0.0163, "reward": -0.20251348614692688, "reward_std": 0.9303808398544788, "rewards/reward_func": -0.20251348614692688, "step": 8570, "toxic_reward": 4.581225419044495 }, { "clip_ratio": 0.0, "completion_length": 45.55, "epoch": 2.0274102079395084, "format_reward": 0.0, "grad_norm": 3.309791088104248, "image_reward": 0.23095855712890626, "kl": 1.5135916233062745, "learning_rate": 5e-06, "loss": -0.0068, "reward": 0.21151033639907837, "reward_std": 0.7603108703624457, "rewards/reward_func": 0.21151033639907837, "step": 8580, "toxic_reward": 4.328943312168121 }, { "clip_ratio": 0.0, "completion_length": 47.5, "epoch": 2.029773156899811, "format_reward": 0.0, "grad_norm": 8.408251762390137, "image_reward": 0.2651763916015625, "kl": 0.6560351371765136, "learning_rate": 5e-06, "loss": -0.0997, "reward": 0.012960964441299438, "reward_std": 0.35295800119638443, "rewards/reward_func": 0.012960964441299438, "step": 8590, "toxic_reward": 4.5852957487106325 }, { "clip_ratio": 0.0, "completion_length": 45.425, "epoch": 2.0321361058601135, "format_reward": 0.0, "grad_norm": 10.234503746032715, "image_reward": 0.242572021484375, "kl": 1.6390519708395004, "learning_rate": 5e-06, "loss": -0.0652, "reward": 0.49767774939537046, "reward_std": 0.8766103692352771, "rewards/reward_func": 0.49767774939537046, "step": 8600, "toxic_reward": 4.273276591300965 }, { "clip_ratio": 0.0, "completion_length": 55.525, "epoch": 2.034499054820416, "format_reward": 0.0, "grad_norm": 31.749767303466797, "image_reward": 0.24173736572265625, "kl": 1.4096274197101593, "learning_rate": 5e-06, "loss": -0.0081, "reward": 0.4087996184825897, "reward_std": 0.9022964790463448, "rewards/reward_func": 0.4087996184825897, "step": 8610, "toxic_reward": 4.324217915534973 }, { "clip_ratio": 0.0, "completion_length": 54.825, "epoch": 2.036862003780718, "format_reward": -0.25, "grad_norm": 12.874961853027344, "image_reward": 0.2652323395013809, "kl": 1.0484755635261536, "learning_rate": 5e-06, "loss": 0.027, "reward": -0.2117618590593338, "reward_std": 1.249949687719345, "rewards/reward_func": -0.2117618590593338, "step": 8620, "toxic_reward": 3.5184057116508485 }, { "clip_ratio": 0.0, "completion_length": 46.375, "epoch": 2.0392249527410207, "format_reward": 0.0, "grad_norm": 1.5234144926071167, "image_reward": 0.2357269287109375, "kl": 1.2848842471837998, "learning_rate": 5e-06, "loss": 0.0752, "reward": 0.716532975435257, "reward_std": 0.89201683960855, "rewards/reward_func": 0.716532975435257, "step": 8630, "toxic_reward": 4.4441750049591064 }, { "clip_ratio": 0.0, "completion_length": 56.125, "epoch": 2.041587901701323, "format_reward": 0.0, "grad_norm": 1.1807531118392944, "image_reward": 0.25293731689453125, "kl": 2.8808428183197976, "learning_rate": 5e-06, "loss": 0.023, "reward": 0.9091297924518585, "reward_std": 0.7464996237307787, "rewards/reward_func": 0.9091297924518585, "step": 8640, "toxic_reward": 4.224450874328613 }, { "clip_ratio": 0.0, "completion_length": 40.15, "epoch": 2.0439508506616257, "format_reward": 0.0, "grad_norm": 10.838650703430176, "image_reward": 0.23937225341796875, "kl": 2.5997998148202894, "learning_rate": 5e-06, "loss": -0.0419, "reward": 0.3074700653553009, "reward_std": 0.8474891871213913, "rewards/reward_func": 0.3074700653553009, "step": 8650, "toxic_reward": 4.419002604484558 }, { "clip_ratio": 0.0, "completion_length": 40.3, "epoch": 2.0463137996219283, "format_reward": 0.0, "grad_norm": 2.063800811767578, "image_reward": 0.2351318359375, "kl": 2.429117688536644, "learning_rate": 5e-06, "loss": 0.0507, "reward": 0.464035177230835, "reward_std": 0.8178490117192269, "rewards/reward_func": 0.464035177230835, "step": 8660, "toxic_reward": 4.122921991348266 }, { "clip_ratio": 0.0, "completion_length": 50.725, "epoch": 2.048676748582231, "format_reward": -0.25, "grad_norm": 5.179421424865723, "image_reward": 0.2340398147702217, "kl": 1.4141836494207383, "learning_rate": 5e-06, "loss": -0.0321, "reward": 0.2651833713054657, "reward_std": 1.3055690463632346, "rewards/reward_func": 0.2651833713054657, "step": 8670, "toxic_reward": 4.099702596664429 }, { "clip_ratio": 0.0, "completion_length": 49.0, "epoch": 2.051039697542533, "format_reward": 0.0, "grad_norm": 5.729818344116211, "image_reward": 0.22962646484375, "kl": 1.0476927325129508, "learning_rate": 5e-06, "loss": 0.0396, "reward": 0.5858624681830407, "reward_std": 0.8647454358637333, "rewards/reward_func": 0.5858624681830407, "step": 8680, "toxic_reward": 3.923676002025604 }, { "clip_ratio": 0.0, "completion_length": 43.575, "epoch": 2.0534026465028354, "format_reward": -0.25, "grad_norm": 33.29255294799805, "image_reward": 0.23504893034696578, "kl": 0.5507122159004212, "learning_rate": 5e-06, "loss": 0.0042, "reward": 0.5358553946018219, "reward_std": 1.371009534597397, "rewards/reward_func": 0.5358553946018219, "step": 8690, "toxic_reward": 4.1810872793197635 }, { "clip_ratio": 0.0, "completion_length": 44.425, "epoch": 2.055765595463138, "format_reward": 0.0, "grad_norm": 9.838844299316406, "image_reward": 0.2368377685546875, "kl": 0.8958913296461105, "learning_rate": 5e-06, "loss": -0.0191, "reward": 0.8707101225852967, "reward_std": 1.2157209530472755, "rewards/reward_func": 0.8707101225852967, "step": 8700, "toxic_reward": 3.7892824172973634 }, { "clip_ratio": 0.0, "completion_length": 50.275, "epoch": 2.0581285444234405, "format_reward": 0.0, "grad_norm": 18.979665756225586, "image_reward": 0.259954833984375, "kl": 1.8286799043416977, "learning_rate": 5e-06, "loss": -0.0646, "reward": 0.3829235196113586, "reward_std": 0.9690108880400657, "rewards/reward_func": 0.3829235196113586, "step": 8710, "toxic_reward": 4.264838469028473 }, { "clip_ratio": 0.0, "completion_length": 50.35, "epoch": 2.060491493383743, "format_reward": 0.0, "grad_norm": 6.838248252868652, "image_reward": 0.22672042846679688, "kl": 0.7338828861713409, "learning_rate": 5e-06, "loss": 0.0185, "reward": 0.827715927362442, "reward_std": 0.9109129812568426, "rewards/reward_func": 0.827715927362442, "step": 8720, "toxic_reward": 4.2410869836807255 }, { "clip_ratio": 0.0, "completion_length": 48.725, "epoch": 2.0628544423440456, "format_reward": 0.0, "grad_norm": 4.239810943603516, "image_reward": 0.25128173828125, "kl": 5.776644492149353, "learning_rate": 5e-06, "loss": -0.0179, "reward": 0.5484015077352524, "reward_std": 1.311685237288475, "rewards/reward_func": 0.5484015077352524, "step": 8730, "toxic_reward": 4.056830906867981 }, { "clip_ratio": 0.0, "completion_length": 43.75, "epoch": 2.0652173913043477, "format_reward": 0.0, "grad_norm": 11.002195358276367, "image_reward": 0.229833984375, "kl": 1.1322214603424072, "learning_rate": 5e-06, "loss": 0.1107, "reward": -0.03462121486663818, "reward_std": 0.43875638470053674, "rewards/reward_func": -0.03462121486663818, "step": 8740, "toxic_reward": 4.620968174934387 }, { "clip_ratio": 0.0, "completion_length": 54.975, "epoch": 2.06758034026465, "format_reward": -0.25, "grad_norm": 11.761068344116211, "image_reward": 0.227862548828125, "kl": 7.682415267825126, "learning_rate": 5e-06, "loss": -0.09, "reward": 0.26341341733932494, "reward_std": 1.4870022028684615, "rewards/reward_func": 0.26341341733932494, "step": 8750, "toxic_reward": 4.433785676956177 }, { "clip_ratio": 0.0, "completion_length": 49.425, "epoch": 2.0699432892249527, "format_reward": 0.0, "grad_norm": 5.119037628173828, "image_reward": 0.2395660400390625, "kl": 1.2481903672218322, "learning_rate": 5e-06, "loss": -0.0746, "reward": 0.3276951313018799, "reward_std": 0.46017137840390204, "rewards/reward_func": 0.3276951313018799, "step": 8760, "toxic_reward": 4.593200016021728 }, { "clip_ratio": 0.0, "completion_length": 44.775, "epoch": 2.0723062381852553, "format_reward": 0.0, "grad_norm": 1.704590916633606, "image_reward": 0.2639801025390625, "kl": 2.778309851884842, "learning_rate": 5e-06, "loss": -0.0417, "reward": 0.30720534920692444, "reward_std": 0.6144355796277523, "rewards/reward_func": 0.30720534920692444, "step": 8770, "toxic_reward": 4.481031203269959 }, { "clip_ratio": 0.0, "completion_length": 47.75, "epoch": 2.074669187145558, "format_reward": 0.0, "grad_norm": 11.392171859741211, "image_reward": 0.2711700439453125, "kl": 2.4740293115377425, "learning_rate": 5e-06, "loss": 0.0428, "reward": 0.318191659450531, "reward_std": 0.6928373419679701, "rewards/reward_func": 0.318191659450531, "step": 8780, "toxic_reward": 4.502946138381958 }, { "clip_ratio": 0.0, "completion_length": 45.85, "epoch": 2.07703213610586, "format_reward": 0.0, "grad_norm": 11.103386878967285, "image_reward": 0.2586761474609375, "kl": 0.8142087966203689, "learning_rate": 5e-06, "loss": -0.0231, "reward": 0.5202795565128326, "reward_std": 0.9188865400850773, "rewards/reward_func": 0.5202795565128326, "step": 8790, "toxic_reward": 4.268853735923767 }, { "clip_ratio": 0.0, "completion_length": 53.825, "epoch": 2.0793950850661624, "format_reward": -0.5, "grad_norm": 3.831815719604492, "image_reward": 0.23363494873046875, "kl": 3.2880129516124725, "learning_rate": 5e-06, "loss": 0.0507, "reward": -0.17698687314987183, "reward_std": 1.4326126247644424, "rewards/reward_func": -0.17698687314987183, "step": 8800, "toxic_reward": 4.581766486167908 }, { "clip_ratio": 0.0, "completion_length": 42.95, "epoch": 2.081758034026465, "format_reward": -0.5, "grad_norm": 13.328118324279785, "image_reward": 0.283209228515625, "kl": 2.583532452583313, "learning_rate": 5e-06, "loss": 0.1117, "reward": 0.2989026606082916, "reward_std": 2.0480130195617674, "rewards/reward_func": 0.2989026606082916, "step": 8810, "toxic_reward": 3.5884770512580872 }, { "clip_ratio": 0.0, "completion_length": 47.125, "epoch": 2.0841209829867675, "format_reward": 0.0, "grad_norm": 1.780765414237976, "image_reward": 0.242584228515625, "kl": 1.3252637952566146, "learning_rate": 5e-06, "loss": -0.0244, "reward": 0.29446207284927367, "reward_std": 1.0302367629483342, "rewards/reward_func": 0.29446207284927367, "step": 8820, "toxic_reward": 4.3270234823226925 }, { "clip_ratio": 0.0, "completion_length": 50.55, "epoch": 2.08648393194707, "format_reward": 0.0, "grad_norm": 0.9498596787452698, "image_reward": 0.25778045654296877, "kl": 1.5434826999902724, "learning_rate": 5e-06, "loss": -0.1175, "reward": 0.414847657084465, "reward_std": 0.7169051881879568, "rewards/reward_func": 0.414847657084465, "step": 8830, "toxic_reward": 4.253011137247086 }, { "clip_ratio": 0.0, "completion_length": 42.925, "epoch": 2.0888468809073726, "format_reward": 0.0, "grad_norm": 8.519845008850098, "image_reward": 0.23509521484375, "kl": 0.5451234139502048, "learning_rate": 5e-06, "loss": 0.1295, "reward": 0.510816776752472, "reward_std": 0.6249840931501239, "rewards/reward_func": 0.510816776752472, "step": 8840, "toxic_reward": 4.746308994293213 }, { "clip_ratio": 0.0, "completion_length": 46.225, "epoch": 2.0912098298676747, "format_reward": 0.0, "grad_norm": 3.2223246097564697, "image_reward": 0.23397216796875, "kl": 1.196854567527771, "learning_rate": 5e-06, "loss": 0.0121, "reward": 0.430766886472702, "reward_std": 1.1830935038626194, "rewards/reward_func": 0.430766886472702, "step": 8850, "toxic_reward": 4.010055112838745 }, { "clip_ratio": 0.0, "completion_length": 50.1, "epoch": 2.093572778827977, "format_reward": 0.0, "grad_norm": 4.516735553741455, "image_reward": 0.248101806640625, "kl": 1.3512360364198686, "learning_rate": 5e-06, "loss": -0.0064, "reward": 0.4724120795726776, "reward_std": 1.2229724466800689, "rewards/reward_func": 0.4724120795726776, "step": 8860, "toxic_reward": 4.219368410110474 }, { "clip_ratio": 0.0, "completion_length": 51.45, "epoch": 2.0959357277882797, "format_reward": 0.0, "grad_norm": 10.528098106384277, "image_reward": 0.2487030029296875, "kl": 0.8361154735088349, "learning_rate": 5e-06, "loss": 0.0096, "reward": 0.481815043091774, "reward_std": 0.8121814839541912, "rewards/reward_func": 0.481815043091774, "step": 8870, "toxic_reward": 4.385925316810608 }, { "clip_ratio": 0.0, "completion_length": 42.625, "epoch": 2.0982986767485823, "format_reward": 0.0, "grad_norm": 14.012845993041992, "image_reward": 0.26658477783203127, "kl": 5.801792293787003, "learning_rate": 5e-06, "loss": -0.0543, "reward": 0.8176810801029205, "reward_std": 0.9641741991043091, "rewards/reward_func": 0.8176810801029205, "step": 8880, "toxic_reward": 4.4868937015533445 }, { "clip_ratio": 0.0, "completion_length": 58.625, "epoch": 2.100661625708885, "format_reward": 0.0, "grad_norm": 1.7836425304412842, "image_reward": 0.219366455078125, "kl": 2.3325648605823517, "learning_rate": 5e-06, "loss": -0.0747, "reward": 0.07698584794998169, "reward_std": 0.7939232878386975, "rewards/reward_func": 0.07698584794998169, "step": 8890, "toxic_reward": 3.8968687295913695 }, { "clip_ratio": 0.0, "completion_length": 35.1, "epoch": 2.1030245746691874, "format_reward": 0.0, "grad_norm": 4.322965145111084, "image_reward": 0.2381072998046875, "kl": 1.5042289346456528, "learning_rate": 5e-06, "loss": -0.0396, "reward": 0.6448795169591903, "reward_std": 1.071408730885014, "rewards/reward_func": 0.6448795169591903, "step": 8900, "toxic_reward": 4.1689093708992 }, { "clip_ratio": 0.0, "completion_length": 46.6, "epoch": 2.1053875236294894, "format_reward": -0.25, "grad_norm": 3.794384002685547, "image_reward": 0.2495122268795967, "kl": 1.5198310285806655, "learning_rate": 5e-06, "loss": 0.0187, "reward": -0.018249320983886718, "reward_std": 0.9345291556790472, "rewards/reward_func": -0.018249320983886718, "step": 8910, "toxic_reward": 4.52795147895813 }, { "clip_ratio": 0.0, "completion_length": 53.425, "epoch": 2.107750472589792, "format_reward": 0.0, "grad_norm": 5.023624420166016, "image_reward": 0.2479766845703125, "kl": 0.8687845975160599, "learning_rate": 5e-06, "loss": -0.0294, "reward": 0.188352632522583, "reward_std": 0.6677849385887384, "rewards/reward_func": 0.188352632522583, "step": 8920, "toxic_reward": 4.531054210662842 }, { "clip_ratio": 0.0, "completion_length": 50.625, "epoch": 2.1101134215500945, "format_reward": 0.0, "grad_norm": 2.3595223426818848, "image_reward": 0.2230987548828125, "kl": 8.144508588314057, "learning_rate": 5e-06, "loss": -0.0584, "reward": 0.1267090529203415, "reward_std": 0.4783048752695322, "rewards/reward_func": 0.1267090529203415, "step": 8930, "toxic_reward": 4.553759598731995 }, { "clip_ratio": 0.0, "completion_length": 48.075, "epoch": 2.112476370510397, "format_reward": 0.0, "grad_norm": 27.809850692749023, "image_reward": 0.2523773193359375, "kl": 0.7018902823328972, "learning_rate": 5e-06, "loss": -0.0582, "reward": 0.3417531728744507, "reward_std": 0.6842773109674454, "rewards/reward_func": 0.3417531728744507, "step": 8940, "toxic_reward": 4.307323157787323 }, { "clip_ratio": 0.0, "completion_length": 48.875, "epoch": 2.1148393194706996, "format_reward": -0.25, "grad_norm": 2.785470962524414, "image_reward": 0.2328338623046875, "kl": 0.866449561715126, "learning_rate": 5e-06, "loss": -0.0235, "reward": 0.9232547760009766, "reward_std": 1.3794653311371803, "rewards/reward_func": 0.9232547760009766, "step": 8950, "toxic_reward": 4.619945740699768 }, { "clip_ratio": 0.0, "completion_length": 41.5, "epoch": 2.1172022684310017, "format_reward": 0.0, "grad_norm": 7.097494602203369, "image_reward": 0.2396942138671875, "kl": 2.0439702540636064, "learning_rate": 5e-06, "loss": 0.072, "reward": 0.26967796087265017, "reward_std": 0.541577224060893, "rewards/reward_func": 0.26967796087265017, "step": 8960, "toxic_reward": 4.525745010375976 }, { "clip_ratio": 0.0, "completion_length": 49.5, "epoch": 2.119565217391304, "format_reward": 0.0, "grad_norm": 9.064950942993164, "image_reward": 0.256915283203125, "kl": 1.2320655643939973, "learning_rate": 5e-06, "loss": -0.0406, "reward": 0.2311327040195465, "reward_std": 0.5596455704420805, "rewards/reward_func": 0.2311327040195465, "step": 8970, "toxic_reward": 4.595108699798584 }, { "clip_ratio": 0.0, "completion_length": 46.65, "epoch": 2.1219281663516067, "format_reward": 0.0, "grad_norm": 2.589348793029785, "image_reward": 0.2637451171875, "kl": 3.0987811207771303, "learning_rate": 5e-06, "loss": 0.0007, "reward": 0.11190776824951172, "reward_std": 0.6281056736595929, "rewards/reward_func": 0.11190776824951172, "step": 8980, "toxic_reward": 3.8689257740974425 }, { "clip_ratio": 0.0, "completion_length": 46.575, "epoch": 2.1242911153119093, "format_reward": 0.0, "grad_norm": 8.932865142822266, "image_reward": 0.246466064453125, "kl": 0.9722367227077484, "learning_rate": 5e-06, "loss": -0.0272, "reward": 0.507748281955719, "reward_std": 0.49981794953346254, "rewards/reward_func": 0.507748281955719, "step": 8990, "toxic_reward": 4.460081267356872 }, { "clip_ratio": 0.0, "completion_length": 52.45, "epoch": 2.126654064272212, "format_reward": 0.0, "grad_norm": 6.9373064041137695, "image_reward": 0.247686767578125, "kl": 1.1410144418478012, "learning_rate": 5e-06, "loss": -0.0068, "reward": 0.7415260970592499, "reward_std": 0.7849105328321457, "rewards/reward_func": 0.7415260970592499, "step": 9000, "toxic_reward": 4.300376343727112 }, { "clip_ratio": 0.0, "completion_length": 46.45, "epoch": 2.1290170132325144, "format_reward": 0.0, "grad_norm": 2.5460894107818604, "image_reward": 0.253936767578125, "kl": 0.8742299884557724, "learning_rate": 5e-06, "loss": 0.009, "reward": 0.2949145630002022, "reward_std": 0.8127535484731198, "rewards/reward_func": 0.2949145630002022, "step": 9010, "toxic_reward": 4.208229756355285 }, { "clip_ratio": 0.0, "completion_length": 44.875, "epoch": 2.1313799621928164, "format_reward": 0.0, "grad_norm": 8.273791313171387, "image_reward": 0.23465576171875, "kl": 1.841402593255043, "learning_rate": 5e-06, "loss": 0.0698, "reward": 0.2962026834487915, "reward_std": 0.5367021195590496, "rewards/reward_func": 0.2962026834487915, "step": 9020, "toxic_reward": 4.80813364982605 }, { "clip_ratio": 0.0, "completion_length": 45.275, "epoch": 2.133742911153119, "format_reward": 0.0, "grad_norm": 1.396345853805542, "image_reward": 0.25015411376953123, "kl": 2.9800774693489074, "learning_rate": 5e-06, "loss": 0.0132, "reward": 0.2012641340494156, "reward_std": 0.9129719872027635, "rewards/reward_func": 0.2012641340494156, "step": 9030, "toxic_reward": 4.130698096752167 }, { "clip_ratio": 0.0, "completion_length": 51.85, "epoch": 2.1361058601134215, "format_reward": 0.0, "grad_norm": 6.688182830810547, "image_reward": 0.27446441650390624, "kl": 3.037346550822258, "learning_rate": 5e-06, "loss": -0.0086, "reward": 0.7237348094582557, "reward_std": 0.9079257231205702, "rewards/reward_func": 0.7237348094582557, "step": 9040, "toxic_reward": 4.051010203361511 }, { "clip_ratio": 0.0, "completion_length": 42.55, "epoch": 2.138468809073724, "format_reward": -0.25, "grad_norm": 2.8426876068115234, "image_reward": 0.26009623110294344, "kl": 234.93744373321533, "learning_rate": 5e-06, "loss": 0.0741, "reward": 0.14779042601585388, "reward_std": 1.6020304949954152, "rewards/reward_func": 0.14779042601585388, "step": 9050, "toxic_reward": 4.297617936134339 }, { "clip_ratio": 0.0, "completion_length": 46.55, "epoch": 2.1408317580340266, "format_reward": 0.0, "grad_norm": 14.305941581726074, "image_reward": 0.23194732666015624, "kl": 1.907991024851799, "learning_rate": 5e-06, "loss": -0.0095, "reward": 0.5170632779598237, "reward_std": 0.8214797399006784, "rewards/reward_func": 0.5170632779598237, "step": 9060, "toxic_reward": 4.2402391791343685 }, { "clip_ratio": 0.0, "completion_length": 44.0, "epoch": 2.143194706994329, "format_reward": 0.0, "grad_norm": 4.640130043029785, "image_reward": 0.2427520751953125, "kl": 7.810242688655853, "learning_rate": 5e-06, "loss": -0.032, "reward": 0.20102212131023406, "reward_std": 1.3208093732595443, "rewards/reward_func": 0.20102212131023406, "step": 9070, "toxic_reward": 3.7083510875701906 }, { "clip_ratio": 0.0, "completion_length": 54.1, "epoch": 2.145557655954631, "format_reward": 0.0, "grad_norm": 1.3630574941635132, "image_reward": 0.2402252197265625, "kl": 10.548876631259919, "learning_rate": 5e-06, "loss": 0.0112, "reward": 0.7412046194076538, "reward_std": 0.9147299766540528, "rewards/reward_func": 0.7412046194076538, "step": 9080, "toxic_reward": 4.000400519371032 }, { "clip_ratio": 0.0, "completion_length": 39.2, "epoch": 2.1479206049149338, "format_reward": 0.0, "grad_norm": 2.9857964515686035, "image_reward": 0.2373077392578125, "kl": 11.153948432207107, "learning_rate": 5e-06, "loss": -0.0525, "reward": 0.4043663561344147, "reward_std": 0.7483500481583178, "rewards/reward_func": 0.4043663561344147, "step": 9090, "toxic_reward": 4.85220890045166 }, { "clip_ratio": 0.0, "completion_length": 54.05, "epoch": 2.1502835538752363, "format_reward": 0.0, "grad_norm": 2.3352339267730713, "image_reward": 0.2479736328125, "kl": 3.4772801220417024, "learning_rate": 5e-06, "loss": 0.0421, "reward": 0.5415297746658325, "reward_std": 0.90726547986269, "rewards/reward_func": 0.5415297746658325, "step": 9100, "toxic_reward": 4.584239768981933 }, { "clip_ratio": 0.0, "completion_length": 39.125, "epoch": 2.152646502835539, "format_reward": 0.0, "grad_norm": 7.961544990539551, "image_reward": 0.2632598876953125, "kl": 2.5744317561388015, "learning_rate": 5e-06, "loss": 0.1083, "reward": 0.5891210317611695, "reward_std": 1.2839626222848892, "rewards/reward_func": 0.5891210317611695, "step": 9110, "toxic_reward": 4.181219959259034 }, { "clip_ratio": 0.0, "completion_length": 49.075, "epoch": 2.1550094517958414, "format_reward": 0.0, "grad_norm": 9.345954895019531, "image_reward": 0.238812255859375, "kl": 6.054638743400574, "learning_rate": 5e-06, "loss": -0.1109, "reward": 0.06543984264135361, "reward_std": 0.519540898501873, "rewards/reward_func": 0.06543984264135361, "step": 9120, "toxic_reward": 4.270166897773743 }, { "clip_ratio": 0.0, "completion_length": 40.875, "epoch": 2.1573724007561434, "format_reward": 0.0, "grad_norm": 8.313584327697754, "image_reward": 0.24713134765625, "kl": 6.326075008511543, "learning_rate": 5e-06, "loss": -0.0767, "reward": 0.8043414294719696, "reward_std": 1.0881578013300897, "rewards/reward_func": 0.8043414294719696, "step": 9130, "toxic_reward": 4.155464768409729 }, { "clip_ratio": 0.0, "completion_length": 53.075, "epoch": 2.159735349716446, "format_reward": 0.0, "grad_norm": 8.283196449279785, "image_reward": 0.2358612060546875, "kl": 2.606276285648346, "learning_rate": 5e-06, "loss": 0.1205, "reward": 0.1571010023355484, "reward_std": 0.7534957839176059, "rewards/reward_func": 0.1571010023355484, "step": 9140, "toxic_reward": 4.338043940067291 }, { "clip_ratio": 0.0, "completion_length": 49.65, "epoch": 2.1620982986767485, "format_reward": 0.0, "grad_norm": 12.468461036682129, "image_reward": 0.2525299072265625, "kl": 10.651741808652877, "learning_rate": 5e-06, "loss": 0.0658, "reward": 0.499350106716156, "reward_std": 0.7922366757877171, "rewards/reward_func": 0.499350106716156, "step": 9150, "toxic_reward": 4.407905173301697 }, { "clip_ratio": 0.0, "completion_length": 43.5, "epoch": 2.164461247637051, "format_reward": 0.0, "grad_norm": 1.4810751676559448, "image_reward": 0.22776641845703124, "kl": 6.478110730648041, "learning_rate": 5e-06, "loss": -0.0464, "reward": 0.9298590540885925, "reward_std": 0.8800611793994904, "rewards/reward_func": 0.9298590540885925, "step": 9160, "toxic_reward": 4.576443719863891 }, { "clip_ratio": 0.0, "completion_length": 52.875, "epoch": 2.1668241965973536, "format_reward": 0.0, "grad_norm": 9.652629852294922, "image_reward": 0.260015869140625, "kl": 1.726886612176895, "learning_rate": 5e-06, "loss": -0.127, "reward": 0.4531150579452515, "reward_std": 0.8976183220744133, "rewards/reward_func": 0.4531150579452515, "step": 9170, "toxic_reward": 4.432527303695679 }, { "clip_ratio": 0.0, "completion_length": 45.0, "epoch": 2.169187145557656, "format_reward": 0.0, "grad_norm": 3.5050952434539795, "image_reward": 0.2384307861328125, "kl": 4.27691433429718, "learning_rate": 5e-06, "loss": -0.0311, "reward": 0.6826965510845184, "reward_std": 0.7877496212720871, "rewards/reward_func": 0.6826965510845184, "step": 9180, "toxic_reward": 4.4837501525878904 }, { "clip_ratio": 0.0, "completion_length": 52.1, "epoch": 2.171550094517958, "format_reward": 0.0, "grad_norm": 3.2137303352355957, "image_reward": 0.24414825439453125, "kl": 3.9786434292793276, "learning_rate": 5e-06, "loss": -0.048, "reward": 0.34388454258441925, "reward_std": 0.7642363490536809, "rewards/reward_func": 0.34388454258441925, "step": 9190, "toxic_reward": 4.380149924755097 }, { "clip_ratio": 0.0, "completion_length": 45.8, "epoch": 2.1739130434782608, "format_reward": 0.0, "grad_norm": 10.09927749633789, "image_reward": 0.2380889892578125, "kl": 3.893726623058319, "learning_rate": 5e-06, "loss": 0.0829, "reward": 0.28469178080558777, "reward_std": 0.7502976493909955, "rewards/reward_func": 0.28469178080558777, "step": 9200, "toxic_reward": 4.405940270423889 }, { "clip_ratio": 0.0, "completion_length": 42.25, "epoch": 2.1762759924385633, "format_reward": -0.25, "grad_norm": 36.55522537231445, "image_reward": 0.23165105208754538, "kl": 25.98146269917488, "learning_rate": 5e-06, "loss": 0.0734, "reward": -0.05637494325637817, "reward_std": 1.5777033947408199, "rewards/reward_func": -0.05637494325637817, "step": 9210, "toxic_reward": 4.138173961639405 }, { "clip_ratio": 0.0, "completion_length": 47.45, "epoch": 2.178638941398866, "format_reward": 0.0, "grad_norm": 2.0444726943969727, "image_reward": 0.2312713623046875, "kl": 2.639026927947998, "learning_rate": 5e-06, "loss": -0.0598, "reward": 0.5707060933113098, "reward_std": 1.1906714523211122, "rewards/reward_func": 0.5707060933113098, "step": 9220, "toxic_reward": 3.9467220425605776 }, { "clip_ratio": 0.0, "completion_length": 44.725, "epoch": 2.1810018903591684, "format_reward": 0.0, "grad_norm": 14.821864128112793, "image_reward": 0.260614013671875, "kl": 2.6187705636024474, "learning_rate": 5e-06, "loss": -0.0004, "reward": 0.776735657453537, "reward_std": 0.8302984148263931, "rewards/reward_func": 0.776735657453537, "step": 9230, "toxic_reward": 4.346326851844788 }, { "clip_ratio": 0.0, "completion_length": 44.95, "epoch": 2.183364839319471, "format_reward": 0.0, "grad_norm": 8.473363876342773, "image_reward": 0.227203369140625, "kl": 1.9701256573200225, "learning_rate": 5e-06, "loss": 0.0091, "reward": 0.6466103255748749, "reward_std": 0.5622012199833989, "rewards/reward_func": 0.6466103255748749, "step": 9240, "toxic_reward": 4.649976348876953 }, { "clip_ratio": 0.0, "completion_length": 46.9, "epoch": 2.185727788279773, "format_reward": 0.0, "grad_norm": 16.42177391052246, "image_reward": 0.2678741455078125, "kl": 3.791227114200592, "learning_rate": 5e-06, "loss": 0.0161, "reward": 0.5803273111581803, "reward_std": 0.8187548790127039, "rewards/reward_func": 0.5803273111581803, "step": 9250, "toxic_reward": 4.081698262691498 }, { "clip_ratio": 0.0, "completion_length": 50.5, "epoch": 2.1880907372400755, "format_reward": 0.0, "grad_norm": 23.73421859741211, "image_reward": 0.227972412109375, "kl": 3.5688813447952272, "learning_rate": 5e-06, "loss": 0.052, "reward": 0.5074087619781494, "reward_std": 0.9598018784075976, "rewards/reward_func": 0.5074087619781494, "step": 9260, "toxic_reward": 4.401837420463562 }, { "clip_ratio": 0.0, "completion_length": 60.425, "epoch": 2.190453686200378, "format_reward": 0.0, "grad_norm": 9.969075202941895, "image_reward": 0.245391845703125, "kl": 2.1753955483436584, "learning_rate": 5e-06, "loss": -0.0984, "reward": 0.9439354777336121, "reward_std": 0.7434614159166812, "rewards/reward_func": 0.9439354777336121, "step": 9270, "toxic_reward": 4.398157930374145 }, { "clip_ratio": 0.0, "completion_length": 54.325, "epoch": 2.1928166351606806, "format_reward": 0.0, "grad_norm": 11.830771446228027, "image_reward": 0.23084869384765624, "kl": 1.2656208366155624, "learning_rate": 5e-06, "loss": 0.0047, "reward": 0.31412020325660706, "reward_std": 0.8172964336816222, "rewards/reward_func": 0.31412020325660706, "step": 9280, "toxic_reward": 4.430534148216248 }, { "clip_ratio": 0.0, "completion_length": 44.475, "epoch": 2.195179584120983, "format_reward": 0.0, "grad_norm": 5.753904819488525, "image_reward": 0.24887237548828126, "kl": 1.3502902746200562, "learning_rate": 5e-06, "loss": 0.0623, "reward": 0.7708447635173797, "reward_std": 0.9731228679418564, "rewards/reward_func": 0.7708447635173797, "step": 9290, "toxic_reward": 4.273838710784912 }, { "clip_ratio": 0.0, "completion_length": 45.45, "epoch": 2.197542533081285, "format_reward": 0.0, "grad_norm": 7.7253947257995605, "image_reward": 0.2331573486328125, "kl": 3.059584191441536, "learning_rate": 5e-06, "loss": 0.0942, "reward": 0.43726455271244047, "reward_std": 1.3362455716356636, "rewards/reward_func": 0.43726455271244047, "step": 9300, "toxic_reward": 4.275756049156189 }, { "clip_ratio": 0.0, "completion_length": 40.525, "epoch": 2.1999054820415878, "format_reward": 0.0, "grad_norm": 14.917332649230957, "image_reward": 0.2485260009765625, "kl": 3.154623621702194, "learning_rate": 5e-06, "loss": -0.0028, "reward": 0.7619877219200134, "reward_std": 0.7480236226692796, "rewards/reward_func": 0.7619877219200134, "step": 9310, "toxic_reward": 4.5632892370224 }, { "clip_ratio": 0.0, "completion_length": 50.425, "epoch": 2.2022684310018903, "format_reward": 0.0, "grad_norm": 2.2232089042663574, "image_reward": 0.272552490234375, "kl": 4.570386919379234, "learning_rate": 5e-06, "loss": 0.0767, "reward": -0.32990662753582, "reward_std": 0.6452065747231245, "rewards/reward_func": -0.32990662753582, "step": 9320, "toxic_reward": 4.057078433036804 }, { "clip_ratio": 0.0, "completion_length": 51.475, "epoch": 2.204631379962193, "format_reward": 0.0, "grad_norm": 12.401063919067383, "image_reward": 0.22475687563419341, "kl": 2.3822293996810915, "learning_rate": 5e-06, "loss": -0.1368, "reward": 0.6720861852169037, "reward_std": 0.5467347849160433, "rewards/reward_func": 0.6720861852169037, "step": 9330, "toxic_reward": 4.748937749862671 }, { "clip_ratio": 0.0, "completion_length": 46.725, "epoch": 2.2069943289224954, "format_reward": 0.0, "grad_norm": 18.317251205444336, "image_reward": 0.2457794189453125, "kl": 2.1576267421245574, "learning_rate": 5e-06, "loss": 0.0443, "reward": 0.4097582340240479, "reward_std": 0.6367886804975569, "rewards/reward_func": 0.4097582340240479, "step": 9340, "toxic_reward": 3.8079322576522827 }, { "clip_ratio": 0.0, "completion_length": 53.0, "epoch": 2.209357277882798, "format_reward": 0.0, "grad_norm": 9.418509483337402, "image_reward": 0.2345733642578125, "kl": 0.8293094992637634, "learning_rate": 5e-06, "loss": 0.1013, "reward": 0.4199859380722046, "reward_std": 0.8710890758782626, "rewards/reward_func": 0.4199859380722046, "step": 9350, "toxic_reward": 4.7352869510650635 }, { "clip_ratio": 0.0, "completion_length": 43.7, "epoch": 2.2117202268431, "format_reward": 0.0, "grad_norm": 16.66223907470703, "image_reward": 0.25906219482421877, "kl": 1.3076835095882415, "learning_rate": 5e-06, "loss": 0.1376, "reward": 0.2918867290019989, "reward_std": 0.2911624666303396, "rewards/reward_func": 0.2918867290019989, "step": 9360, "toxic_reward": 4.793551731109619 }, { "clip_ratio": 0.0, "completion_length": 54.2, "epoch": 2.2140831758034025, "format_reward": 0.0, "grad_norm": 6.431090354919434, "image_reward": 0.214971923828125, "kl": 2.7103491842746736, "learning_rate": 5e-06, "loss": 0.0478, "reward": 0.3785900384187698, "reward_std": 0.8829892821609974, "rewards/reward_func": 0.3785900384187698, "step": 9370, "toxic_reward": 4.103424906730652 }, { "clip_ratio": 0.0, "completion_length": 44.45, "epoch": 2.216446124763705, "format_reward": 0.0, "grad_norm": 5.423946857452393, "image_reward": 0.2241668701171875, "kl": 9.351680633425712, "learning_rate": 5e-06, "loss": -0.0282, "reward": 0.6931559234857559, "reward_std": 0.9827461183071137, "rewards/reward_func": 0.6931559234857559, "step": 9380, "toxic_reward": 4.327680516242981 }, { "clip_ratio": 0.0, "completion_length": 46.75, "epoch": 2.2188090737240076, "format_reward": 0.0, "grad_norm": 12.53814697265625, "image_reward": 0.24124603271484374, "kl": 1.539423054456711, "learning_rate": 5e-06, "loss": 0.0074, "reward": 0.5662744238972663, "reward_std": 0.8970771560445427, "rewards/reward_func": 0.5662744238972663, "step": 9390, "toxic_reward": 3.676301693916321 }, { "clip_ratio": 0.0, "completion_length": 53.85, "epoch": 2.22117202268431, "format_reward": -0.25, "grad_norm": 27.224220275878906, "image_reward": 0.26278177797794344, "kl": 8.620309627056121, "learning_rate": 5e-06, "loss": -0.0561, "reward": 0.45245649218559264, "reward_std": 1.51493993550539, "rewards/reward_func": 0.45245649218559264, "step": 9400, "toxic_reward": 4.195838165283203 }, { "clip_ratio": 0.0, "completion_length": 46.3, "epoch": 2.2235349716446127, "format_reward": 0.0, "grad_norm": 16.83915901184082, "image_reward": 0.244732666015625, "kl": 7.121062386035919, "learning_rate": 5e-06, "loss": 0.0277, "reward": 0.5056580305099487, "reward_std": 0.6380140800029039, "rewards/reward_func": 0.5056580305099487, "step": 9410, "toxic_reward": 4.542606806755066 }, { "clip_ratio": 0.0, "completion_length": 40.525, "epoch": 2.2258979206049148, "format_reward": 0.0, "grad_norm": 14.892727851867676, "image_reward": 0.24930419921875, "kl": 5.096332561969757, "learning_rate": 5e-06, "loss": -0.0329, "reward": 0.48427205085754393, "reward_std": 1.0285473830997944, "rewards/reward_func": 0.48427205085754393, "step": 9420, "toxic_reward": 4.446974515914917 }, { "clip_ratio": 0.0, "completion_length": 40.75, "epoch": 2.2282608695652173, "format_reward": -0.25, "grad_norm": 1.2352709770202637, "image_reward": 0.24373575747013093, "kl": 16.825757110118865, "learning_rate": 5e-06, "loss": -0.0367, "reward": 0.5291045546531677, "reward_std": 1.2605504954233766, "rewards/reward_func": 0.5291045546531677, "step": 9430, "toxic_reward": 4.275347375869751 }, { "clip_ratio": 0.0, "completion_length": 40.875, "epoch": 2.23062381852552, "format_reward": 0.0, "grad_norm": 22.900882720947266, "image_reward": 0.25095672607421876, "kl": 10.578787690401077, "learning_rate": 5e-06, "loss": 0.0032, "reward": 0.26428125500679017, "reward_std": 0.9156784310936927, "rewards/reward_func": 0.26428125500679017, "step": 9440, "toxic_reward": 3.866254734992981 }, { "clip_ratio": 0.0, "completion_length": 39.5, "epoch": 2.2329867674858224, "format_reward": -0.25, "grad_norm": 13.677875518798828, "image_reward": 0.238702392578125, "kl": 5.4229684472084045, "learning_rate": 5e-06, "loss": -0.0035, "reward": 0.34162178039550783, "reward_std": 1.2561071523465217, "rewards/reward_func": 0.34162178039550783, "step": 9450, "toxic_reward": 4.54083218574524 }, { "clip_ratio": 0.0, "completion_length": 50.25, "epoch": 2.235349716446125, "format_reward": 0.0, "grad_norm": 7.939948081970215, "image_reward": 0.245355224609375, "kl": 2.677640450000763, "learning_rate": 5e-06, "loss": -0.0006, "reward": 0.28101458549499514, "reward_std": 0.8911348965018988, "rewards/reward_func": 0.28101458549499514, "step": 9460, "toxic_reward": 4.145281267166138 }, { "clip_ratio": 0.0, "completion_length": 50.0, "epoch": 2.237712665406427, "format_reward": 0.0, "grad_norm": 1.8240618705749512, "image_reward": 0.231634521484375, "kl": 1.528824520111084, "learning_rate": 5e-06, "loss": -0.0875, "reward": 0.37455313801765444, "reward_std": 0.4142075888812542, "rewards/reward_func": 0.37455313801765444, "step": 9470, "toxic_reward": 4.762905406951904 }, { "clip_ratio": 0.0, "completion_length": 45.325, "epoch": 2.2400756143667295, "format_reward": 0.0, "grad_norm": 6.417304515838623, "image_reward": 0.2490814208984375, "kl": 1.3931379437446594, "learning_rate": 5e-06, "loss": 0.0247, "reward": 1.1561188876628876, "reward_std": 0.7829106822609901, "rewards/reward_func": 1.1561188876628876, "step": 9480, "toxic_reward": 4.180247139930725 }, { "clip_ratio": 0.0, "completion_length": 46.1, "epoch": 2.242438563327032, "format_reward": 0.0, "grad_norm": 6.768500328063965, "image_reward": 0.2283935546875, "kl": 0.9197474420070648, "learning_rate": 5e-06, "loss": -0.0378, "reward": 0.5479820281267166, "reward_std": 0.8298372723162174, "rewards/reward_func": 0.5479820281267166, "step": 9490, "toxic_reward": 4.205300378799438 }, { "clip_ratio": 0.0, "completion_length": 55.375, "epoch": 2.2448015122873346, "format_reward": 0.0, "grad_norm": 26.945127487182617, "image_reward": 0.26320343017578124, "kl": 1.3565968126058578, "learning_rate": 5e-06, "loss": 0.0841, "reward": 0.7786126613616944, "reward_std": 1.1838067084550858, "rewards/reward_func": 0.7786126613616944, "step": 9500, "toxic_reward": 3.8664269924163817 }, { "clip_ratio": 0.0, "completion_length": 36.3, "epoch": 2.247164461247637, "format_reward": -0.25, "grad_norm": 5.876742839813232, "image_reward": 0.2632466644048691, "kl": 2.275825482606888, "learning_rate": 5e-06, "loss": -0.0704, "reward": 0.1731979250907898, "reward_std": 1.4120358280837535, "rewards/reward_func": 0.1731979250907898, "step": 9510, "toxic_reward": 4.367373514175415 }, { "clip_ratio": 0.0, "completion_length": 51.525, "epoch": 2.2495274102079397, "format_reward": 0.0, "grad_norm": 4.764988422393799, "image_reward": 0.2396881103515625, "kl": 5.604674518108368, "learning_rate": 5e-06, "loss": 0.0467, "reward": 0.2715915977954865, "reward_std": 0.6032503295689822, "rewards/reward_func": 0.2715915977954865, "step": 9520, "toxic_reward": 4.3036177396774296 }, { "clip_ratio": 0.0, "completion_length": 46.15, "epoch": 2.251890359168242, "format_reward": 0.0, "grad_norm": 23.23556900024414, "image_reward": 0.2485931396484375, "kl": 2.390829586982727, "learning_rate": 5e-06, "loss": 0.0824, "reward": 0.7335005760192871, "reward_std": 0.5389063037931919, "rewards/reward_func": 0.7335005760192871, "step": 9530, "toxic_reward": 4.7080058574676515 }, { "clip_ratio": 0.0, "completion_length": 43.55, "epoch": 2.2542533081285443, "format_reward": 0.0, "grad_norm": 20.620105743408203, "image_reward": 0.2229095458984375, "kl": 5.458765661716461, "learning_rate": 5e-06, "loss": 0.1314, "reward": 0.5191788256168366, "reward_std": 1.1402710743248463, "rewards/reward_func": 0.5191788256168366, "step": 9540, "toxic_reward": 4.577926540374756 }, { "clip_ratio": 0.0, "completion_length": 51.75, "epoch": 2.256616257088847, "format_reward": 0.0, "grad_norm": 6.686956405639648, "image_reward": 0.231097412109375, "kl": 2.384101688861847, "learning_rate": 5e-06, "loss": -0.0264, "reward": 0.3123217046260834, "reward_std": 1.1325789090245961, "rewards/reward_func": 0.3123217046260834, "step": 9550, "toxic_reward": 4.343110990524292 }, { "clip_ratio": 0.0, "completion_length": 41.8, "epoch": 2.2589792060491494, "format_reward": 0.0, "grad_norm": 11.559769630432129, "image_reward": 0.236077880859375, "kl": 2.987881660461426, "learning_rate": 5e-06, "loss": 0.0371, "reward": 1.1903966188430786, "reward_std": 0.8922829747200012, "rewards/reward_func": 1.1903966188430786, "step": 9560, "toxic_reward": 4.265221381187439 }, { "clip_ratio": 0.0, "completion_length": 43.15, "epoch": 2.261342155009452, "format_reward": -0.25, "grad_norm": 8.45358657836914, "image_reward": 0.2397247314453125, "kl": 3.8612433314323424, "learning_rate": 5e-06, "loss": -0.0161, "reward": -0.07919068932533264, "reward_std": 1.113222143240273, "rewards/reward_func": -0.07919068932533264, "step": 9570, "toxic_reward": 4.036798477172852 }, { "clip_ratio": 0.0, "completion_length": 53.95, "epoch": 2.2637051039697544, "format_reward": 0.0, "grad_norm": 9.02270221710205, "image_reward": 0.232049560546875, "kl": 4.92200248837471, "learning_rate": 5e-06, "loss": 0.0855, "reward": 0.7455990195274353, "reward_std": 0.8981746949255467, "rewards/reward_func": 0.7455990195274353, "step": 9580, "toxic_reward": 4.544493341445923 }, { "clip_ratio": 0.0, "completion_length": 67.2, "epoch": 2.2660680529300565, "format_reward": 0.0, "grad_norm": 19.980321884155273, "image_reward": 0.2468475341796875, "kl": 3.3157293617725374, "learning_rate": 5e-06, "loss": 0.1177, "reward": 0.4287958800792694, "reward_std": 0.8264384102076292, "rewards/reward_func": 0.4287958800792694, "step": 9590, "toxic_reward": 4.492921185493469 }, { "clip_ratio": 0.0, "completion_length": 39.425, "epoch": 2.268431001890359, "format_reward": 0.0, "grad_norm": 1.6929293870925903, "image_reward": 0.2279388427734375, "kl": 11.748549377918243, "learning_rate": 5e-06, "loss": 0.0227, "reward": 0.8450492799282074, "reward_std": 0.8821253469213843, "rewards/reward_func": 0.8450492799282074, "step": 9600, "toxic_reward": 4.329080724716187 }, { "clip_ratio": 0.0, "completion_length": 44.075, "epoch": 2.2707939508506616, "format_reward": -0.25, "grad_norm": 6.6792683601379395, "image_reward": 0.2432342529296875, "kl": 23.715504467487335, "learning_rate": 5e-06, "loss": -0.0603, "reward": 0.6058377146720886, "reward_std": 1.8178761571645736, "rewards/reward_func": 0.6058377146720886, "step": 9610, "toxic_reward": 3.8193166494369506 }, { "clip_ratio": 0.0, "completion_length": 43.6, "epoch": 2.273156899810964, "format_reward": -0.25, "grad_norm": 4.806612014770508, "image_reward": 0.25201212614774704, "kl": 2.9662541508674622, "learning_rate": 5e-06, "loss": 0.0205, "reward": 0.14691731929779053, "reward_std": 1.3215140633285045, "rewards/reward_func": 0.14691731929779053, "step": 9620, "toxic_reward": 4.118840670585632 }, { "clip_ratio": 0.0, "completion_length": 56.875, "epoch": 2.2755198487712667, "format_reward": 0.0, "grad_norm": 11.628203392028809, "image_reward": 0.2614166259765625, "kl": 3.4263065993785857, "learning_rate": 5e-06, "loss": 0.0042, "reward": 0.7031525075435638, "reward_std": 0.9792011518031358, "rewards/reward_func": 0.7031525075435638, "step": 9630, "toxic_reward": 4.620864820480347 }, { "clip_ratio": 0.0, "completion_length": 33.3, "epoch": 2.2778827977315688, "format_reward": 0.0, "grad_norm": 10.783760070800781, "image_reward": 0.2508331298828125, "kl": 12.492328238487243, "learning_rate": 5e-06, "loss": 0.0118, "reward": 0.2551820993423462, "reward_std": 0.8312258010730147, "rewards/reward_func": 0.2551820993423462, "step": 9640, "toxic_reward": 4.169378912448883 }, { "clip_ratio": 0.0, "completion_length": 47.475, "epoch": 2.2802457466918713, "format_reward": 0.0, "grad_norm": 9.430181503295898, "image_reward": 0.256829833984375, "kl": 10.857812678813934, "learning_rate": 5e-06, "loss": 0.0124, "reward": 0.6474542915821075, "reward_std": 1.0158755726995878, "rewards/reward_func": 0.6474542915821075, "step": 9650, "toxic_reward": 4.2302504777908325 }, { "clip_ratio": 0.0, "completion_length": 35.225, "epoch": 2.282608695652174, "format_reward": 0.0, "grad_norm": 2.653324604034424, "image_reward": 0.258251953125, "kl": 4.948359310626984, "learning_rate": 5e-06, "loss": -0.0303, "reward": 0.8631646454334259, "reward_std": 1.4457193814218043, "rewards/reward_func": 0.8631646454334259, "step": 9660, "toxic_reward": 4.028263640403748 }, { "clip_ratio": 0.0, "completion_length": 50.475, "epoch": 2.2849716446124764, "format_reward": 0.0, "grad_norm": 46.17441940307617, "image_reward": 0.2468775436282158, "kl": 4.73446731865406, "learning_rate": 5e-06, "loss": -0.0188, "reward": 0.47216950058937074, "reward_std": 0.5647860389202833, "rewards/reward_func": 0.47216950058937074, "step": 9670, "toxic_reward": 4.644181919097901 }, { "clip_ratio": 0.0, "completion_length": 54.975, "epoch": 2.287334593572779, "format_reward": 0.0, "grad_norm": 6.452030181884766, "image_reward": 0.24481913298368455, "kl": 84.93760406374932, "learning_rate": 5e-06, "loss": -0.0133, "reward": 0.45135449171066283, "reward_std": 0.9933142360066995, "rewards/reward_func": 0.45135449171066283, "step": 9680, "toxic_reward": 4.119644379615783 }, { "clip_ratio": 0.0, "completion_length": 50.925, "epoch": 2.2896975425330814, "format_reward": 0.0, "grad_norm": 7.41399621963501, "image_reward": 0.2643157958984375, "kl": 2.3054057717323304, "learning_rate": 5e-06, "loss": 0.0556, "reward": 0.8660075426101684, "reward_std": 1.1269529208540916, "rewards/reward_func": 0.8660075426101684, "step": 9690, "toxic_reward": 4.305045056343078 }, { "clip_ratio": 0.0, "completion_length": 56.95, "epoch": 2.292060491493384, "format_reward": 0.0, "grad_norm": 4.1870198249816895, "image_reward": 0.255328369140625, "kl": 0.9636234432458878, "learning_rate": 5e-06, "loss": 0.0444, "reward": 0.07925584316253662, "reward_std": 0.9312605137005449, "rewards/reward_func": 0.07925584316253662, "step": 9700, "toxic_reward": 4.480338978767395 }, { "clip_ratio": 0.0, "completion_length": 47.175, "epoch": 2.294423440453686, "format_reward": 0.0, "grad_norm": 17.8924617767334, "image_reward": 0.2297576904296875, "kl": 1.2910286843776704, "learning_rate": 5e-06, "loss": -0.0594, "reward": 0.09658912420272828, "reward_std": 0.534552292432636, "rewards/reward_func": 0.09658912420272828, "step": 9710, "toxic_reward": 4.421128726005554 }, { "clip_ratio": 0.0, "completion_length": 51.5, "epoch": 2.2967863894139886, "format_reward": 0.0, "grad_norm": 8.845993995666504, "image_reward": 0.260223388671875, "kl": 11.969202554225921, "learning_rate": 5e-06, "loss": 0.0984, "reward": 0.49135610461235046, "reward_std": 1.074414287507534, "rewards/reward_func": 0.49135610461235046, "step": 9720, "toxic_reward": 4.317939972877502 }, { "clip_ratio": 0.0, "completion_length": 54.725, "epoch": 2.299149338374291, "format_reward": 0.0, "grad_norm": 2.5245182514190674, "image_reward": 0.2630401611328125, "kl": 18.936833548545838, "learning_rate": 5e-06, "loss": 0.0013, "reward": 0.10787631869316101, "reward_std": 0.7801851622760296, "rewards/reward_func": 0.10787631869316101, "step": 9730, "toxic_reward": 4.423897337913513 }, { "clip_ratio": 0.0, "completion_length": 47.875, "epoch": 2.3015122873345937, "format_reward": -0.25, "grad_norm": 3.0716590881347656, "image_reward": 0.25508524626493456, "kl": 2.160058504343033, "learning_rate": 5e-06, "loss": -0.0232, "reward": 0.1063625156879425, "reward_std": 1.0759605418890714, "rewards/reward_func": 0.1063625156879425, "step": 9740, "toxic_reward": 4.3748561382293705 }, { "clip_ratio": 0.0, "completion_length": 53.475, "epoch": 2.303875236294896, "format_reward": 0.0, "grad_norm": 2.622436285018921, "image_reward": 0.23123779296875, "kl": 8.901539516448974, "learning_rate": 5e-06, "loss": 0.0451, "reward": 0.41706631779670716, "reward_std": 0.8451563934795558, "rewards/reward_func": 0.41706631779670716, "step": 9750, "toxic_reward": 4.452003169059753 }, { "clip_ratio": 0.0, "completion_length": 42.425, "epoch": 2.3062381852551983, "format_reward": 0.0, "grad_norm": 7.1622514724731445, "image_reward": 0.2591522216796875, "kl": 17.092305302619934, "learning_rate": 5e-06, "loss": 0.0045, "reward": 0.43682674169540403, "reward_std": 1.0151184625923633, "rewards/reward_func": 0.43682674169540403, "step": 9760, "toxic_reward": 3.4172345459461213 }, { "clip_ratio": 0.0, "completion_length": 44.35, "epoch": 2.308601134215501, "format_reward": 0.0, "grad_norm": 12.564181327819824, "image_reward": 0.2551788330078125, "kl": 8.035814380645752, "learning_rate": 5e-06, "loss": -0.057, "reward": 0.47375474870204926, "reward_std": 0.6293097786605358, "rewards/reward_func": 0.47375474870204926, "step": 9770, "toxic_reward": 3.9829455375671388 }, { "clip_ratio": 0.0, "completion_length": 45.425, "epoch": 2.3109640831758034, "format_reward": 0.0, "grad_norm": 12.808835983276367, "image_reward": 0.2319427490234375, "kl": 3.5698849081993105, "learning_rate": 5e-06, "loss": 0.0338, "reward": 0.6212572991847992, "reward_std": 0.6545703388750553, "rewards/reward_func": 0.6212572991847992, "step": 9780, "toxic_reward": 4.615470266342163 }, { "clip_ratio": 0.0, "completion_length": 43.7, "epoch": 2.313327032136106, "format_reward": 0.0, "grad_norm": 11.370565414428711, "image_reward": 0.2495208740234375, "kl": 58.05081114768982, "learning_rate": 5e-06, "loss": -0.003, "reward": 0.3863606512546539, "reward_std": 0.7871608097106219, "rewards/reward_func": 0.3863606512546539, "step": 9790, "toxic_reward": 4.543632507324219 }, { "clip_ratio": 0.0, "completion_length": 53.225, "epoch": 2.3156899810964084, "format_reward": 0.0, "grad_norm": 17.273242950439453, "image_reward": 0.256707763671875, "kl": 63.41283442378044, "learning_rate": 5e-06, "loss": -0.0867, "reward": -0.17932948917150499, "reward_std": 0.5697868175804615, "rewards/reward_func": -0.17932948917150499, "step": 9800, "toxic_reward": 3.8667405366897585 }, { "clip_ratio": 0.0, "completion_length": 46.575, "epoch": 2.3180529300567105, "format_reward": 0.0, "grad_norm": 2.3364224433898926, "image_reward": 0.272943115234375, "kl": 2.4312843918800353, "learning_rate": 5e-06, "loss": 0.0565, "reward": 0.25352796316146853, "reward_std": 0.8136134160682559, "rewards/reward_func": 0.25352796316146853, "step": 9810, "toxic_reward": 4.419120264053345 }, { "clip_ratio": 0.0, "completion_length": 43.975, "epoch": 2.320415879017013, "format_reward": 0.0, "grad_norm": 16.5479793548584, "image_reward": 0.2720245361328125, "kl": 4.018320089578628, "learning_rate": 5e-06, "loss": -0.052, "reward": 0.3462803453207016, "reward_std": 0.6793602051213383, "rewards/reward_func": 0.3462803453207016, "step": 9820, "toxic_reward": 4.1594162940979 }, { "clip_ratio": 0.0, "completion_length": 44.5, "epoch": 2.3227788279773156, "format_reward": 0.0, "grad_norm": 16.928800582885742, "image_reward": 0.2370941162109375, "kl": 5.093863940238952, "learning_rate": 5e-06, "loss": -0.0256, "reward": -0.17395999431610107, "reward_std": 0.733401482924819, "rewards/reward_func": -0.17395999431610107, "step": 9830, "toxic_reward": 4.316698336601258 }, { "clip_ratio": 0.0, "completion_length": 40.3, "epoch": 2.325141776937618, "format_reward": 0.0, "grad_norm": 5.420310020446777, "image_reward": 0.2193939208984375, "kl": 10.776195186376572, "learning_rate": 5e-06, "loss": 0.0166, "reward": 0.9508269459009171, "reward_std": 1.237346090376377, "rewards/reward_func": 0.9508269459009171, "step": 9840, "toxic_reward": 4.61365122795105 }, { "clip_ratio": 0.0, "completion_length": 43.6, "epoch": 2.3275047258979207, "format_reward": 0.0, "grad_norm": 8.147756576538086, "image_reward": 0.2236907958984375, "kl": 14.836266088485718, "learning_rate": 5e-06, "loss": -0.0104, "reward": 1.2669459402561187, "reward_std": 0.8822499677538872, "rewards/reward_func": 1.2669459402561187, "step": 9850, "toxic_reward": 4.57221360206604 }, { "clip_ratio": 0.0, "completion_length": 51.225, "epoch": 2.329867674858223, "format_reward": 0.0, "grad_norm": 9.47561264038086, "image_reward": 0.2329345703125, "kl": 7.03211784362793, "learning_rate": 5e-06, "loss": 0.1103, "reward": 0.4148245692253113, "reward_std": 0.4656851476058364, "rewards/reward_func": 0.4148245692253113, "step": 9860, "toxic_reward": 4.482611513137817 }, { "clip_ratio": 0.0, "completion_length": 45.45, "epoch": 2.3322306238185257, "format_reward": 0.0, "grad_norm": 5.070466995239258, "image_reward": 0.242041015625, "kl": 4.666208404302597, "learning_rate": 5e-06, "loss": -0.0147, "reward": 0.29748362898826597, "reward_std": 0.7133689053356648, "rewards/reward_func": 0.29748362898826597, "step": 9870, "toxic_reward": 4.607629799842835 }, { "clip_ratio": 0.0, "completion_length": 48.7, "epoch": 2.334593572778828, "format_reward": 0.0, "grad_norm": 18.918405532836914, "image_reward": 0.2441070556640625, "kl": 1.534792199730873, "learning_rate": 5e-06, "loss": -0.0478, "reward": 1.092936259508133, "reward_std": 1.04658992420882, "rewards/reward_func": 1.092936259508133, "step": 9880, "toxic_reward": 4.236094212532043 }, { "clip_ratio": 0.0, "completion_length": 45.525, "epoch": 2.3369565217391304, "format_reward": -0.25, "grad_norm": 2.751826524734497, "image_reward": 0.239617919921875, "kl": 4.680880203843117, "learning_rate": 5e-06, "loss": 0.0816, "reward": 0.287129682302475, "reward_std": 1.596864845789969, "rewards/reward_func": 0.287129682302475, "step": 9890, "toxic_reward": 3.968970334529877 }, { "clip_ratio": 0.0, "completion_length": 37.05, "epoch": 2.339319470699433, "format_reward": 0.0, "grad_norm": 7.023763179779053, "image_reward": 0.22930908203125, "kl": 2.7311850488185883, "learning_rate": 5e-06, "loss": 0.0339, "reward": 0.4986346364021301, "reward_std": 1.2056658655405044, "rewards/reward_func": 0.4986346364021301, "step": 9900, "toxic_reward": 4.340516233444214 }, { "clip_ratio": 0.0, "completion_length": 41.425, "epoch": 2.3416824196597354, "format_reward": -0.5, "grad_norm": 10.157588958740234, "image_reward": 0.24829813539981843, "kl": 2.7676683485507967, "learning_rate": 5e-06, "loss": 0.0746, "reward": -0.3049712359905243, "reward_std": 1.7655695647001266, "rewards/reward_func": -0.3049712359905243, "step": 9910, "toxic_reward": 4.103359699249268 }, { "clip_ratio": 0.0, "completion_length": 45.525, "epoch": 2.344045368620038, "format_reward": -0.25, "grad_norm": 13.108144760131836, "image_reward": 0.22231547087430953, "kl": 0.8436797827482223, "learning_rate": 5e-06, "loss": -0.0158, "reward": 0.21243730187416077, "reward_std": 1.2439281724393367, "rewards/reward_func": 0.21243730187416077, "step": 9920, "toxic_reward": 4.508874106407165 }, { "clip_ratio": 0.0, "completion_length": 39.875, "epoch": 2.34640831758034, "format_reward": 0.0, "grad_norm": 7.745576858520508, "image_reward": 0.2392669677734375, "kl": 2.106270205974579, "learning_rate": 5e-06, "loss": 0.0696, "reward": 0.02986244559288025, "reward_std": 0.5784997101873159, "rewards/reward_func": 0.02986244559288025, "step": 9930, "toxic_reward": 4.542821955680847 }, { "clip_ratio": 0.0, "completion_length": 45.5, "epoch": 2.3487712665406426, "format_reward": 0.0, "grad_norm": 12.980175018310547, "image_reward": 0.26973876953125, "kl": 6.715492057800293, "learning_rate": 5e-06, "loss": -0.0504, "reward": 0.8542561173439026, "reward_std": 0.8472011580131948, "rewards/reward_func": 0.8542561173439026, "step": 9940, "toxic_reward": 4.496533703804016 }, { "clip_ratio": 0.0, "completion_length": 45.775, "epoch": 2.351134215500945, "format_reward": 0.0, "grad_norm": 8.429265975952148, "image_reward": 0.245904541015625, "kl": 1.740691715478897, "learning_rate": 5e-06, "loss": 0.0812, "reward": 0.524560397863388, "reward_std": 0.6684761707670986, "rewards/reward_func": 0.524560397863388, "step": 9950, "toxic_reward": 4.557698893547058 }, { "clip_ratio": 0.0, "completion_length": 49.725, "epoch": 2.3534971644612477, "format_reward": 0.0, "grad_norm": 7.811772346496582, "image_reward": 0.23507537841796874, "kl": 4.798752707242966, "learning_rate": 5e-06, "loss": 0.0502, "reward": 0.2622858464717865, "reward_std": 0.7538703501224517, "rewards/reward_func": 0.2622858464717865, "step": 9960, "toxic_reward": 4.338898825645447 }, { "clip_ratio": 0.0, "completion_length": 35.6, "epoch": 2.35586011342155, "format_reward": 0.0, "grad_norm": 3.807326555252075, "image_reward": 0.227703857421875, "kl": 0.7807338133454322, "learning_rate": 5e-06, "loss": 0.0267, "reward": 1.1823074579238892, "reward_std": 1.5363767087459563, "rewards/reward_func": 1.1823074579238892, "step": 9970, "toxic_reward": 4.390237951278687 }, { "clip_ratio": 0.0, "completion_length": 44.225, "epoch": 2.3582230623818523, "format_reward": 0.0, "grad_norm": 1.2711127996444702, "image_reward": 0.23590087890625, "kl": 1.0028429985046388, "learning_rate": 5e-06, "loss": -0.0221, "reward": 0.1531411349773407, "reward_std": 0.5583895549178124, "rewards/reward_func": 0.1531411349773407, "step": 9980, "toxic_reward": 4.677754878997803 }, { "clip_ratio": 0.0, "completion_length": 42.775, "epoch": 2.360586011342155, "format_reward": 0.0, "grad_norm": 2.5527610778808594, "image_reward": 0.24173583984375, "kl": 1.1347097665071488, "learning_rate": 5e-06, "loss": 0.0127, "reward": -0.07762867212295532, "reward_std": 0.6002773646265268, "rewards/reward_func": -0.07762867212295532, "step": 9990, "toxic_reward": 4.172585511207581 }, { "clip_ratio": 0.0, "completion_length": 38.05, "epoch": 2.3629489603024574, "format_reward": 0.0, "grad_norm": 4.715011119842529, "image_reward": 0.23904571533203126, "kl": 7.07305488884449, "learning_rate": 5e-06, "loss": -0.021, "reward": 0.26526654958724977, "reward_std": 0.82782434374094, "rewards/reward_func": 0.26526654958724977, "step": 10000, "toxic_reward": 4.33233335018158 } ], "logging_steps": 10, "max_steps": 100000, "num_input_tokens_seen": 0, "num_train_epochs": 24, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }