my-model / trainer_state.json
KEVIN04087's picture
Upload folder using huggingface_hub
63451e1 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.3629489603024574,
"eval_steps": 500,
"global_step": 10000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"clip_ratio": 0.0,
"completion_length": 44.075,
"epoch": 0.0023629489603024575,
"format_reward": -1.75,
"grad_norm": 0.179437518119812,
"image_reward": 0.292385521862242,
"kl": 0.0005639283277560026,
"learning_rate": 5e-06,
"loss": -0.0818,
"reward": -1.718647839128971,
"reward_std": 2.0869705460965635,
"rewards/reward_func": -1.718647839128971,
"step": 10,
"toxic_reward": 3.753792663415273
},
{
"clip_ratio": 0.0,
"completion_length": 42.35,
"epoch": 0.004725897920604915,
"format_reward": -1.75,
"grad_norm": 0.40540918707847595,
"image_reward": 0.28610331267118455,
"kl": 0.0006540146190673113,
"learning_rate": 5e-06,
"loss": 0.0547,
"reward": -0.9438592553138733,
"reward_std": 3.9592867106199265,
"rewards/reward_func": -0.9438592553138733,
"step": 20,
"toxic_reward": 3.622282150387764
},
{
"clip_ratio": 0.0,
"completion_length": 41.35,
"epoch": 0.007088846880907372,
"format_reward": -2.5,
"grad_norm": 0.3070058524608612,
"image_reward": 0.29500325620174406,
"kl": 0.0006122831255197525,
"learning_rate": 5e-06,
"loss": 0.0196,
"reward": -2.2396623373031614,
"reward_std": 4.928562045097351,
"rewards/reward_func": -2.2396623373031614,
"step": 30,
"toxic_reward": 3.3049886375665665
},
{
"clip_ratio": 0.0,
"completion_length": 53.475,
"epoch": 0.00945179584120983,
"format_reward": -2.25,
"grad_norm": 0.30812421441078186,
"image_reward": 0.2784423798322678,
"kl": 0.0007215021323645487,
"learning_rate": 5e-06,
"loss": -0.0374,
"reward": -1.9919262409210206,
"reward_std": 3.2468371063470842,
"rewards/reward_func": -1.9919262409210206,
"step": 40,
"toxic_reward": 4.284190082550049
},
{
"clip_ratio": 0.0,
"completion_length": 42.0,
"epoch": 0.011814744801512287,
"format_reward": -2.25,
"grad_norm": 0.7593560814857483,
"image_reward": 0.3014272041618824,
"kl": 0.0006584389615454711,
"learning_rate": 5e-06,
"loss": 0.2018,
"reward": -2.2726588547229767,
"reward_std": 3.8129764549434184,
"rewards/reward_func": -2.2726588547229767,
"step": 50,
"toxic_reward": 3.694073647260666
},
{
"clip_ratio": 0.0,
"completion_length": 51.475,
"epoch": 0.014177693761814745,
"format_reward": -3.75,
"grad_norm": 0.46164554357528687,
"image_reward": 0.24024454802274703,
"kl": 0.000736865375074558,
"learning_rate": 5e-06,
"loss": 0.1005,
"reward": -3.6363322257995607,
"reward_std": 5.34180793762207,
"rewards/reward_func": -3.6363322257995607,
"step": 60,
"toxic_reward": 4.516654038429261
},
{
"clip_ratio": 0.0,
"completion_length": 40.275,
"epoch": 0.0165406427221172,
"format_reward": -1.75,
"grad_norm": 0.34076398611068726,
"image_reward": 0.2581237778067589,
"kl": 0.0006409274850739166,
"learning_rate": 5e-06,
"loss": -0.048,
"reward": -1.306644481420517,
"reward_std": 3.5849914638325573,
"rewards/reward_func": -1.306644481420517,
"step": 70,
"toxic_reward": 4.222395324707032
},
{
"clip_ratio": 0.0,
"completion_length": 41.05,
"epoch": 0.01890359168241966,
"format_reward": -1.75,
"grad_norm": 0.2942235469818115,
"image_reward": 0.2728251129388809,
"kl": 0.00077872859837953,
"learning_rate": 5e-06,
"loss": -0.018,
"reward": -1.0979918956756591,
"reward_std": 3.421245375275612,
"rewards/reward_func": -1.0979918956756591,
"step": 80,
"toxic_reward": 4.375229549407959
},
{
"clip_ratio": 0.0,
"completion_length": 46.375,
"epoch": 0.021266540642722116,
"format_reward": -1.75,
"grad_norm": 0.4680553376674652,
"image_reward": 0.27020376589563155,
"kl": 0.0006814575113821775,
"learning_rate": 5e-06,
"loss": 0.1758,
"reward": -1.8029783844947815,
"reward_std": 2.909199387952685,
"rewards/reward_func": -1.8029783844947815,
"step": 90,
"toxic_reward": 3.5180059373378754
},
{
"clip_ratio": 0.0,
"completion_length": 32.975,
"epoch": 0.023629489603024575,
"format_reward": -1.75,
"grad_norm": 0.5208232998847961,
"image_reward": 0.28130289614200593,
"kl": 0.0006654941505985334,
"learning_rate": 5e-06,
"loss": 0.0732,
"reward": -1.5811177730560302,
"reward_std": 3.0347108453512193,
"rewards/reward_func": -1.5811177730560302,
"step": 100,
"toxic_reward": 3.8031033158302305
},
{
"clip_ratio": 0.0,
"completion_length": 59.575,
"epoch": 0.02599243856332703,
"format_reward": -3.5,
"grad_norm": 0.5875898003578186,
"image_reward": 0.2767374664545059,
"kl": 0.0009529282746370882,
"learning_rate": 5e-06,
"loss": 0.013,
"reward": -3.43455148935318,
"reward_std": 5.033185955882073,
"rewards/reward_func": -3.43455148935318,
"step": 110,
"toxic_reward": 3.8197044640779496
},
{
"clip_ratio": 0.0,
"completion_length": 34.825,
"epoch": 0.02835538752362949,
"format_reward": -2.5,
"grad_norm": 0.9147374629974365,
"image_reward": 0.298614501953125,
"kl": 0.0007633624511072413,
"learning_rate": 5e-06,
"loss": 0.1077,
"reward": -2.407980114221573,
"reward_std": 4.146487069129944,
"rewards/reward_func": -2.407980114221573,
"step": 120,
"toxic_reward": 3.8069980409410267
},
{
"clip_ratio": 0.0,
"completion_length": 40.575,
"epoch": 0.030718336483931945,
"format_reward": -1.5,
"grad_norm": 0.6123144626617432,
"image_reward": 0.26710906128088635,
"kl": 0.000945484999101609,
"learning_rate": 5e-06,
"loss": -0.033,
"reward": -1.4210234582424164,
"reward_std": 2.5487833991646767,
"rewards/reward_func": -1.4210234582424164,
"step": 130,
"toxic_reward": 3.9784648021062217
},
{
"clip_ratio": 0.0,
"completion_length": 36.325,
"epoch": 0.0330812854442344,
"format_reward": -1.5,
"grad_norm": 0.35265249013900757,
"image_reward": 0.2955657958984375,
"kl": 0.001620796724455431,
"learning_rate": 5e-06,
"loss": 0.0518,
"reward": -1.1245046585798264,
"reward_std": 3.641619694232941,
"rewards/reward_func": -1.1245046585798264,
"step": 140,
"toxic_reward": 3.7418821096420287
},
{
"clip_ratio": 0.0,
"completion_length": 37.325,
"epoch": 0.03544423440453686,
"format_reward": -1.75,
"grad_norm": 0.6911599040031433,
"image_reward": 0.301416015625,
"kl": 0.0009025269537232816,
"learning_rate": 5e-06,
"loss": 0.3208,
"reward": -1.707236361503601,
"reward_std": 3.211209188401699,
"rewards/reward_func": -1.707236361503601,
"step": 150,
"toxic_reward": 3.413761219382286
},
{
"clip_ratio": 0.0,
"completion_length": 37.6,
"epoch": 0.03780718336483932,
"format_reward": -2.5,
"grad_norm": 0.6072728037834167,
"image_reward": 0.28253965079784393,
"kl": 0.0016979283303953708,
"learning_rate": 5e-06,
"loss": -0.0538,
"reward": -1.9519330382347106,
"reward_std": 3.5465006709098814,
"rewards/reward_func": -1.9519330382347106,
"step": 160,
"toxic_reward": 4.008814732233684
},
{
"clip_ratio": 0.0,
"completion_length": 41.7,
"epoch": 0.04017013232514178,
"format_reward": -2.5,
"grad_norm": 0.9174755811691284,
"image_reward": 0.2571976251072354,
"kl": 0.002261338901007548,
"learning_rate": 5e-06,
"loss": 0.1112,
"reward": -2.123195892572403,
"reward_std": 4.526358595490455,
"rewards/reward_func": -2.123195892572403,
"step": 170,
"toxic_reward": 3.4624782469537525
},
{
"clip_ratio": 0.0,
"completion_length": 42.25,
"epoch": 0.04253308128544423,
"format_reward": -3.5,
"grad_norm": 0.6067785024642944,
"image_reward": 0.26939900666475297,
"kl": 0.0012992891133762896,
"learning_rate": 5e-06,
"loss": 0.2051,
"reward": -3.432029390335083,
"reward_std": 5.464101791381836,
"rewards/reward_func": -3.432029390335083,
"step": 180,
"toxic_reward": 3.7212570786476133
},
{
"clip_ratio": 0.0,
"completion_length": 47.175,
"epoch": 0.04489603024574669,
"format_reward": -1.5,
"grad_norm": 0.32170766592025757,
"image_reward": 0.2965630425347222,
"kl": 0.00527564455405809,
"learning_rate": 5e-06,
"loss": -0.1016,
"reward": -1.44393031001091,
"reward_std": 3.0585690192878245,
"rewards/reward_func": -1.44393031001091,
"step": 190,
"toxic_reward": 4.017925447887844
},
{
"clip_ratio": 0.0,
"completion_length": 38.775,
"epoch": 0.04725897920604915,
"format_reward": -2.0,
"grad_norm": 0.5049771070480347,
"image_reward": 0.28580220490694047,
"kl": 0.003515976545168087,
"learning_rate": 5e-06,
"loss": 0.0836,
"reward": -1.8655982911586761,
"reward_std": 3.0476409645751117,
"rewards/reward_func": -1.8655982911586761,
"step": 200,
"toxic_reward": 3.8450406193733215
},
{
"clip_ratio": 0.0,
"completion_length": 37.05,
"epoch": 0.04962192816635161,
"format_reward": -2.0,
"grad_norm": 0.38584810495376587,
"image_reward": 0.2693684895833333,
"kl": 0.0034694685833528637,
"learning_rate": 5e-06,
"loss": 0.2267,
"reward": -2.040862238407135,
"reward_std": 3.0185029461979864,
"rewards/reward_func": -2.040862238407135,
"step": 210,
"toxic_reward": 4.567277669906616
},
{
"clip_ratio": 0.0,
"completion_length": 43.325,
"epoch": 0.05198487712665406,
"format_reward": -2.25,
"grad_norm": 0.7845410108566284,
"image_reward": 0.27211100459098814,
"kl": 0.0027843258751090614,
"learning_rate": 5e-06,
"loss": -0.0217,
"reward": -1.9688808619976044,
"reward_std": 4.326950389891863,
"rewards/reward_func": -1.9688808619976044,
"step": 220,
"toxic_reward": 3.998746132850647
},
{
"clip_ratio": 0.0,
"completion_length": 38.475,
"epoch": 0.05434782608695652,
"format_reward": -1.5,
"grad_norm": 0.28465747833251953,
"image_reward": 0.28180135041475296,
"kl": 0.0029356992337852715,
"learning_rate": 5e-06,
"loss": 0.1949,
"reward": -1.8664621770381928,
"reward_std": 3.3784094207920132,
"rewards/reward_func": -1.8664621770381928,
"step": 230,
"toxic_reward": 3.4729531943798064
},
{
"clip_ratio": 0.0,
"completion_length": 61.9,
"epoch": 0.05671077504725898,
"format_reward": -3.25,
"grad_norm": 0.42949026823043823,
"image_reward": 0.27780679166316985,
"kl": 0.00391890910686925,
"learning_rate": 5e-06,
"loss": 0.0808,
"reward": -2.961669445037842,
"reward_std": 4.58679872751236,
"rewards/reward_func": -2.961669445037842,
"step": 240,
"toxic_reward": 3.08537415266037
},
{
"clip_ratio": 0.0,
"completion_length": 43.8,
"epoch": 0.05907372400756144,
"format_reward": -2.25,
"grad_norm": 0.9381951093673706,
"image_reward": 0.276055908203125,
"kl": 0.017148628836730496,
"learning_rate": 5e-06,
"loss": -0.2303,
"reward": -2.1199170768260958,
"reward_std": 3.3072034239768984,
"rewards/reward_func": -2.1199170768260958,
"step": 250,
"toxic_reward": 3.4872416734695433
},
{
"clip_ratio": 0.0,
"completion_length": 45.7,
"epoch": 0.06143667296786389,
"format_reward": -0.75,
"grad_norm": 1.0572214126586914,
"image_reward": 0.26918131560087205,
"kl": 0.0040537358960136775,
"learning_rate": 5e-06,
"loss": -0.1228,
"reward": -0.41667274236679075,
"reward_std": 1.9968392252922058,
"rewards/reward_func": -0.41667274236679075,
"step": 260,
"toxic_reward": 4.103001546859741
},
{
"clip_ratio": 0.0,
"completion_length": 31.325,
"epoch": 0.06379962192816635,
"format_reward": -1.75,
"grad_norm": 0.33211401104927063,
"image_reward": 0.26322936862707136,
"kl": 0.010905979719245807,
"learning_rate": 5e-06,
"loss": 0.1079,
"reward": -1.2869422495365144,
"reward_std": 3.4389497309923174,
"rewards/reward_func": -1.2869422495365144,
"step": 270,
"toxic_reward": 4.132273650169372
},
{
"clip_ratio": 0.0,
"completion_length": 56.725,
"epoch": 0.0661625708884688,
"format_reward": -1.0,
"grad_norm": 1.1315058469772339,
"image_reward": 0.28337690565321183,
"kl": 0.003939477750100196,
"learning_rate": 5e-06,
"loss": 0.3379,
"reward": -1.081434178352356,
"reward_std": 1.8980566158890724,
"rewards/reward_func": -1.081434178352356,
"step": 280,
"toxic_reward": 4.38276841905382
},
{
"clip_ratio": 0.0,
"completion_length": 31.3,
"epoch": 0.06852551984877127,
"format_reward": -1.75,
"grad_norm": 0.35049131512641907,
"image_reward": 0.27892303466796875,
"kl": 0.04790264330804348,
"learning_rate": 5e-06,
"loss": -0.1138,
"reward": -1.20261852145195,
"reward_std": 3.7090243451297282,
"rewards/reward_func": -1.20261852145195,
"step": 290,
"toxic_reward": 3.994084894657135
},
{
"clip_ratio": 0.0,
"completion_length": 51.925,
"epoch": 0.07088846880907372,
"format_reward": -2.0,
"grad_norm": 0.5147161483764648,
"image_reward": 0.290887451171875,
"kl": 0.008263002592138946,
"learning_rate": 5e-06,
"loss": 0.0859,
"reward": -1.9743857204914093,
"reward_std": 3.558365413546562,
"rewards/reward_func": -1.9743857204914093,
"step": 300,
"toxic_reward": 3.2862678617239
},
{
"clip_ratio": 0.0,
"completion_length": 35.4,
"epoch": 0.07325141776937619,
"format_reward": -0.25,
"grad_norm": 0.4911198616027832,
"image_reward": 0.27863413393497466,
"kl": 0.0026858947356231512,
"learning_rate": 5e-06,
"loss": -0.1385,
"reward": -0.005173623561859131,
"reward_std": 1.2043775863945485,
"rewards/reward_func": -0.005173623561859131,
"step": 310,
"toxic_reward": 4.392678713798523
},
{
"clip_ratio": 0.0,
"completion_length": 49.925,
"epoch": 0.07561436672967864,
"format_reward": -2.5,
"grad_norm": 0.7166872024536133,
"image_reward": 0.2722563561466005,
"kl": 0.0117031121510081,
"learning_rate": 5e-06,
"loss": 0.1367,
"reward": -2.320690667629242,
"reward_std": 3.535179616510868,
"rewards/reward_func": -2.320690667629242,
"step": 320,
"toxic_reward": 3.1776589486334057
},
{
"clip_ratio": 0.0,
"completion_length": 36.3,
"epoch": 0.07797731568998109,
"format_reward": -0.75,
"grad_norm": 0.2700420618057251,
"image_reward": 0.29552409052848816,
"kl": 0.017672599526122212,
"learning_rate": 5e-06,
"loss": -0.1148,
"reward": -0.291591414809227,
"reward_std": 2.016152049601078,
"rewards/reward_func": -0.291591414809227,
"step": 330,
"toxic_reward": 3.240800154209137
},
{
"clip_ratio": 0.0,
"completion_length": 40.525,
"epoch": 0.08034026465028356,
"format_reward": -1.75,
"grad_norm": 0.4268760085105896,
"image_reward": 0.29228312373161314,
"kl": 0.0068239012965932485,
"learning_rate": 5e-06,
"loss": 0.1467,
"reward": -1.9235587894916535,
"reward_std": 2.7463727177120747,
"rewards/reward_func": -1.9235587894916535,
"step": 340,
"toxic_reward": 3.535431480407715
},
{
"clip_ratio": 0.0,
"completion_length": 64.05,
"epoch": 0.08270321361058601,
"format_reward": -1.5,
"grad_norm": 0.6083372235298157,
"image_reward": 0.2844095855951309,
"kl": 0.014765451126731933,
"learning_rate": 5e-06,
"loss": -0.0278,
"reward": -1.293799924850464,
"reward_std": 3.273481422662735,
"rewards/reward_func": -1.293799924850464,
"step": 350,
"toxic_reward": 3.966331052780151
},
{
"clip_ratio": 0.0,
"completion_length": 50.65,
"epoch": 0.08506616257088846,
"format_reward": -1.25,
"grad_norm": 0.72890704870224,
"image_reward": 0.2809214279055595,
"kl": 0.013393631461076439,
"learning_rate": 5e-06,
"loss": 0.0463,
"reward": -1.2338840126991273,
"reward_std": 2.9229114189743997,
"rewards/reward_func": -1.2338840126991273,
"step": 360,
"toxic_reward": 3.495128685235977
},
{
"clip_ratio": 0.0,
"completion_length": 40.5,
"epoch": 0.08742911153119093,
"format_reward": -1.0,
"grad_norm": 0.19754794239997864,
"image_reward": 0.26820373386144636,
"kl": 0.009714199486188591,
"learning_rate": 5e-06,
"loss": -0.0682,
"reward": -0.5900760173797608,
"reward_std": 2.301849504513666,
"rewards/reward_func": -0.5900760173797608,
"step": 370,
"toxic_reward": 4.155627632141114
},
{
"clip_ratio": 0.0,
"completion_length": 43.325,
"epoch": 0.08979206049149338,
"format_reward": -1.0,
"grad_norm": 0.7548431158065796,
"image_reward": 0.2744639068841934,
"kl": 0.02220306231174618,
"learning_rate": 5e-06,
"loss": 0.2135,
"reward": -1.0681762412190436,
"reward_std": 2.1873259781859815,
"rewards/reward_func": -1.0681762412190436,
"step": 380,
"toxic_reward": 3.818178777396679
},
{
"clip_ratio": 0.0,
"completion_length": 38.75,
"epoch": 0.09215500945179585,
"format_reward": -1.5,
"grad_norm": 1.6385910511016846,
"image_reward": 0.28105672299861906,
"kl": 0.010323460912331939,
"learning_rate": 5e-06,
"loss": 0.1059,
"reward": -1.3754307508468628,
"reward_std": 3.023836246691644,
"rewards/reward_func": -1.3754307508468628,
"step": 390,
"toxic_reward": 4.011180245876313
},
{
"clip_ratio": 0.0,
"completion_length": 45.55,
"epoch": 0.0945179584120983,
"format_reward": -2.0,
"grad_norm": 0.8115288615226746,
"image_reward": 0.28032633662223816,
"kl": 0.05955924341687933,
"learning_rate": 5e-06,
"loss": 0.0819,
"reward": -1.5853845477104187,
"reward_std": 4.042920933663845,
"rewards/reward_func": -1.5853845477104187,
"step": 400,
"toxic_reward": 3.5872471928596497
},
{
"clip_ratio": 0.0,
"completion_length": 36.675,
"epoch": 0.09688090737240075,
"format_reward": -0.75,
"grad_norm": 0.5388673543930054,
"image_reward": 0.2691446923547321,
"kl": 0.007679732237011194,
"learning_rate": 5e-06,
"loss": -0.0322,
"reward": -0.41340800523757937,
"reward_std": 2.081881234049797,
"rewards/reward_func": -0.41340800523757937,
"step": 410,
"toxic_reward": 3.857707765367296
},
{
"clip_ratio": 0.0,
"completion_length": 52.125,
"epoch": 0.09924385633270322,
"format_reward": -1.75,
"grad_norm": 0.2760399281978607,
"image_reward": 0.2856099456548691,
"kl": 0.06441240075509995,
"learning_rate": 5e-06,
"loss": 0.1586,
"reward": -1.576817613840103,
"reward_std": 3.0979749940335752,
"rewards/reward_func": -1.576817613840103,
"step": 420,
"toxic_reward": 3.6233551859855653
},
{
"clip_ratio": 0.0,
"completion_length": 42.9,
"epoch": 0.10160680529300567,
"format_reward": -2.25,
"grad_norm": 0.8334791660308838,
"image_reward": 0.314910888671875,
"kl": 0.13873190036974847,
"learning_rate": 5e-06,
"loss": 0.0671,
"reward": -1.9813659265637398,
"reward_std": 3.193006566166878,
"rewards/reward_func": -1.9813659265637398,
"step": 430,
"toxic_reward": 3.297185143828392
},
{
"clip_ratio": 0.0,
"completion_length": 41.6,
"epoch": 0.10396975425330812,
"format_reward": -0.25,
"grad_norm": 0.8734163045883179,
"image_reward": 0.2913574203848839,
"kl": 0.01563742496073246,
"learning_rate": 5e-06,
"loss": -0.0094,
"reward": 0.012267284095287323,
"reward_std": 1.1554903835058212,
"rewards/reward_func": 0.012267284095287323,
"step": 440,
"toxic_reward": 2.416472536325455
},
{
"clip_ratio": 0.0,
"completion_length": 36.825,
"epoch": 0.10633270321361059,
"format_reward": -1.5,
"grad_norm": 1.1138451099395752,
"image_reward": 0.28591206669807434,
"kl": 0.022654308984056116,
"learning_rate": 5e-06,
"loss": 0.0957,
"reward": -1.2695215404033662,
"reward_std": 2.8351699322462083,
"rewards/reward_func": -1.2695215404033662,
"step": 450,
"toxic_reward": 3.3260442495346068
},
{
"clip_ratio": 0.0,
"completion_length": 33.475,
"epoch": 0.10869565217391304,
"format_reward": -0.25,
"grad_norm": 0.6227550506591797,
"image_reward": 0.2807729095220566,
"kl": 0.007865939987823367,
"learning_rate": 5e-06,
"loss": 0.0179,
"reward": 1.1717996835708617,
"reward_std": 1.5977750271558762,
"rewards/reward_func": 1.1717996835708617,
"step": 460,
"toxic_reward": 3.309050977230072
},
{
"clip_ratio": 0.0,
"completion_length": 34.275,
"epoch": 0.1110586011342155,
"format_reward": -1.0,
"grad_norm": 0.3605867624282837,
"image_reward": 0.3162755310535431,
"kl": 0.02495001317001879,
"learning_rate": 5e-06,
"loss": 0.0852,
"reward": -1.0550554990768433,
"reward_std": 2.7155043721199035,
"rewards/reward_func": -1.0550554990768433,
"step": 470,
"toxic_reward": 3.4480915129184724
},
{
"clip_ratio": 0.0,
"completion_length": 43.0,
"epoch": 0.11342155009451796,
"format_reward": -0.5,
"grad_norm": 0.3204725980758667,
"image_reward": 0.32206115424633025,
"kl": 0.011832635500468314,
"learning_rate": 5e-06,
"loss": 0.032,
"reward": -0.29420808106660845,
"reward_std": 1.479024769924581,
"rewards/reward_func": -0.29420808106660845,
"step": 480,
"toxic_reward": 3.5640476822853087
},
{
"clip_ratio": 0.0,
"completion_length": 49.2,
"epoch": 0.11578449905482041,
"format_reward": -1.75,
"grad_norm": 0.6204938888549805,
"image_reward": 0.26227518618106843,
"kl": 0.03589183106087148,
"learning_rate": 5e-06,
"loss": 0.1186,
"reward": -1.7148385405540467,
"reward_std": 3.0656426630914213,
"rewards/reward_func": -1.7148385405540467,
"step": 490,
"toxic_reward": 4.371080112457276
},
{
"clip_ratio": 0.0,
"completion_length": 52.925,
"epoch": 0.11814744801512288,
"format_reward": -1.75,
"grad_norm": 0.7287388443946838,
"image_reward": 0.2700037628412247,
"kl": 0.045285335322842,
"learning_rate": 5e-06,
"loss": 0.0265,
"reward": -1.4807079195976258,
"reward_std": 3.789501038193703,
"rewards/reward_func": -1.4807079195976258,
"step": 500,
"toxic_reward": 3.9034363865852355
},
{
"clip_ratio": 0.0,
"completion_length": 35.75,
"epoch": 0.12051039697542533,
"format_reward": -1.0,
"grad_norm": 0.8750075697898865,
"image_reward": 0.2932400173611111,
"kl": 0.15442988513968886,
"learning_rate": 5e-06,
"loss": 0.0218,
"reward": -0.7209997951984406,
"reward_std": 1.843582271039486,
"rewards/reward_func": -0.7209997951984406,
"step": 510,
"toxic_reward": 3.8813175095452204
},
{
"clip_ratio": 0.0,
"completion_length": 44.125,
"epoch": 0.12287334593572778,
"format_reward": -0.5,
"grad_norm": 0.4498269259929657,
"image_reward": 0.29094645082950593,
"kl": 0.01979847764596343,
"learning_rate": 5e-06,
"loss": -0.0324,
"reward": -0.40604341179132464,
"reward_std": 1.4113173604011535,
"rewards/reward_func": -0.40604341179132464,
"step": 520,
"toxic_reward": 4.314427596330643
},
{
"clip_ratio": 0.0,
"completion_length": 34.025,
"epoch": 0.12523629489603025,
"format_reward": -1.0,
"grad_norm": 1.7480149269104004,
"image_reward": 0.2472829192876816,
"kl": 0.0556537595577538,
"learning_rate": 5e-06,
"loss": -0.1291,
"reward": -1.0481307327747345,
"reward_std": 2.317431343346834,
"rewards/reward_func": -1.0481307327747345,
"step": 530,
"toxic_reward": 4.515345811843872
},
{
"clip_ratio": 0.0,
"completion_length": 36.2,
"epoch": 0.1275992438563327,
"format_reward": -1.25,
"grad_norm": 0.39433184266090393,
"image_reward": 0.2873850494623184,
"kl": 0.039984302362427115,
"learning_rate": 5e-06,
"loss": 0.0301,
"reward": -0.2359391689300537,
"reward_std": 2.863342150300741,
"rewards/reward_func": -0.2359391689300537,
"step": 540,
"toxic_reward": 3.940987694263458
},
{
"clip_ratio": 0.0,
"completion_length": 42.15,
"epoch": 0.12996219281663515,
"format_reward": -1.25,
"grad_norm": 2.7985472679138184,
"image_reward": 0.30071309208869934,
"kl": 0.0283741801045835,
"learning_rate": 5e-06,
"loss": -0.0278,
"reward": -0.4430400252342224,
"reward_std": 2.761640505492687,
"rewards/reward_func": -0.4430400252342224,
"step": 550,
"toxic_reward": 3.235564041137695
},
{
"clip_ratio": 0.0,
"completion_length": 39.525,
"epoch": 0.1323251417769376,
"format_reward": -1.0,
"grad_norm": 1.208016037940979,
"image_reward": 0.29123942106962203,
"kl": 0.03811377864331007,
"learning_rate": 5e-06,
"loss": 0.2144,
"reward": -1.2057244956493378,
"reward_std": 2.0336616799235343,
"rewards/reward_func": -1.2057244956493378,
"step": 560,
"toxic_reward": 3.977510142326355
},
{
"clip_ratio": 0.0,
"completion_length": 27.55,
"epoch": 0.13468809073724008,
"format_reward": -1.5,
"grad_norm": 0.8842714428901672,
"image_reward": 0.2724670395255089,
"kl": 0.07012159014120697,
"learning_rate": 5e-06,
"loss": 0.1684,
"reward": -1.302715817093849,
"reward_std": 3.6691504657268523,
"rewards/reward_func": -1.302715817093849,
"step": 570,
"toxic_reward": 3.246229815483093
},
{
"clip_ratio": 0.0,
"completion_length": 51.6,
"epoch": 0.13705103969754254,
"format_reward": -0.75,
"grad_norm": 0.7157159447669983,
"image_reward": 0.299871826171875,
"kl": 0.028781934920698405,
"learning_rate": 5e-06,
"loss": -0.0308,
"reward": 0.11744136810302734,
"reward_std": 2.1306695722043516,
"rewards/reward_func": 0.11744136810302734,
"step": 580,
"toxic_reward": 3.35184041261673
},
{
"clip_ratio": 0.0,
"completion_length": 41.475,
"epoch": 0.139413988657845,
"format_reward": -0.25,
"grad_norm": 0.4593754708766937,
"image_reward": 0.2574858499897851,
"kl": 0.05173348039388657,
"learning_rate": 5e-06,
"loss": -0.1241,
"reward": 0.452265202999115,
"reward_std": 1.2885668274015187,
"rewards/reward_func": 0.452265202999115,
"step": 590,
"toxic_reward": 3.4634872145122952
},
{
"clip_ratio": 0.0,
"completion_length": 35.325,
"epoch": 0.14177693761814744,
"format_reward": -1.0,
"grad_norm": 0.5869470834732056,
"image_reward": 0.26802266389131546,
"kl": 0.2022853755392134,
"learning_rate": 5e-06,
"loss": -0.1205,
"reward": -0.9757636785507202,
"reward_std": 2.408064843714237,
"rewards/reward_func": -0.9757636785507202,
"step": 600,
"toxic_reward": 4.45868456363678
},
{
"clip_ratio": 0.0,
"completion_length": 39.45,
"epoch": 0.1441398865784499,
"format_reward": -2.75,
"grad_norm": 1.1131778955459595,
"image_reward": 0.26167353987693787,
"kl": 0.16366879558190703,
"learning_rate": 5e-06,
"loss": -0.0012,
"reward": -2.7253461956977842,
"reward_std": 4.713953969441354,
"rewards/reward_func": -2.7253461956977842,
"step": 610,
"toxic_reward": 3.5585821866989136
},
{
"clip_ratio": 0.0,
"completion_length": 40.65,
"epoch": 0.14650283553875237,
"format_reward": -1.0,
"grad_norm": 1.6662554740905762,
"image_reward": 0.2821828216314316,
"kl": 0.1423144882544875,
"learning_rate": 5e-06,
"loss": -0.0443,
"reward": -0.9905034899711609,
"reward_std": 2.6423311533406375,
"rewards/reward_func": -0.9905034899711609,
"step": 620,
"toxic_reward": 4.095821046829224
},
{
"clip_ratio": 0.0,
"completion_length": 55.85,
"epoch": 0.14886578449905483,
"format_reward": -1.0,
"grad_norm": 18.956981658935547,
"image_reward": 0.28932088166475295,
"kl": 0.41657317453064024,
"learning_rate": 5e-06,
"loss": 0.0324,
"reward": -0.8243820607662201,
"reward_std": 2.0909267283976076,
"rewards/reward_func": -0.8243820607662201,
"step": 630,
"toxic_reward": 3.2601676136255264
},
{
"clip_ratio": 0.0,
"completion_length": 29.875,
"epoch": 0.15122873345935728,
"format_reward": -0.75,
"grad_norm": 1.4686508178710938,
"image_reward": 0.29945882111787797,
"kl": 0.28281182143837214,
"learning_rate": 5e-06,
"loss": 0.0769,
"reward": -0.4713120386004448,
"reward_std": 1.791446179151535,
"rewards/reward_func": -0.4713120386004448,
"step": 640,
"toxic_reward": 3.3351209998130797
},
{
"clip_ratio": 0.0,
"completion_length": 37.725,
"epoch": 0.15359168241965973,
"format_reward": -0.5,
"grad_norm": 2.9935286045074463,
"image_reward": 0.2910970068640179,
"kl": 1.0141649260884151,
"learning_rate": 5e-06,
"loss": -0.2174,
"reward": -0.5139556050300598,
"reward_std": 1.0858815148472787,
"rewards/reward_func": -0.5139556050300598,
"step": 650,
"toxic_reward": 4.207416137059529
},
{
"clip_ratio": 0.0,
"completion_length": 39.2,
"epoch": 0.15595463137996218,
"format_reward": -0.75,
"grad_norm": 3.4974160194396973,
"image_reward": 0.29859237670898436,
"kl": 0.03742524515837431,
"learning_rate": 5e-06,
"loss": -0.0491,
"reward": -0.8688022553920746,
"reward_std": 1.9190378237515688,
"rewards/reward_func": -0.8688022553920746,
"step": 660,
"toxic_reward": 3.639171451330185
},
{
"clip_ratio": 0.0,
"completion_length": 44.625,
"epoch": 0.15831758034026466,
"format_reward": -0.75,
"grad_norm": 0.6731751561164856,
"image_reward": 0.2705291733145714,
"kl": 0.1289379763416946,
"learning_rate": 5e-06,
"loss": 0.0339,
"reward": -0.5425865709781647,
"reward_std": 2.217602302134037,
"rewards/reward_func": -0.5425865709781647,
"step": 670,
"toxic_reward": 3.7739344239234924
},
{
"clip_ratio": 0.0,
"completion_length": 49.9,
"epoch": 0.16068052930056712,
"format_reward": -1.0,
"grad_norm": 0.6705069541931152,
"image_reward": 0.2828119918704033,
"kl": 0.09238320724107325,
"learning_rate": 5e-06,
"loss": -0.0765,
"reward": -0.3722410202026367,
"reward_std": 1.9134121721610426,
"rewards/reward_func": -0.3722410202026367,
"step": 680,
"toxic_reward": 4.390137553215027
},
{
"clip_ratio": 0.0,
"completion_length": 51.225,
"epoch": 0.16304347826086957,
"format_reward": -1.0,
"grad_norm": 2.7068045139312744,
"image_reward": 0.27732340693473817,
"kl": 0.06089744158089161,
"learning_rate": 5e-06,
"loss": -0.0203,
"reward": -0.6177265048027039,
"reward_std": 2.210049830470234,
"rewards/reward_func": -0.6177265048027039,
"step": 690,
"toxic_reward": 3.5699973523616793
},
{
"clip_ratio": 0.0,
"completion_length": 46.125,
"epoch": 0.16540642722117202,
"format_reward": -1.25,
"grad_norm": 3.031416654586792,
"image_reward": 0.2965891510248184,
"kl": 0.8002684944309294,
"learning_rate": 5e-06,
"loss": 0.0719,
"reward": -0.29744131565093995,
"reward_std": 2.741807485371828,
"rewards/reward_func": -0.29744131565093995,
"step": 700,
"toxic_reward": 3.4483383893966675
},
{
"clip_ratio": 0.0,
"completion_length": 35.975,
"epoch": 0.16776937618147447,
"format_reward": -0.25,
"grad_norm": 3.4755773544311523,
"image_reward": 0.2723083525896072,
"kl": 0.24097473481670023,
"learning_rate": 5e-06,
"loss": 0.0494,
"reward": -0.21520038843154907,
"reward_std": 0.7798372395336628,
"rewards/reward_func": -0.21520038843154907,
"step": 710,
"toxic_reward": 4.5303761720657345
},
{
"clip_ratio": 0.0,
"completion_length": 43.725,
"epoch": 0.17013232514177692,
"format_reward": -0.25,
"grad_norm": 1.2503156661987305,
"image_reward": 0.27466329038143156,
"kl": 0.2257185777183622,
"learning_rate": 5e-06,
"loss": -0.0733,
"reward": 0.11292819976806641,
"reward_std": 1.212121632695198,
"rewards/reward_func": 0.11292819976806641,
"step": 720,
"toxic_reward": 4.0655577898025514
},
{
"clip_ratio": 0.0,
"completion_length": 39.225,
"epoch": 0.1724952741020794,
"format_reward": -1.5,
"grad_norm": 7.7392988204956055,
"image_reward": 0.2492055267095566,
"kl": 0.37416572365909817,
"learning_rate": 5e-06,
"loss": 0.0225,
"reward": -1.0509216010570526,
"reward_std": 3.409189415350556,
"rewards/reward_func": -1.0509216010570526,
"step": 730,
"toxic_reward": 4.022808003425598
},
{
"clip_ratio": 0.0,
"completion_length": 28.525,
"epoch": 0.17485822306238186,
"format_reward": -0.5,
"grad_norm": 4.889242172241211,
"image_reward": 0.30042317807674407,
"kl": 0.22789150793105364,
"learning_rate": 5e-06,
"loss": -0.0569,
"reward": -0.2479497730731964,
"reward_std": 1.3530383894219995,
"rewards/reward_func": -0.2479497730731964,
"step": 740,
"toxic_reward": 3.774165117740631
},
{
"clip_ratio": 0.0,
"completion_length": 50.55,
"epoch": 0.1772211720226843,
"format_reward": -1.5,
"grad_norm": 16.729528427124023,
"image_reward": 0.273948161303997,
"kl": 0.43975371681153774,
"learning_rate": 5e-06,
"loss": 0.1103,
"reward": -1.793390053510666,
"reward_std": 3.0602585028856994,
"rewards/reward_func": -1.793390053510666,
"step": 750,
"toxic_reward": 3.111769822239876
},
{
"clip_ratio": 0.0,
"completion_length": 42.675,
"epoch": 0.17958412098298676,
"format_reward": -0.25,
"grad_norm": 10.731781005859375,
"image_reward": 0.26650288701057434,
"kl": 0.6582286342978477,
"learning_rate": 5e-06,
"loss": 0.1081,
"reward": 0.10775105953216553,
"reward_std": 1.3219802690669895,
"rewards/reward_func": 0.10775105953216553,
"step": 760,
"toxic_reward": 4.1322005033493046
},
{
"clip_ratio": 0.0,
"completion_length": 44.0,
"epoch": 0.1819470699432892,
"format_reward": -0.75,
"grad_norm": 4.2282633781433105,
"image_reward": 0.28914388120174406,
"kl": 0.7939867446199059,
"learning_rate": 5e-06,
"loss": 0.0704,
"reward": -0.24524924755096436,
"reward_std": 2.0771213214844466,
"rewards/reward_func": -0.24524924755096436,
"step": 770,
"toxic_reward": 3.9121114134788515
},
{
"clip_ratio": 0.0,
"completion_length": 55.975,
"epoch": 0.1843100189035917,
"format_reward": -0.5,
"grad_norm": 8.486693382263184,
"image_reward": 0.246868896484375,
"kl": 1.14481502994895,
"learning_rate": 5e-06,
"loss": -0.0032,
"reward": 0.28170942068099974,
"reward_std": 2.0574716079980133,
"rewards/reward_func": 0.28170942068099974,
"step": 780,
"toxic_reward": 3.4702104151248934
},
{
"clip_ratio": 0.0,
"completion_length": 36.15,
"epoch": 0.18667296786389415,
"format_reward": -0.75,
"grad_norm": 27.51862907409668,
"image_reward": 0.26758320927619933,
"kl": 1.0921552445739509,
"learning_rate": 5e-06,
"loss": -0.3259,
"reward": -0.5566600695252418,
"reward_std": 1.7622592605650425,
"rewards/reward_func": -0.5566600695252418,
"step": 790,
"toxic_reward": 3.4233752876520156
},
{
"clip_ratio": 0.0,
"completion_length": 35.45,
"epoch": 0.1890359168241966,
"format_reward": -0.5,
"grad_norm": 4.040957927703857,
"image_reward": 0.3153462767601013,
"kl": 1.9678303502500056,
"learning_rate": 5e-06,
"loss": -0.1665,
"reward": -0.010482311248779297,
"reward_std": 1.1518827967345715,
"rewards/reward_func": -0.010482311248779297,
"step": 800,
"toxic_reward": 3.6056110084056856
},
{
"clip_ratio": 0.0,
"completion_length": 41.5,
"epoch": 0.19139886578449905,
"format_reward": -0.25,
"grad_norm": 12.718086242675781,
"image_reward": 0.27923176884651185,
"kl": 0.9990547701716423,
"learning_rate": 5e-06,
"loss": -0.0447,
"reward": 0.1995850086212158,
"reward_std": 1.246943424642086,
"rewards/reward_func": 0.1995850086212158,
"step": 810,
"toxic_reward": 3.635990482568741
},
{
"clip_ratio": 0.0,
"completion_length": 37.525,
"epoch": 0.1937618147448015,
"format_reward": -1.25,
"grad_norm": 5.244020938873291,
"image_reward": 0.27026468962430955,
"kl": 2.5741087660193442,
"learning_rate": 5e-06,
"loss": 0.0569,
"reward": -1.3374125480651855,
"reward_std": 2.818611039035022,
"rewards/reward_func": -1.3374125480651855,
"step": 820,
"toxic_reward": 4.255197846889496
},
{
"clip_ratio": 0.0,
"completion_length": 43.25,
"epoch": 0.19612476370510398,
"format_reward": -0.25,
"grad_norm": 1.3633440732955933,
"image_reward": 0.29616292417049406,
"kl": 0.48451304286718366,
"learning_rate": 5e-06,
"loss": 0.1053,
"reward": -0.34738388657569885,
"reward_std": 0.9195286151021719,
"rewards/reward_func": -0.34738388657569885,
"step": 830,
"toxic_reward": 4.384462606906891
},
{
"clip_ratio": 0.0,
"completion_length": 35.75,
"epoch": 0.19848771266540643,
"format_reward": -0.75,
"grad_norm": 6.93122673034668,
"image_reward": 0.2948842361569405,
"kl": 0.3984289012849331,
"learning_rate": 5e-06,
"loss": 0.007,
"reward": -0.4061413824558258,
"reward_std": 2.115474058687687,
"rewards/reward_func": -0.4061413824558258,
"step": 840,
"toxic_reward": 2.784619116783142
},
{
"clip_ratio": 0.0,
"completion_length": 58.85,
"epoch": 0.2008506616257089,
"format_reward": -1.0,
"grad_norm": 11.167367935180664,
"image_reward": 0.2535125732421875,
"kl": 0.7260896906256675,
"learning_rate": 5e-06,
"loss": -0.0252,
"reward": -0.6900001287460327,
"reward_std": 2.5411489391699433,
"rewards/reward_func": -0.6900001287460327,
"step": 850,
"toxic_reward": 3.902221655845642
},
{
"clip_ratio": 0.0,
"completion_length": 37.35,
"epoch": 0.20321361058601134,
"format_reward": -0.25,
"grad_norm": 12.129627227783203,
"image_reward": 0.25641682744026184,
"kl": 0.5523816287517548,
"learning_rate": 5e-06,
"loss": -0.0869,
"reward": 0.027270352840423583,
"reward_std": 1.1594479020684958,
"rewards/reward_func": 0.027270352840423583,
"step": 860,
"toxic_reward": 4.19142780303955
},
{
"clip_ratio": 0.0,
"completion_length": 50.75,
"epoch": 0.2055765595463138,
"format_reward": -1.0,
"grad_norm": 25.523523330688477,
"image_reward": 0.28674203488561845,
"kl": 1.1298049300909043,
"learning_rate": 5e-06,
"loss": 0.0639,
"reward": -1.0763263344764709,
"reward_std": 1.7480091962963342,
"rewards/reward_func": -1.0763263344764709,
"step": 870,
"toxic_reward": 4.468152364095052
},
{
"clip_ratio": 0.0,
"completion_length": 35.575,
"epoch": 0.20793950850661624,
"format_reward": -1.0,
"grad_norm": 3.8387675285339355,
"image_reward": 0.26868184506893156,
"kl": 0.9680751413106918,
"learning_rate": 5e-06,
"loss": -0.0833,
"reward": -0.8666846975684166,
"reward_std": 2.079224378615618,
"rewards/reward_func": -0.8666846975684166,
"step": 880,
"toxic_reward": 3.481996048986912
},
{
"clip_ratio": 0.0,
"completion_length": 33.825,
"epoch": 0.21030245746691872,
"format_reward": -0.5,
"grad_norm": 15.843626022338867,
"image_reward": 0.2802464798092842,
"kl": 0.49419727362692356,
"learning_rate": 5e-06,
"loss": 0.0241,
"reward": 0.11158292293548584,
"reward_std": 1.7106264479458333,
"rewards/reward_func": 0.11158292293548584,
"step": 890,
"toxic_reward": 3.7324341177940368
},
{
"clip_ratio": 0.0,
"completion_length": 47.3,
"epoch": 0.21266540642722118,
"format_reward": -0.25,
"grad_norm": 2.770407199859619,
"image_reward": 0.27023824155330656,
"kl": 0.2871086034923792,
"learning_rate": 5e-06,
"loss": 0.1861,
"reward": -0.27072116136550906,
"reward_std": 1.447587224841118,
"rewards/reward_func": -0.27072116136550906,
"step": 900,
"toxic_reward": 3.426037532091141
},
{
"clip_ratio": 0.0,
"completion_length": 33.0,
"epoch": 0.21502835538752363,
"format_reward": -0.5,
"grad_norm": 6.4211225509643555,
"image_reward": 0.2804026290774345,
"kl": 1.5080223519355058,
"learning_rate": 5e-06,
"loss": 0.0382,
"reward": -0.10845602005720138,
"reward_std": 1.7854840472340583,
"rewards/reward_func": -0.10845602005720138,
"step": 910,
"toxic_reward": 3.3229601860046385
},
{
"clip_ratio": 0.0,
"completion_length": 41.4,
"epoch": 0.21739130434782608,
"format_reward": -1.0,
"grad_norm": 1.846864938735962,
"image_reward": 0.29064489238791996,
"kl": 0.8340548906475306,
"learning_rate": 5e-06,
"loss": 0.0872,
"reward": -1.217875736951828,
"reward_std": 1.4547557694837452,
"rewards/reward_func": -1.217875736951828,
"step": 920,
"toxic_reward": 4.098645766576131
},
{
"clip_ratio": 0.0,
"completion_length": 39.3,
"epoch": 0.21975425330812853,
"format_reward": -0.5,
"grad_norm": 14.329817771911621,
"image_reward": 0.28984171748161314,
"kl": 0.3335365690290928,
"learning_rate": 5e-06,
"loss": 0.0341,
"reward": -0.14692462086677552,
"reward_std": 1.6654048651456832,
"rewards/reward_func": -0.14692462086677552,
"step": 930,
"toxic_reward": 3.8828285098075868
},
{
"clip_ratio": 0.0,
"completion_length": 39.15,
"epoch": 0.222117202268431,
"format_reward": -1.0,
"grad_norm": 13.11744499206543,
"image_reward": 0.2768778458237648,
"kl": 0.7420168094336986,
"learning_rate": 5e-06,
"loss": -0.1362,
"reward": -0.5828769445419312,
"reward_std": 2.509597599506378,
"rewards/reward_func": -0.5828769445419312,
"step": 940,
"toxic_reward": 3.994591364264488
},
{
"clip_ratio": 0.0,
"completion_length": 35.9,
"epoch": 0.22448015122873347,
"format_reward": -1.25,
"grad_norm": 1.4235849380493164,
"image_reward": 0.2599512729793787,
"kl": 0.23791442420333625,
"learning_rate": 5e-06,
"loss": 0.2081,
"reward": -0.7265825271606445,
"reward_std": 2.4457253187894823,
"rewards/reward_func": -0.7265825271606445,
"step": 950,
"toxic_reward": 4.328470140695572
},
{
"clip_ratio": 0.0,
"completion_length": 40.05,
"epoch": 0.22684310018903592,
"format_reward": -1.0,
"grad_norm": 10.51688003540039,
"image_reward": 0.29052734225988386,
"kl": 1.1104660354554654,
"learning_rate": 5e-06,
"loss": 0.1948,
"reward": -0.3963636875152588,
"reward_std": 2.6071507059037686,
"rewards/reward_func": -0.3963636875152588,
"step": 960,
"toxic_reward": 3.5060137271881104
},
{
"clip_ratio": 0.0,
"completion_length": 42.075,
"epoch": 0.22920604914933837,
"format_reward": 0.0,
"grad_norm": 0.5477933287620544,
"image_reward": 0.2825276702642441,
"kl": 0.24828157052397729,
"learning_rate": 5e-06,
"loss": 0.2364,
"reward": -0.024850471317768096,
"reward_std": 0.7480767840519548,
"rewards/reward_func": -0.024850471317768096,
"step": 970,
"toxic_reward": 3.07701745480299
},
{
"clip_ratio": 0.0,
"completion_length": 39.925,
"epoch": 0.23156899810964082,
"format_reward": -1.25,
"grad_norm": 6.296302318572998,
"image_reward": 0.26927467518382603,
"kl": 3.1552879590541125,
"learning_rate": 5e-06,
"loss": 0.0241,
"reward": -0.648174649477005,
"reward_std": 2.8984405621886253,
"rewards/reward_func": -0.648174649477005,
"step": 980,
"toxic_reward": 3.881988432672289
},
{
"clip_ratio": 0.0,
"completion_length": 50.475,
"epoch": 0.2339319470699433,
"format_reward": -1.0,
"grad_norm": 2.797386646270752,
"image_reward": 0.2668904632329941,
"kl": 1.7048991359770298,
"learning_rate": 5e-06,
"loss": -0.0828,
"reward": -1.1502302587032318,
"reward_std": 2.383236999064684,
"rewards/reward_func": -1.1502302587032318,
"step": 990,
"toxic_reward": 4.231578087806701
},
{
"clip_ratio": 0.0,
"completion_length": 39.425,
"epoch": 0.23629489603024575,
"format_reward": -0.75,
"grad_norm": 13.208063125610352,
"image_reward": 0.2917307555675507,
"kl": 0.7445122614502907,
"learning_rate": 5e-06,
"loss": -0.1073,
"reward": -0.7605196535587311,
"reward_std": 2.2064386613667013,
"rewards/reward_func": -0.7605196535587311,
"step": 1000,
"toxic_reward": 3.5633171044290064
},
{
"clip_ratio": 0.0,
"completion_length": 55.875,
"epoch": 0.2386578449905482,
"format_reward": -1.0,
"grad_norm": 10.358668327331543,
"image_reward": 0.26257934868335725,
"kl": 0.35015557184815405,
"learning_rate": 5e-06,
"loss": -0.0206,
"reward": -0.38898804783821106,
"reward_std": 2.7123206526041033,
"rewards/reward_func": -0.38898804783821106,
"step": 1010,
"toxic_reward": 3.609158730506897
},
{
"clip_ratio": 0.0,
"completion_length": 52.95,
"epoch": 0.24102079395085066,
"format_reward": -1.0,
"grad_norm": 9.602174758911133,
"image_reward": 0.289794921875,
"kl": 0.2867487147450447,
"learning_rate": 5e-06,
"loss": 0.0269,
"reward": -0.4154239475727081,
"reward_std": 2.4513496346771717,
"rewards/reward_func": -0.4154239475727081,
"step": 1020,
"toxic_reward": 4.2405922412872314
},
{
"clip_ratio": 0.0,
"completion_length": 52.5,
"epoch": 0.2433837429111531,
"format_reward": -1.0,
"grad_norm": 6.7750630378723145,
"image_reward": 0.2876515701413155,
"kl": 0.8189243379980325,
"learning_rate": 5e-06,
"loss": 0.0184,
"reward": -0.9024024844169617,
"reward_std": 2.123489296063781,
"rewards/reward_func": -0.9024024844169617,
"step": 1030,
"toxic_reward": 3.870901381969452
},
{
"clip_ratio": 0.0,
"completion_length": 55.1,
"epoch": 0.24574669187145556,
"format_reward": -0.5,
"grad_norm": 1.4051434993743896,
"image_reward": 0.2766723616255654,
"kl": 0.7713468134403229,
"learning_rate": 5e-06,
"loss": -0.1005,
"reward": 0.42890325784683225,
"reward_std": 1.7344073422253132,
"rewards/reward_func": 0.42890325784683225,
"step": 1040,
"toxic_reward": 3.850937591658698
},
{
"clip_ratio": 0.0,
"completion_length": 43.6,
"epoch": 0.24810964083175804,
"format_reward": -0.5,
"grad_norm": 10.04930591583252,
"image_reward": 0.2845031708478928,
"kl": 0.21945146545767785,
"learning_rate": 5e-06,
"loss": -0.0794,
"reward": -0.29822829365730286,
"reward_std": 2.0626097127795218,
"rewards/reward_func": -0.29822829365730286,
"step": 1050,
"toxic_reward": 3.3056647762656213
},
{
"clip_ratio": 0.0,
"completion_length": 36.675,
"epoch": 0.2504725897920605,
"format_reward": -0.25,
"grad_norm": 1.4483786821365356,
"image_reward": 0.2949198380112648,
"kl": 0.5147463826462626,
"learning_rate": 5e-06,
"loss": 0.0472,
"reward": -0.4302744150161743,
"reward_std": 0.9093868482857943,
"rewards/reward_func": -0.4302744150161743,
"step": 1060,
"toxic_reward": 4.118269920349121
},
{
"clip_ratio": 0.0,
"completion_length": 45.125,
"epoch": 0.252835538752363,
"format_reward": -0.75,
"grad_norm": 5.471806526184082,
"image_reward": 0.3024444580078125,
"kl": 0.924912228435278,
"learning_rate": 5e-06,
"loss": 0.0653,
"reward": -0.9226927876472473,
"reward_std": 1.8348794005811215,
"rewards/reward_func": -0.9226927876472473,
"step": 1070,
"toxic_reward": 3.55495400428772
},
{
"clip_ratio": 0.0,
"completion_length": 49.225,
"epoch": 0.2551984877126654,
"format_reward": -0.5,
"grad_norm": 6.291661739349365,
"image_reward": 0.30248311161994934,
"kl": 0.14056268222630025,
"learning_rate": 5e-06,
"loss": 0.1172,
"reward": -0.07832016348838806,
"reward_std": 1.6703550808131695,
"rewards/reward_func": -0.07832016348838806,
"step": 1080,
"toxic_reward": 3.876679849624634
},
{
"clip_ratio": 0.0,
"completion_length": 40.575,
"epoch": 0.2575614366729679,
"format_reward": 0.0,
"grad_norm": 5.747459411621094,
"image_reward": 0.268257649242878,
"kl": 0.20501487758010625,
"learning_rate": 5e-06,
"loss": 0.1961,
"reward": 0.8156829088926315,
"reward_std": 0.6415594108402729,
"rewards/reward_func": 0.8156829088926315,
"step": 1090,
"toxic_reward": 4.041116189956665
},
{
"clip_ratio": 0.0,
"completion_length": 43.5,
"epoch": 0.2599243856332703,
"format_reward": -0.25,
"grad_norm": 0.5391029715538025,
"image_reward": 0.27643330842256547,
"kl": 0.27743567544966935,
"learning_rate": 5e-06,
"loss": -0.0042,
"reward": 0.06835275292396545,
"reward_std": 1.1296793665736913,
"rewards/reward_func": 0.06835275292396545,
"step": 1100,
"toxic_reward": 3.7508057713508607
},
{
"clip_ratio": 0.0,
"completion_length": 44.475,
"epoch": 0.2622873345935728,
"format_reward": -0.75,
"grad_norm": 5.044631004333496,
"image_reward": 0.2711191803216934,
"kl": 0.08945430461317301,
"learning_rate": 5e-06,
"loss": 0.1402,
"reward": -0.8795787930488587,
"reward_std": 1.802781331539154,
"rewards/reward_func": -0.8795787930488587,
"step": 1110,
"toxic_reward": 3.978659760951996
},
{
"clip_ratio": 0.0,
"completion_length": 50.0,
"epoch": 0.2646502835538752,
"format_reward": -1.25,
"grad_norm": 10.223982810974121,
"image_reward": 0.2896250396966934,
"kl": 0.5244473532773555,
"learning_rate": 5e-06,
"loss": 0.1933,
"reward": -0.48248053193092344,
"reward_std": 2.971283960342407,
"rewards/reward_func": -0.48248053193092344,
"step": 1120,
"toxic_reward": 3.2150497317314146
},
{
"clip_ratio": 0.0,
"completion_length": 37.8,
"epoch": 0.2670132325141777,
"format_reward": -0.5,
"grad_norm": 3.6621553897857666,
"image_reward": 0.2852656051516533,
"kl": 0.5911644924432039,
"learning_rate": 5e-06,
"loss": 0.0576,
"reward": -0.3013936847448349,
"reward_std": 1.430125593394041,
"rewards/reward_func": -0.3013936847448349,
"step": 1130,
"toxic_reward": 4.058745819330215
},
{
"clip_ratio": 0.0,
"completion_length": 32.95,
"epoch": 0.26937618147448017,
"format_reward": -0.5,
"grad_norm": 24.121688842773438,
"image_reward": 0.2795908600091934,
"kl": 0.4301185546442866,
"learning_rate": 5e-06,
"loss": -0.0126,
"reward": -0.10317457914352417,
"reward_std": 1.667516409419477,
"rewards/reward_func": -0.10317457914352417,
"step": 1140,
"toxic_reward": 4.072073769569397
},
{
"clip_ratio": 0.0,
"completion_length": 42.025,
"epoch": 0.2717391304347826,
"format_reward": 0.0,
"grad_norm": 0.7166000604629517,
"image_reward": 0.2804423004388809,
"kl": 0.675014778599143,
"learning_rate": 5e-06,
"loss": -0.0049,
"reward": 0.4254330635070801,
"reward_std": 0.9621219031512738,
"rewards/reward_func": 0.4254330635070801,
"step": 1150,
"toxic_reward": 3.471704053878784
},
{
"clip_ratio": 0.0,
"completion_length": 33.175,
"epoch": 0.2741020793950851,
"format_reward": -1.0,
"grad_norm": 1.803680658340454,
"image_reward": 0.31466064155101775,
"kl": 0.344609697163105,
"learning_rate": 5e-06,
"loss": -0.1152,
"reward": -0.5670508742332458,
"reward_std": 2.301799529790878,
"rewards/reward_func": -0.5670508742332458,
"step": 1160,
"toxic_reward": 3.554426383972168
},
{
"clip_ratio": 0.0,
"completion_length": 31.475,
"epoch": 0.2764650283553875,
"format_reward": 0.0,
"grad_norm": 7.919179439544678,
"image_reward": 0.26389770656824113,
"kl": 0.8297407850623131,
"learning_rate": 5e-06,
"loss": -0.28,
"reward": 0.23291709423065185,
"reward_std": 0.47383863255381586,
"rewards/reward_func": 0.23291709423065185,
"step": 1170,
"toxic_reward": 4.360145711898804
},
{
"clip_ratio": 0.0,
"completion_length": 36.0,
"epoch": 0.27882797731569,
"format_reward": -0.5,
"grad_norm": 294.2972106933594,
"image_reward": 0.2640360534191132,
"kl": 0.9242212943732738,
"learning_rate": 5e-06,
"loss": 0.017,
"reward": -0.04461590349674225,
"reward_std": 1.7138214907608926,
"rewards/reward_func": -0.04461590349674225,
"step": 1180,
"toxic_reward": 3.5669440746307375
},
{
"clip_ratio": 0.0,
"completion_length": 60.525,
"epoch": 0.28119092627599246,
"format_reward": -0.25,
"grad_norm": 0.6788994669914246,
"image_reward": 0.2832122802734375,
"kl": 6.060492021404206,
"learning_rate": 5e-06,
"loss": 0.1039,
"reward": 0.30282129645347594,
"reward_std": 1.3184241026639938,
"rewards/reward_func": 0.30282129645347594,
"step": 1190,
"toxic_reward": 3.858977997303009
},
{
"clip_ratio": 0.0,
"completion_length": 53.5,
"epoch": 0.2835538752362949,
"format_reward": -1.0,
"grad_norm": 2.821944236755371,
"image_reward": 0.292755126953125,
"kl": 0.2833241932094097,
"learning_rate": 5e-06,
"loss": -0.0765,
"reward": -0.8336254239082337,
"reward_std": 2.1170720741152764,
"rewards/reward_func": -0.8336254239082337,
"step": 1200,
"toxic_reward": 4.131281018257141
},
{
"clip_ratio": 0.0,
"completion_length": 45.975,
"epoch": 0.28591682419659736,
"format_reward": -0.75,
"grad_norm": 5.20048189163208,
"image_reward": 0.3018681839108467,
"kl": 0.26484427275136113,
"learning_rate": 5e-06,
"loss": 0.0037,
"reward": -0.23466770052909852,
"reward_std": 2.3572978913784026,
"rewards/reward_func": -0.23466770052909852,
"step": 1210,
"toxic_reward": 3.701621878147125
},
{
"clip_ratio": 0.0,
"completion_length": 53.95,
"epoch": 0.2882797731568998,
"format_reward": -0.75,
"grad_norm": 2.5671803951263428,
"image_reward": 0.2591837555170059,
"kl": 0.27887978348881004,
"learning_rate": 5e-06,
"loss": 0.1531,
"reward": -0.5629445493221283,
"reward_std": 2.2025086715817452,
"rewards/reward_func": -0.5629445493221283,
"step": 1220,
"toxic_reward": 3.878066289424896
},
{
"clip_ratio": 0.0,
"completion_length": 40.975,
"epoch": 0.29064272211720227,
"format_reward": -0.25,
"grad_norm": 1.3592997789382935,
"image_reward": 0.2804290771484375,
"kl": 0.7250507925637066,
"learning_rate": 5e-06,
"loss": -0.1268,
"reward": 0.029623252153396607,
"reward_std": 1.3399539720267057,
"rewards/reward_func": 0.029623252153396607,
"step": 1230,
"toxic_reward": 3.5630233764648436
},
{
"clip_ratio": 0.0,
"completion_length": 32.675,
"epoch": 0.29300567107750475,
"format_reward": -1.25,
"grad_norm": 6.867509365081787,
"image_reward": 0.2880493178963661,
"kl": 0.46422886326909063,
"learning_rate": 5e-06,
"loss": 0.0122,
"reward": -1.0097105741500854,
"reward_std": 2.696252405457199,
"rewards/reward_func": -1.0097105741500854,
"step": 1240,
"toxic_reward": 4.076703870296479
},
{
"clip_ratio": 0.0,
"completion_length": 47.4,
"epoch": 0.2953686200378072,
"format_reward": -0.25,
"grad_norm": 4.707825183868408,
"image_reward": 0.256890869140625,
"kl": 0.1788209406659007,
"learning_rate": 5e-06,
"loss": -0.0197,
"reward": 0.38095744252204894,
"reward_std": 1.2988073959946633,
"rewards/reward_func": 0.38095744252204894,
"step": 1250,
"toxic_reward": 3.8400187373161314
},
{
"clip_ratio": 0.0,
"completion_length": 35.625,
"epoch": 0.29773156899810965,
"format_reward": -0.75,
"grad_norm": 1.229298710823059,
"image_reward": 0.313336181640625,
"kl": 0.33243545759469273,
"learning_rate": 5e-06,
"loss": 0.1516,
"reward": -0.5754710257053375,
"reward_std": 1.8287720288150013,
"rewards/reward_func": -0.5754710257053375,
"step": 1260,
"toxic_reward": 4.415339708328247
},
{
"clip_ratio": 0.0,
"completion_length": 60.25,
"epoch": 0.3000945179584121,
"format_reward": -0.5,
"grad_norm": 0.5034794807434082,
"image_reward": 0.27869771271944044,
"kl": 0.38923515090718863,
"learning_rate": 5e-06,
"loss": -0.0327,
"reward": -0.4456570327281952,
"reward_std": 1.5328068390488625,
"rewards/reward_func": -0.4456570327281952,
"step": 1270,
"toxic_reward": 3.8723622620105744
},
{
"clip_ratio": 0.0,
"completion_length": 55.425,
"epoch": 0.30245746691871456,
"format_reward": -1.0,
"grad_norm": 1.2214823961257935,
"image_reward": 0.2668467193841934,
"kl": 1.2360946209169925,
"learning_rate": 5e-06,
"loss": 0.1227,
"reward": -0.9184286594390869,
"reward_std": 2.3616207716986537,
"rewards/reward_func": -0.9184286594390869,
"step": 1280,
"toxic_reward": 4.0201707005500795
},
{
"clip_ratio": 0.0,
"completion_length": 42.525,
"epoch": 0.30482041587901704,
"format_reward": -0.5,
"grad_norm": 0.6785597205162048,
"image_reward": 0.27662353664636613,
"kl": 0.6153190754354,
"learning_rate": 5e-06,
"loss": -0.0909,
"reward": -0.025622844696044922,
"reward_std": 1.7058033104985952,
"rewards/reward_func": -0.025622844696044922,
"step": 1290,
"toxic_reward": 3.283605984598398
},
{
"clip_ratio": 0.0,
"completion_length": 48.475,
"epoch": 0.30718336483931946,
"format_reward": -0.5,
"grad_norm": 1.5470991134643555,
"image_reward": 0.28620096743106843,
"kl": 1.3450787207111716,
"learning_rate": 5e-06,
"loss": -0.0773,
"reward": 0.39500882625579836,
"reward_std": 1.9240341871976852,
"rewards/reward_func": 0.39500882625579836,
"step": 1300,
"toxic_reward": 3.8390918374061584
},
{
"clip_ratio": 0.0,
"completion_length": 74.6,
"epoch": 0.30954631379962194,
"format_reward": 0.0,
"grad_norm": 4.827681541442871,
"image_reward": 0.2871856689453125,
"kl": 0.2589964304119349,
"learning_rate": 5e-06,
"loss": -0.0753,
"reward": -0.08085522651672364,
"reward_std": 0.7007970325648785,
"rewards/reward_func": -0.08085522651672364,
"step": 1310,
"toxic_reward": 4.15708065032959
},
{
"clip_ratio": 0.0,
"completion_length": 35.175,
"epoch": 0.31190926275992437,
"format_reward": 0.0,
"grad_norm": 2.559379816055298,
"image_reward": 0.28839518427848815,
"kl": 1.160063625872135,
"learning_rate": 5e-06,
"loss": -0.1169,
"reward": 0.457793202996254,
"reward_std": 0.8301180111244321,
"rewards/reward_func": 0.457793202996254,
"step": 1320,
"toxic_reward": 3.847675251960754
},
{
"clip_ratio": 0.0,
"completion_length": 46.05,
"epoch": 0.31427221172022685,
"format_reward": -0.5,
"grad_norm": 1.0227330923080444,
"image_reward": 0.25479024201631545,
"kl": 5.228898542746902,
"learning_rate": 5e-06,
"loss": 0.1453,
"reward": -0.19808580130338668,
"reward_std": 1.237728140875697,
"rewards/reward_func": -0.19808580130338668,
"step": 1330,
"toxic_reward": 3.487361752986908
},
{
"clip_ratio": 0.0,
"completion_length": 41.375,
"epoch": 0.3166351606805293,
"format_reward": 0.0,
"grad_norm": 2.158604383468628,
"image_reward": 0.27274220883846284,
"kl": 5.145803064666689,
"learning_rate": 5e-06,
"loss": 0.0016,
"reward": 0.5905790150165557,
"reward_std": 1.0763475911691784,
"rewards/reward_func": 0.5905790150165557,
"step": 1340,
"toxic_reward": 3.561137008666992
},
{
"clip_ratio": 0.0,
"completion_length": 33.9,
"epoch": 0.31899810964083175,
"format_reward": -0.25,
"grad_norm": 1.078782081604004,
"image_reward": 0.27456156313419344,
"kl": 4.645642199181021,
"learning_rate": 5e-06,
"loss": 0.0272,
"reward": 0.0937275767326355,
"reward_std": 1.5942428700625897,
"rewards/reward_func": 0.0937275767326355,
"step": 1350,
"toxic_reward": 3.385586667060852
},
{
"clip_ratio": 0.0,
"completion_length": 39.05,
"epoch": 0.32136105860113423,
"format_reward": -0.25,
"grad_norm": 2.4886958599090576,
"image_reward": 0.27929331362247467,
"kl": 0.6772738939616829,
"learning_rate": 5e-06,
"loss": -0.1689,
"reward": 0.10146453976631165,
"reward_std": 1.4149208962917328,
"rewards/reward_func": 0.10146453976631165,
"step": 1360,
"toxic_reward": 4.062562417984009
},
{
"clip_ratio": 0.0,
"completion_length": 56.025,
"epoch": 0.32372400756143666,
"format_reward": 0.0,
"grad_norm": 0.45091304183006287,
"image_reward": 0.26109618991613387,
"kl": 1.1132759511470796,
"learning_rate": 5e-06,
"loss": 0.0773,
"reward": 0.4344749391078949,
"reward_std": 0.6906750492751599,
"rewards/reward_func": 0.4344749391078949,
"step": 1370,
"toxic_reward": 3.89659765958786
},
{
"clip_ratio": 0.0,
"completion_length": 54.1,
"epoch": 0.32608695652173914,
"format_reward": -1.0,
"grad_norm": 2.2919623851776123,
"image_reward": 0.2507191985845566,
"kl": 2.863751105964184,
"learning_rate": 5e-06,
"loss": 0.0426,
"reward": -0.3381307005882263,
"reward_std": 1.9777413787320257,
"rewards/reward_func": -0.3381307005882263,
"step": 1380,
"toxic_reward": 4.168315529823303
},
{
"clip_ratio": 0.0,
"completion_length": 59.7,
"epoch": 0.3284499054820416,
"format_reward": 0.0,
"grad_norm": 17.546894073486328,
"image_reward": 0.2879852294921875,
"kl": 1.016882681287825,
"learning_rate": 5e-06,
"loss": 0.065,
"reward": -0.029438415169715883,
"reward_std": 0.3044209867715836,
"rewards/reward_func": -0.029438415169715883,
"step": 1390,
"toxic_reward": 3.8181951224803923
},
{
"clip_ratio": 0.0,
"completion_length": 44.425,
"epoch": 0.33081285444234404,
"format_reward": -0.75,
"grad_norm": 3.9508233070373535,
"image_reward": 0.3041224151849747,
"kl": 1.2148886673152446,
"learning_rate": 5e-06,
"loss": 0.0983,
"reward": -0.08471554517745972,
"reward_std": 2.0540446445345877,
"rewards/reward_func": -0.08471554517745972,
"step": 1400,
"toxic_reward": 4.20858781337738
},
{
"clip_ratio": 0.0,
"completion_length": 43.5,
"epoch": 0.3331758034026465,
"format_reward": -0.75,
"grad_norm": 23.3671817779541,
"image_reward": 0.2869578033685684,
"kl": 9.38541857972741,
"learning_rate": 5e-06,
"loss": -0.0633,
"reward": -0.21220148205757142,
"reward_std": 2.147160884644836,
"rewards/reward_func": -0.21220148205757142,
"step": 1410,
"toxic_reward": 3.646671336889267
},
{
"clip_ratio": 0.0,
"completion_length": 43.5,
"epoch": 0.33553875236294894,
"format_reward": -0.5,
"grad_norm": 5.768739223480225,
"image_reward": 0.29927419126033783,
"kl": 3.124450533092022,
"learning_rate": 5e-06,
"loss": 0.0205,
"reward": -0.20792179703712463,
"reward_std": 1.7920773405581714,
"rewards/reward_func": -0.20792179703712463,
"step": 1420,
"toxic_reward": 3.938745903968811
},
{
"clip_ratio": 0.0,
"completion_length": 55.3,
"epoch": 0.3379017013232514,
"format_reward": -0.75,
"grad_norm": 7.805192947387695,
"image_reward": 0.2781646728515625,
"kl": 9.086061615869403,
"learning_rate": 5e-06,
"loss": -0.1052,
"reward": 0.26188963651657104,
"reward_std": 1.916423682682216,
"rewards/reward_func": 0.26188963651657104,
"step": 1430,
"toxic_reward": 3.8569429397583006
},
{
"clip_ratio": 0.0,
"completion_length": 48.075,
"epoch": 0.34026465028355385,
"format_reward": 0.0,
"grad_norm": 6.398307800292969,
"image_reward": 0.28298187255859375,
"kl": 3.477000297047198,
"learning_rate": 5e-06,
"loss": -0.1652,
"reward": 0.2245475471019745,
"reward_std": 0.7394228018820286,
"rewards/reward_func": 0.2245475471019745,
"step": 1440,
"toxic_reward": 3.977894365787506
},
{
"clip_ratio": 0.0,
"completion_length": 49.375,
"epoch": 0.34262759924385633,
"format_reward": -0.25,
"grad_norm": 15.553762435913086,
"image_reward": 0.26566060483455656,
"kl": 17.512660111114382,
"learning_rate": 5e-06,
"loss": -0.0446,
"reward": 0.14810482859611512,
"reward_std": 1.504632395505905,
"rewards/reward_func": 0.14810482859611512,
"step": 1450,
"toxic_reward": 3.5254761219024657
},
{
"clip_ratio": 0.0,
"completion_length": 49.475,
"epoch": 0.3449905482041588,
"format_reward": -1.0,
"grad_norm": 2.524869918823242,
"image_reward": 0.2846649169921875,
"kl": 1.9967870802618564,
"learning_rate": 5e-06,
"loss": -0.002,
"reward": -1.1410660862922668,
"reward_std": 2.0114028319716453,
"rewards/reward_func": -1.1410660862922668,
"step": 1460,
"toxic_reward": 4.1038308382034305
},
{
"clip_ratio": 0.0,
"completion_length": 55.125,
"epoch": 0.34735349716446123,
"format_reward": -1.0,
"grad_norm": 5.871716499328613,
"image_reward": 0.2953603118658066,
"kl": 1.2091532168909906,
"learning_rate": 5e-06,
"loss": 0.1278,
"reward": -0.8882034704089165,
"reward_std": 2.2325065452605486,
"rewards/reward_func": -0.8882034704089165,
"step": 1470,
"toxic_reward": 2.88705118894577
},
{
"clip_ratio": 0.0,
"completion_length": 51.5,
"epoch": 0.3497164461247637,
"format_reward": 0.0,
"grad_norm": 5.483914852142334,
"image_reward": 0.2924133285880089,
"kl": 20.523441922478376,
"learning_rate": 5e-06,
"loss": 0.0722,
"reward": 0.036271828413009646,
"reward_std": 1.0079955972731114,
"rewards/reward_func": 0.036271828413009646,
"step": 1480,
"toxic_reward": 3.1371969431638718
},
{
"clip_ratio": 0.0,
"completion_length": 40.975,
"epoch": 0.35207939508506614,
"format_reward": 0.0,
"grad_norm": 1.4849286079406738,
"image_reward": 0.2851186111569405,
"kl": 2.15047435965389,
"learning_rate": 5e-06,
"loss": 0.0946,
"reward": 0.441963791847229,
"reward_std": 0.4248314931988716,
"rewards/reward_func": 0.441963791847229,
"step": 1490,
"toxic_reward": 3.752102476358414
},
{
"clip_ratio": 0.0,
"completion_length": 40.0,
"epoch": 0.3544423440453686,
"format_reward": -0.25,
"grad_norm": 2.0869834423065186,
"image_reward": 0.2911224365234375,
"kl": 2.3197390008717775,
"learning_rate": 5e-06,
"loss": -0.0123,
"reward": 0.13550712168216705,
"reward_std": 1.141077246889472,
"rewards/reward_func": 0.13550712168216705,
"step": 1500,
"toxic_reward": 3.36595538854599
},
{
"clip_ratio": 0.0,
"completion_length": 40.6,
"epoch": 0.3568052930056711,
"format_reward": -0.25,
"grad_norm": 2.7763924598693848,
"image_reward": 0.28095703125,
"kl": 0.8447903416119515,
"learning_rate": 5e-06,
"loss": 0.1001,
"reward": 0.07771911025047303,
"reward_std": 1.3111265070736409,
"rewards/reward_func": 0.07771911025047303,
"step": 1510,
"toxic_reward": 4.060403060913086
},
{
"clip_ratio": 0.0,
"completion_length": 55.575,
"epoch": 0.3591682419659735,
"format_reward": -0.25,
"grad_norm": 11.143818855285645,
"image_reward": 0.2903269439935684,
"kl": 1.106547536328435,
"learning_rate": 5e-06,
"loss": -0.0008,
"reward": 0.7432255536317826,
"reward_std": 1.0503722863271832,
"rewards/reward_func": 0.7432255536317826,
"step": 1520,
"toxic_reward": 3.4027091443538664
},
{
"clip_ratio": 0.0,
"completion_length": 43.825,
"epoch": 0.361531190926276,
"format_reward": -0.25,
"grad_norm": 6.157534599304199,
"image_reward": 0.2823811858892441,
"kl": 0.7211934769526124,
"learning_rate": 5e-06,
"loss": -0.0288,
"reward": 0.11932253241539001,
"reward_std": 1.307121137715876,
"rewards/reward_func": 0.11932253241539001,
"step": 1530,
"toxic_reward": 3.8783162236213684
},
{
"clip_ratio": 0.0,
"completion_length": 37.075,
"epoch": 0.3638941398865784,
"format_reward": -1.0,
"grad_norm": 2.383302688598633,
"image_reward": 0.28258056491613387,
"kl": 3.830422883108258,
"learning_rate": 5e-06,
"loss": -0.0149,
"reward": -0.39380887150764465,
"reward_std": 2.854560297727585,
"rewards/reward_func": -0.39380887150764465,
"step": 1540,
"toxic_reward": 3.2431194216012953
},
{
"clip_ratio": 0.0,
"completion_length": 44.55,
"epoch": 0.3662570888468809,
"format_reward": -0.25,
"grad_norm": 2.1643450260162354,
"image_reward": 0.2872863754630089,
"kl": 0.3903345447033644,
"learning_rate": 5e-06,
"loss": 0.2399,
"reward": 0.23153584003448485,
"reward_std": 1.3368525609374047,
"rewards/reward_func": 0.23153584003448485,
"step": 1550,
"toxic_reward": 3.452616012096405
},
{
"clip_ratio": 0.0,
"completion_length": 36.625,
"epoch": 0.3686200378071834,
"format_reward": -0.25,
"grad_norm": 0.922444224357605,
"image_reward": 0.29551798701286314,
"kl": 0.9415501815266907,
"learning_rate": 5e-06,
"loss": 0.0285,
"reward": 0.2152680218219757,
"reward_std": 1.0939797786995769,
"rewards/reward_func": 0.2152680218219757,
"step": 1560,
"toxic_reward": 4.278083491325378
},
{
"clip_ratio": 0.0,
"completion_length": 58.65,
"epoch": 0.3709829867674858,
"format_reward": -1.0,
"grad_norm": 1.9485223293304443,
"image_reward": 0.289756266772747,
"kl": 0.52877401644364,
"learning_rate": 5e-06,
"loss": -0.0562,
"reward": -0.7691292554140091,
"reward_std": 2.194984516873956,
"rewards/reward_func": -0.7691292554140091,
"step": 1570,
"toxic_reward": 3.7907654672861097
},
{
"clip_ratio": 0.0,
"completion_length": 34.35,
"epoch": 0.3733459357277883,
"format_reward": -0.75,
"grad_norm": 4.795892238616943,
"image_reward": 0.29136555939912795,
"kl": 1.7273975620046258,
"learning_rate": 5e-06,
"loss": 0.0006,
"reward": -0.2548545479774475,
"reward_std": 2.3145264372229577,
"rewards/reward_func": -0.2548545479774475,
"step": 1580,
"toxic_reward": 3.206251806020737
},
{
"clip_ratio": 0.0,
"completion_length": 51.15,
"epoch": 0.3757088846880907,
"format_reward": -0.25,
"grad_norm": 1.6828984022140503,
"image_reward": 0.29258016049861907,
"kl": 0.27110366327688096,
"learning_rate": 5e-06,
"loss": 0.0295,
"reward": 0.2889214813709259,
"reward_std": 1.4156969770789147,
"rewards/reward_func": 0.2889214813709259,
"step": 1590,
"toxic_reward": 3.8408302307128905
},
{
"clip_ratio": 0.0,
"completion_length": 48.075,
"epoch": 0.3780718336483932,
"format_reward": -0.5,
"grad_norm": 2.8415489196777344,
"image_reward": 0.2738067626953125,
"kl": 1.5718746781349182,
"learning_rate": 5e-06,
"loss": 0.0332,
"reward": -0.4795783460140228,
"reward_std": 1.1532321106642485,
"rewards/reward_func": -0.4795783460140228,
"step": 1600,
"toxic_reward": 3.8701359391212464
},
{
"clip_ratio": 0.0,
"completion_length": 55.3,
"epoch": 0.3804347826086957,
"format_reward": -0.5,
"grad_norm": 0.4898248612880707,
"image_reward": 0.28651835173368456,
"kl": 0.5627498641610146,
"learning_rate": 5e-06,
"loss": 0.1267,
"reward": -0.35464051365852356,
"reward_std": 1.5732567172497511,
"rewards/reward_func": -0.35464051365852356,
"step": 1610,
"toxic_reward": 4.116016793251037
},
{
"clip_ratio": 0.0,
"completion_length": 44.0,
"epoch": 0.3827977315689981,
"format_reward": -0.5,
"grad_norm": 1.5352033376693726,
"image_reward": 0.29410196989774706,
"kl": 1.2344657305628062,
"learning_rate": 5e-06,
"loss": 0.1575,
"reward": -0.4094507694244385,
"reward_std": 1.245941134635359,
"rewards/reward_func": -0.4094507694244385,
"step": 1620,
"toxic_reward": 4.569849014282227
},
{
"clip_ratio": 0.0,
"completion_length": 75.625,
"epoch": 0.3851606805293006,
"format_reward": -1.0,
"grad_norm": 0.5829593539237976,
"image_reward": 0.280389404296875,
"kl": 1.36093844124116,
"learning_rate": 5e-06,
"loss": 0.1245,
"reward": -0.31835838556289675,
"reward_std": 2.00613936111331,
"rewards/reward_func": -0.31835838556289675,
"step": 1630,
"toxic_reward": 4.125273871421814
},
{
"clip_ratio": 0.0,
"completion_length": 50.425,
"epoch": 0.387523629489603,
"format_reward": -0.75,
"grad_norm": 0.8723268508911133,
"image_reward": 0.27287851870059965,
"kl": 0.15645003337413071,
"learning_rate": 5e-06,
"loss": -0.039,
"reward": -0.8851189732551574,
"reward_std": 2.1296220384538174,
"rewards/reward_func": -0.8851189732551574,
"step": 1640,
"toxic_reward": 3.323053848743439
},
{
"clip_ratio": 0.0,
"completion_length": 35.35,
"epoch": 0.3898865784499055,
"format_reward": -0.75,
"grad_norm": 0.14725980162620544,
"image_reward": 0.28720601350069047,
"kl": 1.1328919077292086,
"learning_rate": 5e-06,
"loss": 0.0133,
"reward": -0.12160237431526184,
"reward_std": 1.725741315446794,
"rewards/reward_func": -0.12160237431526184,
"step": 1650,
"toxic_reward": 3.924569344520569
},
{
"clip_ratio": 0.0,
"completion_length": 34.05,
"epoch": 0.39224952741020797,
"format_reward": -0.5,
"grad_norm": 2.200639009475708,
"image_reward": 0.2846842437982559,
"kl": 0.11551734725944698,
"learning_rate": 5e-06,
"loss": -0.0781,
"reward": 0.11074192523956299,
"reward_std": 1.8953823536634444,
"rewards/reward_func": 0.11074192523956299,
"step": 1660,
"toxic_reward": 3.5436886310577393
},
{
"clip_ratio": 0.0,
"completion_length": 30.925,
"epoch": 0.3946124763705104,
"format_reward": 0.0,
"grad_norm": 3.0496935844421387,
"image_reward": 0.2790842682123184,
"kl": 2.538264278974384,
"learning_rate": 5e-06,
"loss": -0.1096,
"reward": 0.14284086227416992,
"reward_std": 0.8084073163568973,
"rewards/reward_func": 0.14284086227416992,
"step": 1670,
"toxic_reward": 4.144779133796692
},
{
"clip_ratio": 0.0,
"completion_length": 42.675,
"epoch": 0.39697542533081287,
"format_reward": -0.5,
"grad_norm": 0.9690385460853577,
"image_reward": 0.2903676345944405,
"kl": 3.7070351759903133,
"learning_rate": 5e-06,
"loss": 0.1427,
"reward": 0.008394747972488403,
"reward_std": 1.8407307181507349,
"rewards/reward_func": 0.008394747972488403,
"step": 1680,
"toxic_reward": 3.498854029178619
},
{
"clip_ratio": 0.0,
"completion_length": 42.875,
"epoch": 0.3993383742911153,
"format_reward": -0.5,
"grad_norm": 0.6957125067710876,
"image_reward": 0.2657012939453125,
"kl": 0.42172617875039575,
"learning_rate": 5e-06,
"loss": 0.1448,
"reward": -0.40106786489486695,
"reward_std": 1.718069277703762,
"rewards/reward_func": -0.40106786489486695,
"step": 1690,
"toxic_reward": 3.609626793861389
},
{
"clip_ratio": 0.0,
"completion_length": 47.375,
"epoch": 0.4017013232514178,
"format_reward": -0.5,
"grad_norm": 2.07503342628479,
"image_reward": 0.2696156814694405,
"kl": 1.291714602895081,
"learning_rate": 5e-06,
"loss": 0.0722,
"reward": -0.014362984895706176,
"reward_std": 1.5762588312849402,
"rewards/reward_func": -0.014362984895706176,
"step": 1700,
"toxic_reward": 4.394974184036255
},
{
"clip_ratio": 0.0,
"completion_length": 34.125,
"epoch": 0.40406427221172025,
"format_reward": 0.0,
"grad_norm": 1.1231868267059326,
"image_reward": 0.290789794921875,
"kl": 0.21602323912084104,
"learning_rate": 5e-06,
"loss": -0.0932,
"reward": 0.4133676677942276,
"reward_std": 0.8327854365110398,
"rewards/reward_func": 0.4133676677942276,
"step": 1710,
"toxic_reward": 3.955091452598572
},
{
"clip_ratio": 0.0,
"completion_length": 49.45,
"epoch": 0.4064272211720227,
"format_reward": -0.25,
"grad_norm": 1.602283000946045,
"image_reward": 0.2754241943359375,
"kl": 2.6595573978964238,
"learning_rate": 5e-06,
"loss": -0.1005,
"reward": 0.07846117615699769,
"reward_std": 1.170348797738552,
"rewards/reward_func": 0.07846117615699769,
"step": 1720,
"toxic_reward": 4.142733359336853
},
{
"clip_ratio": 0.0,
"completion_length": 65.9,
"epoch": 0.40879017013232516,
"format_reward": -1.0,
"grad_norm": 0.5282357335090637,
"image_reward": 0.26338195651769636,
"kl": 0.2848859841004014,
"learning_rate": 5e-06,
"loss": -0.0035,
"reward": -0.5072973608970642,
"reward_std": 2.7491880640387536,
"rewards/reward_func": -0.5072973608970642,
"step": 1730,
"toxic_reward": 4.047195649147033
},
{
"clip_ratio": 0.0,
"completion_length": 38.375,
"epoch": 0.4111531190926276,
"format_reward": -0.25,
"grad_norm": 1.5527747869491577,
"image_reward": 0.2691065490245819,
"kl": 1.2007373101077974,
"learning_rate": 5e-06,
"loss": -0.0239,
"reward": -0.045976501703262326,
"reward_std": 0.8193172802217304,
"rewards/reward_func": -0.045976501703262326,
"step": 1740,
"toxic_reward": 3.446149069070816
},
{
"clip_ratio": 0.0,
"completion_length": 46.725,
"epoch": 0.41351606805293006,
"format_reward": -0.25,
"grad_norm": 0.5118568539619446,
"image_reward": 0.27915140688419343,
"kl": 0.9548864349722862,
"learning_rate": 5e-06,
"loss": 0.1013,
"reward": -0.10445084571838378,
"reward_std": 0.730734084546566,
"rewards/reward_func": -0.10445084571838378,
"step": 1750,
"toxic_reward": 4.5370954990386965
},
{
"clip_ratio": 0.0,
"completion_length": 40.25,
"epoch": 0.4158790170132325,
"format_reward": -0.25,
"grad_norm": 1.8082605600357056,
"image_reward": 0.264396159350872,
"kl": 1.575367003493011,
"learning_rate": 5e-06,
"loss": 0.0632,
"reward": 0.14499086737632752,
"reward_std": 0.663521677441895,
"rewards/reward_func": 0.14499086737632752,
"step": 1760,
"toxic_reward": 4.827451419830322
},
{
"clip_ratio": 0.0,
"completion_length": 45.325,
"epoch": 0.41824196597353497,
"format_reward": -0.25,
"grad_norm": 0.833739697933197,
"image_reward": 0.28918762058019637,
"kl": 0.6164161543361842,
"learning_rate": 5e-06,
"loss": -0.0846,
"reward": -0.22242847234010696,
"reward_std": 1.0645570412278176,
"rewards/reward_func": -0.22242847234010696,
"step": 1770,
"toxic_reward": 3.958344542980194
},
{
"clip_ratio": 0.0,
"completion_length": 50.925,
"epoch": 0.42060491493383745,
"format_reward": -0.5,
"grad_norm": 0.929023027420044,
"image_reward": 0.2808074980974197,
"kl": 0.8390735885128379,
"learning_rate": 5e-06,
"loss": -0.0834,
"reward": -0.738262277841568,
"reward_std": 1.677246123738587,
"rewards/reward_func": -0.738262277841568,
"step": 1780,
"toxic_reward": 3.8094155311584474
},
{
"clip_ratio": 0.0,
"completion_length": 41.4,
"epoch": 0.4229678638941399,
"format_reward": -0.5,
"grad_norm": 1.0305073261260986,
"image_reward": 0.286659748852253,
"kl": 0.6373991215135902,
"learning_rate": 5e-06,
"loss": -0.0697,
"reward": -0.2053418666124344,
"reward_std": 1.680133179202676,
"rewards/reward_func": -0.2053418666124344,
"step": 1790,
"toxic_reward": 3.8562827944755553
},
{
"clip_ratio": 0.0,
"completion_length": 39.8,
"epoch": 0.42533081285444235,
"format_reward": 0.0,
"grad_norm": 0.9716371297836304,
"image_reward": 0.292718505859375,
"kl": 0.6843567499890924,
"learning_rate": 5e-06,
"loss": -0.0924,
"reward": 0.7018224939703941,
"reward_std": 0.8987518041394651,
"rewards/reward_func": 0.7018224939703941,
"step": 1800,
"toxic_reward": 3.408372712135315
},
{
"clip_ratio": 0.0,
"completion_length": 51.25,
"epoch": 0.4276937618147448,
"format_reward": 0.0,
"grad_norm": 1.081742286682129,
"image_reward": 0.2768310546875,
"kl": 0.7960635300725698,
"learning_rate": 5e-06,
"loss": 0.0443,
"reward": -0.25897485911846163,
"reward_std": 0.9034805342555046,
"rewards/reward_func": -0.25897485911846163,
"step": 1810,
"toxic_reward": 3.7079725742340086
},
{
"clip_ratio": 0.0,
"completion_length": 54.3,
"epoch": 0.43005671077504726,
"format_reward": -0.25,
"grad_norm": 0.6442953944206238,
"image_reward": 0.27892710268497467,
"kl": 0.7656038996763528,
"learning_rate": 5e-06,
"loss": -0.0099,
"reward": -0.13414714336395264,
"reward_std": 1.1088863730430603,
"rewards/reward_func": -0.13414714336395264,
"step": 1820,
"toxic_reward": 3.735495138168335
},
{
"clip_ratio": 0.0,
"completion_length": 42.525,
"epoch": 0.43241965973534974,
"format_reward": -1.0,
"grad_norm": 0.7406989336013794,
"image_reward": 0.2804585784673691,
"kl": 3.6395583665929734,
"learning_rate": 5e-06,
"loss": -0.1008,
"reward": -0.8905552387237549,
"reward_std": 2.38557695299387,
"rewards/reward_func": -0.8905552387237549,
"step": 1830,
"toxic_reward": 3.60183764398098
},
{
"clip_ratio": 0.0,
"completion_length": 36.375,
"epoch": 0.43478260869565216,
"format_reward": -0.25,
"grad_norm": 1.5541785955429077,
"image_reward": 0.30787862092256546,
"kl": 1.104234455060214,
"learning_rate": 5e-06,
"loss": 0.0222,
"reward": 0.09280971884727478,
"reward_std": 1.7143970176577568,
"rewards/reward_func": 0.09280971884727478,
"step": 1840,
"toxic_reward": 3.689550542831421
},
{
"clip_ratio": 0.0,
"completion_length": 55.0,
"epoch": 0.43714555765595464,
"format_reward": -0.25,
"grad_norm": 0.8598329424858093,
"image_reward": 0.2855051666498184,
"kl": 0.16781285647302865,
"learning_rate": 5e-06,
"loss": -0.1435,
"reward": 0.3788378477096558,
"reward_std": 1.0338344363495708,
"rewards/reward_func": 0.3788378477096558,
"step": 1850,
"toxic_reward": 4.1332162618637085
},
{
"clip_ratio": 0.0,
"completion_length": 33.9,
"epoch": 0.43950850661625707,
"format_reward": -0.5,
"grad_norm": 1.6019521951675415,
"image_reward": 0.27197469025850296,
"kl": 7.518688270077109,
"learning_rate": 5e-06,
"loss": -0.0371,
"reward": 0.130861234664917,
"reward_std": 1.7171866662800312,
"rewards/reward_func": 0.130861234664917,
"step": 1860,
"toxic_reward": 4.243645071983337
},
{
"clip_ratio": 0.0,
"completion_length": 41.7,
"epoch": 0.44187145557655955,
"format_reward": -0.5,
"grad_norm": 0.5758384466171265,
"image_reward": 0.28136799931526185,
"kl": 2.1443952365778385,
"learning_rate": 5e-06,
"loss": -0.0189,
"reward": -0.18380895256996155,
"reward_std": 1.6837687961757184,
"rewards/reward_func": -0.18380895256996155,
"step": 1870,
"toxic_reward": 3.4331242620944975
},
{
"clip_ratio": 0.0,
"completion_length": 38.3,
"epoch": 0.444234404536862,
"format_reward": -0.75,
"grad_norm": 1.5153789520263672,
"image_reward": 0.28166198879480364,
"kl": 1.9300499164499343,
"learning_rate": 5e-06,
"loss": 0.0564,
"reward": -0.7839775577187538,
"reward_std": 2.034397203475237,
"rewards/reward_func": -0.7839775577187538,
"step": 1880,
"toxic_reward": 3.5422126829624174
},
{
"clip_ratio": 0.0,
"completion_length": 40.05,
"epoch": 0.44659735349716445,
"format_reward": 0.0,
"grad_norm": 1.02174973487854,
"image_reward": 0.30441080778837204,
"kl": 5.820364655274898,
"learning_rate": 5e-06,
"loss": -0.1999,
"reward": 0.5548859179019928,
"reward_std": 0.8466346619650722,
"rewards/reward_func": 0.5548859179019928,
"step": 1890,
"toxic_reward": 3.5053808212280275
},
{
"clip_ratio": 0.0,
"completion_length": 41.0,
"epoch": 0.44896030245746693,
"format_reward": -0.75,
"grad_norm": 1.8126834630966187,
"image_reward": 0.25828145295381544,
"kl": 1.9232184071093799,
"learning_rate": 5e-06,
"loss": 0.0966,
"reward": -0.5137902736663819,
"reward_std": 2.415500694513321,
"rewards/reward_func": -0.5137902736663819,
"step": 1900,
"toxic_reward": 3.4278686165809633
},
{
"clip_ratio": 0.0,
"completion_length": 37.7,
"epoch": 0.45132325141776936,
"format_reward": -0.5,
"grad_norm": 0.6371603608131409,
"image_reward": 0.2626200348138809,
"kl": 6.273042661882937,
"learning_rate": 5e-06,
"loss": 0.0209,
"reward": -0.10160770416259765,
"reward_std": 1.7223791293799877,
"rewards/reward_func": -0.10160770416259765,
"step": 1910,
"toxic_reward": 3.4677812099456786
},
{
"clip_ratio": 0.0,
"completion_length": 39.9,
"epoch": 0.45368620037807184,
"format_reward": 0.0,
"grad_norm": 1.025303840637207,
"image_reward": 0.27600199580192564,
"kl": 2.9244240637868644,
"learning_rate": 5e-06,
"loss": 0.0036,
"reward": 0.2618570938706398,
"reward_std": 0.7942308865487575,
"rewards/reward_func": 0.2618570938706398,
"step": 1920,
"toxic_reward": 3.214989905059338
},
{
"clip_ratio": 0.0,
"completion_length": 42.5,
"epoch": 0.4560491493383743,
"format_reward": 0.0,
"grad_norm": 3.0306193828582764,
"image_reward": 0.27111816257238386,
"kl": 7.301137297973037,
"learning_rate": 5e-06,
"loss": -0.3058,
"reward": 0.7629794716835022,
"reward_std": 1.207332517206669,
"rewards/reward_func": 0.7629794716835022,
"step": 1930,
"toxic_reward": 3.8610877275466917
},
{
"clip_ratio": 0.0,
"completion_length": 49.325,
"epoch": 0.45841209829867674,
"format_reward": -0.5,
"grad_norm": 0.4994942843914032,
"image_reward": 0.2564666748046875,
"kl": 1.9746190145611764,
"learning_rate": 5e-06,
"loss": -0.056,
"reward": 0.17883441746234893,
"reward_std": 1.9227621294558048,
"rewards/reward_func": 0.17883441746234893,
"step": 1940,
"toxic_reward": 3.5681721329689027
},
{
"clip_ratio": 0.0,
"completion_length": 36.5,
"epoch": 0.4607750472589792,
"format_reward": -0.5,
"grad_norm": 1.0730820894241333,
"image_reward": 0.2937784805893898,
"kl": 2.8218962060287596,
"learning_rate": 5e-06,
"loss": 0.0566,
"reward": -0.1567411482334137,
"reward_std": 1.654453044757247,
"rewards/reward_func": -0.1567411482334137,
"step": 1950,
"toxic_reward": 3.6663838982582093
},
{
"clip_ratio": 0.0,
"completion_length": 55.725,
"epoch": 0.46313799621928164,
"format_reward": 0.0,
"grad_norm": 2.0345563888549805,
"image_reward": 0.2648590087890625,
"kl": 0.5958237243816257,
"learning_rate": 5e-06,
"loss": 0.0654,
"reward": 0.12212587893009186,
"reward_std": 0.6707309451885521,
"rewards/reward_func": 0.12212587893009186,
"step": 1960,
"toxic_reward": 3.1909562170505525
},
{
"clip_ratio": 0.0,
"completion_length": 45.4,
"epoch": 0.4655009451795841,
"format_reward": -0.5,
"grad_norm": 5.125189781188965,
"image_reward": 0.28848724216222765,
"kl": 1.6634003438055516,
"learning_rate": 5e-06,
"loss": 0.0863,
"reward": -0.1009038507938385,
"reward_std": 1.4750457480549812,
"rewards/reward_func": -0.1009038507938385,
"step": 1970,
"toxic_reward": 4.304786968231201
},
{
"clip_ratio": 0.0,
"completion_length": 44.075,
"epoch": 0.4678638941398866,
"format_reward": -0.25,
"grad_norm": 1.4688388109207153,
"image_reward": 0.27630208283662794,
"kl": 0.420011714566499,
"learning_rate": 5e-06,
"loss": -0.0726,
"reward": -0.325018173456192,
"reward_std": 1.0332348687574266,
"rewards/reward_func": -0.325018173456192,
"step": 1980,
"toxic_reward": 3.5992671266198157
},
{
"clip_ratio": 0.0,
"completion_length": 55.85,
"epoch": 0.47022684310018903,
"format_reward": -0.25,
"grad_norm": 11.723315238952637,
"image_reward": 0.26587321013212206,
"kl": 0.32123089879751204,
"learning_rate": 5e-06,
"loss": 0.0977,
"reward": -0.41115415692329405,
"reward_std": 1.5678910434246063,
"rewards/reward_func": -0.41115415692329405,
"step": 1990,
"toxic_reward": 3.7649365305900573
},
{
"clip_ratio": 0.0,
"completion_length": 46.35,
"epoch": 0.4725897920604915,
"format_reward": -0.25,
"grad_norm": 2.3079888820648193,
"image_reward": 0.27147267758846283,
"kl": 0.2777526224032044,
"learning_rate": 5e-06,
"loss": -0.0282,
"reward": -0.2599769473075867,
"reward_std": 0.731538234371692,
"rewards/reward_func": -0.2599769473075867,
"step": 2000,
"toxic_reward": 4.658599400520325
},
{
"clip_ratio": 0.0,
"completion_length": 36.85,
"epoch": 0.47495274102079393,
"format_reward": -0.5,
"grad_norm": 14.372509956359863,
"image_reward": 0.2984934478998184,
"kl": 3.4746980018913747,
"learning_rate": 5e-06,
"loss": 0.0433,
"reward": -0.3160775646567345,
"reward_std": 0.8356795504689216,
"rewards/reward_func": -0.3160775646567345,
"step": 2010,
"toxic_reward": 3.6712876573204993
},
{
"clip_ratio": 0.0,
"completion_length": 38.975,
"epoch": 0.4773156899810964,
"format_reward": 0.0,
"grad_norm": 9.949368476867676,
"image_reward": 0.2758158355951309,
"kl": 2.603505723550916,
"learning_rate": 5e-06,
"loss": -0.1898,
"reward": 0.5061412572860717,
"reward_std": 0.6404913809150458,
"rewards/reward_func": 0.5061412572860717,
"step": 2020,
"toxic_reward": 4.01279228925705
},
{
"clip_ratio": 0.0,
"completion_length": 56.05,
"epoch": 0.47967863894139884,
"format_reward": -0.5,
"grad_norm": 11.427620887756348,
"image_reward": 0.2567454010248184,
"kl": 0.622926688939333,
"learning_rate": 5e-06,
"loss": 0.0783,
"reward": 0.21228746175765992,
"reward_std": 1.9739407232031225,
"rewards/reward_func": 0.21228746175765992,
"step": 2030,
"toxic_reward": 3.7354461193084716
},
{
"clip_ratio": 0.0,
"completion_length": 44.45,
"epoch": 0.4820415879017013,
"format_reward": 0.0,
"grad_norm": 4.316232204437256,
"image_reward": 0.2718638092279434,
"kl": 2.3269161872565745,
"learning_rate": 5e-06,
"loss": -0.1163,
"reward": 0.737056265771389,
"reward_std": 0.9669643521308899,
"rewards/reward_func": 0.737056265771389,
"step": 2040,
"toxic_reward": 3.0878625586628914
},
{
"clip_ratio": 0.0,
"completion_length": 46.475,
"epoch": 0.4844045368620038,
"format_reward": -1.0,
"grad_norm": 41.36595153808594,
"image_reward": 0.26953938901424407,
"kl": 0.7504621215164662,
"learning_rate": 5e-06,
"loss": -0.1493,
"reward": -1.3220559000968932,
"reward_std": 1.9624842151999473,
"rewards/reward_func": -1.3220559000968932,
"step": 2050,
"toxic_reward": 3.74695360660553
},
{
"clip_ratio": 0.0,
"completion_length": 45.825,
"epoch": 0.4867674858223062,
"format_reward": 0.0,
"grad_norm": 6.471742153167725,
"image_reward": 0.2753570556640625,
"kl": 0.07729073958471418,
"learning_rate": 5e-06,
"loss": -0.03,
"reward": 1.3116377294063568,
"reward_std": 1.4300442904233932,
"rewards/reward_func": 1.3116377294063568,
"step": 2060,
"toxic_reward": 3.5985005378723143
},
{
"clip_ratio": 0.0,
"completion_length": 44.825,
"epoch": 0.4891304347826087,
"format_reward": 0.0,
"grad_norm": 1.805216670036316,
"image_reward": 0.306744384765625,
"kl": 6.001958086341619,
"learning_rate": 5e-06,
"loss": -0.1945,
"reward": 0.36415485143661497,
"reward_std": 0.6190065078437328,
"rewards/reward_func": 0.36415485143661497,
"step": 2070,
"toxic_reward": 4.081458044052124
},
{
"clip_ratio": 0.0,
"completion_length": 48.275,
"epoch": 0.4914933837429111,
"format_reward": 0.0,
"grad_norm": 18.216772079467773,
"image_reward": 0.2797536224126816,
"kl": 0.49935312662273645,
"learning_rate": 5e-06,
"loss": 0.0342,
"reward": 0.23056302070617676,
"reward_std": 0.4776972606778145,
"rewards/reward_func": 0.23056302070617676,
"step": 2080,
"toxic_reward": 4.019720596075058
},
{
"clip_ratio": 0.0,
"completion_length": 35.075,
"epoch": 0.4938563327032136,
"format_reward": -0.75,
"grad_norm": 13.060705184936523,
"image_reward": 0.28729756474494933,
"kl": 4.740964457206428,
"learning_rate": 5e-06,
"loss": 0.0645,
"reward": -0.4479706704616547,
"reward_std": 2.0641879491508006,
"rewards/reward_func": -0.4479706704616547,
"step": 2090,
"toxic_reward": 2.7062815964221953
},
{
"clip_ratio": 0.0,
"completion_length": 32.575,
"epoch": 0.4962192816635161,
"format_reward": -0.25,
"grad_norm": 14.017393112182617,
"image_reward": 0.2847381591796875,
"kl": 0.9378721818327904,
"learning_rate": 5e-06,
"loss": -0.0908,
"reward": 0.4732812285423279,
"reward_std": 1.2860259119421245,
"rewards/reward_func": 0.4732812285423279,
"step": 2100,
"toxic_reward": 3.420735603570938
},
{
"clip_ratio": 0.0,
"completion_length": 55.075,
"epoch": 0.4985822306238185,
"format_reward": -0.75,
"grad_norm": 6.193188667297363,
"image_reward": 0.27182515412569047,
"kl": 2.9611662749201058,
"learning_rate": 5e-06,
"loss": 0.0056,
"reward": -0.19096837639808656,
"reward_std": 1.8480727752670645,
"rewards/reward_func": -0.19096837639808656,
"step": 2110,
"toxic_reward": 4.268127584457398
},
{
"clip_ratio": 0.0,
"completion_length": 42.8,
"epoch": 0.500945179584121,
"format_reward": -0.75,
"grad_norm": 11.63723087310791,
"image_reward": 0.2698944091796875,
"kl": 1.1968733308836819,
"learning_rate": 5e-06,
"loss": 0.0042,
"reward": -0.5995136559009552,
"reward_std": 2.1293695636093615,
"rewards/reward_func": -0.5995136559009552,
"step": 2120,
"toxic_reward": 3.746561822295189
},
{
"clip_ratio": 0.0,
"completion_length": 40.8,
"epoch": 0.5033081285444234,
"format_reward": -0.75,
"grad_norm": 2.3855180740356445,
"image_reward": 0.26025390625,
"kl": 1.5614483684301377,
"learning_rate": 5e-06,
"loss": 0.2496,
"reward": -0.6204059720039368,
"reward_std": 1.9704039812088012,
"rewards/reward_func": -0.6204059720039368,
"step": 2130,
"toxic_reward": 3.747698575258255
},
{
"clip_ratio": 0.0,
"completion_length": 40.75,
"epoch": 0.505671077504726,
"format_reward": 0.0,
"grad_norm": 7.681392669677734,
"image_reward": 0.27169952541589737,
"kl": 3.525779527798295,
"learning_rate": 5e-06,
"loss": -0.154,
"reward": 0.7122885227203369,
"reward_std": 1.038828771188855,
"rewards/reward_func": 0.7122885227203369,
"step": 2140,
"toxic_reward": 3.8024647355079653
},
{
"clip_ratio": 0.0,
"completion_length": 49.55,
"epoch": 0.5080340264650284,
"format_reward": -0.25,
"grad_norm": 7.522043228149414,
"image_reward": 0.2867136627435684,
"kl": 2.352656077966094,
"learning_rate": 5e-06,
"loss": -0.0567,
"reward": 0.3375007212162018,
"reward_std": 1.1598852841183542,
"rewards/reward_func": 0.3375007212162018,
"step": 2150,
"toxic_reward": 3.6138802111148833
},
{
"clip_ratio": 0.0,
"completion_length": 47.75,
"epoch": 0.5103969754253308,
"format_reward": 0.0,
"grad_norm": 8.265325546264648,
"image_reward": 0.2756062835454941,
"kl": 6.923487820476294,
"learning_rate": 5e-06,
"loss": -0.1108,
"reward": 0.7483027845621109,
"reward_std": 0.5725362204015255,
"rewards/reward_func": 0.7483027845621109,
"step": 2160,
"toxic_reward": 3.906574785709381
},
{
"clip_ratio": 0.0,
"completion_length": 33.525,
"epoch": 0.5127599243856332,
"format_reward": -0.75,
"grad_norm": 21.7608642578125,
"image_reward": 0.2696726471185684,
"kl": 5.021715716272593,
"learning_rate": 5e-06,
"loss": -0.1133,
"reward": -0.4512764573097229,
"reward_std": 2.062841220572591,
"rewards/reward_func": -0.4512764573097229,
"step": 2170,
"toxic_reward": 4.282562255859375
},
{
"clip_ratio": 0.0,
"completion_length": 48.425,
"epoch": 0.5151228733459358,
"format_reward": -0.25,
"grad_norm": 2.369183301925659,
"image_reward": 0.28711649775505066,
"kl": 12.483240520581603,
"learning_rate": 5e-06,
"loss": 0.0658,
"reward": -0.0087041437625885,
"reward_std": 1.3220645122230053,
"rewards/reward_func": -0.0087041437625885,
"step": 2180,
"toxic_reward": 3.781124639511108
},
{
"clip_ratio": 0.0,
"completion_length": 43.0,
"epoch": 0.5174858223062382,
"format_reward": -0.5,
"grad_norm": 4.219491958618164,
"image_reward": 0.27772623747587205,
"kl": 2.453311304561794,
"learning_rate": 5e-06,
"loss": -0.0285,
"reward": -0.30757330656051635,
"reward_std": 1.7083245173096657,
"rewards/reward_func": -0.30757330656051635,
"step": 2190,
"toxic_reward": 4.130738306045532
},
{
"clip_ratio": 0.0,
"completion_length": 50.85,
"epoch": 0.5198487712665406,
"format_reward": -0.5,
"grad_norm": 6.190961837768555,
"image_reward": 0.2818817153573036,
"kl": 4.28942144587636,
"learning_rate": 5e-06,
"loss": -0.1115,
"reward": 0.2441554695367813,
"reward_std": 1.9595814019441604,
"rewards/reward_func": 0.2441554695367813,
"step": 2200,
"toxic_reward": 3.141683894395828
},
{
"clip_ratio": 0.0,
"completion_length": 48.225,
"epoch": 0.5222117202268431,
"format_reward": -0.5,
"grad_norm": 4.348143577575684,
"image_reward": 0.29916890412569047,
"kl": 0.34145298339426516,
"learning_rate": 5e-06,
"loss": 0.0071,
"reward": -0.5653827100992203,
"reward_std": 1.6975119888782502,
"rewards/reward_func": -0.5653827100992203,
"step": 2210,
"toxic_reward": 3.599680471420288
},
{
"clip_ratio": 0.0,
"completion_length": 49.0,
"epoch": 0.5245746691871456,
"format_reward": -0.75,
"grad_norm": 6.7439422607421875,
"image_reward": 0.2785715714097023,
"kl": 1.8124071411788463,
"learning_rate": 5e-06,
"loss": 0.0723,
"reward": -0.6911701261997223,
"reward_std": 1.9053923369385302,
"rewards/reward_func": -0.6911701261997223,
"step": 2220,
"toxic_reward": 3.67071852684021
},
{
"clip_ratio": 0.0,
"completion_length": 38.9,
"epoch": 0.526937618147448,
"format_reward": -0.25,
"grad_norm": 5.702417373657227,
"image_reward": 0.2697733551263809,
"kl": 3.5654136715456843,
"learning_rate": 5e-06,
"loss": -0.0592,
"reward": 0.31644179224967955,
"reward_std": 1.338551426678896,
"rewards/reward_func": 0.31644179224967955,
"step": 2230,
"toxic_reward": 4.082410860061645
},
{
"clip_ratio": 0.0,
"completion_length": 46.9,
"epoch": 0.5293005671077504,
"format_reward": -0.75,
"grad_norm": 3.3108696937561035,
"image_reward": 0.2754450500011444,
"kl": 1.1358238738030195,
"learning_rate": 5e-06,
"loss": 0.0103,
"reward": -0.19608908146619797,
"reward_std": 1.9574983415892349,
"rewards/reward_func": -0.19608908146619797,
"step": 2240,
"toxic_reward": 3.8882675245404243
},
{
"clip_ratio": 0.0,
"completion_length": 41.775,
"epoch": 0.531663516068053,
"format_reward": 0.0,
"grad_norm": 3.8872108459472656,
"image_reward": 0.2711354583501816,
"kl": 0.6185108724981546,
"learning_rate": 5e-06,
"loss": -0.0331,
"reward": 0.43025930523872374,
"reward_std": 0.6924620851874351,
"rewards/reward_func": 0.43025930523872374,
"step": 2250,
"toxic_reward": 3.741843378543854
},
{
"clip_ratio": 0.0,
"completion_length": 46.05,
"epoch": 0.5340264650283554,
"format_reward": -0.5,
"grad_norm": 2.605905055999756,
"image_reward": 0.24824727326631546,
"kl": 3.812788811326027,
"learning_rate": 5e-06,
"loss": -0.062,
"reward": -0.0177284836769104,
"reward_std": 1.7159371480345726,
"rewards/reward_func": -0.0177284836769104,
"step": 2260,
"toxic_reward": 3.8558017730712892
},
{
"clip_ratio": 0.0,
"completion_length": 45.225,
"epoch": 0.5363894139886578,
"format_reward": 0.0,
"grad_norm": 4.317953109741211,
"image_reward": 0.29388427734375,
"kl": 0.9772842615842819,
"learning_rate": 5e-06,
"loss": -0.005,
"reward": 0.24463090300559998,
"reward_std": 0.8211262285709381,
"rewards/reward_func": 0.24463090300559998,
"step": 2270,
"toxic_reward": 3.4330978095531464
},
{
"clip_ratio": 0.0,
"completion_length": 45.35,
"epoch": 0.5387523629489603,
"format_reward": -0.25,
"grad_norm": 2.7746388912200928,
"image_reward": 0.28372802734375,
"kl": 0.6956694826483727,
"learning_rate": 5e-06,
"loss": 0.0806,
"reward": 0.9492665678262711,
"reward_std": 1.2596320446580649,
"rewards/reward_func": 0.9492665678262711,
"step": 2280,
"toxic_reward": 3.6599619805812837
},
{
"clip_ratio": 0.0,
"completion_length": 41.65,
"epoch": 0.5411153119092628,
"format_reward": -0.75,
"grad_norm": 24.271883010864258,
"image_reward": 0.25230407863855364,
"kl": 2.0102761931717397,
"learning_rate": 5e-06,
"loss": 0.099,
"reward": -0.5960418626666069,
"reward_std": 1.6162065342068672,
"rewards/reward_func": -0.5960418626666069,
"step": 2290,
"toxic_reward": 3.32955624461174
},
{
"clip_ratio": 0.0,
"completion_length": 40.025,
"epoch": 0.5434782608695652,
"format_reward": -0.75,
"grad_norm": 12.164813995361328,
"image_reward": 0.27450052797794344,
"kl": 1.0361489206552505,
"learning_rate": 5e-06,
"loss": 0.0215,
"reward": -0.12894563674926757,
"reward_std": 2.2585421696305277,
"rewards/reward_func": -0.12894563674926757,
"step": 2300,
"toxic_reward": 3.8079848527908324
},
{
"clip_ratio": 0.0,
"completion_length": 39.925,
"epoch": 0.5458412098298677,
"format_reward": 0.0,
"grad_norm": 4.370122909545898,
"image_reward": 0.28968607634305954,
"kl": 2.262423123046756,
"learning_rate": 5e-06,
"loss": -0.0122,
"reward": 0.4122478127479553,
"reward_std": 0.8819206684827805,
"rewards/reward_func": 0.4122478127479553,
"step": 2310,
"toxic_reward": 3.7774435758590696
},
{
"clip_ratio": 0.0,
"completion_length": 47.625,
"epoch": 0.5482041587901701,
"format_reward": -0.25,
"grad_norm": 4.913710594177246,
"image_reward": 0.2981597900390625,
"kl": 1.1325825482606888,
"learning_rate": 5e-06,
"loss": 0.0383,
"reward": -0.302042031288147,
"reward_std": 1.1343338422477245,
"rewards/reward_func": -0.302042031288147,
"step": 2320,
"toxic_reward": 3.4699944481253624
},
{
"clip_ratio": 0.0,
"completion_length": 34.875,
"epoch": 0.5505671077504726,
"format_reward": -0.5,
"grad_norm": 10.183396339416504,
"image_reward": 0.2794362396001816,
"kl": 2.359659927338362,
"learning_rate": 5e-06,
"loss": 0.127,
"reward": -0.5543205380439759,
"reward_std": 1.5390649776905776,
"rewards/reward_func": -0.5543205380439759,
"step": 2330,
"toxic_reward": 4.130715823173523
},
{
"clip_ratio": 0.0,
"completion_length": 38.275,
"epoch": 0.552930056710775,
"format_reward": -0.25,
"grad_norm": 29.773969650268555,
"image_reward": 0.3009490996599197,
"kl": 1.2122079662978649,
"learning_rate": 5e-06,
"loss": -0.0302,
"reward": 0.49274033308029175,
"reward_std": 1.2792111776769162,
"rewards/reward_func": 0.49274033308029175,
"step": 2340,
"toxic_reward": 4.144988393783569
},
{
"clip_ratio": 0.0,
"completion_length": 48.925,
"epoch": 0.5552930056710775,
"format_reward": -0.25,
"grad_norm": 1.4507733583450317,
"image_reward": 0.27436625212430954,
"kl": 10.124456256255508,
"learning_rate": 5e-06,
"loss": -0.0586,
"reward": 0.16714471578598022,
"reward_std": 1.1183603500947357,
"rewards/reward_func": 0.16714471578598022,
"step": 2350,
"toxic_reward": 3.7719646602869035
},
{
"clip_ratio": 0.0,
"completion_length": 50.45,
"epoch": 0.55765595463138,
"format_reward": -0.25,
"grad_norm": 3.8344922065734863,
"image_reward": 0.27209879606962206,
"kl": 0.4884789928793907,
"learning_rate": 5e-06,
"loss": 0.0246,
"reward": 0.7492954432964325,
"reward_std": 1.5298523031175137,
"rewards/reward_func": 0.7492954432964325,
"step": 2360,
"toxic_reward": 3.582643675804138
},
{
"clip_ratio": 0.0,
"completion_length": 33.925,
"epoch": 0.5600189035916824,
"format_reward": -0.25,
"grad_norm": 28.500579833984375,
"image_reward": 0.256439208984375,
"kl": 7.240471968054772,
"learning_rate": 5e-06,
"loss": 0.0033,
"reward": 0.39032529294490814,
"reward_std": 1.3387351400218903,
"rewards/reward_func": 0.39032529294490814,
"step": 2370,
"toxic_reward": 3.7680604696273803
},
{
"clip_ratio": 0.0,
"completion_length": 43.35,
"epoch": 0.5623818525519849,
"format_reward": -0.25,
"grad_norm": 18.509540557861328,
"image_reward": 0.25091654509305955,
"kl": 2.3443214535713195,
"learning_rate": 5e-06,
"loss": -0.0852,
"reward": -0.013416659832000733,
"reward_std": 1.2783805396407844,
"rewards/reward_func": -0.013416659832000733,
"step": 2380,
"toxic_reward": 3.937808632850647
},
{
"clip_ratio": 0.0,
"completion_length": 48.125,
"epoch": 0.5647448015122873,
"format_reward": -0.5,
"grad_norm": 11.650871276855469,
"image_reward": 0.29215189516544343,
"kl": 0.3515282288193703,
"learning_rate": 5e-06,
"loss": -0.0259,
"reward": 0.1742587387561798,
"reward_std": 1.8562648460268973,
"rewards/reward_func": 0.1742587387561798,
"step": 2390,
"toxic_reward": 3.7724621415138246
},
{
"clip_ratio": 0.0,
"completion_length": 52.55,
"epoch": 0.5671077504725898,
"format_reward": -1.0,
"grad_norm": 20.670705795288086,
"image_reward": 0.26702982634305955,
"kl": 2.7752922803163527,
"learning_rate": 5e-06,
"loss": 0.1898,
"reward": -0.49167909026145934,
"reward_std": 2.5721775129437447,
"rewards/reward_func": -0.49167909026145934,
"step": 2400,
"toxic_reward": 3.612065541744232
},
{
"clip_ratio": 0.0,
"completion_length": 47.75,
"epoch": 0.5694706994328923,
"format_reward": -0.25,
"grad_norm": 5.918033599853516,
"image_reward": 0.27968953400850294,
"kl": 1.1868829876184464,
"learning_rate": 5e-06,
"loss": 0.0339,
"reward": -0.041136431694030764,
"reward_std": 1.1883981741964817,
"rewards/reward_func": -0.041136431694030764,
"step": 2410,
"toxic_reward": 4.002831280231476
},
{
"clip_ratio": 0.0,
"completion_length": 42.4,
"epoch": 0.5718336483931947,
"format_reward": -0.25,
"grad_norm": 5.842867851257324,
"image_reward": 0.27998046875,
"kl": 0.9403334192931652,
"learning_rate": 5e-06,
"loss": 0.0547,
"reward": 0.23068565130233765,
"reward_std": 1.2439154148101808,
"rewards/reward_func": 0.23068565130233765,
"step": 2420,
"toxic_reward": 3.8584881067276
},
{
"clip_ratio": 0.0,
"completion_length": 37.2,
"epoch": 0.5741965973534972,
"format_reward": -0.25,
"grad_norm": 13.205660820007324,
"image_reward": 0.2850880965590477,
"kl": 1.6154363751411438,
"learning_rate": 5e-06,
"loss": -0.0775,
"reward": 0.4115023612976074,
"reward_std": 1.0730943327769638,
"rewards/reward_func": 0.4115023612976074,
"step": 2430,
"toxic_reward": 4.400762820243836
},
{
"clip_ratio": 0.0,
"completion_length": 42.925,
"epoch": 0.5765595463137996,
"format_reward": -0.25,
"grad_norm": 3.637028455734253,
"image_reward": 0.26606852263212205,
"kl": 1.6208242058753968,
"learning_rate": 5e-06,
"loss": 0.0364,
"reward": -0.5815495431423188,
"reward_std": 1.270220142416656,
"rewards/reward_func": -0.5815495431423188,
"step": 2440,
"toxic_reward": 3.934324860572815
},
{
"clip_ratio": 0.0,
"completion_length": 66.35,
"epoch": 0.5789224952741021,
"format_reward": -0.75,
"grad_norm": 11.621758460998535,
"image_reward": 0.28047332763671873,
"kl": 0.7798056200146675,
"learning_rate": 5e-06,
"loss": 0.0519,
"reward": 0.1087100327014923,
"reward_std": 2.0828719630837442,
"rewards/reward_func": 0.1087100327014923,
"step": 2450,
"toxic_reward": 2.8291834026575087
},
{
"clip_ratio": 0.0,
"completion_length": 43.95,
"epoch": 0.5812854442344045,
"format_reward": 0.0,
"grad_norm": 9.702945709228516,
"image_reward": 0.28443044126033784,
"kl": 1.73483949303627,
"learning_rate": 5e-06,
"loss": 0.0958,
"reward": 0.28035863041877745,
"reward_std": 0.5182013310492039,
"rewards/reward_func": 0.28035863041877745,
"step": 2460,
"toxic_reward": 3.8520292162895204
},
{
"clip_ratio": 0.0,
"completion_length": 43.925,
"epoch": 0.583648393194707,
"format_reward": -0.5,
"grad_norm": 18.073659896850586,
"image_reward": 0.24947459101676941,
"kl": 2.9204909898340703,
"learning_rate": 5e-06,
"loss": -0.1939,
"reward": 0.42990538477897644,
"reward_std": 1.8428901416249572,
"rewards/reward_func": 0.42990538477897644,
"step": 2470,
"toxic_reward": 3.7781980872154235
},
{
"clip_ratio": 0.0,
"completion_length": 43.175,
"epoch": 0.5860113421550095,
"format_reward": -0.5,
"grad_norm": 4.270178318023682,
"image_reward": 0.28282063752412795,
"kl": 0.48990702964365485,
"learning_rate": 5e-06,
"loss": 0.0624,
"reward": -0.30424859523773196,
"reward_std": 1.5560518722981214,
"rewards/reward_func": -0.30424859523773196,
"step": 2480,
"toxic_reward": 4.44784414768219
},
{
"clip_ratio": 0.0,
"completion_length": 38.9,
"epoch": 0.5883742911153119,
"format_reward": -0.5,
"grad_norm": 7.575175762176514,
"image_reward": 0.2695292145013809,
"kl": 1.2654437847435474,
"learning_rate": 5e-06,
"loss": -0.1158,
"reward": -0.44633115231990816,
"reward_std": 1.8826897315680982,
"rewards/reward_func": -0.44633115231990816,
"step": 2490,
"toxic_reward": 3.8135931372642515
},
{
"clip_ratio": 0.0,
"completion_length": 46.3,
"epoch": 0.5907372400756143,
"format_reward": 0.0,
"grad_norm": 24.015722274780273,
"image_reward": 0.26897684782743453,
"kl": 5.640305678918958,
"learning_rate": 5e-06,
"loss": -0.1054,
"reward": 0.6214121818542481,
"reward_std": 0.9682584583759308,
"rewards/reward_func": 0.6214121818542481,
"step": 2500,
"toxic_reward": 3.9037705421447755
},
{
"clip_ratio": 0.0,
"completion_length": 32.125,
"epoch": 0.5931001890359168,
"format_reward": -0.25,
"grad_norm": 13.069973945617676,
"image_reward": 0.2854502350091934,
"kl": 11.71274044290185,
"learning_rate": 5e-06,
"loss": -0.077,
"reward": -0.3511055693030357,
"reward_std": 1.0736159782391042,
"rewards/reward_func": -0.3511055693030357,
"step": 2510,
"toxic_reward": 3.7281174302101134
},
{
"clip_ratio": 0.0,
"completion_length": 40.375,
"epoch": 0.5954631379962193,
"format_reward": 0.0,
"grad_norm": 2.0403361320495605,
"image_reward": 0.2833099365234375,
"kl": 0.6411756843328476,
"learning_rate": 5e-06,
"loss": 0.0435,
"reward": 0.5592477023601532,
"reward_std": 0.8428021136671304,
"rewards/reward_func": 0.5592477023601532,
"step": 2520,
"toxic_reward": 3.6056689500808714
},
{
"clip_ratio": 0.0,
"completion_length": 50.925,
"epoch": 0.5978260869565217,
"format_reward": -0.5,
"grad_norm": 2.7234652042388916,
"image_reward": 0.2631998687982559,
"kl": 2.588300554268062,
"learning_rate": 5e-06,
"loss": -0.0954,
"reward": -0.11296717822551727,
"reward_std": 1.059992153197527,
"rewards/reward_func": -0.11296717822551727,
"step": 2530,
"toxic_reward": 4.310960650444031
},
{
"clip_ratio": 0.0,
"completion_length": 42.8,
"epoch": 0.6001890359168242,
"format_reward": 0.0,
"grad_norm": 1.746839165687561,
"image_reward": 0.27794291228055956,
"kl": 0.12578147873282433,
"learning_rate": 5e-06,
"loss": -0.0894,
"reward": 0.6603235125541687,
"reward_std": 0.5662866534665227,
"rewards/reward_func": 0.6603235125541687,
"step": 2540,
"toxic_reward": 4.165549850463867
},
{
"clip_ratio": 0.0,
"completion_length": 65.2,
"epoch": 0.6025519848771267,
"format_reward": 0.0,
"grad_norm": 1.1635066270828247,
"image_reward": 0.2584126806921429,
"kl": 16.10209010541439,
"learning_rate": 5e-06,
"loss": -0.0047,
"reward": 0.9701344430446625,
"reward_std": 0.8910946477204561,
"rewards/reward_func": 0.9701344430446625,
"step": 2550,
"toxic_reward": 3.8731188111835055
},
{
"clip_ratio": 0.0,
"completion_length": 44.975,
"epoch": 0.6049149338374291,
"format_reward": -1.0,
"grad_norm": 3.505110502243042,
"image_reward": 0.2755279541015625,
"kl": 1.5500462669879198,
"learning_rate": 5e-06,
"loss": 0.0015,
"reward": -0.1658882439136505,
"reward_std": 2.0384394701570274,
"rewards/reward_func": -0.1658882439136505,
"step": 2560,
"toxic_reward": 3.8778061270713806
},
{
"clip_ratio": 0.0,
"completion_length": 30.8,
"epoch": 0.6072778827977315,
"format_reward": 0.0,
"grad_norm": 8.120704650878906,
"image_reward": 0.2892588287591934,
"kl": 2.1680047139525414,
"learning_rate": 5e-06,
"loss": 0.0399,
"reward": 0.6697697341442108,
"reward_std": 1.024929089844227,
"rewards/reward_func": 0.6697697341442108,
"step": 2570,
"toxic_reward": 3.547108954191208
},
{
"clip_ratio": 0.0,
"completion_length": 63.225,
"epoch": 0.6096408317580341,
"format_reward": -1.0,
"grad_norm": 9.57001781463623,
"image_reward": 0.2734588623046875,
"kl": 0.8948870234191417,
"learning_rate": 5e-06,
"loss": 0.0951,
"reward": -0.7226251482963562,
"reward_std": 2.448101815581322,
"rewards/reward_func": -0.7226251482963562,
"step": 2580,
"toxic_reward": 4.320083689689636
},
{
"clip_ratio": 0.0,
"completion_length": 38.1,
"epoch": 0.6120037807183365,
"format_reward": 0.0,
"grad_norm": 2.0496883392333984,
"image_reward": 0.2865132659673691,
"kl": 2.8105035655200483,
"learning_rate": 5e-06,
"loss": -0.0797,
"reward": 0.568730728328228,
"reward_std": 0.6556393213570118,
"rewards/reward_func": 0.568730728328228,
"step": 2590,
"toxic_reward": 3.725440341234207
},
{
"clip_ratio": 0.0,
"completion_length": 45.075,
"epoch": 0.6143667296786389,
"format_reward": 0.0,
"grad_norm": 10.353742599487305,
"image_reward": 0.2710174560546875,
"kl": 0.6778285041451454,
"learning_rate": 5e-06,
"loss": 0.0379,
"reward": 0.2569525420665741,
"reward_std": 0.597846270352602,
"rewards/reward_func": 0.2569525420665741,
"step": 2600,
"toxic_reward": 4.306404328346252
},
{
"clip_ratio": 0.0,
"completion_length": 43.2,
"epoch": 0.6167296786389413,
"format_reward": -0.5,
"grad_norm": 3.9594945907592773,
"image_reward": 0.28432718813419344,
"kl": 0.5540166199207306,
"learning_rate": 5e-06,
"loss": 0.0047,
"reward": 0.5912085831165313,
"reward_std": 1.5809811264276505,
"rewards/reward_func": 0.5912085831165313,
"step": 2610,
"toxic_reward": 4.350194215774536
},
{
"clip_ratio": 0.0,
"completion_length": 53.0,
"epoch": 0.6190926275992439,
"format_reward": -0.25,
"grad_norm": 7.203413963317871,
"image_reward": 0.26516723483800886,
"kl": 1.199559571594,
"learning_rate": 5e-06,
"loss": -0.0012,
"reward": 0.2267006203532219,
"reward_std": 1.142584490031004,
"rewards/reward_func": 0.2267006203532219,
"step": 2620,
"toxic_reward": 3.9258982062339784
},
{
"clip_ratio": 0.0,
"completion_length": 44.95,
"epoch": 0.6214555765595463,
"format_reward": -0.25,
"grad_norm": 6.998039722442627,
"image_reward": 0.2901885986328125,
"kl": 27.16859985589981,
"learning_rate": 5e-06,
"loss": -0.026,
"reward": 0.13268216848373413,
"reward_std": 1.2143183693289756,
"rewards/reward_func": 0.13268216848373413,
"step": 2630,
"toxic_reward": 3.556568074226379
},
{
"clip_ratio": 0.0,
"completion_length": 48.8,
"epoch": 0.6238185255198487,
"format_reward": -0.25,
"grad_norm": 13.862479209899902,
"image_reward": 0.27274271547794343,
"kl": 2.363949555903673,
"learning_rate": 5e-06,
"loss": -0.1093,
"reward": 0.21172123551368713,
"reward_std": 1.788407751917839,
"rewards/reward_func": 0.21172123551368713,
"step": 2640,
"toxic_reward": 3.530658257007599
},
{
"clip_ratio": 0.0,
"completion_length": 41.45,
"epoch": 0.6261814744801513,
"format_reward": 0.0,
"grad_norm": 6.451826095581055,
"image_reward": 0.25544840544462205,
"kl": 0.9077189475297928,
"learning_rate": 5e-06,
"loss": -0.1204,
"reward": 0.17604875564575195,
"reward_std": 0.7731596916913986,
"rewards/reward_func": 0.17604875564575195,
"step": 2650,
"toxic_reward": 3.655298948287964
},
{
"clip_ratio": 0.0,
"completion_length": 52.375,
"epoch": 0.6285444234404537,
"format_reward": -0.25,
"grad_norm": 15.447392463684082,
"image_reward": 0.28090617060661316,
"kl": 1.5149286333471537,
"learning_rate": 5e-06,
"loss": -0.0393,
"reward": 0.6538720428943634,
"reward_std": 1.4380803421139716,
"rewards/reward_func": 0.6538720428943634,
"step": 2660,
"toxic_reward": 3.8757722854614256
},
{
"clip_ratio": 0.0,
"completion_length": 45.525,
"epoch": 0.6309073724007561,
"format_reward": -0.5,
"grad_norm": 5.443056583404541,
"image_reward": 0.28455810546875,
"kl": 2.1727461591362953,
"learning_rate": 5e-06,
"loss": 0.0129,
"reward": -0.5597851276397705,
"reward_std": 1.4988839238882066,
"rewards/reward_func": -0.5597851276397705,
"step": 2670,
"toxic_reward": 3.852312761545181
},
{
"clip_ratio": 0.0,
"completion_length": 35.575,
"epoch": 0.6332703213610587,
"format_reward": 0.0,
"grad_norm": 6.4276652336120605,
"image_reward": 0.2711863175034523,
"kl": 2.061120516061783,
"learning_rate": 5e-06,
"loss": -0.1249,
"reward": 0.4250785157084465,
"reward_std": 0.8246009856462478,
"rewards/reward_func": 0.4250785157084465,
"step": 2680,
"toxic_reward": 3.8009597778320314
},
{
"clip_ratio": 0.0,
"completion_length": 44.375,
"epoch": 0.6356332703213611,
"format_reward": 0.0,
"grad_norm": 1.9922189712524414,
"image_reward": 0.2824055999517441,
"kl": 0.8832020409405231,
"learning_rate": 5e-06,
"loss": 0.0029,
"reward": 0.2428468108177185,
"reward_std": 0.7863198474049569,
"rewards/reward_func": 0.2428468108177185,
"step": 2690,
"toxic_reward": 3.925771975517273
},
{
"clip_ratio": 0.0,
"completion_length": 37.375,
"epoch": 0.6379962192816635,
"format_reward": -0.25,
"grad_norm": 3.2788710594177246,
"image_reward": 0.2625895172357559,
"kl": 0.3067374438047409,
"learning_rate": 5e-06,
"loss": 0.0585,
"reward": 0.0673605427145958,
"reward_std": 1.2387039607390762,
"rewards/reward_func": 0.0673605427145958,
"step": 2700,
"toxic_reward": 3.223780316114426
},
{
"clip_ratio": 0.0,
"completion_length": 38.7,
"epoch": 0.6403591682419659,
"format_reward": -0.5,
"grad_norm": 18.068998336791992,
"image_reward": 0.28381652683019637,
"kl": 4.5763449721038345,
"learning_rate": 5e-06,
"loss": 0.0303,
"reward": -0.08654462695121765,
"reward_std": 1.6389019638299942,
"rewards/reward_func": -0.08654462695121765,
"step": 2710,
"toxic_reward": 3.9132798612117767
},
{
"clip_ratio": 0.0,
"completion_length": 35.75,
"epoch": 0.6427221172022685,
"format_reward": 0.0,
"grad_norm": 16.331071853637695,
"image_reward": 0.2924163818359375,
"kl": 1.1359277203679086,
"learning_rate": 5e-06,
"loss": -0.0706,
"reward": 0.6057616770267487,
"reward_std": 0.7651574447751045,
"rewards/reward_func": 0.6057616770267487,
"step": 2720,
"toxic_reward": 3.8298017740249635
},
{
"clip_ratio": 0.0,
"completion_length": 36.725,
"epoch": 0.6450850661625709,
"format_reward": -0.25,
"grad_norm": 2.152521848678589,
"image_reward": 0.3041951507329941,
"kl": 8.785355818271636,
"learning_rate": 5e-06,
"loss": 0.0118,
"reward": 0.3512896567583084,
"reward_std": 1.3057980645447969,
"rewards/reward_func": 0.3512896567583084,
"step": 2730,
"toxic_reward": 3.397814577817917
},
{
"clip_ratio": 0.0,
"completion_length": 32.675,
"epoch": 0.6474480151228733,
"format_reward": 0.0,
"grad_norm": 20.01748275756836,
"image_reward": 0.29010823667049407,
"kl": 3.5924226850271226,
"learning_rate": 5e-06,
"loss": -0.1142,
"reward": 0.7106038928031921,
"reward_std": 0.8158069387078285,
"rewards/reward_func": 0.7106038928031921,
"step": 2740,
"toxic_reward": 4.0692403554916385
},
{
"clip_ratio": 0.0,
"completion_length": 49.7,
"epoch": 0.6498109640831758,
"format_reward": -0.75,
"grad_norm": 11.965126037597656,
"image_reward": 0.26697489619255066,
"kl": 1.9964583709836006,
"learning_rate": 5e-06,
"loss": -0.0406,
"reward": -0.07322075963020325,
"reward_std": 1.999936766922474,
"rewards/reward_func": -0.07322075963020325,
"step": 2750,
"toxic_reward": 3.366811156272888
},
{
"clip_ratio": 0.0,
"completion_length": 41.475,
"epoch": 0.6521739130434783,
"format_reward": -0.5,
"grad_norm": 26.80545997619629,
"image_reward": 0.24265645444393158,
"kl": 2.707187344133854,
"learning_rate": 5e-06,
"loss": 0.0971,
"reward": -0.5061075001955032,
"reward_std": 1.755505845695734,
"rewards/reward_func": -0.5061075001955032,
"step": 2760,
"toxic_reward": 3.8667294502258303
},
{
"clip_ratio": 0.0,
"completion_length": 43.025,
"epoch": 0.6545368620037807,
"format_reward": 0.0,
"grad_norm": 5.15554141998291,
"image_reward": 0.271905517578125,
"kl": 4.915832757204771,
"learning_rate": 5e-06,
"loss": -0.0126,
"reward": 0.8192368298768997,
"reward_std": 0.41571362912654874,
"rewards/reward_func": 0.8192368298768997,
"step": 2770,
"toxic_reward": 4.089378929138183
},
{
"clip_ratio": 0.0,
"completion_length": 54.075,
"epoch": 0.6568998109640832,
"format_reward": 0.0,
"grad_norm": 2.028783082962036,
"image_reward": 0.27303365170955657,
"kl": 1.8947554275393486,
"learning_rate": 5e-06,
"loss": 0.0468,
"reward": -0.1263785183429718,
"reward_std": 0.7480042926967144,
"rewards/reward_func": -0.1263785183429718,
"step": 2780,
"toxic_reward": 4.165195155143738
},
{
"clip_ratio": 0.0,
"completion_length": 38.125,
"epoch": 0.6592627599243857,
"format_reward": -0.5,
"grad_norm": 17.42000961303711,
"image_reward": 0.2864379853010178,
"kl": 1.6738548278808594,
"learning_rate": 5e-06,
"loss": -0.0375,
"reward": -0.2114594280719757,
"reward_std": 1.560011611506343,
"rewards/reward_func": -0.2114594280719757,
"step": 2790,
"toxic_reward": 4.024951922893524
},
{
"clip_ratio": 0.0,
"completion_length": 48.075,
"epoch": 0.6616257088846881,
"format_reward": 0.0,
"grad_norm": 5.910866737365723,
"image_reward": 0.2764821395277977,
"kl": 2.9146203480660917,
"learning_rate": 5e-06,
"loss": -0.0104,
"reward": 0.6828193128108978,
"reward_std": 0.7127262264490127,
"rewards/reward_func": 0.6828193128108978,
"step": 2800,
"toxic_reward": 4.108976912498474
},
{
"clip_ratio": 0.0,
"completion_length": 46.225,
"epoch": 0.6639886578449905,
"format_reward": -0.25,
"grad_norm": 13.787774085998535,
"image_reward": 0.2695404052734375,
"kl": 2.044136567413807,
"learning_rate": 5e-06,
"loss": 0.1481,
"reward": 0.1416476845741272,
"reward_std": 0.9124870980158448,
"rewards/reward_func": 0.1416476845741272,
"step": 2810,
"toxic_reward": 3.9404671788215637
},
{
"clip_ratio": 0.0,
"completion_length": 33.35,
"epoch": 0.666351606805293,
"format_reward": -0.25,
"grad_norm": 9.458231925964355,
"image_reward": 0.27503865361213686,
"kl": 12.555490608513356,
"learning_rate": 5e-06,
"loss": 0.015,
"reward": -0.398735374212265,
"reward_std": 1.3145878296345472,
"rewards/reward_func": -0.398735374212265,
"step": 2820,
"toxic_reward": 3.8166601181030275
},
{
"clip_ratio": 0.0,
"completion_length": 40.575,
"epoch": 0.6687145557655955,
"format_reward": 0.0,
"grad_norm": 5.239807605743408,
"image_reward": 0.2913035064935684,
"kl": 4.1338134072721004,
"learning_rate": 5e-06,
"loss": -0.0082,
"reward": 0.09673230051994323,
"reward_std": 0.5237030681222677,
"rewards/reward_func": 0.09673230051994323,
"step": 2830,
"toxic_reward": 3.791787397861481
},
{
"clip_ratio": 0.0,
"completion_length": 74.05,
"epoch": 0.6710775047258979,
"format_reward": 0.0,
"grad_norm": 3.1467976570129395,
"image_reward": 0.2838506057858467,
"kl": 18.177365225553512,
"learning_rate": 5e-06,
"loss": 0.21,
"reward": 0.40501208901405333,
"reward_std": 0.894443211145699,
"rewards/reward_func": 0.40501208901405333,
"step": 2840,
"toxic_reward": 3.982026219367981
},
{
"clip_ratio": 0.0,
"completion_length": 50.05,
"epoch": 0.6734404536862004,
"format_reward": -0.25,
"grad_norm": 4.421890735626221,
"image_reward": 0.2822255462408066,
"kl": 4.33959369957447,
"learning_rate": 5e-06,
"loss": -0.0624,
"reward": -0.05263040065765381,
"reward_std": 1.2849599719047546,
"rewards/reward_func": -0.05263040065765381,
"step": 2850,
"toxic_reward": 3.633159136772156
},
{
"clip_ratio": 0.0,
"completion_length": 40.2,
"epoch": 0.6758034026465028,
"format_reward": 0.0,
"grad_norm": 2.5038645267486572,
"image_reward": 0.2830434158444405,
"kl": 0.6373848512768745,
"learning_rate": 5e-06,
"loss": -0.0724,
"reward": 0.627695482969284,
"reward_std": 0.8375864863395691,
"rewards/reward_func": 0.627695482969284,
"step": 2860,
"toxic_reward": 2.48615984916687
},
{
"clip_ratio": 0.0,
"completion_length": 53.45,
"epoch": 0.6781663516068053,
"format_reward": -0.75,
"grad_norm": 13.282075881958008,
"image_reward": 0.27708842009305956,
"kl": 0.9827784240245819,
"learning_rate": 5e-06,
"loss": 0.0701,
"reward": -0.892215234041214,
"reward_std": 2.255379121750593,
"rewards/reward_func": -0.892215234041214,
"step": 2870,
"toxic_reward": 3.7220635175704957
},
{
"clip_ratio": 0.0,
"completion_length": 45.525,
"epoch": 0.6805293005671077,
"format_reward": 0.0,
"grad_norm": 12.856422424316406,
"image_reward": 0.2848948180675507,
"kl": 1.0351120814681054,
"learning_rate": 5e-06,
"loss": 0.0319,
"reward": 0.2441805601119995,
"reward_std": 0.7333651419728995,
"rewards/reward_func": 0.2441805601119995,
"step": 2880,
"toxic_reward": 3.4050124049186707
},
{
"clip_ratio": 0.0,
"completion_length": 39.925,
"epoch": 0.6828922495274102,
"format_reward": 0.0,
"grad_norm": 17.430034637451172,
"image_reward": 0.2576904296875,
"kl": 0.8548611015081405,
"learning_rate": 5e-06,
"loss": 0.0552,
"reward": 0.17943925857543946,
"reward_std": 1.0328819096088409,
"rewards/reward_func": 0.17943925857543946,
"step": 2890,
"toxic_reward": 3.6138275027275086
},
{
"clip_ratio": 0.0,
"completion_length": 48.725,
"epoch": 0.6852551984877127,
"format_reward": -0.5,
"grad_norm": 8.174365997314453,
"image_reward": 0.27861836850643157,
"kl": 2.0340675324201585,
"learning_rate": 5e-06,
"loss": 0.1112,
"reward": 0.21913965195417404,
"reward_std": 1.3600813373923302,
"rewards/reward_func": 0.21913965195417404,
"step": 2900,
"toxic_reward": 3.973111832141876
},
{
"clip_ratio": 0.0,
"completion_length": 41.375,
"epoch": 0.6876181474480151,
"format_reward": 0.0,
"grad_norm": 9.611124992370605,
"image_reward": 0.2759572356939316,
"kl": 1.901711493730545,
"learning_rate": 5e-06,
"loss": 0.0008,
"reward": 0.7438287258148193,
"reward_std": 0.6283189944922924,
"rewards/reward_func": 0.7438287258148193,
"step": 2910,
"toxic_reward": 3.9766014724969865
},
{
"clip_ratio": 0.0,
"completion_length": 71.675,
"epoch": 0.6899810964083176,
"format_reward": -0.75,
"grad_norm": 4.556710243225098,
"image_reward": 0.25573730319738386,
"kl": 2.2221992775797843,
"learning_rate": 5e-06,
"loss": 0.0776,
"reward": -0.4340919256210327,
"reward_std": 1.778307182714343,
"rewards/reward_func": -0.4340919256210327,
"step": 2920,
"toxic_reward": 4.26712441444397
},
{
"clip_ratio": 0.0,
"completion_length": 48.45,
"epoch": 0.69234404536862,
"format_reward": -0.25,
"grad_norm": 8.245325088500977,
"image_reward": 0.28000691831111907,
"kl": 1.5203486174345016,
"learning_rate": 5e-06,
"loss": -0.0075,
"reward": 0.38065839409828184,
"reward_std": 1.2137143149971963,
"rewards/reward_func": 0.38065839409828184,
"step": 2930,
"toxic_reward": 4.011340999603272
},
{
"clip_ratio": 0.0,
"completion_length": 46.95,
"epoch": 0.6947069943289225,
"format_reward": -0.5,
"grad_norm": 43.48079299926758,
"image_reward": 0.28537089079618455,
"kl": 4.194944667816162,
"learning_rate": 5e-06,
"loss": -0.0192,
"reward": -0.4992818832397461,
"reward_std": 1.652469713240862,
"rewards/reward_func": -0.4992818832397461,
"step": 2940,
"toxic_reward": 3.73269322514534
},
{
"clip_ratio": 0.0,
"completion_length": 41.375,
"epoch": 0.697069943289225,
"format_reward": -0.25,
"grad_norm": 2.9284157752990723,
"image_reward": 0.29124247282743454,
"kl": 1.9233473122119904,
"learning_rate": 5e-06,
"loss": -0.0558,
"reward": 0.31386570632457733,
"reward_std": 1.3490888617932797,
"rewards/reward_func": 0.31386570632457733,
"step": 2950,
"toxic_reward": 3.3832929611206053
},
{
"clip_ratio": 0.0,
"completion_length": 37.9,
"epoch": 0.6994328922495274,
"format_reward": -0.75,
"grad_norm": 5.489762306213379,
"image_reward": 0.27443746030330657,
"kl": 11.033294987678527,
"learning_rate": 5e-06,
"loss": 0.034,
"reward": -0.6967712700366974,
"reward_std": 1.7560975707136095,
"rewards/reward_func": -0.6967712700366974,
"step": 2960,
"toxic_reward": 4.0662164211273195
},
{
"clip_ratio": 0.0,
"completion_length": 50.575,
"epoch": 0.7017958412098299,
"format_reward": -0.25,
"grad_norm": 18.22649574279785,
"image_reward": 0.26611735075712206,
"kl": 3.9552819430828094,
"learning_rate": 5e-06,
"loss": -0.097,
"reward": 0.2059646487236023,
"reward_std": 1.4741453856229783,
"rewards/reward_func": 0.2059646487236023,
"step": 2970,
"toxic_reward": 3.947977590560913
},
{
"clip_ratio": 0.0,
"completion_length": 39.25,
"epoch": 0.7041587901701323,
"format_reward": 0.0,
"grad_norm": 17.15437889099121,
"image_reward": 0.29809672236442564,
"kl": 2.7566053330898286,
"learning_rate": 5e-06,
"loss": 0.008,
"reward": 0.5703202053904534,
"reward_std": 0.8202566847205162,
"rewards/reward_func": 0.5703202053904534,
"step": 2980,
"toxic_reward": 3.724568712711334
},
{
"clip_ratio": 0.0,
"completion_length": 37.325,
"epoch": 0.7065217391304348,
"format_reward": -0.5,
"grad_norm": 24.218904495239258,
"image_reward": 0.26676025390625,
"kl": 2.1775312602519987,
"learning_rate": 5e-06,
"loss": 0.0501,
"reward": 0.04957394301891327,
"reward_std": 1.5467448111623525,
"rewards/reward_func": 0.04957394301891327,
"step": 2990,
"toxic_reward": 3.3577624768018723
},
{
"clip_ratio": 0.0,
"completion_length": 47.1,
"epoch": 0.7088846880907372,
"format_reward": -0.25,
"grad_norm": 4.296006679534912,
"image_reward": 0.286920166015625,
"kl": 1.4036121606826781,
"learning_rate": 5e-06,
"loss": -0.0237,
"reward": 0.3820555150508881,
"reward_std": 1.188760439120233,
"rewards/reward_func": 0.3820555150508881,
"step": 3000,
"toxic_reward": 4.305025839805603
},
{
"clip_ratio": 0.0,
"completion_length": 39.9,
"epoch": 0.7112476370510397,
"format_reward": -0.5,
"grad_norm": 20.778005599975586,
"image_reward": 0.30558042062653434,
"kl": 1.4864997833967208,
"learning_rate": 5e-06,
"loss": 0.0741,
"reward": -0.08508440256118774,
"reward_std": 1.637317718565464,
"rewards/reward_func": -0.08508440256118774,
"step": 3010,
"toxic_reward": 4.079210705227322
},
{
"clip_ratio": 0.0,
"completion_length": 41.95,
"epoch": 0.7136105860113422,
"format_reward": -0.25,
"grad_norm": 4.398971080780029,
"image_reward": 0.2982396438717842,
"kl": 36.805122749507426,
"learning_rate": 5e-06,
"loss": 0.0657,
"reward": -0.5174520492553711,
"reward_std": 1.1937666054815055,
"rewards/reward_func": -0.5174520492553711,
"step": 3020,
"toxic_reward": 4.007938003540039
},
{
"clip_ratio": 0.0,
"completion_length": 43.35,
"epoch": 0.7159735349716446,
"format_reward": -0.5,
"grad_norm": 13.247093200683594,
"image_reward": 0.26703898310661317,
"kl": 1.961264681816101,
"learning_rate": 5e-06,
"loss": -0.0169,
"reward": -0.05578238368034363,
"reward_std": 1.441930427402258,
"rewards/reward_func": -0.05578238368034363,
"step": 3030,
"toxic_reward": 4.098655521869659
},
{
"clip_ratio": 0.0,
"completion_length": 43.275,
"epoch": 0.718336483931947,
"format_reward": 0.0,
"grad_norm": 23.132400512695312,
"image_reward": 0.2803761810064316,
"kl": 26.18696767091751,
"learning_rate": 5e-06,
"loss": 0.0457,
"reward": 0.49192982316017153,
"reward_std": 0.6619096536189317,
"rewards/reward_func": 0.49192982316017153,
"step": 3040,
"toxic_reward": 4.184990978240966
},
{
"clip_ratio": 0.0,
"completion_length": 53.35,
"epoch": 0.7206994328922496,
"format_reward": -0.25,
"grad_norm": 28.89768409729004,
"image_reward": 0.2594024658203125,
"kl": 2.125564157962799,
"learning_rate": 5e-06,
"loss": 0.0056,
"reward": -0.18149735927581787,
"reward_std": 1.4747628048062325,
"rewards/reward_func": -0.18149735927581787,
"step": 3050,
"toxic_reward": 3.5373760223388673
},
{
"clip_ratio": 0.0,
"completion_length": 48.95,
"epoch": 0.723062381852552,
"format_reward": -0.5,
"grad_norm": 17.26774787902832,
"image_reward": 0.29308573305606844,
"kl": 13.767069751024247,
"learning_rate": 5e-06,
"loss": 0.0058,
"reward": 0.02059091329574585,
"reward_std": 1.5365628942847251,
"rewards/reward_func": 0.02059091329574585,
"step": 3060,
"toxic_reward": 3.280546021461487
},
{
"clip_ratio": 0.0,
"completion_length": 45.65,
"epoch": 0.7254253308128544,
"format_reward": -0.5,
"grad_norm": 16.69405746459961,
"image_reward": 0.27832234650850296,
"kl": 2.4563605159521105,
"learning_rate": 5e-06,
"loss": 0.0249,
"reward": -0.4515081524848938,
"reward_std": 1.2287155898287891,
"rewards/reward_func": -0.4515081524848938,
"step": 3070,
"toxic_reward": 3.398514473438263
},
{
"clip_ratio": 0.0,
"completion_length": 58.775,
"epoch": 0.7277882797731569,
"format_reward": -0.5,
"grad_norm": 7.214962482452393,
"image_reward": 0.27323404848575594,
"kl": 1.8898794114589692,
"learning_rate": 5e-06,
"loss": -0.0003,
"reward": -0.4889628529548645,
"reward_std": 1.4541106900200247,
"rewards/reward_func": -0.4889628529548645,
"step": 3080,
"toxic_reward": 4.181539106369018
},
{
"clip_ratio": 0.0,
"completion_length": 52.1,
"epoch": 0.7301512287334594,
"format_reward": -0.5,
"grad_norm": 1.7761129140853882,
"image_reward": 0.27235768735408783,
"kl": 2.1771215945482254,
"learning_rate": 5e-06,
"loss": -0.0612,
"reward": 0.1671779692173004,
"reward_std": 1.4098370391875505,
"rewards/reward_func": 0.1671779692173004,
"step": 3090,
"toxic_reward": 3.8193355441093444
},
{
"clip_ratio": 0.0,
"completion_length": 36.85,
"epoch": 0.7325141776937618,
"format_reward": -0.5,
"grad_norm": 5.988401412963867,
"image_reward": 0.25819803923368456,
"kl": 0.8303129658102989,
"learning_rate": 5e-06,
"loss": -0.0141,
"reward": 0.20456358194351196,
"reward_std": 1.755793434381485,
"rewards/reward_func": 0.20456358194351196,
"step": 3100,
"toxic_reward": 3.5276977360248565
},
{
"clip_ratio": 0.0,
"completion_length": 45.825,
"epoch": 0.7348771266540642,
"format_reward": 0.0,
"grad_norm": 63.649696350097656,
"image_reward": 0.26085103303194046,
"kl": 2.509291835129261,
"learning_rate": 5e-06,
"loss": -0.0945,
"reward": 0.34231345951557157,
"reward_std": 1.2596007108688354,
"rewards/reward_func": 0.34231345951557157,
"step": 3110,
"toxic_reward": 3.680406093597412
},
{
"clip_ratio": 0.0,
"completion_length": 42.675,
"epoch": 0.7372400756143668,
"format_reward": 0.0,
"grad_norm": 13.457945823669434,
"image_reward": 0.2661163330078125,
"kl": 3.3423233568668365,
"learning_rate": 5e-06,
"loss": -0.0837,
"reward": 0.21805171072483062,
"reward_std": 1.0620483674108983,
"rewards/reward_func": 0.21805171072483062,
"step": 3120,
"toxic_reward": 3.4958622455596924
},
{
"clip_ratio": 0.0,
"completion_length": 43.25,
"epoch": 0.7396030245746692,
"format_reward": 0.0,
"grad_norm": 1.3712886571884155,
"image_reward": 0.280279541015625,
"kl": 0.5368543028831482,
"learning_rate": 5e-06,
"loss": 0.0856,
"reward": -0.08258238434791565,
"reward_std": 0.7678581360727549,
"rewards/reward_func": -0.08258238434791565,
"step": 3130,
"toxic_reward": 4.090320491790772
},
{
"clip_ratio": 0.0,
"completion_length": 51.75,
"epoch": 0.7419659735349716,
"format_reward": -0.25,
"grad_norm": 33.164817810058594,
"image_reward": 0.263348388671875,
"kl": 3.852606762945652,
"learning_rate": 5e-06,
"loss": -0.1711,
"reward": 0.23493566811084748,
"reward_std": 1.2882447349838912,
"rewards/reward_func": 0.23493566811084748,
"step": 3140,
"toxic_reward": 3.8201312363147735
},
{
"clip_ratio": 0.0,
"completion_length": 41.275,
"epoch": 0.744328922495274,
"format_reward": -0.25,
"grad_norm": 23.956363677978516,
"image_reward": 0.31206461489200593,
"kl": 0.8646048396825791,
"learning_rate": 5e-06,
"loss": -0.0402,
"reward": 0.018216264247894288,
"reward_std": 1.226612313091755,
"rewards/reward_func": 0.018216264247894288,
"step": 3150,
"toxic_reward": 3.7303581714630125
},
{
"clip_ratio": 0.0,
"completion_length": 42.025,
"epoch": 0.7466918714555766,
"format_reward": -0.25,
"grad_norm": 7.992063999176025,
"image_reward": 0.283380126953125,
"kl": 1.2732116781175136,
"learning_rate": 5e-06,
"loss": -0.091,
"reward": 0.7706227093935013,
"reward_std": 1.4939947571605443,
"rewards/reward_func": 0.7706227093935013,
"step": 3160,
"toxic_reward": 3.5458990573883056
},
{
"clip_ratio": 0.0,
"completion_length": 35.05,
"epoch": 0.749054820415879,
"format_reward": 0.0,
"grad_norm": 26.938879013061523,
"image_reward": 0.2929168701171875,
"kl": 4.621248189732432,
"learning_rate": 5e-06,
"loss": -0.0543,
"reward": 0.26634013652801514,
"reward_std": 0.6591222167015076,
"rewards/reward_func": 0.26634013652801514,
"step": 3170,
"toxic_reward": 4.090583860874176
},
{
"clip_ratio": 0.0,
"completion_length": 45.825,
"epoch": 0.7514177693761814,
"format_reward": -0.5,
"grad_norm": 1.0445924997329712,
"image_reward": 0.27820536196231843,
"kl": 1.2374065339565277,
"learning_rate": 5e-06,
"loss": 0.0777,
"reward": -0.1608543336391449,
"reward_std": 0.9281521745026111,
"rewards/reward_func": -0.1608543336391449,
"step": 3180,
"toxic_reward": 3.968969798088074
},
{
"clip_ratio": 0.0,
"completion_length": 42.8,
"epoch": 0.753780718336484,
"format_reward": 0.0,
"grad_norm": 26.366165161132812,
"image_reward": 0.27580566257238387,
"kl": 11.984261164069176,
"learning_rate": 5e-06,
"loss": 0.0176,
"reward": 0.36088051795959475,
"reward_std": 0.737302597053349,
"rewards/reward_func": 0.36088051795959475,
"step": 3190,
"toxic_reward": 4.220689821243286
},
{
"clip_ratio": 0.0,
"completion_length": 43.525,
"epoch": 0.7561436672967864,
"format_reward": 0.0,
"grad_norm": 15.66350269317627,
"image_reward": 0.2542442321777344,
"kl": 1.2004614934325217,
"learning_rate": 5e-06,
"loss": -0.1499,
"reward": 0.916411966085434,
"reward_std": 1.1410479605197907,
"rewards/reward_func": 0.916411966085434,
"step": 3200,
"toxic_reward": 3.5325961112976074
},
{
"clip_ratio": 0.0,
"completion_length": 32.375,
"epoch": 0.7585066162570888,
"format_reward": 0.0,
"grad_norm": 4.70230770111084,
"image_reward": 0.30909423828125,
"kl": 2.567577276751399,
"learning_rate": 5e-06,
"loss": -0.0471,
"reward": 0.20682075023651122,
"reward_std": 0.5303860757499933,
"rewards/reward_func": 0.20682075023651122,
"step": 3210,
"toxic_reward": 3.270925796031952
},
{
"clip_ratio": 0.0,
"completion_length": 49.05,
"epoch": 0.7608695652173914,
"format_reward": 0.0,
"grad_norm": 10.79295539855957,
"image_reward": 0.25408528596162794,
"kl": 2.86144537627697,
"learning_rate": 5e-06,
"loss": -0.0379,
"reward": 0.6601236045360566,
"reward_std": 0.7253405870869756,
"rewards/reward_func": 0.6601236045360566,
"step": 3220,
"toxic_reward": 4.3341371536254885
},
{
"clip_ratio": 0.0,
"completion_length": 42.725,
"epoch": 0.7632325141776938,
"format_reward": -0.25,
"grad_norm": 6.788066387176514,
"image_reward": 0.2796641021966934,
"kl": 5.517164082825184,
"learning_rate": 5e-06,
"loss": 0.1356,
"reward": -0.18339840769767762,
"reward_std": 1.0695885993540286,
"rewards/reward_func": -0.18339840769767762,
"step": 3230,
"toxic_reward": 4.077802658081055
},
{
"clip_ratio": 0.0,
"completion_length": 37.5,
"epoch": 0.7655954631379962,
"format_reward": -0.5,
"grad_norm": 39.19500732421875,
"image_reward": 0.2956451416015625,
"kl": 0.7065762653946877,
"learning_rate": 5e-06,
"loss": -0.047,
"reward": -0.26765223741531374,
"reward_std": 1.6595379646867514,
"rewards/reward_func": -0.26765223741531374,
"step": 3240,
"toxic_reward": 3.865544855594635
},
{
"clip_ratio": 0.0,
"completion_length": 40.675,
"epoch": 0.7679584120982986,
"format_reward": 0.0,
"grad_norm": 8.50940990447998,
"image_reward": 0.28647562563419343,
"kl": 4.316986609622836,
"learning_rate": 5e-06,
"loss": 0.124,
"reward": 0.6616093635559082,
"reward_std": 1.070189495384693,
"rewards/reward_func": 0.6616093635559082,
"step": 3250,
"toxic_reward": 3.284928467869759
},
{
"clip_ratio": 0.0,
"completion_length": 40.175,
"epoch": 0.7703213610586012,
"format_reward": 0.0,
"grad_norm": 21.128314971923828,
"image_reward": 0.2679835006594658,
"kl": 4.375968629121781,
"learning_rate": 5e-06,
"loss": 0.0579,
"reward": 0.3372311323881149,
"reward_std": 0.869463924318552,
"rewards/reward_func": 0.3372311323881149,
"step": 3260,
"toxic_reward": 3.78046395778656
},
{
"clip_ratio": 0.0,
"completion_length": 39.5,
"epoch": 0.7726843100189036,
"format_reward": 0.0,
"grad_norm": 7.558209419250488,
"image_reward": 0.2745330810546875,
"kl": 2.3013378672301767,
"learning_rate": 5e-06,
"loss": -0.031,
"reward": 0.784791512787342,
"reward_std": 0.8750310368835926,
"rewards/reward_func": 0.784791512787342,
"step": 3270,
"toxic_reward": 3.3566872388124467
},
{
"clip_ratio": 0.0,
"completion_length": 39.025,
"epoch": 0.775047258979206,
"format_reward": -0.5,
"grad_norm": 2.0432510375976562,
"image_reward": 0.2913625091314316,
"kl": 0.3781319923698902,
"learning_rate": 5e-06,
"loss": -0.0607,
"reward": -0.3740895688533783,
"reward_std": 1.350129895284772,
"rewards/reward_func": -0.3740895688533783,
"step": 3280,
"toxic_reward": 4.018161624670029
},
{
"clip_ratio": 0.0,
"completion_length": 47.825,
"epoch": 0.7774102079395085,
"format_reward": -0.5,
"grad_norm": 3.16352915763855,
"image_reward": 0.27322998046875,
"kl": 0.3104788601398468,
"learning_rate": 5e-06,
"loss": 0.0334,
"reward": -0.05296646356582642,
"reward_std": 1.2258484821766615,
"rewards/reward_func": -0.05296646356582642,
"step": 3290,
"toxic_reward": 4.389449417591095
},
{
"clip_ratio": 0.0,
"completion_length": 35.925,
"epoch": 0.779773156899811,
"format_reward": 0.0,
"grad_norm": 4.182164669036865,
"image_reward": 0.2737925201654434,
"kl": 1.2850206293165685,
"learning_rate": 5e-06,
"loss": -0.1213,
"reward": 0.6742017388343811,
"reward_std": 0.736553730070591,
"rewards/reward_func": 0.6742017388343811,
"step": 3300,
"toxic_reward": 4.220770263671875
},
{
"clip_ratio": 0.0,
"completion_length": 55.175,
"epoch": 0.7821361058601134,
"format_reward": -0.25,
"grad_norm": 8.606978416442871,
"image_reward": 0.2700215637683868,
"kl": 4.289887800067663,
"learning_rate": 5e-06,
"loss": -0.1432,
"reward": 0.30965389013290406,
"reward_std": 1.063696064054966,
"rewards/reward_func": 0.30965389013290406,
"step": 3310,
"toxic_reward": 4.136632585525513
},
{
"clip_ratio": 0.0,
"completion_length": 40.45,
"epoch": 0.7844990548204159,
"format_reward": 0.0,
"grad_norm": 3.973367691040039,
"image_reward": 0.25255330502986906,
"kl": 6.628417156636715,
"learning_rate": 5e-06,
"loss": -0.0524,
"reward": 0.2595418691635132,
"reward_std": 0.6656439051032066,
"rewards/reward_func": 0.2595418691635132,
"step": 3320,
"toxic_reward": 3.946825695037842
},
{
"clip_ratio": 0.0,
"completion_length": 44.625,
"epoch": 0.7868620037807184,
"format_reward": 0.0,
"grad_norm": 5.275523662567139,
"image_reward": 0.29005940854549406,
"kl": 25.900663439184427,
"learning_rate": 5e-06,
"loss": -0.1344,
"reward": 0.8005503177642822,
"reward_std": 0.9713124742731452,
"rewards/reward_func": 0.8005503177642822,
"step": 3330,
"toxic_reward": 4.047469854354858
},
{
"clip_ratio": 0.0,
"completion_length": 36.55,
"epoch": 0.7892249527410208,
"format_reward": 0.0,
"grad_norm": 5.920967102050781,
"image_reward": 0.2706329345703125,
"kl": 2.892443811520934,
"learning_rate": 5e-06,
"loss": -0.028,
"reward": 0.7794641971588134,
"reward_std": 0.7315312433987856,
"rewards/reward_func": 0.7794641971588134,
"step": 3340,
"toxic_reward": 4.036288380622864
},
{
"clip_ratio": 0.0,
"completion_length": 49.325,
"epoch": 0.7915879017013232,
"format_reward": -0.25,
"grad_norm": 19.411304473876953,
"image_reward": 0.2590001419186592,
"kl": 0.762314885109663,
"learning_rate": 5e-06,
"loss": 0.0738,
"reward": -0.22335948944091796,
"reward_std": 1.229094560444355,
"rewards/reward_func": -0.22335948944091796,
"step": 3350,
"toxic_reward": 4.078046441078186
},
{
"clip_ratio": 0.0,
"completion_length": 42.825,
"epoch": 0.7939508506616257,
"format_reward": -0.25,
"grad_norm": 9.397270202636719,
"image_reward": 0.25453638202614254,
"kl": 1.7116897955536843,
"learning_rate": 5e-06,
"loss": -0.0637,
"reward": -0.23146066069602966,
"reward_std": 1.4809592371806501,
"rewards/reward_func": -0.23146066069602966,
"step": 3360,
"toxic_reward": 3.7261215580834284
},
{
"clip_ratio": 0.0,
"completion_length": 36.075,
"epoch": 0.7963137996219282,
"format_reward": 0.0,
"grad_norm": 2.7069385051727295,
"image_reward": 0.29098663330078123,
"kl": 0.2713630013167858,
"learning_rate": 5e-06,
"loss": -0.0331,
"reward": 0.2162942558526993,
"reward_std": 0.7098794117569923,
"rewards/reward_func": 0.2162942558526993,
"step": 3370,
"toxic_reward": 3.5313488602638246
},
{
"clip_ratio": 0.0,
"completion_length": 40.425,
"epoch": 0.7986767485822306,
"format_reward": 0.0,
"grad_norm": 5.024960041046143,
"image_reward": 0.2739929184317589,
"kl": 18.83201899640262,
"learning_rate": 5e-06,
"loss": -0.0017,
"reward": 0.4777979046106339,
"reward_std": 1.240721021965146,
"rewards/reward_func": 0.4777979046106339,
"step": 3380,
"toxic_reward": 3.522536587715149
},
{
"clip_ratio": 0.0,
"completion_length": 41.725,
"epoch": 0.8010396975425331,
"format_reward": 0.0,
"grad_norm": 4.2769622802734375,
"image_reward": 0.2533442169427872,
"kl": 8.165257753431797,
"learning_rate": 5e-06,
"loss": -0.0776,
"reward": 0.44249573945999143,
"reward_std": 0.8017176885157824,
"rewards/reward_func": 0.44249573945999143,
"step": 3390,
"toxic_reward": 4.304618096351623
},
{
"clip_ratio": 0.0,
"completion_length": 33.85,
"epoch": 0.8034026465028355,
"format_reward": -0.75,
"grad_norm": 6.779478549957275,
"image_reward": 0.254600016772747,
"kl": 3.2404680982232095,
"learning_rate": 5e-06,
"loss": -0.0622,
"reward": 1.1459296941757202,
"reward_std": 2.3130407273769378,
"rewards/reward_func": 1.1459296941757202,
"step": 3400,
"toxic_reward": 3.946528363227844
},
{
"clip_ratio": 0.0,
"completion_length": 49.475,
"epoch": 0.805765595463138,
"format_reward": -0.75,
"grad_norm": 9.082526206970215,
"image_reward": 0.2917332977056503,
"kl": 1.1568331263959408,
"learning_rate": 5e-06,
"loss": -0.087,
"reward": -0.22108137607574463,
"reward_std": 2.109957142919302,
"rewards/reward_func": -0.22108137607574463,
"step": 3410,
"toxic_reward": 3.467973506450653
},
{
"clip_ratio": 0.0,
"completion_length": 46.225,
"epoch": 0.8081285444234405,
"format_reward": 0.0,
"grad_norm": 4.748640537261963,
"image_reward": 0.2629201263189316,
"kl": 1.277670707181096,
"learning_rate": 5e-06,
"loss": 0.0912,
"reward": 0.5041390061378479,
"reward_std": 1.1238155417144298,
"rewards/reward_func": 0.5041390061378479,
"step": 3420,
"toxic_reward": 3.6773669004440306
},
{
"clip_ratio": 0.0,
"completion_length": 54.875,
"epoch": 0.8104914933837429,
"format_reward": -0.25,
"grad_norm": 6.2929182052612305,
"image_reward": 0.2664311736822128,
"kl": 0.41112807476893065,
"learning_rate": 5e-06,
"loss": -0.0185,
"reward": 0.25773588865995406,
"reward_std": 1.1975380808115006,
"rewards/reward_func": 0.25773588865995406,
"step": 3430,
"toxic_reward": 3.5475049674510957
},
{
"clip_ratio": 0.0,
"completion_length": 31.2,
"epoch": 0.8128544423440454,
"format_reward": 0.0,
"grad_norm": 2.984248399734497,
"image_reward": 0.29365132600069044,
"kl": 6.331273209676146,
"learning_rate": 5e-06,
"loss": -0.0094,
"reward": -0.1625719666481018,
"reward_std": 0.864103776961565,
"rewards/reward_func": -0.1625719666481018,
"step": 3440,
"toxic_reward": 3.899219441413879
},
{
"clip_ratio": 0.0,
"completion_length": 59.075,
"epoch": 0.8152173913043478,
"format_reward": 0.0,
"grad_norm": 5.733253479003906,
"image_reward": 0.28086344301700594,
"kl": 9.025772982649505,
"learning_rate": 5e-06,
"loss": -0.0245,
"reward": 0.4977319598197937,
"reward_std": 0.6485220491886139,
"rewards/reward_func": 0.4977319598197937,
"step": 3450,
"toxic_reward": 3.7944631457328795
},
{
"clip_ratio": 0.0,
"completion_length": 33.25,
"epoch": 0.8175803402646503,
"format_reward": -0.25,
"grad_norm": 5.605562686920166,
"image_reward": 0.281341552734375,
"kl": 0.665616973862052,
"learning_rate": 5e-06,
"loss": -0.0068,
"reward": -0.26609439849853517,
"reward_std": 1.4688232390210032,
"rewards/reward_func": -0.26609439849853517,
"step": 3460,
"toxic_reward": 3.4837970972061156
},
{
"clip_ratio": 0.0,
"completion_length": 52.475,
"epoch": 0.8199432892249527,
"format_reward": -0.5,
"grad_norm": 2.6239664554595947,
"image_reward": 0.2684331268072128,
"kl": 1.5078430883586407,
"learning_rate": 5e-06,
"loss": 0.1555,
"reward": -0.12674018144607543,
"reward_std": 1.4365263484418391,
"rewards/reward_func": -0.12674018144607543,
"step": 3470,
"toxic_reward": 4.5540220737457275
},
{
"clip_ratio": 0.0,
"completion_length": 49.975,
"epoch": 0.8223062381852552,
"format_reward": -0.25,
"grad_norm": 8.734126091003418,
"image_reward": 0.2604085296392441,
"kl": 24.937382932007313,
"learning_rate": 5e-06,
"loss": -0.0011,
"reward": -0.20797204971313477,
"reward_std": 0.9569237198680639,
"rewards/reward_func": -0.20797204971313477,
"step": 3480,
"toxic_reward": 4.407214689254761
},
{
"clip_ratio": 0.0,
"completion_length": 60.625,
"epoch": 0.8246691871455577,
"format_reward": -0.25,
"grad_norm": 1.5907628536224365,
"image_reward": 0.2842885345220566,
"kl": 0.051335761044174436,
"learning_rate": 5e-06,
"loss": 0.0047,
"reward": 0.9079252362251282,
"reward_std": 1.1933536015450954,
"rewards/reward_func": 0.9079252362251282,
"step": 3490,
"toxic_reward": 4.199088740348816
},
{
"clip_ratio": 0.0,
"completion_length": 39.75,
"epoch": 0.8270321361058601,
"format_reward": -1.0,
"grad_norm": 1.5189018249511719,
"image_reward": 0.2938863128423691,
"kl": 6.8780351031571625,
"learning_rate": 5e-06,
"loss": 0.0792,
"reward": -0.6177874624729156,
"reward_std": 1.9702259879559278,
"rewards/reward_func": -0.6177874624729156,
"step": 3500,
"toxic_reward": 3.7184417486190795
},
{
"clip_ratio": 0.0,
"completion_length": 44.075,
"epoch": 0.8293950850661626,
"format_reward": -0.25,
"grad_norm": 0.5691888928413391,
"image_reward": 0.2792378753423691,
"kl": 0.0625603836029768,
"learning_rate": 5e-06,
"loss": -0.0245,
"reward": 0.39415156543254853,
"reward_std": 0.7689090168103576,
"rewards/reward_func": 0.39415156543254853,
"step": 3510,
"toxic_reward": 4.210167169570923
},
{
"clip_ratio": 0.0,
"completion_length": 48.675,
"epoch": 0.831758034026465,
"format_reward": 0.0,
"grad_norm": 2.8700907230377197,
"image_reward": 0.263861083984375,
"kl": 0.3225065166130662,
"learning_rate": 5e-06,
"loss": -0.0609,
"reward": 0.6327012300491333,
"reward_std": 0.980434575676918,
"rewards/reward_func": 0.6327012300491333,
"step": 3520,
"toxic_reward": 3.8261560261249543
},
{
"clip_ratio": 0.0,
"completion_length": 40.05,
"epoch": 0.8341209829867675,
"format_reward": 0.0,
"grad_norm": 1.0346537828445435,
"image_reward": 0.284771728515625,
"kl": 0.40898411339148877,
"learning_rate": 5e-06,
"loss": -0.0307,
"reward": 0.30759164690971375,
"reward_std": 0.6451162457466125,
"rewards/reward_func": 0.30759164690971375,
"step": 3530,
"toxic_reward": 4.171144628524781
},
{
"clip_ratio": 0.0,
"completion_length": 55.3,
"epoch": 0.8364839319470699,
"format_reward": -0.5,
"grad_norm": 5.951425075531006,
"image_reward": 0.3035013824701309,
"kl": 11.781208837591112,
"learning_rate": 5e-06,
"loss": 0.0257,
"reward": -0.1725111722946167,
"reward_std": 1.8102335507050156,
"rewards/reward_func": -0.1725111722946167,
"step": 3540,
"toxic_reward": 3.670738685131073
},
{
"clip_ratio": 0.0,
"completion_length": 68.975,
"epoch": 0.8388468809073724,
"format_reward": 0.0,
"grad_norm": 14.631609916687012,
"image_reward": 0.2706085205078125,
"kl": 4.56192576661706,
"learning_rate": 5e-06,
"loss": -0.033,
"reward": 1.4811566695570946,
"reward_std": 0.9509499605745078,
"rewards/reward_func": 1.4811566695570946,
"step": 3550,
"toxic_reward": 3.4709715723991392
},
{
"clip_ratio": 0.0,
"completion_length": 41.225,
"epoch": 0.8412098298676749,
"format_reward": -0.5,
"grad_norm": 7.567039489746094,
"image_reward": 0.2595652252435684,
"kl": 10.069495621696115,
"learning_rate": 5e-06,
"loss": -0.057,
"reward": 0.034914278984069826,
"reward_std": 2.0578875496983526,
"rewards/reward_func": 0.034914278984069826,
"step": 3560,
"toxic_reward": 3.711397814750671
},
{
"clip_ratio": 0.0,
"completion_length": 44.9,
"epoch": 0.8435727788279773,
"format_reward": 0.0,
"grad_norm": 2.5686023235321045,
"image_reward": 0.27446797788143157,
"kl": 1.5964453139342367,
"learning_rate": 5e-06,
"loss": -0.031,
"reward": 0.41478089690208436,
"reward_std": 0.6242949636653066,
"rewards/reward_func": 0.41478089690208436,
"step": 3570,
"toxic_reward": 4.057631134986877
},
{
"clip_ratio": 0.0,
"completion_length": 38.725,
"epoch": 0.8459357277882797,
"format_reward": -0.75,
"grad_norm": 18.62441062927246,
"image_reward": 0.26212361752986907,
"kl": 18.3546858407557,
"learning_rate": 5e-06,
"loss": -0.0125,
"reward": 0.3845840930938721,
"reward_std": 2.4349112689495085,
"rewards/reward_func": 0.3845840930938721,
"step": 3580,
"toxic_reward": 3.7492689728736877
},
{
"clip_ratio": 0.0,
"completion_length": 46.15,
"epoch": 0.8482986767485823,
"format_reward": -0.25,
"grad_norm": 1.5018891096115112,
"image_reward": 0.25630950927734375,
"kl": 1.5399845570325852,
"learning_rate": 5e-06,
"loss": 0.0064,
"reward": -0.4008509755134583,
"reward_std": 1.2334194054827095,
"rewards/reward_func": -0.4008509755134583,
"step": 3590,
"toxic_reward": 3.9793298959732057
},
{
"clip_ratio": 0.0,
"completion_length": 49.475,
"epoch": 0.8506616257088847,
"format_reward": 0.0,
"grad_norm": 1.118828296661377,
"image_reward": 0.279705810546875,
"kl": 2.2166069228202105,
"learning_rate": 5e-06,
"loss": -0.0439,
"reward": 0.722964608669281,
"reward_std": 0.7349236082285643,
"rewards/reward_func": 0.722964608669281,
"step": 3600,
"toxic_reward": 4.429630327224731
},
{
"clip_ratio": 0.0,
"completion_length": 47.8,
"epoch": 0.8530245746691871,
"format_reward": 0.0,
"grad_norm": 2.9072647094726562,
"image_reward": 0.2988444000482559,
"kl": 0.4259108882397413,
"learning_rate": 5e-06,
"loss": -0.139,
"reward": 0.36530678868293764,
"reward_std": 1.0169117324054242,
"rewards/reward_func": 0.36530678868293764,
"step": 3610,
"toxic_reward": 3.706578254699707
},
{
"clip_ratio": 0.0,
"completion_length": 34.05,
"epoch": 0.8553875236294896,
"format_reward": 0.0,
"grad_norm": 5.574492931365967,
"image_reward": 0.26971537321805955,
"kl": 0.8386783060617745,
"learning_rate": 5e-06,
"loss": 0.0196,
"reward": -0.043644605576992034,
"reward_std": 0.7492304600775241,
"rewards/reward_func": -0.043644605576992034,
"step": 3620,
"toxic_reward": 3.7889950960874557
},
{
"clip_ratio": 0.0,
"completion_length": 48.2,
"epoch": 0.8577504725897921,
"format_reward": 0.0,
"grad_norm": 0.6372764110565186,
"image_reward": 0.2799346923828125,
"kl": 0.09604998417198658,
"learning_rate": 5e-06,
"loss": -0.0245,
"reward": 0.31555656492710116,
"reward_std": 0.5240693692117929,
"rewards/reward_func": 0.31555656492710116,
"step": 3630,
"toxic_reward": 3.9185105204582213
},
{
"clip_ratio": 0.0,
"completion_length": 54.775,
"epoch": 0.8601134215500945,
"format_reward": 0.0,
"grad_norm": 1.1808196306228638,
"image_reward": 0.2841166198253632,
"kl": 29.371498390100896,
"learning_rate": 5e-06,
"loss": -0.0516,
"reward": 0.26911270916461943,
"reward_std": 0.5647319633513689,
"rewards/reward_func": 0.26911270916461943,
"step": 3640,
"toxic_reward": 3.410293960571289
},
{
"clip_ratio": 0.0,
"completion_length": 39.35,
"epoch": 0.8624763705103969,
"format_reward": -0.25,
"grad_norm": 0.7336105108261108,
"image_reward": 0.26638386994600294,
"kl": 2.7957767372950912,
"learning_rate": 5e-06,
"loss": 0.0919,
"reward": 0.25321381688117983,
"reward_std": 1.543316999450326,
"rewards/reward_func": 0.25321381688117983,
"step": 3650,
"toxic_reward": 3.6566759824752806
},
{
"clip_ratio": 0.0,
"completion_length": 46.75,
"epoch": 0.8648393194706995,
"format_reward": -0.25,
"grad_norm": 0.6029968857765198,
"image_reward": 0.2995513916015625,
"kl": 2.5597430652938784,
"learning_rate": 5e-06,
"loss": -0.0974,
"reward": 0.32535398602485655,
"reward_std": 1.3309460416436196,
"rewards/reward_func": 0.32535398602485655,
"step": 3660,
"toxic_reward": 3.65915470123291
},
{
"clip_ratio": 0.0,
"completion_length": 35.275,
"epoch": 0.8672022684310019,
"format_reward": -0.75,
"grad_norm": 3.770862102508545,
"image_reward": 0.291839599609375,
"kl": 24.679713291302324,
"learning_rate": 5e-06,
"loss": -0.0464,
"reward": -0.6877359867095947,
"reward_std": 2.206609180383384,
"rewards/reward_func": -0.6877359867095947,
"step": 3670,
"toxic_reward": 3.764638936519623
},
{
"clip_ratio": 0.0,
"completion_length": 44.9,
"epoch": 0.8695652173913043,
"format_reward": -0.25,
"grad_norm": 0.5417113304138184,
"image_reward": 0.26164347380399705,
"kl": 0.35583615899085996,
"learning_rate": 5e-06,
"loss": 0.0668,
"reward": -0.214414319396019,
"reward_std": 1.3576272014528512,
"rewards/reward_func": -0.214414319396019,
"step": 3680,
"toxic_reward": 3.65915904045105
},
{
"clip_ratio": 0.0,
"completion_length": 38.125,
"epoch": 0.8719281663516069,
"format_reward": 0.0,
"grad_norm": 0.4714978039264679,
"image_reward": 0.278389485180378,
"kl": 2.4852739069610834,
"learning_rate": 5e-06,
"loss": -0.0177,
"reward": 0.1318028151988983,
"reward_std": 0.8923286706209183,
"rewards/reward_func": 0.1318028151988983,
"step": 3690,
"toxic_reward": 3.508782708644867
},
{
"clip_ratio": 0.0,
"completion_length": 47.05,
"epoch": 0.8742911153119093,
"format_reward": 0.0,
"grad_norm": 0.8036189079284668,
"image_reward": 0.246942138671875,
"kl": 17.79970283471048,
"learning_rate": 5e-06,
"loss": -0.0525,
"reward": 0.44554237723350526,
"reward_std": 0.8977296775206923,
"rewards/reward_func": 0.44554237723350526,
"step": 3700,
"toxic_reward": 3.5204819679260253
},
{
"clip_ratio": 0.0,
"completion_length": 37.15,
"epoch": 0.8766540642722117,
"format_reward": 0.0,
"grad_norm": 0.9322050213813782,
"image_reward": 0.2607330322265625,
"kl": 4.579995289538056,
"learning_rate": 5e-06,
"loss": -0.0918,
"reward": 0.7257406830787658,
"reward_std": 0.7061707813292741,
"rewards/reward_func": 0.7257406830787658,
"step": 3710,
"toxic_reward": 3.967698335647583
},
{
"clip_ratio": 0.0,
"completion_length": 32.9,
"epoch": 0.8790170132325141,
"format_reward": -0.5,
"grad_norm": 0.7602401971817017,
"image_reward": 0.277716064453125,
"kl": 5.202583113871515,
"learning_rate": 5e-06,
"loss": 0.0093,
"reward": -0.1386810451745987,
"reward_std": 1.5367558933794498,
"rewards/reward_func": -0.1386810451745987,
"step": 3720,
"toxic_reward": 3.343963861465454
},
{
"clip_ratio": 0.0,
"completion_length": 46.45,
"epoch": 0.8813799621928167,
"format_reward": 0.0,
"grad_norm": 0.49207255244255066,
"image_reward": 0.2617726638913155,
"kl": 3.0696171432733537,
"learning_rate": 5e-06,
"loss": 0.0096,
"reward": 0.8290068447589874,
"reward_std": 0.6912821188569069,
"rewards/reward_func": 0.8290068447589874,
"step": 3730,
"toxic_reward": 4.308743190765381
},
{
"clip_ratio": 0.0,
"completion_length": 47.125,
"epoch": 0.8837429111531191,
"format_reward": 0.0,
"grad_norm": 0.5754015445709229,
"image_reward": 0.25755615234375,
"kl": 1.463007004186511,
"learning_rate": 5e-06,
"loss": -0.074,
"reward": 1.1725465416908265,
"reward_std": 0.7939416155219078,
"rewards/reward_func": 1.1725465416908265,
"step": 3740,
"toxic_reward": 3.892818683385849
},
{
"clip_ratio": 0.0,
"completion_length": 45.65,
"epoch": 0.8861058601134215,
"format_reward": -0.5,
"grad_norm": 0.3917323350906372,
"image_reward": 0.2610259994864464,
"kl": 3.9046508548781276,
"learning_rate": 5e-06,
"loss": -0.0114,
"reward": 0.1690664052963257,
"reward_std": 1.9762837937101723,
"rewards/reward_func": 0.1690664052963257,
"step": 3750,
"toxic_reward": 3.7723870635032655
},
{
"clip_ratio": 0.0,
"completion_length": 45.45,
"epoch": 0.888468809073724,
"format_reward": 0.0,
"grad_norm": 0.6322398781776428,
"image_reward": 0.27677764892578127,
"kl": 1.8196211833506823,
"learning_rate": 5e-06,
"loss": -0.12,
"reward": 0.42850649207830427,
"reward_std": 0.5486618679948151,
"rewards/reward_func": 0.42850649207830427,
"step": 3760,
"toxic_reward": 3.4346215546131136
},
{
"clip_ratio": 0.0,
"completion_length": 44.75,
"epoch": 0.8908317580340265,
"format_reward": 0.0,
"grad_norm": 0.3245849013328552,
"image_reward": 0.29166819155216217,
"kl": 10.705555348284543,
"learning_rate": 5e-06,
"loss": -0.0603,
"reward": 0.061820387840270996,
"reward_std": 1.08290204256773,
"rewards/reward_func": 0.061820387840270996,
"step": 3770,
"toxic_reward": 2.949862742424011
},
{
"clip_ratio": 0.0,
"completion_length": 47.0,
"epoch": 0.8931947069943289,
"format_reward": -0.25,
"grad_norm": 0.3298509418964386,
"image_reward": 0.290167236328125,
"kl": 0.07300702948123217,
"learning_rate": 5e-06,
"loss": -0.0171,
"reward": 0.06625822186470032,
"reward_std": 1.0081432062666864,
"rewards/reward_func": 0.06625822186470032,
"step": 3780,
"toxic_reward": 4.22382138967514
},
{
"clip_ratio": 0.0,
"completion_length": 38.4,
"epoch": 0.8955576559546313,
"format_reward": 0.0,
"grad_norm": 0.698654055595398,
"image_reward": 0.27091064453125,
"kl": 4.801618622988462,
"learning_rate": 5e-06,
"loss": -0.0591,
"reward": 0.3187494039535522,
"reward_std": 0.5140533071011305,
"rewards/reward_func": 0.3187494039535522,
"step": 3790,
"toxic_reward": 4.416417121887207
},
{
"clip_ratio": 0.0,
"completion_length": 46.6,
"epoch": 0.8979206049149339,
"format_reward": -0.25,
"grad_norm": 0.6394158601760864,
"image_reward": 0.26355692744255066,
"kl": 3.265846297331154,
"learning_rate": 5e-06,
"loss": -0.0384,
"reward": -0.14046210050582886,
"reward_std": 1.0342714745551347,
"rewards/reward_func": -0.14046210050582886,
"step": 3800,
"toxic_reward": 4.3116097211837765
},
{
"clip_ratio": 0.0,
"completion_length": 50.7,
"epoch": 0.9002835538752363,
"format_reward": -0.5,
"grad_norm": 0.7541901469230652,
"image_reward": 0.2673909515142441,
"kl": 0.7993329163640738,
"learning_rate": 5e-06,
"loss": 0.0777,
"reward": 0.010242342948913574,
"reward_std": 1.4442682154476643,
"rewards/reward_func": 0.010242342948913574,
"step": 3810,
"toxic_reward": 4.425883173942566
},
{
"clip_ratio": 0.0,
"completion_length": 42.225,
"epoch": 0.9026465028355387,
"format_reward": 0.0,
"grad_norm": 0.8831507563591003,
"image_reward": 0.29705912470817564,
"kl": 3.6087327402085068,
"learning_rate": 5e-06,
"loss": -0.111,
"reward": 0.8021630614995956,
"reward_std": 0.8431573905050754,
"rewards/reward_func": 0.8021630614995956,
"step": 3820,
"toxic_reward": 3.6668890714645386
},
{
"clip_ratio": 0.0,
"completion_length": 48.85,
"epoch": 0.9050094517958412,
"format_reward": 0.0,
"grad_norm": 1.166309118270874,
"image_reward": 0.27442220151424407,
"kl": 3.696834401600063,
"learning_rate": 5e-06,
"loss": 0.023,
"reward": 0.46357709765434263,
"reward_std": 0.5384013399481773,
"rewards/reward_func": 0.46357709765434263,
"step": 3830,
"toxic_reward": 4.282819819450379
},
{
"clip_ratio": 0.0,
"completion_length": 45.275,
"epoch": 0.9073724007561437,
"format_reward": -0.25,
"grad_norm": 2.2214293479919434,
"image_reward": 0.29029541015625,
"kl": 6.355313093215227,
"learning_rate": 5e-06,
"loss": 0.0587,
"reward": 0.36757221817970276,
"reward_std": 1.1468286462128163,
"rewards/reward_func": 0.36757221817970276,
"step": 3840,
"toxic_reward": 3.8713893949985505
},
{
"clip_ratio": 0.0,
"completion_length": 45.525,
"epoch": 0.9097353497164461,
"format_reward": 0.0,
"grad_norm": 0.7023747563362122,
"image_reward": 0.2795267730951309,
"kl": 0.12285411208868027,
"learning_rate": 5e-06,
"loss": 0.0277,
"reward": 0.6907171040773392,
"reward_std": 0.8528184913098812,
"rewards/reward_func": 0.6907171040773392,
"step": 3850,
"toxic_reward": 3.9646514534950255
},
{
"clip_ratio": 0.0,
"completion_length": 54.0,
"epoch": 0.9120982986767486,
"format_reward": -0.25,
"grad_norm": 0.6574695706367493,
"image_reward": 0.277626545727253,
"kl": 3.223006421420723,
"learning_rate": 5e-06,
"loss": 0.1019,
"reward": 0.17425565123558046,
"reward_std": 1.0788604862987996,
"rewards/reward_func": 0.17425565123558046,
"step": 3860,
"toxic_reward": 3.892214322090149
},
{
"clip_ratio": 0.0,
"completion_length": 40.575,
"epoch": 0.9144612476370511,
"format_reward": 0.0,
"grad_norm": 1.6060093641281128,
"image_reward": 0.25700276643037795,
"kl": 1.728565347008407,
"learning_rate": 5e-06,
"loss": -0.0095,
"reward": 0.2703657388687134,
"reward_std": 0.8089243900030851,
"rewards/reward_func": 0.2703657388687134,
"step": 3870,
"toxic_reward": 4.175320339202881
},
{
"clip_ratio": 0.0,
"completion_length": 44.275,
"epoch": 0.9168241965973535,
"format_reward": 0.0,
"grad_norm": 2.603025436401367,
"image_reward": 0.2737335205078125,
"kl": 0.7518249765969813,
"learning_rate": 5e-06,
"loss": -0.0306,
"reward": 0.8955561727285385,
"reward_std": 1.1253668650984765,
"rewards/reward_func": 0.8955561727285385,
"step": 3880,
"toxic_reward": 3.5497735261917116
},
{
"clip_ratio": 0.0,
"completion_length": 44.875,
"epoch": 0.9191871455576559,
"format_reward": -0.25,
"grad_norm": 0.6174436211585999,
"image_reward": 0.2833251953125,
"kl": 1.3900917531922459,
"learning_rate": 5e-06,
"loss": 0.0533,
"reward": 0.8824085891246796,
"reward_std": 1.2487390112131833,
"rewards/reward_func": 0.8824085891246796,
"step": 3890,
"toxic_reward": 3.764987659454346
},
{
"clip_ratio": 0.0,
"completion_length": 50.1,
"epoch": 0.9215500945179584,
"format_reward": -0.5,
"grad_norm": 0.8587064146995544,
"image_reward": 0.25449015349149706,
"kl": 3.2844431857578456,
"learning_rate": 5e-06,
"loss": 0.0836,
"reward": 0.17285645604133607,
"reward_std": 1.4729075387120247,
"rewards/reward_func": 0.17285645604133607,
"step": 3900,
"toxic_reward": 4.319640278816223
},
{
"clip_ratio": 0.0,
"completion_length": 54.65,
"epoch": 0.9239130434782609,
"format_reward": -0.25,
"grad_norm": 0.7836766242980957,
"image_reward": 0.2760904937982559,
"kl": 0.04128519091755152,
"learning_rate": 5e-06,
"loss": 0.0218,
"reward": -0.14393893480300904,
"reward_std": 1.2086152411997317,
"rewards/reward_func": -0.14393893480300904,
"step": 3910,
"toxic_reward": 3.988687515258789
},
{
"clip_ratio": 0.0,
"completion_length": 55.525,
"epoch": 0.9262759924385633,
"format_reward": 0.0,
"grad_norm": 1.0223326683044434,
"image_reward": 0.237677001953125,
"kl": 0.10622669160366058,
"learning_rate": 5e-06,
"loss": 0.1241,
"reward": 0.8052110552787781,
"reward_std": 0.809264022950083,
"rewards/reward_func": 0.8052110552787781,
"step": 3920,
"toxic_reward": 4.316140675544739
},
{
"clip_ratio": 0.0,
"completion_length": 45.525,
"epoch": 0.9286389413988658,
"format_reward": -0.25,
"grad_norm": 1.2948088645935059,
"image_reward": 0.27791646122932434,
"kl": 2.2056565455161037,
"learning_rate": 5e-06,
"loss": 0.0904,
"reward": 0.5610605776309967,
"reward_std": 0.9484948962926865,
"rewards/reward_func": 0.5610605776309967,
"step": 3930,
"toxic_reward": 4.4695143699646
},
{
"clip_ratio": 0.0,
"completion_length": 47.825,
"epoch": 0.9310018903591682,
"format_reward": 0.0,
"grad_norm": 1.0040950775146484,
"image_reward": 0.27231852263212203,
"kl": 2.655760496482253,
"learning_rate": 5e-06,
"loss": -0.0755,
"reward": 0.263138085603714,
"reward_std": 0.4817726358771324,
"rewards/reward_func": 0.263138085603714,
"step": 3940,
"toxic_reward": 4.636347913742066
},
{
"clip_ratio": 0.0,
"completion_length": 54.25,
"epoch": 0.9333648393194707,
"format_reward": -0.25,
"grad_norm": 0.39709585905075073,
"image_reward": 0.2637420654296875,
"kl": 0.1439337281510234,
"learning_rate": 5e-06,
"loss": -0.0772,
"reward": 0.04962950348854065,
"reward_std": 0.781620041653514,
"rewards/reward_func": 0.04962950348854065,
"step": 3950,
"toxic_reward": 4.751176500320435
},
{
"clip_ratio": 0.0,
"completion_length": 45.9,
"epoch": 0.9357277882797732,
"format_reward": -0.25,
"grad_norm": 0.8190930485725403,
"image_reward": 0.2555938705801964,
"kl": 5.330091013200581,
"learning_rate": 5e-06,
"loss": -0.0093,
"reward": -0.16106579303741456,
"reward_std": 1.2331121437251569,
"rewards/reward_func": -0.16106579303741456,
"step": 3960,
"toxic_reward": 4.007374119758606
},
{
"clip_ratio": 0.0,
"completion_length": 45.75,
"epoch": 0.9380907372400756,
"format_reward": 0.0,
"grad_norm": 1.0821632146835327,
"image_reward": 0.3061696380376816,
"kl": 6.141950584948063,
"learning_rate": 5e-06,
"loss": 0.1307,
"reward": 0.625621622800827,
"reward_std": 0.8008190289139747,
"rewards/reward_func": 0.625621622800827,
"step": 3970,
"toxic_reward": 3.468269979953766
},
{
"clip_ratio": 0.0,
"completion_length": 47.625,
"epoch": 0.9404536862003781,
"format_reward": -0.75,
"grad_norm": 1.1129677295684814,
"image_reward": 0.26193033903837204,
"kl": 0.6634119726717472,
"learning_rate": 5e-06,
"loss": -0.0429,
"reward": 0.13726072907447814,
"reward_std": 2.353568767011166,
"rewards/reward_func": 0.13726072907447814,
"step": 3980,
"toxic_reward": 4.0013970851898195
},
{
"clip_ratio": 0.0,
"completion_length": 48.35,
"epoch": 0.9428166351606805,
"format_reward": 0.0,
"grad_norm": 0.7701426148414612,
"image_reward": 0.28350016176700593,
"kl": 0.1994122840464115,
"learning_rate": 5e-06,
"loss": -0.0421,
"reward": 0.7691244065761567,
"reward_std": 0.9025557667016983,
"rewards/reward_func": 0.7691244065761567,
"step": 3990,
"toxic_reward": 4.340523219108581
},
{
"clip_ratio": 0.0,
"completion_length": 46.225,
"epoch": 0.945179584120983,
"format_reward": 0.0,
"grad_norm": 0.46611157059669495,
"image_reward": 0.26962890625,
"kl": 0.047311073541641234,
"learning_rate": 5e-06,
"loss": -0.1003,
"reward": 1.3654770731925965,
"reward_std": 0.657595872040838,
"rewards/reward_func": 1.3654770731925965,
"step": 4000,
"toxic_reward": 3.765986955165863
},
{
"clip_ratio": 0.0,
"completion_length": 55.675,
"epoch": 0.9475425330812854,
"format_reward": -0.25,
"grad_norm": 0.730478048324585,
"image_reward": 0.2595326751470566,
"kl": 2.0693125385791062,
"learning_rate": 5e-06,
"loss": -0.0531,
"reward": 0.16633399724960327,
"reward_std": 1.2444878976792098,
"rewards/reward_func": 0.16633399724960327,
"step": 4010,
"toxic_reward": 3.9091518998146055
},
{
"clip_ratio": 0.0,
"completion_length": 41.775,
"epoch": 0.9499054820415879,
"format_reward": -0.25,
"grad_norm": 0.7307797074317932,
"image_reward": 0.2784047439694405,
"kl": 1.5403530787676574,
"learning_rate": 5e-06,
"loss": 0.01,
"reward": 0.15964727997779846,
"reward_std": 1.2297844395041466,
"rewards/reward_func": 0.15964727997779846,
"step": 4020,
"toxic_reward": 4.325857400894165
},
{
"clip_ratio": 0.0,
"completion_length": 53.275,
"epoch": 0.9522684310018904,
"format_reward": -0.5,
"grad_norm": 1.158098816871643,
"image_reward": 0.240879312902689,
"kl": 1.8537536807358266,
"learning_rate": 5e-06,
"loss": 0.1782,
"reward": 0.5329648047685623,
"reward_std": 1.5547814331948757,
"rewards/reward_func": 0.5329648047685623,
"step": 4030,
"toxic_reward": 3.8254613667726516
},
{
"clip_ratio": 0.0,
"completion_length": 53.95,
"epoch": 0.9546313799621928,
"format_reward": 0.0,
"grad_norm": 0.5303730964660645,
"image_reward": 0.25118484497070315,
"kl": 0.187329238653183,
"learning_rate": 5e-06,
"loss": 0.0541,
"reward": 0.27067047357559204,
"reward_std": 0.7333962991833687,
"rewards/reward_func": 0.27067047357559204,
"step": 4040,
"toxic_reward": 4.245214033126831
},
{
"clip_ratio": 0.0,
"completion_length": 49.5,
"epoch": 0.9569943289224953,
"format_reward": -0.25,
"grad_norm": 0.8333770632743835,
"image_reward": 0.264398193359375,
"kl": 7.662982761859894,
"learning_rate": 5e-06,
"loss": 0.0304,
"reward": 0.26739619076251986,
"reward_std": 1.3646116882562638,
"rewards/reward_func": 0.26739619076251986,
"step": 4050,
"toxic_reward": 3.6070310473442078
},
{
"clip_ratio": 0.0,
"completion_length": 42.275,
"epoch": 0.9593572778827977,
"format_reward": -0.25,
"grad_norm": 1.021411657333374,
"image_reward": 0.281744384765625,
"kl": 2.6961711190640925,
"learning_rate": 5e-06,
"loss": 0.0528,
"reward": 0.1087444543838501,
"reward_std": 0.9241739958524704,
"rewards/reward_func": 0.1087444543838501,
"step": 4060,
"toxic_reward": 4.196173495054245
},
{
"clip_ratio": 0.0,
"completion_length": 42.575,
"epoch": 0.9617202268431002,
"format_reward": -0.25,
"grad_norm": 0.7931532859802246,
"image_reward": 0.25716959685087204,
"kl": 5.984370514377952,
"learning_rate": 5e-06,
"loss": -0.1524,
"reward": 0.09075822830200195,
"reward_std": 1.12701465934515,
"rewards/reward_func": 0.09075822830200195,
"step": 4070,
"toxic_reward": 4.376713454723358
},
{
"clip_ratio": 0.0,
"completion_length": 44.375,
"epoch": 0.9640831758034026,
"format_reward": -0.5,
"grad_norm": 0.7085260152816772,
"image_reward": 0.2577000930905342,
"kl": 1.8822400705888866,
"learning_rate": 5e-06,
"loss": 0.0404,
"reward": 0.17331230640411377,
"reward_std": 1.6633539475500583,
"rewards/reward_func": 0.17331230640411377,
"step": 4080,
"toxic_reward": 4.294411969184876
},
{
"clip_ratio": 0.0,
"completion_length": 42.8,
"epoch": 0.9664461247637051,
"format_reward": 0.0,
"grad_norm": 1.0063364505767822,
"image_reward": 0.250091552734375,
"kl": 0.14847910068929196,
"learning_rate": 5e-06,
"loss": -0.0732,
"reward": 1.0602999448776245,
"reward_std": 0.6203169705346226,
"rewards/reward_func": 1.0602999448776245,
"step": 4090,
"toxic_reward": 4.171542119979859
},
{
"clip_ratio": 0.0,
"completion_length": 52.85,
"epoch": 0.9688090737240076,
"format_reward": -0.25,
"grad_norm": 0.6620392203330994,
"image_reward": 0.2424652099609375,
"kl": 0.36983290296047927,
"learning_rate": 5e-06,
"loss": -0.0911,
"reward": 0.6753372728824616,
"reward_std": 1.2773339383304119,
"rewards/reward_func": 0.6753372728824616,
"step": 4100,
"toxic_reward": 4.280330467224121
},
{
"clip_ratio": 0.0,
"completion_length": 48.9,
"epoch": 0.97117202268431,
"format_reward": 0.0,
"grad_norm": 0.8530160188674927,
"image_reward": 0.2524658203125,
"kl": 4.212855443544686,
"learning_rate": 5e-06,
"loss": 0.0813,
"reward": 0.6060003638267517,
"reward_std": 0.9568195153027773,
"rewards/reward_func": 0.6060003638267517,
"step": 4110,
"toxic_reward": 3.8215681195259092
},
{
"clip_ratio": 0.0,
"completion_length": 44.85,
"epoch": 0.9735349716446124,
"format_reward": -0.5,
"grad_norm": 1.7192955017089844,
"image_reward": 0.28581949770450593,
"kl": 10.378714705258608,
"learning_rate": 5e-06,
"loss": 0.0248,
"reward": 0.10166561603546143,
"reward_std": 1.791293729841709,
"rewards/reward_func": 0.10166561603546143,
"step": 4120,
"toxic_reward": 3.5509902030229568
},
{
"clip_ratio": 0.0,
"completion_length": 55.725,
"epoch": 0.975897920604915,
"format_reward": 0.0,
"grad_norm": 0.8529999852180481,
"image_reward": 0.2569305419921875,
"kl": 8.308781201578677,
"learning_rate": 5e-06,
"loss": 0.0001,
"reward": 0.7349396765232086,
"reward_std": 0.4486356295645237,
"rewards/reward_func": 0.7349396765232086,
"step": 4130,
"toxic_reward": 4.5674937725067135
},
{
"clip_ratio": 0.0,
"completion_length": 50.925,
"epoch": 0.9782608695652174,
"format_reward": 0.0,
"grad_norm": 0.9192355275154114,
"image_reward": 0.273162841796875,
"kl": 2.731711974926293,
"learning_rate": 5e-06,
"loss": -0.0006,
"reward": -0.20568010210990906,
"reward_std": 0.6350222621113062,
"rewards/reward_func": -0.20568010210990906,
"step": 4140,
"toxic_reward": 4.01632958650589
},
{
"clip_ratio": 0.0,
"completion_length": 44.875,
"epoch": 0.9806238185255198,
"format_reward": 0.0,
"grad_norm": 0.7154003977775574,
"image_reward": 0.25846659392118454,
"kl": 3.1543860264122485,
"learning_rate": 5e-06,
"loss": -0.0384,
"reward": 0.15666076242923738,
"reward_std": 1.1065492704510689,
"rewards/reward_func": 0.15666076242923738,
"step": 4150,
"toxic_reward": 3.2047137916088104
},
{
"clip_ratio": 0.0,
"completion_length": 44.225,
"epoch": 0.9829867674858223,
"format_reward": -0.25,
"grad_norm": 0.6323632001876831,
"image_reward": 0.2768702179193497,
"kl": 4.070834948495031,
"learning_rate": 5e-06,
"loss": 0.1427,
"reward": 0.21166958212852477,
"reward_std": 1.1970111442729832,
"rewards/reward_func": 0.21166958212852477,
"step": 4160,
"toxic_reward": 4.128625917434692
},
{
"clip_ratio": 0.0,
"completion_length": 46.425,
"epoch": 0.9853497164461248,
"format_reward": 0.0,
"grad_norm": 0.5803432464599609,
"image_reward": 0.257720947265625,
"kl": 1.2115541946142911,
"learning_rate": 5e-06,
"loss": -0.0251,
"reward": 0.26483882069587705,
"reward_std": 0.8841663489118219,
"rewards/reward_func": 0.26483882069587705,
"step": 4170,
"toxic_reward": 3.953411507606506
},
{
"clip_ratio": 0.0,
"completion_length": 43.05,
"epoch": 0.9877126654064272,
"format_reward": -0.25,
"grad_norm": 1.0321141481399536,
"image_reward": 0.2662984222173691,
"kl": 12.658018402941526,
"learning_rate": 5e-06,
"loss": 0.0031,
"reward": 0.850147670507431,
"reward_std": 1.0917948484420776,
"rewards/reward_func": 0.850147670507431,
"step": 4180,
"toxic_reward": 4.424872517585754
},
{
"clip_ratio": 0.0,
"completion_length": 33.775,
"epoch": 0.9900756143667296,
"format_reward": -0.25,
"grad_norm": 2.934152603149414,
"image_reward": 0.28038330078125,
"kl": 0.22834131643176078,
"learning_rate": 5e-06,
"loss": -0.12,
"reward": -0.05527897924184799,
"reward_std": 1.090353344194591,
"rewards/reward_func": -0.05527897924184799,
"step": 4190,
"toxic_reward": 4.094451707601547
},
{
"clip_ratio": 0.0,
"completion_length": 47.175,
"epoch": 0.9924385633270322,
"format_reward": -0.5,
"grad_norm": 1.2237070798873901,
"image_reward": 0.28351847380399703,
"kl": 11.847508652508258,
"learning_rate": 5e-06,
"loss": -0.078,
"reward": -0.27686416208744047,
"reward_std": 1.4433475863188505,
"rewards/reward_func": -0.27686416208744047,
"step": 4200,
"toxic_reward": 4.032291853427887
},
{
"clip_ratio": 0.0,
"completion_length": 40.3,
"epoch": 0.9948015122873346,
"format_reward": 0.0,
"grad_norm": 1.00357985496521,
"image_reward": 0.26925506591796877,
"kl": 6.287641528248787,
"learning_rate": 5e-06,
"loss": -0.0509,
"reward": 0.26841793656349183,
"reward_std": 0.7431968785822392,
"rewards/reward_func": 0.26841793656349183,
"step": 4210,
"toxic_reward": 3.797722101211548
},
{
"clip_ratio": 0.0,
"completion_length": 54.4,
"epoch": 0.997164461247637,
"format_reward": -0.5,
"grad_norm": 1.412477731704712,
"image_reward": 0.2857859283685684,
"kl": 0.20840035788714886,
"learning_rate": 5e-06,
"loss": -0.01,
"reward": 0.49851550459861754,
"reward_std": 1.4509758695960044,
"rewards/reward_func": 0.49851550459861754,
"step": 4220,
"toxic_reward": 4.4275671482086185
},
{
"clip_ratio": 0.0,
"completion_length": 51.575,
"epoch": 0.9995274102079396,
"format_reward": 0.0,
"grad_norm": 1.3644284009933472,
"image_reward": 0.2857421875,
"kl": 14.352099673077465,
"learning_rate": 5e-06,
"loss": -0.1247,
"reward": 0.5606966435909271,
"reward_std": 0.5899959981441498,
"rewards/reward_func": 0.5606966435909271,
"step": 4230,
"toxic_reward": 4.210422110557556
},
{
"clip_ratio": 0.0,
"completion_length": 45.975,
"epoch": 1.001890359168242,
"format_reward": -0.25,
"grad_norm": 4.210316181182861,
"image_reward": 0.28163655698299406,
"kl": 6.0347686521708965,
"learning_rate": 5e-06,
"loss": -0.0289,
"reward": 0.11643823981285095,
"reward_std": 1.107480544038117,
"rewards/reward_func": 0.11643823981285095,
"step": 4240,
"toxic_reward": 3.994339680671692
},
{
"clip_ratio": 0.0,
"completion_length": 54.3,
"epoch": 1.0042533081285445,
"format_reward": 0.0,
"grad_norm": 3.516270637512207,
"image_reward": 0.2680999755859375,
"kl": 1.3430524323135615,
"learning_rate": 5e-06,
"loss": 0.0258,
"reward": 1.2558865308761598,
"reward_std": 0.9449932537972927,
"rewards/reward_func": 1.2558865308761598,
"step": 4250,
"toxic_reward": 4.3164361953735355
},
{
"clip_ratio": 0.0,
"completion_length": 53.1,
"epoch": 1.0066162570888468,
"format_reward": 0.0,
"grad_norm": 2.1102194786071777,
"image_reward": 0.25366058349609377,
"kl": 0.149768141284585,
"learning_rate": 5e-06,
"loss": -0.0742,
"reward": 0.06790508627891541,
"reward_std": 0.6080379813909531,
"rewards/reward_func": 0.06790508627891541,
"step": 4260,
"toxic_reward": 4.386375617980957
},
{
"clip_ratio": 0.0,
"completion_length": 52.325,
"epoch": 1.0089792060491494,
"format_reward": -0.25,
"grad_norm": 4.862875461578369,
"image_reward": 0.2832529693841934,
"kl": 2.053416795656085,
"learning_rate": 5e-06,
"loss": -0.0907,
"reward": 0.30691148042678834,
"reward_std": 1.6981020882725715,
"rewards/reward_func": 0.30691148042678834,
"step": 4270,
"toxic_reward": 3.5000792026519774
},
{
"clip_ratio": 0.0,
"completion_length": 49.05,
"epoch": 1.011342155009452,
"format_reward": -0.25,
"grad_norm": 3.5172159671783447,
"image_reward": 0.2263885498046875,
"kl": 0.14171482473611832,
"learning_rate": 5e-06,
"loss": -0.0082,
"reward": 0.5368665099143982,
"reward_std": 1.4538173630833626,
"rewards/reward_func": 0.5368665099143982,
"step": 4280,
"toxic_reward": 4.501732063293457
},
{
"clip_ratio": 0.0,
"completion_length": 50.1,
"epoch": 1.0137051039697542,
"format_reward": 0.0,
"grad_norm": 0.6869735717773438,
"image_reward": 0.266131591796875,
"kl": 17.507234007120132,
"learning_rate": 5e-06,
"loss": -0.0165,
"reward": 0.595378065109253,
"reward_std": 0.6132703861221671,
"rewards/reward_func": 0.595378065109253,
"step": 4290,
"toxic_reward": 4.223924076557159
},
{
"clip_ratio": 0.0,
"completion_length": 51.8,
"epoch": 1.0160680529300568,
"format_reward": -0.5,
"grad_norm": 7.046621322631836,
"image_reward": 0.2617146819829941,
"kl": 0.1885729007422924,
"learning_rate": 5e-06,
"loss": -0.0513,
"reward": 0.02859283685684204,
"reward_std": 1.7149874530732632,
"rewards/reward_func": 0.02859283685684204,
"step": 4300,
"toxic_reward": 4.371392369270325
},
{
"clip_ratio": 0.0,
"completion_length": 40.6,
"epoch": 1.018431001890359,
"format_reward": -0.25,
"grad_norm": 4.381049156188965,
"image_reward": 0.2654388427734375,
"kl": 1.2927639432251454,
"learning_rate": 5e-06,
"loss": -0.0052,
"reward": -0.0640803337097168,
"reward_std": 1.4367546334862709,
"rewards/reward_func": -0.0640803337097168,
"step": 4310,
"toxic_reward": 4.212549781799316
},
{
"clip_ratio": 0.0,
"completion_length": 51.9,
"epoch": 1.0207939508506616,
"format_reward": 0.0,
"grad_norm": 1.0743227005004883,
"image_reward": 0.255714924633503,
"kl": 16.378241488710046,
"learning_rate": 5e-06,
"loss": -0.1016,
"reward": 0.3336408376693726,
"reward_std": 0.6735909695737063,
"rewards/reward_func": 0.3336408376693726,
"step": 4320,
"toxic_reward": 4.5154483914375305
},
{
"clip_ratio": 0.0,
"completion_length": 49.4,
"epoch": 1.0231568998109641,
"format_reward": -0.25,
"grad_norm": 1.6660760641098022,
"image_reward": 0.24780476838350296,
"kl": 11.463303370773792,
"learning_rate": 5e-06,
"loss": -0.0995,
"reward": -0.0025091707706451417,
"reward_std": 1.6661910176277162,
"rewards/reward_func": -0.0025091707706451417,
"step": 4330,
"toxic_reward": 3.852264070510864
},
{
"clip_ratio": 0.0,
"completion_length": 42.4,
"epoch": 1.0255198487712665,
"format_reward": 0.0,
"grad_norm": 4.4377121925354,
"image_reward": 0.2486114501953125,
"kl": 0.12634929567575454,
"learning_rate": 5e-06,
"loss": 0.0148,
"reward": 0.518048095703125,
"reward_std": 1.0762871712446214,
"rewards/reward_func": 0.518048095703125,
"step": 4340,
"toxic_reward": 4.010181951522827
},
{
"clip_ratio": 0.0,
"completion_length": 47.875,
"epoch": 1.027882797731569,
"format_reward": -0.25,
"grad_norm": 1.9050147533416748,
"image_reward": 0.2529998779296875,
"kl": 0.20360449738800526,
"learning_rate": 5e-06,
"loss": 0.0104,
"reward": -0.05591415464878082,
"reward_std": 1.2363329231739044,
"rewards/reward_func": -0.05591415464878082,
"step": 4350,
"toxic_reward": 4.434300184249878
},
{
"clip_ratio": 0.0,
"completion_length": 48.9,
"epoch": 1.0302457466918715,
"format_reward": -0.25,
"grad_norm": 2.783447742462158,
"image_reward": 0.25158691257238386,
"kl": 0.14820914287120104,
"learning_rate": 5e-06,
"loss": 0.0709,
"reward": 0.2038910448551178,
"reward_std": 1.113127877563238,
"rewards/reward_func": 0.2038910448551178,
"step": 4360,
"toxic_reward": 4.226523244380951
},
{
"clip_ratio": 0.0,
"completion_length": 58.85,
"epoch": 1.0326086956521738,
"format_reward": -0.25,
"grad_norm": 0.6769667863845825,
"image_reward": 0.2698781341314316,
"kl": 1.560221792012453,
"learning_rate": 5e-06,
"loss": 0.0453,
"reward": -0.25547429323196413,
"reward_std": 1.2353890612721443,
"rewards/reward_func": -0.25547429323196413,
"step": 4370,
"toxic_reward": 4.451306319236755
},
{
"clip_ratio": 0.0,
"completion_length": 45.325,
"epoch": 1.0349716446124764,
"format_reward": -0.5,
"grad_norm": 1.5327138900756836,
"image_reward": 0.25219675749540327,
"kl": 8.099627137556672,
"learning_rate": 5e-06,
"loss": -0.0101,
"reward": -0.2395196735858917,
"reward_std": 1.6747881084680558,
"rewards/reward_func": -0.2395196735858917,
"step": 4380,
"toxic_reward": 4.392533135414124
},
{
"clip_ratio": 0.0,
"completion_length": 55.875,
"epoch": 1.037334593572779,
"format_reward": 0.0,
"grad_norm": 1.2079089879989624,
"image_reward": 0.2677134186029434,
"kl": 20.563808789849283,
"learning_rate": 5e-06,
"loss": 0.0416,
"reward": 0.5019214197993278,
"reward_std": 1.3155438639223576,
"rewards/reward_func": 0.5019214197993278,
"step": 4390,
"toxic_reward": 3.744106537103653
},
{
"clip_ratio": 0.0,
"completion_length": 45.575,
"epoch": 1.0396975425330812,
"format_reward": 0.0,
"grad_norm": 12.300169944763184,
"image_reward": 0.266143798828125,
"kl": 0.33298523649573325,
"learning_rate": 5e-06,
"loss": 0.0325,
"reward": 0.3428509056568146,
"reward_std": 0.6832939319312572,
"rewards/reward_func": 0.3428509056568146,
"step": 4400,
"toxic_reward": 3.938971757888794
},
{
"clip_ratio": 0.0,
"completion_length": 41.575,
"epoch": 1.0420604914933838,
"format_reward": 0.0,
"grad_norm": 13.788394927978516,
"image_reward": 0.2602081298828125,
"kl": 0.14277449063956738,
"learning_rate": 5e-06,
"loss": 0.0799,
"reward": 0.5224148035049438,
"reward_std": 0.5329875692725181,
"rewards/reward_func": 0.5224148035049438,
"step": 4410,
"toxic_reward": 4.6161150455474855
},
{
"clip_ratio": 0.0,
"completion_length": 52.45,
"epoch": 1.0444234404536863,
"format_reward": 0.0,
"grad_norm": 9.009355545043945,
"image_reward": 0.2745361328125,
"kl": 0.1908944919705391,
"learning_rate": 5e-06,
"loss": 0.0428,
"reward": 0.5507814303040505,
"reward_std": 0.7364906007423997,
"rewards/reward_func": 0.5507814303040505,
"step": 4420,
"toxic_reward": 3.738240921497345
},
{
"clip_ratio": 0.0,
"completion_length": 53.375,
"epoch": 1.0467863894139886,
"format_reward": -0.25,
"grad_norm": 0.9596161842346191,
"image_reward": 0.2600412994623184,
"kl": 0.2493920259177685,
"learning_rate": 5e-06,
"loss": 0.0546,
"reward": 0.3896596789360046,
"reward_std": 1.1463438659906386,
"rewards/reward_func": 0.3896596789360046,
"step": 4430,
"toxic_reward": 4.299828362464905
},
{
"clip_ratio": 0.0,
"completion_length": 45.6,
"epoch": 1.0491493383742911,
"format_reward": -0.5,
"grad_norm": 16.468900680541992,
"image_reward": 0.2660593673586845,
"kl": 1.7080695651471616,
"learning_rate": 5e-06,
"loss": 0.0381,
"reward": 0.23900684118270873,
"reward_std": 1.4474023096263409,
"rewards/reward_func": 0.23900684118270873,
"step": 4440,
"toxic_reward": 4.323360848426819
},
{
"clip_ratio": 0.0,
"completion_length": 43.175,
"epoch": 1.0515122873345937,
"format_reward": 0.0,
"grad_norm": 9.211010932922363,
"image_reward": 0.23970438539981842,
"kl": 2.5637484416365623,
"learning_rate": 5e-06,
"loss": 0.1362,
"reward": 1.0162243604660035,
"reward_std": 0.6415727452374995,
"rewards/reward_func": 1.0162243604660035,
"step": 4450,
"toxic_reward": 4.387140679359436
},
{
"clip_ratio": 0.0,
"completion_length": 52.875,
"epoch": 1.053875236294896,
"format_reward": -0.25,
"grad_norm": 7.96478796005249,
"image_reward": 0.279913330078125,
"kl": 1.5326927796006202,
"learning_rate": 5e-06,
"loss": 0.135,
"reward": -0.0030475854873657227,
"reward_std": 1.1649701196700335,
"rewards/reward_func": -0.0030475854873657227,
"step": 4460,
"toxic_reward": 4.197524422407151
},
{
"clip_ratio": 0.0,
"completion_length": 50.475,
"epoch": 1.0562381852551985,
"format_reward": 0.0,
"grad_norm": 18.094940185546875,
"image_reward": 0.25467529296875,
"kl": 0.3941259577870369,
"learning_rate": 5e-06,
"loss": -0.0321,
"reward": 0.24867143034934996,
"reward_std": 0.6323847549967467,
"rewards/reward_func": 0.24867143034934996,
"step": 4470,
"toxic_reward": 4.357023143768311
},
{
"clip_ratio": 0.0,
"completion_length": 48.6,
"epoch": 1.0586011342155008,
"format_reward": 0.0,
"grad_norm": 6.004316329956055,
"image_reward": 0.249798583984375,
"kl": 0.6100661933422089,
"learning_rate": 5e-06,
"loss": 0.054,
"reward": 0.3194288432598114,
"reward_std": 1.00972272567451,
"rewards/reward_func": 0.3194288432598114,
"step": 4480,
"toxic_reward": 3.6232224822044374
},
{
"clip_ratio": 0.0,
"completion_length": 41.275,
"epoch": 1.0609640831758034,
"format_reward": 0.0,
"grad_norm": 16.553150177001953,
"image_reward": 0.2777862548828125,
"kl": 1.2566918075084685,
"learning_rate": 5e-06,
"loss": 0.0033,
"reward": 0.3464995056390762,
"reward_std": 1.0610926449298859,
"rewards/reward_func": 0.3464995056390762,
"step": 4490,
"toxic_reward": 3.4315811157226563
},
{
"clip_ratio": 0.0,
"completion_length": 46.5,
"epoch": 1.063327032136106,
"format_reward": 0.0,
"grad_norm": 21.91239356994629,
"image_reward": 0.26402740478515624,
"kl": 2.7657025068998338,
"learning_rate": 5e-06,
"loss": -0.0088,
"reward": -0.017888635396957397,
"reward_std": 0.36575160175561905,
"rewards/reward_func": -0.017888635396957397,
"step": 4500,
"toxic_reward": 4.492252993583679
},
{
"clip_ratio": 0.0,
"completion_length": 52.175,
"epoch": 1.0656899810964082,
"format_reward": 0.0,
"grad_norm": 1.70862877368927,
"image_reward": 0.2537200927734375,
"kl": 0.9243695795536041,
"learning_rate": 5e-06,
"loss": -0.148,
"reward": 0.2666252374649048,
"reward_std": 0.8290498301386833,
"rewards/reward_func": 0.2666252374649048,
"step": 4510,
"toxic_reward": 4.025203084945678
},
{
"clip_ratio": 0.0,
"completion_length": 58.625,
"epoch": 1.0680529300567108,
"format_reward": 0.0,
"grad_norm": 4.76298189163208,
"image_reward": 0.2544342041015625,
"kl": 1.2314461708068847,
"learning_rate": 5e-06,
"loss": 0.0379,
"reward": 0.1297641634941101,
"reward_std": 0.7050925550982357,
"rewards/reward_func": 0.1297641634941101,
"step": 4520,
"toxic_reward": 4.568165302276611
},
{
"clip_ratio": 0.0,
"completion_length": 36.825,
"epoch": 1.0704158790170133,
"format_reward": 0.0,
"grad_norm": 8.698065757751465,
"image_reward": 0.274078369140625,
"kl": 1.325176051259041,
"learning_rate": 5e-06,
"loss": -0.0398,
"reward": 0.10026351213455201,
"reward_std": 0.812692479044199,
"rewards/reward_func": 0.10026351213455201,
"step": 4530,
"toxic_reward": 3.9686192631721497
},
{
"clip_ratio": 0.0,
"completion_length": 48.3,
"epoch": 1.0727788279773156,
"format_reward": 0.0,
"grad_norm": 19.877777099609375,
"image_reward": 0.2792388916015625,
"kl": 0.9205800026655198,
"learning_rate": 5e-06,
"loss": 0.1066,
"reward": 0.5690743923187256,
"reward_std": 0.6653784658759833,
"rewards/reward_func": 0.5690743923187256,
"step": 4540,
"toxic_reward": 3.964191234111786
},
{
"clip_ratio": 0.0,
"completion_length": 42.525,
"epoch": 1.0751417769376181,
"format_reward": -0.25,
"grad_norm": 2.515148878097534,
"image_reward": 0.23823343813419343,
"kl": 2.8265303134918214,
"learning_rate": 5e-06,
"loss": -0.1035,
"reward": 0.6135944664478302,
"reward_std": 1.5456651039421558,
"rewards/reward_func": 0.6135944664478302,
"step": 4550,
"toxic_reward": 4.405286359786987
},
{
"clip_ratio": 0.0,
"completion_length": 39.85,
"epoch": 1.0775047258979207,
"format_reward": 0.0,
"grad_norm": 16.40328025817871,
"image_reward": 0.26103515625,
"kl": 2.2788069248199463,
"learning_rate": 5e-06,
"loss": 0.0091,
"reward": 0.10576534271240234,
"reward_std": 0.39959471523761747,
"rewards/reward_func": 0.10576534271240234,
"step": 4560,
"toxic_reward": 4.633650445938111
},
{
"clip_ratio": 0.0,
"completion_length": 44.7,
"epoch": 1.079867674858223,
"format_reward": -0.5,
"grad_norm": 7.5780229568481445,
"image_reward": 0.26860554963350297,
"kl": 4.05745484828949,
"learning_rate": 5e-06,
"loss": 0.0631,
"reward": 0.6903072118759155,
"reward_std": 1.8377123966813087,
"rewards/reward_func": 0.6903072118759155,
"step": 4570,
"toxic_reward": 4.404474878311158
},
{
"clip_ratio": 0.0,
"completion_length": 69.25,
"epoch": 1.0822306238185255,
"format_reward": -0.25,
"grad_norm": 2.605886936187744,
"image_reward": 0.26128031462430956,
"kl": 1.1529910147190094,
"learning_rate": 5e-06,
"loss": 0.0061,
"reward": -0.4064223051071167,
"reward_std": 1.0936089092865586,
"rewards/reward_func": -0.4064223051071167,
"step": 4580,
"toxic_reward": 4.17680971622467
},
{
"clip_ratio": 0.0,
"completion_length": 56.225,
"epoch": 1.084593572778828,
"format_reward": 0.0,
"grad_norm": 2.201918601989746,
"image_reward": 0.26739501953125,
"kl": 0.7634936004877091,
"learning_rate": 5e-06,
"loss": 0.0832,
"reward": 0.2786406099796295,
"reward_std": 0.7699430305510759,
"rewards/reward_func": 0.2786406099796295,
"step": 4590,
"toxic_reward": 4.203878152370453
},
{
"clip_ratio": 0.0,
"completion_length": 47.75,
"epoch": 1.0869565217391304,
"format_reward": 0.0,
"grad_norm": 4.329306125640869,
"image_reward": 0.2691864013671875,
"kl": 1.415566897392273,
"learning_rate": 5e-06,
"loss": 0.0091,
"reward": 0.031065577268600465,
"reward_std": 0.9241972327232361,
"rewards/reward_func": 0.031065577268600465,
"step": 4600,
"toxic_reward": 3.4664461970329286
},
{
"clip_ratio": 0.0,
"completion_length": 41.825,
"epoch": 1.089319470699433,
"format_reward": 0.0,
"grad_norm": 15.773272514343262,
"image_reward": 0.23636678010225295,
"kl": 3.1093257188797,
"learning_rate": 5e-06,
"loss": -0.0048,
"reward": 0.6077887773513794,
"reward_std": 0.9942519944161177,
"rewards/reward_func": 0.6077887773513794,
"step": 4610,
"toxic_reward": 4.182659006118774
},
{
"clip_ratio": 0.0,
"completion_length": 45.6,
"epoch": 1.0916824196597354,
"format_reward": -0.25,
"grad_norm": 4.842249870300293,
"image_reward": 0.2884033203125,
"kl": 0.8512112647294998,
"learning_rate": 5e-06,
"loss": 0.0029,
"reward": 0.2840136528015137,
"reward_std": 1.34358575232327,
"rewards/reward_func": 0.2840136528015137,
"step": 4620,
"toxic_reward": 4.202986550331116
},
{
"clip_ratio": 0.0,
"completion_length": 49.0,
"epoch": 1.0940453686200378,
"format_reward": 0.0,
"grad_norm": 12.111598014831543,
"image_reward": 0.26724853515625,
"kl": 0.5160227678716183,
"learning_rate": 5e-06,
"loss": -0.1139,
"reward": 0.41757542341947557,
"reward_std": 0.6193137221038342,
"rewards/reward_func": 0.41757542341947557,
"step": 4630,
"toxic_reward": 3.6429463922977448
},
{
"clip_ratio": 0.0,
"completion_length": 56.475,
"epoch": 1.0964083175803403,
"format_reward": 0.0,
"grad_norm": 5.654786586761475,
"image_reward": 0.2576507568359375,
"kl": 0.6431491911411286,
"learning_rate": 5e-06,
"loss": -0.0209,
"reward": 0.4444656491279602,
"reward_std": 0.8271868824958801,
"rewards/reward_func": 0.4444656491279602,
"step": 4640,
"toxic_reward": 3.751771080493927
},
{
"clip_ratio": 0.0,
"completion_length": 45.225,
"epoch": 1.0987712665406426,
"format_reward": 0.0,
"grad_norm": 3.850701332092285,
"image_reward": 0.25439249724149704,
"kl": 3.444407519698143,
"learning_rate": 5e-06,
"loss": 0.0854,
"reward": 0.643358188867569,
"reward_std": 0.8931491523981094,
"rewards/reward_func": 0.643358188867569,
"step": 4650,
"toxic_reward": 4.321841323375702
},
{
"clip_ratio": 0.0,
"completion_length": 41.575,
"epoch": 1.1011342155009451,
"format_reward": 0.0,
"grad_norm": 6.3553853034973145,
"image_reward": 0.2528656005859375,
"kl": 0.5195316299796104,
"learning_rate": 5e-06,
"loss": -0.0275,
"reward": 0.4554763913154602,
"reward_std": 0.8332011103630066,
"rewards/reward_func": 0.4554763913154602,
"step": 4660,
"toxic_reward": 4.327652913331986
},
{
"clip_ratio": 0.0,
"completion_length": 37.925,
"epoch": 1.1034971644612477,
"format_reward": 0.0,
"grad_norm": 7.020429611206055,
"image_reward": 0.24530232697725296,
"kl": 1.074008372426033,
"learning_rate": 5e-06,
"loss": 0.1332,
"reward": 0.08963438272476196,
"reward_std": 1.1591505765914918,
"rewards/reward_func": 0.08963438272476196,
"step": 4670,
"toxic_reward": 3.8221271514892576
},
{
"clip_ratio": 0.0,
"completion_length": 45.675,
"epoch": 1.10586011342155,
"format_reward": 0.0,
"grad_norm": 1.8656316995620728,
"image_reward": 0.23163909912109376,
"kl": 1.6141413852572442,
"learning_rate": 5e-06,
"loss": 0.0496,
"reward": 0.4311521232128143,
"reward_std": 0.39210873320698736,
"rewards/reward_func": 0.4311521232128143,
"step": 4680,
"toxic_reward": 4.517966604232788
},
{
"clip_ratio": 0.0,
"completion_length": 54.2,
"epoch": 1.1082230623818525,
"format_reward": 0.0,
"grad_norm": 13.881125450134277,
"image_reward": 0.2380462646484375,
"kl": 0.6565065160393715,
"learning_rate": 5e-06,
"loss": -0.0278,
"reward": 0.3582367777824402,
"reward_std": 1.0096068516373635,
"rewards/reward_func": 0.3582367777824402,
"step": 4690,
"toxic_reward": 3.9490260004997255
},
{
"clip_ratio": 0.0,
"completion_length": 37.2,
"epoch": 1.110586011342155,
"format_reward": -0.25,
"grad_norm": 6.552460193634033,
"image_reward": 0.24650166779756547,
"kl": 2.4836434960365295,
"learning_rate": 5e-06,
"loss": -0.055,
"reward": 0.850802743434906,
"reward_std": 1.5017553605139256,
"rewards/reward_func": 0.850802743434906,
"step": 4700,
"toxic_reward": 4.000320458412171
},
{
"clip_ratio": 0.0,
"completion_length": 43.35,
"epoch": 1.1129489603024574,
"format_reward": 0.0,
"grad_norm": 10.16213607788086,
"image_reward": 0.2479156494140625,
"kl": 9.667583072185517,
"learning_rate": 5e-06,
"loss": 0.0035,
"reward": 0.7445069432258606,
"reward_std": 0.7123569492250681,
"rewards/reward_func": 0.7445069432258606,
"step": 4710,
"toxic_reward": 4.515477871894836
},
{
"clip_ratio": 0.0,
"completion_length": 44.825,
"epoch": 1.11531190926276,
"format_reward": 0.0,
"grad_norm": 11.000924110412598,
"image_reward": 0.2601796478033066,
"kl": 1.5108904749155045,
"learning_rate": 5e-06,
"loss": 0.0823,
"reward": 0.5056971669197082,
"reward_std": 0.6825690733268857,
"rewards/reward_func": 0.5056971669197082,
"step": 4720,
"toxic_reward": 4.039038109779358
},
{
"clip_ratio": 0.0,
"completion_length": 44.9,
"epoch": 1.1176748582230625,
"format_reward": -0.25,
"grad_norm": 10.222740173339844,
"image_reward": 0.25692138671875,
"kl": 0.35422504395246507,
"learning_rate": 5e-06,
"loss": 0.0093,
"reward": 0.08535944372415542,
"reward_std": 1.423003512620926,
"rewards/reward_func": 0.08535944372415542,
"step": 4730,
"toxic_reward": 3.4993788480758665
},
{
"clip_ratio": 0.0,
"completion_length": 41.55,
"epoch": 1.1200378071833648,
"format_reward": -0.25,
"grad_norm": 2.3486738204956055,
"image_reward": 0.24800923615694045,
"kl": 0.38842023983597757,
"learning_rate": 5e-06,
"loss": -0.0107,
"reward": 0.1211450919508934,
"reward_std": 1.3706756496801973,
"rewards/reward_func": 0.1211450919508934,
"step": 4740,
"toxic_reward": 3.2782628774642943
},
{
"clip_ratio": 0.0,
"completion_length": 51.325,
"epoch": 1.1224007561436673,
"format_reward": 0.0,
"grad_norm": 4.218822479248047,
"image_reward": 0.251934814453125,
"kl": 1.4006462961435318,
"learning_rate": 5e-06,
"loss": 0.0701,
"reward": 0.9292663365602494,
"reward_std": 0.874046965315938,
"rewards/reward_func": 0.9292663365602494,
"step": 4750,
"toxic_reward": 4.4716246843338014
},
{
"clip_ratio": 0.0,
"completion_length": 53.45,
"epoch": 1.1247637051039698,
"format_reward": 0.0,
"grad_norm": 13.548481941223145,
"image_reward": 0.276861572265625,
"kl": 0.5785035833716392,
"learning_rate": 5e-06,
"loss": -0.0177,
"reward": 0.1986662968993187,
"reward_std": 0.7839731447398662,
"rewards/reward_func": 0.1986662968993187,
"step": 4760,
"toxic_reward": 4.128668719530106
},
{
"clip_ratio": 0.0,
"completion_length": 40.975,
"epoch": 1.1271266540642721,
"format_reward": 0.0,
"grad_norm": 10.347504615783691,
"image_reward": 0.2632904052734375,
"kl": 0.28924584165215494,
"learning_rate": 5e-06,
"loss": -0.0536,
"reward": 0.40365022569894793,
"reward_std": 0.6283778937533497,
"rewards/reward_func": 0.40365022569894793,
"step": 4770,
"toxic_reward": 3.78736280053854
},
{
"clip_ratio": 0.0,
"completion_length": 51.775,
"epoch": 1.1294896030245747,
"format_reward": -0.75,
"grad_norm": 29.38702964782715,
"image_reward": 0.27183634638786314,
"kl": 8.101900951564312,
"learning_rate": 5e-06,
"loss": -0.0473,
"reward": -0.11305050253868103,
"reward_std": 2.1815814077854156,
"rewards/reward_func": -0.11305050253868103,
"step": 4780,
"toxic_reward": 3.949468755722046
},
{
"clip_ratio": 0.0,
"completion_length": 48.7,
"epoch": 1.1318525519848772,
"format_reward": -0.75,
"grad_norm": 8.19861125946045,
"image_reward": 0.2731597885489464,
"kl": 5.514032608270645,
"learning_rate": 5e-06,
"loss": 0.0809,
"reward": -0.5878833532333374,
"reward_std": 1.700104326196015,
"rewards/reward_func": -0.5878833532333374,
"step": 4790,
"toxic_reward": 4.362279486656189
},
{
"clip_ratio": 0.0,
"completion_length": 55.575,
"epoch": 1.1342155009451795,
"format_reward": 0.0,
"grad_norm": 25.879568099975586,
"image_reward": 0.272625732421875,
"kl": 0.41564694195985796,
"learning_rate": 5e-06,
"loss": 0.0466,
"reward": 0.5246647775173188,
"reward_std": 0.5603986160829664,
"rewards/reward_func": 0.5246647775173188,
"step": 4800,
"toxic_reward": 4.4845054864883425
},
{
"clip_ratio": 0.0,
"completion_length": 38.175,
"epoch": 1.136578449905482,
"format_reward": -0.25,
"grad_norm": 6.490880966186523,
"image_reward": 0.27078043669462204,
"kl": 0.39700448513031006,
"learning_rate": 5e-06,
"loss": 0.0442,
"reward": 0.10371096134185791,
"reward_std": 1.3051490228623153,
"rewards/reward_func": 0.10371096134185791,
"step": 4810,
"toxic_reward": 4.362593126296997
},
{
"clip_ratio": 0.0,
"completion_length": 42.025,
"epoch": 1.1389413988657844,
"format_reward": 0.0,
"grad_norm": 10.680285453796387,
"image_reward": 0.2447296142578125,
"kl": 0.43964013159275056,
"learning_rate": 5e-06,
"loss": -0.1088,
"reward": 0.43211621046066284,
"reward_std": 0.5677682287991047,
"rewards/reward_func": 0.43211621046066284,
"step": 4820,
"toxic_reward": 4.520205068588257
},
{
"clip_ratio": 0.0,
"completion_length": 34.225,
"epoch": 1.141304347826087,
"format_reward": -0.5,
"grad_norm": 7.846988201141357,
"image_reward": 0.24936320036649703,
"kl": 0.3737114042043686,
"learning_rate": 5e-06,
"loss": 0.1324,
"reward": -0.08488219976425171,
"reward_std": 1.6377468653023244,
"rewards/reward_func": -0.08488219976425171,
"step": 4830,
"toxic_reward": 3.979211616516113
},
{
"clip_ratio": 0.0,
"completion_length": 45.95,
"epoch": 1.1436672967863895,
"format_reward": -0.25,
"grad_norm": 13.332221031188965,
"image_reward": 0.27100830078125,
"kl": 0.39419813454151154,
"learning_rate": 5e-06,
"loss": -0.0752,
"reward": 0.9029350757598877,
"reward_std": 1.455178501456976,
"rewards/reward_func": 0.9029350757598877,
"step": 4840,
"toxic_reward": 3.6434609413146974
},
{
"clip_ratio": 0.0,
"completion_length": 45.125,
"epoch": 1.146030245746692,
"format_reward": 0.0,
"grad_norm": 3.2025651931762695,
"image_reward": 0.281640625,
"kl": 7.567617936432361,
"learning_rate": 5e-06,
"loss": -0.0926,
"reward": 0.7164658069610595,
"reward_std": 0.6624833345413208,
"rewards/reward_func": 0.7164658069610595,
"step": 4850,
"toxic_reward": 3.8413574934005736
},
{
"clip_ratio": 0.0,
"completion_length": 48.975,
"epoch": 1.1483931947069943,
"format_reward": 0.0,
"grad_norm": 9.695226669311523,
"image_reward": 0.24990997314453126,
"kl": 0.28165399581193923,
"learning_rate": 5e-06,
"loss": 0.0188,
"reward": -0.12237508296966552,
"reward_std": 0.6198875203728675,
"rewards/reward_func": -0.12237508296966552,
"step": 4860,
"toxic_reward": 4.485203766822815
},
{
"clip_ratio": 0.0,
"completion_length": 40.45,
"epoch": 1.1507561436672968,
"format_reward": 0.0,
"grad_norm": 2.5677099227905273,
"image_reward": 0.2783660888671875,
"kl": 0.29035804942250254,
"learning_rate": 5e-06,
"loss": -0.0618,
"reward": 0.050651901960372926,
"reward_std": 1.2044988840818405,
"rewards/reward_func": 0.050651901960372926,
"step": 4870,
"toxic_reward": 3.6688124537467957
},
{
"clip_ratio": 0.0,
"completion_length": 51.9,
"epoch": 1.1531190926275992,
"format_reward": 0.0,
"grad_norm": 4.83213996887207,
"image_reward": 0.2756866455078125,
"kl": 5.533606587722898,
"learning_rate": 5e-06,
"loss": -0.1798,
"reward": 0.5692965686321259,
"reward_std": 1.0450827227905393,
"rewards/reward_func": 0.5692965686321259,
"step": 4880,
"toxic_reward": 3.439787745475769
},
{
"clip_ratio": 0.0,
"completion_length": 42.375,
"epoch": 1.1554820415879017,
"format_reward": 0.0,
"grad_norm": 1.0500257015228271,
"image_reward": 0.258331298828125,
"kl": 0.11663263067603111,
"learning_rate": 5e-06,
"loss": 0.0311,
"reward": 0.4637997090816498,
"reward_std": 0.9648044936358928,
"rewards/reward_func": 0.4637997090816498,
"step": 4890,
"toxic_reward": 4.382505106925964
},
{
"clip_ratio": 0.0,
"completion_length": 46.5,
"epoch": 1.1578449905482042,
"format_reward": 0.0,
"grad_norm": 7.189860820770264,
"image_reward": 0.2593902587890625,
"kl": 0.15962190218269826,
"learning_rate": 5e-06,
"loss": 0.0056,
"reward": 0.5262487173080445,
"reward_std": 1.052651860564947,
"rewards/reward_func": 0.5262487173080445,
"step": 4900,
"toxic_reward": 4.166365385055542
},
{
"clip_ratio": 0.0,
"completion_length": 48.575,
"epoch": 1.1602079395085065,
"format_reward": 0.0,
"grad_norm": 1.5136041641235352,
"image_reward": 0.277166748046875,
"kl": 0.14849806036800145,
"learning_rate": 5e-06,
"loss": -0.0034,
"reward": 0.22453336119651796,
"reward_std": 0.5165121786296367,
"rewards/reward_func": 0.22453336119651796,
"step": 4910,
"toxic_reward": 3.900139307975769
},
{
"clip_ratio": 0.0,
"completion_length": 47.55,
"epoch": 1.162570888468809,
"format_reward": 0.0,
"grad_norm": 0.6905107498168945,
"image_reward": 0.26757049560546875,
"kl": 0.17389641776680947,
"learning_rate": 5e-06,
"loss": 0.0273,
"reward": 0.2769235372543335,
"reward_std": 0.8026977114379406,
"rewards/reward_func": 0.2769235372543335,
"step": 4920,
"toxic_reward": 4.421657228469849
},
{
"clip_ratio": 0.0,
"completion_length": 41.9,
"epoch": 1.1649338374291116,
"format_reward": 0.0,
"grad_norm": 0.7038688063621521,
"image_reward": 0.23498077392578126,
"kl": 0.1468098048120737,
"learning_rate": 5e-06,
"loss": -0.0593,
"reward": 0.6022326171398162,
"reward_std": 0.8370201224461198,
"rewards/reward_func": 0.6022326171398162,
"step": 4930,
"toxic_reward": 4.272796273231506
},
{
"clip_ratio": 0.0,
"completion_length": 43.575,
"epoch": 1.167296786389414,
"format_reward": 0.0,
"grad_norm": 3.3997626304626465,
"image_reward": 0.2218317672610283,
"kl": 10.733999550715088,
"learning_rate": 5e-06,
"loss": -0.0239,
"reward": 0.5406073331832886,
"reward_std": 1.1294488459825516,
"rewards/reward_func": 0.5406073331832886,
"step": 4940,
"toxic_reward": 4.010496520996094
},
{
"clip_ratio": 0.0,
"completion_length": 42.75,
"epoch": 1.1696597353497165,
"format_reward": 0.0,
"grad_norm": 13.437244415283203,
"image_reward": 0.260540771484375,
"kl": 0.4533839326351881,
"learning_rate": 5e-06,
"loss": 0.0308,
"reward": 0.6349693357944488,
"reward_std": 0.9300125196576119,
"rewards/reward_func": 0.6349693357944488,
"step": 4950,
"toxic_reward": 3.9825836658477782
},
{
"clip_ratio": 0.0,
"completion_length": 48.275,
"epoch": 1.172022684310019,
"format_reward": 0.0,
"grad_norm": 5.643482208251953,
"image_reward": 0.25181121826171876,
"kl": 0.7526591405272484,
"learning_rate": 5e-06,
"loss": 0.0702,
"reward": 0.2168402910232544,
"reward_std": 0.8874317653477192,
"rewards/reward_func": 0.2168402910232544,
"step": 4960,
"toxic_reward": 4.36525526046753
},
{
"clip_ratio": 0.0,
"completion_length": 52.125,
"epoch": 1.1743856332703213,
"format_reward": -0.25,
"grad_norm": 2.1135120391845703,
"image_reward": 0.27721354067325593,
"kl": 1.7936469875276089,
"learning_rate": 5e-06,
"loss": 0.0616,
"reward": -0.168658310174942,
"reward_std": 1.076946148276329,
"rewards/reward_func": -0.168658310174942,
"step": 4970,
"toxic_reward": 4.423034191131592
},
{
"clip_ratio": 0.0,
"completion_length": 41.2,
"epoch": 1.1767485822306238,
"format_reward": 0.0,
"grad_norm": 1.0600641965866089,
"image_reward": 0.22789459228515624,
"kl": 3.3630725659430025,
"learning_rate": 5e-06,
"loss": 0.0228,
"reward": 0.7056062936782836,
"reward_std": 0.9683291807770729,
"rewards/reward_func": 0.7056062936782836,
"step": 4980,
"toxic_reward": 4.235305881500244
},
{
"clip_ratio": 0.0,
"completion_length": 42.025,
"epoch": 1.1791115311909262,
"format_reward": -0.25,
"grad_norm": 1.4251501560211182,
"image_reward": 0.25722147673368456,
"kl": 0.2127727370709181,
"learning_rate": 5e-06,
"loss": -0.0369,
"reward": 0.7460228025913238,
"reward_std": 1.3902123406529427,
"rewards/reward_func": 0.7460228025913238,
"step": 4990,
"toxic_reward": 4.187235593795776
},
{
"clip_ratio": 0.0,
"completion_length": 43.85,
"epoch": 1.1814744801512287,
"format_reward": 0.0,
"grad_norm": 0.9059237837791443,
"image_reward": 0.275982666015625,
"kl": 0.1094449780881405,
"learning_rate": 5e-06,
"loss": 0.0122,
"reward": -0.013343071937561036,
"reward_std": 0.8927877993322909,
"rewards/reward_func": -0.013343071937561036,
"step": 5000,
"toxic_reward": 4.172649383544922
},
{
"clip_ratio": 0.0,
"completion_length": 54.475,
"epoch": 1.1838374291115312,
"format_reward": -0.25,
"grad_norm": 0.676426887512207,
"image_reward": 0.25437113344669343,
"kl": 0.306893527135253,
"learning_rate": 5e-06,
"loss": 0.0597,
"reward": 0.5460815012454987,
"reward_std": 1.3148551121354104,
"rewards/reward_func": 0.5460815012454987,
"step": 5010,
"toxic_reward": 4.169591236114502
},
{
"clip_ratio": 0.0,
"completion_length": 41.5,
"epoch": 1.1862003780718338,
"format_reward": -0.25,
"grad_norm": 1.0359044075012207,
"image_reward": 0.25831960141658783,
"kl": 0.11788953803479671,
"learning_rate": 5e-06,
"loss": 0.0199,
"reward": 0.008247452974319457,
"reward_std": 1.6603192906826734,
"rewards/reward_func": 0.008247452974319457,
"step": 5020,
"toxic_reward": 4.008079314231873
},
{
"clip_ratio": 0.0,
"completion_length": 41.7,
"epoch": 1.188563327032136,
"format_reward": -0.25,
"grad_norm": 4.407492637634277,
"image_reward": 0.25005086213350297,
"kl": 0.16296980381011963,
"learning_rate": 5e-06,
"loss": 0.013,
"reward": 0.45398043394088744,
"reward_std": 1.4666540574282407,
"rewards/reward_func": 0.45398043394088744,
"step": 5030,
"toxic_reward": 3.9480291843414306
},
{
"clip_ratio": 0.0,
"completion_length": 45.05,
"epoch": 1.1909262759924386,
"format_reward": -0.25,
"grad_norm": 1.3405718803405762,
"image_reward": 0.26083475798368455,
"kl": 1.2570629265159368,
"learning_rate": 5e-06,
"loss": -0.0608,
"reward": 0.24062097072601318,
"reward_std": 1.1102397807873785,
"rewards/reward_func": 0.24062097072601318,
"step": 5040,
"toxic_reward": 4.282037019729614
},
{
"clip_ratio": 0.0,
"completion_length": 52.475,
"epoch": 1.193289224952741,
"format_reward": -0.25,
"grad_norm": 1.0421810150146484,
"image_reward": 0.2481842041015625,
"kl": 0.4886137153953314,
"learning_rate": 5e-06,
"loss": 0.0154,
"reward": 0.35102577805519103,
"reward_std": 1.4176109634339809,
"rewards/reward_func": 0.35102577805519103,
"step": 5050,
"toxic_reward": 4.56660737991333
},
{
"clip_ratio": 0.0,
"completion_length": 49.775,
"epoch": 1.1956521739130435,
"format_reward": 0.0,
"grad_norm": 2.4514474868774414,
"image_reward": 0.2724589020013809,
"kl": 17.261842382885515,
"learning_rate": 5e-06,
"loss": 0.0799,
"reward": 0.2634397208690643,
"reward_std": 0.6655941482633352,
"rewards/reward_func": 0.2634397208690643,
"step": 5060,
"toxic_reward": 4.50599045753479
},
{
"clip_ratio": 0.0,
"completion_length": 42.675,
"epoch": 1.198015122873346,
"format_reward": -0.25,
"grad_norm": 4.386458396911621,
"image_reward": 0.2706837967038155,
"kl": 0.41296282410621643,
"learning_rate": 5e-06,
"loss": -0.0407,
"reward": 0.3763133823871613,
"reward_std": 1.3990098256617785,
"rewards/reward_func": 0.3763133823871613,
"step": 5070,
"toxic_reward": 3.8180208444595336
},
{
"clip_ratio": 0.0,
"completion_length": 43.55,
"epoch": 1.2003780718336483,
"format_reward": 0.0,
"grad_norm": 3.708019495010376,
"image_reward": 0.2523040771484375,
"kl": 0.13607071787118913,
"learning_rate": 5e-06,
"loss": -0.0338,
"reward": 0.09913046360015869,
"reward_std": 0.64256557286717,
"rewards/reward_func": 0.09913046360015869,
"step": 5080,
"toxic_reward": 4.473843407630921
},
{
"clip_ratio": 0.0,
"completion_length": 47.85,
"epoch": 1.2027410207939508,
"format_reward": 0.0,
"grad_norm": 0.9288604855537415,
"image_reward": 0.24779205322265624,
"kl": 0.20878240577876567,
"learning_rate": 5e-06,
"loss": 0.041,
"reward": 0.5819396436214447,
"reward_std": 0.7615427184849978,
"rewards/reward_func": 0.5819396436214447,
"step": 5090,
"toxic_reward": 4.673109149932861
},
{
"clip_ratio": 0.0,
"completion_length": 46.825,
"epoch": 1.2051039697542534,
"format_reward": -0.75,
"grad_norm": 1.3514373302459717,
"image_reward": 0.24433186948299407,
"kl": 0.2980830356478691,
"learning_rate": 5e-06,
"loss": -0.0794,
"reward": -0.4275161147117615,
"reward_std": 2.30497971996665,
"rewards/reward_func": -0.4275161147117615,
"step": 5100,
"toxic_reward": 4.221112084388733
},
{
"clip_ratio": 0.0,
"completion_length": 34.775,
"epoch": 1.2074669187145557,
"format_reward": 0.0,
"grad_norm": 1.3809269666671753,
"image_reward": 0.2739410400390625,
"kl": 4.718816532939672,
"learning_rate": 5e-06,
"loss": -0.1045,
"reward": 0.23022666573524475,
"reward_std": 0.9735932052135468,
"rewards/reward_func": 0.23022666573524475,
"step": 5110,
"toxic_reward": 3.8542242765426638
},
{
"clip_ratio": 0.0,
"completion_length": 51.125,
"epoch": 1.2098298676748582,
"format_reward": -0.75,
"grad_norm": 1.045753836631775,
"image_reward": 0.243878173828125,
"kl": 0.15604666136205197,
"learning_rate": 5e-06,
"loss": 0.0351,
"reward": -0.7461395561695099,
"reward_std": 2.103620085120201,
"rewards/reward_func": -0.7461395561695099,
"step": 5120,
"toxic_reward": 4.052614498138428
},
{
"clip_ratio": 0.0,
"completion_length": 37.3,
"epoch": 1.2121928166351608,
"format_reward": 0.0,
"grad_norm": 2.4709815979003906,
"image_reward": 0.25801239013671873,
"kl": 0.1505513045936823,
"learning_rate": 5e-06,
"loss": -0.073,
"reward": 0.789186455309391,
"reward_std": 1.0413845662027597,
"rewards/reward_func": 0.789186455309391,
"step": 5130,
"toxic_reward": 3.886520874500275
},
{
"clip_ratio": 0.0,
"completion_length": 38.575,
"epoch": 1.214555765595463,
"format_reward": 0.0,
"grad_norm": 3.3710083961486816,
"image_reward": 0.2646331787109375,
"kl": 0.1121824998408556,
"learning_rate": 5e-06,
"loss": 0.1077,
"reward": 0.6361587151885033,
"reward_std": 0.6423972092568875,
"rewards/reward_func": 0.6361587151885033,
"step": 5140,
"toxic_reward": 4.181642347574234
},
{
"clip_ratio": 0.0,
"completion_length": 51.625,
"epoch": 1.2169187145557656,
"format_reward": 0.0,
"grad_norm": 1.4757941961288452,
"image_reward": 0.2700469970703125,
"kl": 0.808637504093349,
"learning_rate": 5e-06,
"loss": 0.0188,
"reward": 0.2734032437205315,
"reward_std": 0.8411962412297725,
"rewards/reward_func": 0.2734032437205315,
"step": 5150,
"toxic_reward": 3.6458971202373505
},
{
"clip_ratio": 0.0,
"completion_length": 51.725,
"epoch": 1.2192816635160681,
"format_reward": -0.25,
"grad_norm": 0.66521817445755,
"image_reward": 0.2667388916015625,
"kl": 0.4425561033189297,
"learning_rate": 5e-06,
"loss": -0.0435,
"reward": 0.35035309493541716,
"reward_std": 1.6248657763004304,
"rewards/reward_func": 0.35035309493541716,
"step": 5160,
"toxic_reward": 3.7190463662147524
},
{
"clip_ratio": 0.0,
"completion_length": 53.475,
"epoch": 1.2216446124763705,
"format_reward": 0.0,
"grad_norm": 8.705077171325684,
"image_reward": 0.259429931640625,
"kl": 0.7811562133952975,
"learning_rate": 5e-06,
"loss": 0.0244,
"reward": 0.3771729826927185,
"reward_std": 1.326733610033989,
"rewards/reward_func": 0.3771729826927185,
"step": 5170,
"toxic_reward": 3.6760028123855593
},
{
"clip_ratio": 0.0,
"completion_length": 46.975,
"epoch": 1.224007561436673,
"format_reward": 0.0,
"grad_norm": 14.75157356262207,
"image_reward": 0.22822214663028717,
"kl": 45.12481062971055,
"learning_rate": 5e-06,
"loss": -0.0908,
"reward": 0.6647323310375214,
"reward_std": 1.0134072445333004,
"rewards/reward_func": 0.6647323310375214,
"step": 5180,
"toxic_reward": 4.210642290115357
},
{
"clip_ratio": 0.0,
"completion_length": 50.4,
"epoch": 1.2263705103969755,
"format_reward": -0.25,
"grad_norm": 13.71264934539795,
"image_reward": 0.23838348388671876,
"kl": 0.2953592788428068,
"learning_rate": 5e-06,
"loss": -0.0801,
"reward": 0.4181412994861603,
"reward_std": 1.2768891528248787,
"rewards/reward_func": 0.4181412994861603,
"step": 5190,
"toxic_reward": 4.502067589759827
},
{
"clip_ratio": 0.0,
"completion_length": 55.625,
"epoch": 1.2287334593572778,
"format_reward": 0.0,
"grad_norm": 7.2092390060424805,
"image_reward": 0.2596160888671875,
"kl": 0.15071408227086067,
"learning_rate": 5e-06,
"loss": 0.0359,
"reward": 0.46884081363677976,
"reward_std": 0.9004301078617573,
"rewards/reward_func": 0.46884081363677976,
"step": 5200,
"toxic_reward": 4.537938523292541
},
{
"clip_ratio": 0.0,
"completion_length": 37.55,
"epoch": 1.2310964083175804,
"format_reward": 0.0,
"grad_norm": 1.4807243347167969,
"image_reward": 0.25061492919921874,
"kl": 0.39518592432141303,
"learning_rate": 5e-06,
"loss": 0.0062,
"reward": 0.7565897464752197,
"reward_std": 0.6514241144061088,
"rewards/reward_func": 0.7565897464752197,
"step": 5210,
"toxic_reward": 4.779706335067749
},
{
"clip_ratio": 0.0,
"completion_length": 37.15,
"epoch": 1.2334593572778827,
"format_reward": 0.0,
"grad_norm": 12.918940544128418,
"image_reward": 0.266143798828125,
"kl": 1.6246791556477547,
"learning_rate": 5e-06,
"loss": -0.0669,
"reward": 1.0145411103963853,
"reward_std": 0.7731746513396501,
"rewards/reward_func": 1.0145411103963853,
"step": 5220,
"toxic_reward": 3.9364122271537783
},
{
"clip_ratio": 0.0,
"completion_length": 47.8,
"epoch": 1.2358223062381852,
"format_reward": -0.5,
"grad_norm": 8.1648530960083,
"image_reward": 0.256298828125,
"kl": 0.3491713672876358,
"learning_rate": 5e-06,
"loss": -0.0294,
"reward": 0.18980904817581176,
"reward_std": 1.4395622819662095,
"rewards/reward_func": 0.18980904817581176,
"step": 5230,
"toxic_reward": 4.15175496339798
},
{
"clip_ratio": 0.0,
"completion_length": 43.675,
"epoch": 1.2381852551984878,
"format_reward": -0.25,
"grad_norm": 7.493502140045166,
"image_reward": 0.2956329345703125,
"kl": 3.9262495055794715,
"learning_rate": 5e-06,
"loss": 0.0129,
"reward": 0.19967559576034546,
"reward_std": 1.4724704299122096,
"rewards/reward_func": 0.19967559576034546,
"step": 5240,
"toxic_reward": 3.676086974143982
},
{
"clip_ratio": 0.0,
"completion_length": 47.95,
"epoch": 1.24054820415879,
"format_reward": 0.0,
"grad_norm": 7.836026668548584,
"image_reward": 0.262060546875,
"kl": 0.5677594847977161,
"learning_rate": 5e-06,
"loss": -0.0137,
"reward": 1.0836671590805054,
"reward_std": 0.9185017041862011,
"rewards/reward_func": 1.0836671590805054,
"step": 5250,
"toxic_reward": 4.442173409461975
},
{
"clip_ratio": 0.0,
"completion_length": 35.925,
"epoch": 1.2429111531190926,
"format_reward": -0.25,
"grad_norm": 17.290130615234375,
"image_reward": 0.256195068359375,
"kl": 0.3261503577232361,
"learning_rate": 5e-06,
"loss": -0.0665,
"reward": 0.4270883619785309,
"reward_std": 1.5899662226438522,
"rewards/reward_func": 0.4270883619785309,
"step": 5260,
"toxic_reward": 3.6384164452552796
},
{
"clip_ratio": 0.0,
"completion_length": 39.425,
"epoch": 1.2452741020793952,
"format_reward": 0.0,
"grad_norm": 19.655460357666016,
"image_reward": 0.271282958984375,
"kl": 1.0409250572323798,
"learning_rate": 5e-06,
"loss": -0.0411,
"reward": 0.7604422211647034,
"reward_std": 0.6456888254731894,
"rewards/reward_func": 0.7604422211647034,
"step": 5270,
"toxic_reward": 3.7677977979183197
},
{
"clip_ratio": 0.0,
"completion_length": 59.775,
"epoch": 1.2476370510396975,
"format_reward": -0.25,
"grad_norm": 0.5878366827964783,
"image_reward": 0.2589070647954941,
"kl": 0.16051149740815163,
"learning_rate": 5e-06,
"loss": 0.0852,
"reward": 0.39556344896554946,
"reward_std": 1.1551922081038355,
"rewards/reward_func": 0.39556344896554946,
"step": 5280,
"toxic_reward": 3.390644001960754
},
{
"clip_ratio": 0.0,
"completion_length": 42.275,
"epoch": 1.25,
"format_reward": 0.0,
"grad_norm": 8.206055641174316,
"image_reward": 0.275860595703125,
"kl": 0.4969006285071373,
"learning_rate": 5e-06,
"loss": -0.115,
"reward": 0.4857667863368988,
"reward_std": 0.8739027962088585,
"rewards/reward_func": 0.4857667863368988,
"step": 5290,
"toxic_reward": 4.016790902614593
},
{
"clip_ratio": 0.0,
"completion_length": 48.7,
"epoch": 1.2523629489603025,
"format_reward": 0.0,
"grad_norm": 3.513704299926758,
"image_reward": 0.24937744140625,
"kl": 0.14417755380272865,
"learning_rate": 5e-06,
"loss": 0.0317,
"reward": 0.48732776641845704,
"reward_std": 0.8942459903657436,
"rewards/reward_func": 0.48732776641845704,
"step": 5300,
"toxic_reward": 4.074605274200439
},
{
"clip_ratio": 0.0,
"completion_length": 46.7,
"epoch": 1.2547258979206048,
"format_reward": 0.0,
"grad_norm": 3.694108724594116,
"image_reward": 0.2640960693359375,
"kl": 0.21989786028862,
"learning_rate": 5e-06,
"loss": 0.0552,
"reward": 0.20011116266250611,
"reward_std": 0.9783342686016112,
"rewards/reward_func": 0.20011116266250611,
"step": 5310,
"toxic_reward": 3.337161436676979
},
{
"clip_ratio": 0.0,
"completion_length": 61.15,
"epoch": 1.2570888468809074,
"format_reward": -0.25,
"grad_norm": 1.8417941331863403,
"image_reward": 0.23155619353055953,
"kl": 4.248336365818977,
"learning_rate": 5e-06,
"loss": 0.0877,
"reward": 0.23556498885154725,
"reward_std": 0.9007356996648014,
"rewards/reward_func": 0.23556498885154725,
"step": 5320,
"toxic_reward": 4.5890906810760494
},
{
"clip_ratio": 0.0,
"completion_length": 50.475,
"epoch": 1.2594517958412097,
"format_reward": 0.0,
"grad_norm": 4.823044300079346,
"image_reward": 0.2513310745358467,
"kl": 4.037289990484714,
"learning_rate": 5e-06,
"loss": 0.082,
"reward": 0.8450765609741211,
"reward_std": 0.8255521267652511,
"rewards/reward_func": 0.8450765609741211,
"step": 5330,
"toxic_reward": 4.287392568588257
},
{
"clip_ratio": 0.0,
"completion_length": 40.975,
"epoch": 1.2618147448015122,
"format_reward": 0.0,
"grad_norm": 4.315946578979492,
"image_reward": 0.2459075927734375,
"kl": 0.3113373316824436,
"learning_rate": 5e-06,
"loss": 0.0475,
"reward": 0.645756970345974,
"reward_std": 0.7255122657865286,
"rewards/reward_func": 0.645756970345974,
"step": 5340,
"toxic_reward": 4.189401495456695
},
{
"clip_ratio": 0.0,
"completion_length": 39.7,
"epoch": 1.2641776937618148,
"format_reward": 0.0,
"grad_norm": 3.4027810096740723,
"image_reward": 0.2551483154296875,
"kl": 0.4323126286268234,
"learning_rate": 5e-06,
"loss": -0.0048,
"reward": 0.5660954803228379,
"reward_std": 0.6791210256516933,
"rewards/reward_func": 0.5660954803228379,
"step": 5350,
"toxic_reward": 3.2965795576572416
},
{
"clip_ratio": 0.0,
"completion_length": 35.55,
"epoch": 1.2665406427221173,
"format_reward": 0.0,
"grad_norm": 1.9737337827682495,
"image_reward": 0.2604766845703125,
"kl": 0.8922965943813324,
"learning_rate": 5e-06,
"loss": 0.0886,
"reward": 0.37445068359375,
"reward_std": 0.7902419693768025,
"rewards/reward_func": 0.37445068359375,
"step": 5360,
"toxic_reward": 3.6073597192764284
},
{
"clip_ratio": 0.0,
"completion_length": 50.975,
"epoch": 1.2689035916824196,
"format_reward": -0.25,
"grad_norm": 5.368748188018799,
"image_reward": 0.2573964446783066,
"kl": 0.8937133550643921,
"learning_rate": 5e-06,
"loss": 0.0345,
"reward": 1.1729332506656647,
"reward_std": 1.3139135614037514,
"rewards/reward_func": 1.1729332506656647,
"step": 5370,
"toxic_reward": 4.428536581993103
},
{
"clip_ratio": 0.0,
"completion_length": 47.375,
"epoch": 1.2712665406427222,
"format_reward": 0.0,
"grad_norm": 2.3669607639312744,
"image_reward": 0.242498779296875,
"kl": 0.6131832510232925,
"learning_rate": 5e-06,
"loss": -0.1065,
"reward": 0.3124019861221313,
"reward_std": 0.8398781210184098,
"rewards/reward_func": 0.3124019861221313,
"step": 5380,
"toxic_reward": 3.9513532400131224
},
{
"clip_ratio": 0.0,
"completion_length": 62.325,
"epoch": 1.2736294896030245,
"format_reward": 0.0,
"grad_norm": 3.6428773403167725,
"image_reward": 0.2564788818359375,
"kl": 0.983223095536232,
"learning_rate": 5e-06,
"loss": -0.0332,
"reward": 0.8021515548229218,
"reward_std": 0.8680705142207443,
"rewards/reward_func": 0.8021515548229218,
"step": 5390,
"toxic_reward": 3.7623249292373657
},
{
"clip_ratio": 0.0,
"completion_length": 41.375,
"epoch": 1.275992438563327,
"format_reward": 0.0,
"grad_norm": 22.341930389404297,
"image_reward": 0.25401458740234373,
"kl": 0.7686945527791977,
"learning_rate": 5e-06,
"loss": 0.021,
"reward": 0.18261390328407287,
"reward_std": 0.39404432671144607,
"rewards/reward_func": 0.18261390328407287,
"step": 5400,
"toxic_reward": 3.986022639274597
},
{
"clip_ratio": 0.0,
"completion_length": 43.075,
"epoch": 1.2783553875236295,
"format_reward": -0.5,
"grad_norm": 11.878053665161133,
"image_reward": 0.24981587678194045,
"kl": 0.4886711150407791,
"learning_rate": 5e-06,
"loss": -0.0214,
"reward": 0.06913218498229981,
"reward_std": 1.5105109971016646,
"rewards/reward_func": 0.06913218498229981,
"step": 5410,
"toxic_reward": 4.118505048751831
},
{
"clip_ratio": 0.0,
"completion_length": 43.65,
"epoch": 1.280718336483932,
"format_reward": -0.25,
"grad_norm": 7.851999759674072,
"image_reward": 0.25881449431180953,
"kl": 0.6457854598760605,
"learning_rate": 5e-06,
"loss": 0.0958,
"reward": 0.12992151379585265,
"reward_std": 1.303325356543064,
"rewards/reward_func": 0.12992151379585265,
"step": 5420,
"toxic_reward": 4.377404046058655
},
{
"clip_ratio": 0.0,
"completion_length": 39.9,
"epoch": 1.2830812854442344,
"format_reward": -0.25,
"grad_norm": 25.7547550201416,
"image_reward": 0.2694793701171875,
"kl": 1.677524197101593,
"learning_rate": 5e-06,
"loss": 0.0859,
"reward": 0.34250465631484983,
"reward_std": 1.0538076907396317,
"rewards/reward_func": 0.34250465631484983,
"step": 5430,
"toxic_reward": 4.271343016624451
},
{
"clip_ratio": 0.0,
"completion_length": 46.875,
"epoch": 1.285444234404537,
"format_reward": 0.0,
"grad_norm": 1.9158964157104492,
"image_reward": 0.2666290283203125,
"kl": 0.5966441169381141,
"learning_rate": 5e-06,
"loss": -0.0566,
"reward": 0.4593892157077789,
"reward_std": 0.6576637156307698,
"rewards/reward_func": 0.4593892157077789,
"step": 5440,
"toxic_reward": 4.204905700683594
},
{
"clip_ratio": 0.0,
"completion_length": 47.525,
"epoch": 1.2878071833648392,
"format_reward": 0.0,
"grad_norm": 1.6007134914398193,
"image_reward": 0.2621429443359375,
"kl": 1.1984394997358323,
"learning_rate": 5e-06,
"loss": -0.0486,
"reward": 0.25084500312805175,
"reward_std": 1.5825427711009978,
"rewards/reward_func": 0.25084500312805175,
"step": 5450,
"toxic_reward": 3.685545027256012
},
{
"clip_ratio": 0.0,
"completion_length": 38.45,
"epoch": 1.2901701323251418,
"format_reward": -0.25,
"grad_norm": 25.30818748474121,
"image_reward": 0.2647552490234375,
"kl": 6.056701734662056,
"learning_rate": 5e-06,
"loss": 0.0599,
"reward": 0.25077282190322875,
"reward_std": 1.234234382212162,
"rewards/reward_func": 0.25077282190322875,
"step": 5460,
"toxic_reward": 4.593598937988281
},
{
"clip_ratio": 0.0,
"completion_length": 43.825,
"epoch": 1.2925330812854443,
"format_reward": 0.0,
"grad_norm": 16.34868812561035,
"image_reward": 0.2645416259765625,
"kl": 1.6588621526956557,
"learning_rate": 5e-06,
"loss": 0.0054,
"reward": 0.5463581264019013,
"reward_std": 0.7020838841795921,
"rewards/reward_func": 0.5463581264019013,
"step": 5470,
"toxic_reward": 4.367759561538696
},
{
"clip_ratio": 0.0,
"completion_length": 57.5,
"epoch": 1.2948960302457466,
"format_reward": 0.0,
"grad_norm": 19.92365264892578,
"image_reward": 0.235223388671875,
"kl": 4.412905436754227,
"learning_rate": 5e-06,
"loss": -0.0277,
"reward": 0.301082968711853,
"reward_std": 0.5573954021558165,
"rewards/reward_func": 0.301082968711853,
"step": 5480,
"toxic_reward": 4.563822269439697
},
{
"clip_ratio": 0.0,
"completion_length": 48.45,
"epoch": 1.2972589792060492,
"format_reward": -0.25,
"grad_norm": 15.039924621582031,
"image_reward": 0.2686960846185684,
"kl": 1.0792655169963836,
"learning_rate": 5e-06,
"loss": -0.0051,
"reward": -0.10340776294469833,
"reward_std": 1.0451893661171199,
"rewards/reward_func": -0.10340776294469833,
"step": 5490,
"toxic_reward": 3.9914595246315003
},
{
"clip_ratio": 0.0,
"completion_length": 49.675,
"epoch": 1.2996219281663515,
"format_reward": 0.0,
"grad_norm": 20.686878204345703,
"image_reward": 0.236834716796875,
"kl": 0.6705092936754227,
"learning_rate": 5e-06,
"loss": -0.0657,
"reward": 0.7118561029434204,
"reward_std": 0.6682203419506549,
"rewards/reward_func": 0.7118561029434204,
"step": 5500,
"toxic_reward": 4.56234884262085
},
{
"clip_ratio": 0.0,
"completion_length": 45.45,
"epoch": 1.301984877126654,
"format_reward": -0.25,
"grad_norm": 9.817633628845215,
"image_reward": 0.2553232818841934,
"kl": 0.9995080977678299,
"learning_rate": 5e-06,
"loss": -0.1257,
"reward": 0.27432467341423034,
"reward_std": 1.3118387231603266,
"rewards/reward_func": 0.27432467341423034,
"step": 5510,
"toxic_reward": 4.407331418991089
},
{
"clip_ratio": 0.0,
"completion_length": 51.6,
"epoch": 1.3043478260869565,
"format_reward": 0.0,
"grad_norm": 11.929862976074219,
"image_reward": 0.2562835693359375,
"kl": 11.621018621325494,
"learning_rate": 5e-06,
"loss": 0.0747,
"reward": 0.29978330433368683,
"reward_std": 0.5768878096714616,
"rewards/reward_func": 0.29978330433368683,
"step": 5520,
"toxic_reward": 3.9843369722366333
},
{
"clip_ratio": 0.0,
"completion_length": 44.375,
"epoch": 1.306710775047259,
"format_reward": -0.5,
"grad_norm": 54.56308364868164,
"image_reward": 0.25370279848575594,
"kl": 15.30106150507927,
"learning_rate": 5e-06,
"loss": 0.059,
"reward": -0.3965187072753906,
"reward_std": 1.5167736381292343,
"rewards/reward_func": -0.3965187072753906,
"step": 5530,
"toxic_reward": 4.435292959213257
},
{
"clip_ratio": 0.0,
"completion_length": 50.35,
"epoch": 1.3090737240075614,
"format_reward": 0.0,
"grad_norm": 3.027195930480957,
"image_reward": 0.30064697265625,
"kl": 1.0911591410636903,
"learning_rate": 5e-06,
"loss": -0.0513,
"reward": 0.3132080137729645,
"reward_std": 0.5429811116307974,
"rewards/reward_func": 0.3132080137729645,
"step": 5540,
"toxic_reward": 4.454129576683044
},
{
"clip_ratio": 0.0,
"completion_length": 42.2,
"epoch": 1.311436672967864,
"format_reward": 0.0,
"grad_norm": 14.916865348815918,
"image_reward": 0.2515268951654434,
"kl": 5.700361841917038,
"learning_rate": 5e-06,
"loss": 0.0014,
"reward": 0.014350098371505738,
"reward_std": 0.48063138537108896,
"rewards/reward_func": 0.014350098371505738,
"step": 5550,
"toxic_reward": 4.399230480194092
},
{
"clip_ratio": 0.0,
"completion_length": 43.975,
"epoch": 1.3137996219281662,
"format_reward": -0.25,
"grad_norm": 17.978458404541016,
"image_reward": 0.24458109587430954,
"kl": 4.160827812552452,
"learning_rate": 5e-06,
"loss": -0.0048,
"reward": 0.29164408445358275,
"reward_std": 0.9224476981908083,
"rewards/reward_func": 0.29164408445358275,
"step": 5560,
"toxic_reward": 4.361080431938172
},
{
"clip_ratio": 0.0,
"completion_length": 42.7,
"epoch": 1.3161625708884688,
"format_reward": 0.0,
"grad_norm": 2.0734810829162598,
"image_reward": 0.248583984375,
"kl": 2.431508493423462,
"learning_rate": 5e-06,
"loss": -0.0346,
"reward": 0.49899758100509645,
"reward_std": 0.9045591181144118,
"rewards/reward_func": 0.49899758100509645,
"step": 5570,
"toxic_reward": 4.267354512214661
},
{
"clip_ratio": 0.0,
"completion_length": 51.675,
"epoch": 1.3185255198487713,
"format_reward": -0.25,
"grad_norm": 7.6361165046691895,
"image_reward": 0.27701314240694047,
"kl": 1.473704105615616,
"learning_rate": 5e-06,
"loss": 0.0156,
"reward": 0.5548757612705231,
"reward_std": 0.9885425483807921,
"rewards/reward_func": 0.5548757612705231,
"step": 5580,
"toxic_reward": 4.615298962593078
},
{
"clip_ratio": 0.0,
"completion_length": 45.35,
"epoch": 1.3208884688090738,
"format_reward": 0.0,
"grad_norm": 8.7377290725708,
"image_reward": 0.2538330078125,
"kl": 197.4472616136074,
"learning_rate": 5e-06,
"loss": 0.1633,
"reward": 0.6491668224334717,
"reward_std": 0.6371353514492512,
"rewards/reward_func": 0.6491668224334717,
"step": 5590,
"toxic_reward": 4.438857316970825
},
{
"clip_ratio": 0.0,
"completion_length": 47.575,
"epoch": 1.3232514177693762,
"format_reward": 0.0,
"grad_norm": 16.95717430114746,
"image_reward": 0.23577117919921875,
"kl": 3.680394399166107,
"learning_rate": 5e-06,
"loss": -0.0084,
"reward": 0.1440478801727295,
"reward_std": 0.4425256311893463,
"rewards/reward_func": 0.1440478801727295,
"step": 5600,
"toxic_reward": 4.603517079353333
},
{
"clip_ratio": 0.0,
"completion_length": 55.25,
"epoch": 1.3256143667296787,
"format_reward": 0.0,
"grad_norm": 12.732531547546387,
"image_reward": 0.2568023681640625,
"kl": 2.256978714466095,
"learning_rate": 5e-06,
"loss": 0.0219,
"reward": 0.040336894989013675,
"reward_std": 0.6101976454257965,
"rewards/reward_func": 0.040336894989013675,
"step": 5610,
"toxic_reward": 4.473554587364196
},
{
"clip_ratio": 0.0,
"completion_length": 50.75,
"epoch": 1.327977315689981,
"format_reward": 0.0,
"grad_norm": 14.701716423034668,
"image_reward": 0.252197265625,
"kl": 8.072559344768525,
"learning_rate": 5e-06,
"loss": 0.0803,
"reward": 0.8639614999294281,
"reward_std": 1.0928052112460136,
"rewards/reward_func": 0.8639614999294281,
"step": 5620,
"toxic_reward": 3.9843992233276366
},
{
"clip_ratio": 0.0,
"completion_length": 46.8,
"epoch": 1.3303402646502835,
"format_reward": 0.0,
"grad_norm": 10.12986946105957,
"image_reward": 0.27263641357421875,
"kl": 3.3592694640159606,
"learning_rate": 5e-06,
"loss": 0.0652,
"reward": 0.7886571228504181,
"reward_std": 1.0737986475229264,
"rewards/reward_func": 0.7886571228504181,
"step": 5630,
"toxic_reward": 3.668324041366577
},
{
"clip_ratio": 0.0,
"completion_length": 37.95,
"epoch": 1.332703213610586,
"format_reward": 0.0,
"grad_norm": 23.82711410522461,
"image_reward": 0.2702301025390625,
"kl": 12.466990399360657,
"learning_rate": 5e-06,
"loss": -0.0401,
"reward": 0.7557259559631347,
"reward_std": 0.9376067817211151,
"rewards/reward_func": 0.7557259559631347,
"step": 5640,
"toxic_reward": 4.100273895263672
},
{
"clip_ratio": 0.0,
"completion_length": 46.625,
"epoch": 1.3350661625708884,
"format_reward": 0.0,
"grad_norm": 7.783689975738525,
"image_reward": 0.2736572265625,
"kl": 9.604325413703918,
"learning_rate": 5e-06,
"loss": 0.0135,
"reward": 0.21887901425361633,
"reward_std": 0.41371094444766643,
"rewards/reward_func": 0.21887901425361633,
"step": 5650,
"toxic_reward": 4.314438569545746
},
{
"clip_ratio": 0.0,
"completion_length": 54.125,
"epoch": 1.337429111531191,
"format_reward": 0.0,
"grad_norm": 8.773420333862305,
"image_reward": 0.2513946533203125,
"kl": 9.296885073184967,
"learning_rate": 5e-06,
"loss": -0.095,
"reward": 1.1378837168216704,
"reward_std": 0.818750386312604,
"rewards/reward_func": 1.1378837168216704,
"step": 5660,
"toxic_reward": 4.522894716262817
},
{
"clip_ratio": 0.0,
"completion_length": 50.9,
"epoch": 1.3397920604914935,
"format_reward": -0.25,
"grad_norm": 14.245692253112793,
"image_reward": 0.24806925505399705,
"kl": 6.753875517845154,
"learning_rate": 5e-06,
"loss": 0.021,
"reward": 0.6894584268331527,
"reward_std": 1.542804090678692,
"rewards/reward_func": 0.6894584268331527,
"step": 5670,
"toxic_reward": 3.9723486423492433
},
{
"clip_ratio": 0.0,
"completion_length": 45.075,
"epoch": 1.3421550094517958,
"format_reward": -0.25,
"grad_norm": 24.408653259277344,
"image_reward": 0.25640462189912794,
"kl": 11.201071047782898,
"learning_rate": 5e-06,
"loss": -0.0492,
"reward": -0.00045427381992340087,
"reward_std": 1.027926566079259,
"rewards/reward_func": -0.00045427381992340087,
"step": 5680,
"toxic_reward": 3.891611325740814
},
{
"clip_ratio": 0.0,
"completion_length": 44.65,
"epoch": 1.3445179584120983,
"format_reward": 0.0,
"grad_norm": 1.8934930562973022,
"image_reward": 0.25365397036075593,
"kl": 4.947464096546173,
"learning_rate": 5e-06,
"loss": -0.0784,
"reward": 0.5487861603498458,
"reward_std": 0.7702463563531637,
"rewards/reward_func": 0.5487861603498458,
"step": 5690,
"toxic_reward": 3.977373069524765
},
{
"clip_ratio": 0.0,
"completion_length": 49.5,
"epoch": 1.3468809073724008,
"format_reward": -0.25,
"grad_norm": 9.322084426879883,
"image_reward": 0.27449544221162797,
"kl": 2.6174604117870333,
"learning_rate": 5e-06,
"loss": -0.0094,
"reward": 0.39208410382270814,
"reward_std": 1.61805320084095,
"rewards/reward_func": 0.39208410382270814,
"step": 5700,
"toxic_reward": 4.174729800224304
},
{
"clip_ratio": 0.0,
"completion_length": 37.25,
"epoch": 1.3492438563327032,
"format_reward": 0.0,
"grad_norm": 12.4689302444458,
"image_reward": 0.2571044921875,
"kl": 3.102731728553772,
"learning_rate": 5e-06,
"loss": -0.0306,
"reward": 0.24812114238739014,
"reward_std": 0.6699782099574805,
"rewards/reward_func": 0.24812114238739014,
"step": 5710,
"toxic_reward": 3.692942750453949
},
{
"clip_ratio": 0.0,
"completion_length": 42.425,
"epoch": 1.3516068052930057,
"format_reward": 0.0,
"grad_norm": 16.464384078979492,
"image_reward": 0.2592987060546875,
"kl": 41.42341262102127,
"learning_rate": 5e-06,
"loss": -0.1787,
"reward": 0.9784101039171219,
"reward_std": 1.2197245783172548,
"rewards/reward_func": 0.9784101039171219,
"step": 5720,
"toxic_reward": 3.5939176797866823
},
{
"clip_ratio": 0.0,
"completion_length": 41.775,
"epoch": 1.353969754253308,
"format_reward": 0.0,
"grad_norm": 14.272177696228027,
"image_reward": 0.24337158203125,
"kl": 3.5139986366033553,
"learning_rate": 5e-06,
"loss": -0.0502,
"reward": 0.3250808596611023,
"reward_std": 0.6109479434788228,
"rewards/reward_func": 0.3250808596611023,
"step": 5730,
"toxic_reward": 4.485757279396057
},
{
"clip_ratio": 0.0,
"completion_length": 48.6,
"epoch": 1.3563327032136105,
"format_reward": -0.25,
"grad_norm": 8.131665229797363,
"image_reward": 0.2514506012201309,
"kl": 5.592804127931595,
"learning_rate": 5e-06,
"loss": -0.015,
"reward": 0.3052162408828735,
"reward_std": 1.201428510248661,
"rewards/reward_func": 0.3052162408828735,
"step": 5740,
"toxic_reward": 4.2779217004776005
},
{
"clip_ratio": 0.0,
"completion_length": 44.275,
"epoch": 1.358695652173913,
"format_reward": -0.5,
"grad_norm": 15.648195266723633,
"image_reward": 0.266064453125,
"kl": 1.6513773769140243,
"learning_rate": 5e-06,
"loss": 0.0092,
"reward": -0.20032901763916017,
"reward_std": 1.7222102746367454,
"rewards/reward_func": -0.20032901763916017,
"step": 5750,
"toxic_reward": 4.259865856170654
},
{
"clip_ratio": 0.0,
"completion_length": 56.0,
"epoch": 1.3610586011342156,
"format_reward": 0.0,
"grad_norm": 10.893685340881348,
"image_reward": 0.2588653564453125,
"kl": 4.073341834545135,
"learning_rate": 5e-06,
"loss": 0.013,
"reward": 0.916795802116394,
"reward_std": 0.8524092853069305,
"rewards/reward_func": 0.916795802116394,
"step": 5760,
"toxic_reward": 4.560049152374267
},
{
"clip_ratio": 0.0,
"completion_length": 52.75,
"epoch": 1.363421550094518,
"format_reward": 0.0,
"grad_norm": 3.932856798171997,
"image_reward": 0.2459381103515625,
"kl": 2.5305844336748122,
"learning_rate": 5e-06,
"loss": 0.0338,
"reward": 0.5017880856990814,
"reward_std": 0.7364757396280766,
"rewards/reward_func": 0.5017880856990814,
"step": 5770,
"toxic_reward": 4.69781801700592
},
{
"clip_ratio": 0.0,
"completion_length": 50.45,
"epoch": 1.3657844990548205,
"format_reward": 0.0,
"grad_norm": 1.3704707622528076,
"image_reward": 0.2677764892578125,
"kl": 1.8369466960430145,
"learning_rate": 5e-06,
"loss": 0.107,
"reward": 0.7046410620212555,
"reward_std": 0.9321951523423195,
"rewards/reward_func": 0.7046410620212555,
"step": 5780,
"toxic_reward": 4.073530220985413
},
{
"clip_ratio": 0.0,
"completion_length": 45.8,
"epoch": 1.3681474480151228,
"format_reward": -0.25,
"grad_norm": 2.7950003147125244,
"image_reward": 0.268048095703125,
"kl": 3.3737578272819517,
"learning_rate": 5e-06,
"loss": 0.0251,
"reward": 0.1121946096420288,
"reward_std": 1.2333336278796196,
"rewards/reward_func": 0.1121946096420288,
"step": 5790,
"toxic_reward": 4.301294279098511
},
{
"clip_ratio": 0.0,
"completion_length": 44.9,
"epoch": 1.3705103969754253,
"format_reward": -0.25,
"grad_norm": 10.600517272949219,
"image_reward": 0.24301045686006545,
"kl": 5.24166065454483,
"learning_rate": 5e-06,
"loss": -0.0188,
"reward": -0.04525191783905029,
"reward_std": 1.1580330106429755,
"rewards/reward_func": -0.04525191783905029,
"step": 5800,
"toxic_reward": 3.95846186876297
},
{
"clip_ratio": 0.0,
"completion_length": 52.6,
"epoch": 1.3728733459357279,
"format_reward": -0.5,
"grad_norm": 22.423450469970703,
"image_reward": 0.25951487123966216,
"kl": 12.250067234039307,
"learning_rate": 5e-06,
"loss": 0.0091,
"reward": 0.0036635279655456545,
"reward_std": 1.6421116095036268,
"rewards/reward_func": 0.0036635279655456545,
"step": 5810,
"toxic_reward": 4.407265400886535
},
{
"clip_ratio": 0.0,
"completion_length": 45.825,
"epoch": 1.3752362948960302,
"format_reward": 0.0,
"grad_norm": 3.5839083194732666,
"image_reward": 0.2497711181640625,
"kl": 7.638963532447815,
"learning_rate": 5e-06,
"loss": -0.0623,
"reward": 0.36217689514160156,
"reward_std": 1.057050895690918,
"rewards/reward_func": 0.36217689514160156,
"step": 5820,
"toxic_reward": 3.8359474897384644
},
{
"clip_ratio": 0.0,
"completion_length": 42.075,
"epoch": 1.3775992438563327,
"format_reward": 0.0,
"grad_norm": 18.257360458374023,
"image_reward": 0.242034912109375,
"kl": 1406.6461040258407,
"learning_rate": 5e-06,
"loss": 0.3409,
"reward": 0.35478733479976654,
"reward_std": 0.5706452172249555,
"rewards/reward_func": 0.35478733479976654,
"step": 5830,
"toxic_reward": 3.5973093271255494
},
{
"clip_ratio": 0.0,
"completion_length": 39.55,
"epoch": 1.3799621928166352,
"format_reward": 0.0,
"grad_norm": 4.36010217666626,
"image_reward": 0.24814300537109374,
"kl": 117.68144319057464,
"learning_rate": 5e-06,
"loss": 0.0637,
"reward": 0.3609376668930054,
"reward_std": 0.6294937739614397,
"rewards/reward_func": 0.3609376668930054,
"step": 5840,
"toxic_reward": 4.1664423704147335
},
{
"clip_ratio": 0.0,
"completion_length": 38.65,
"epoch": 1.3823251417769375,
"format_reward": 0.0,
"grad_norm": 14.234587669372559,
"image_reward": 0.254736328125,
"kl": 4.648911118507385,
"learning_rate": 5e-06,
"loss": -0.0132,
"reward": -0.4629164904356003,
"reward_std": 0.8635658169165253,
"rewards/reward_func": -0.4629164904356003,
"step": 5850,
"toxic_reward": 3.804247868061066
},
{
"clip_ratio": 0.0,
"completion_length": 41.825,
"epoch": 1.38468809073724,
"format_reward": -0.25,
"grad_norm": 9.249091148376465,
"image_reward": 0.25889790803194046,
"kl": 8.51909922361374,
"learning_rate": 5e-06,
"loss": 0.0656,
"reward": 0.20021165013313294,
"reward_std": 1.1463583020493389,
"rewards/reward_func": 0.20021165013313294,
"step": 5860,
"toxic_reward": 4.298932027816773
},
{
"clip_ratio": 0.0,
"completion_length": 47.2,
"epoch": 1.3870510396975426,
"format_reward": 0.0,
"grad_norm": 2.6423728466033936,
"image_reward": 0.2569976806640625,
"kl": 114.30452468395234,
"learning_rate": 5e-06,
"loss": 0.0235,
"reward": 0.7682538509368897,
"reward_std": 0.9905061937868596,
"rewards/reward_func": 0.7682538509368897,
"step": 5870,
"toxic_reward": 4.355731654167175
},
{
"clip_ratio": 0.0,
"completion_length": 57.75,
"epoch": 1.389413988657845,
"format_reward": 0.0,
"grad_norm": 6.193624496459961,
"image_reward": 0.244573974609375,
"kl": 27.17574143409729,
"learning_rate": 5e-06,
"loss": 0.0466,
"reward": 0.4181258499622345,
"reward_std": 0.7019964678213,
"rewards/reward_func": 0.4181258499622345,
"step": 5880,
"toxic_reward": 4.30544638633728
},
{
"clip_ratio": 0.0,
"completion_length": 51.875,
"epoch": 1.3917769376181475,
"format_reward": 0.0,
"grad_norm": 14.119263648986816,
"image_reward": 0.24458719789981842,
"kl": 22.515019488334655,
"learning_rate": 5e-06,
"loss": 0.022,
"reward": 0.2917088523507118,
"reward_std": 0.7304708318784833,
"rewards/reward_func": 0.2917088523507118,
"step": 5890,
"toxic_reward": 4.011698079109192
},
{
"clip_ratio": 0.0,
"completion_length": 49.95,
"epoch": 1.3941398865784498,
"format_reward": 0.0,
"grad_norm": 203.53358459472656,
"image_reward": 0.266705322265625,
"kl": 130.7643344759941,
"learning_rate": 5e-06,
"loss": -0.0925,
"reward": 0.8561399459838868,
"reward_std": 0.7673989269882441,
"rewards/reward_func": 0.8561399459838868,
"step": 5900,
"toxic_reward": 4.3426886081695555
},
{
"clip_ratio": 0.0,
"completion_length": 45.25,
"epoch": 1.3965028355387523,
"format_reward": 0.0,
"grad_norm": 1.8555150032043457,
"image_reward": 0.26331787109375,
"kl": 25.51781210899353,
"learning_rate": 5e-06,
"loss": -0.0759,
"reward": 0.14676390141248702,
"reward_std": 0.31099242605268956,
"rewards/reward_func": 0.14676390141248702,
"step": 5910,
"toxic_reward": 4.393804085254669
},
{
"clip_ratio": 0.0,
"completion_length": 44.375,
"epoch": 1.3988657844990549,
"format_reward": 0.0,
"grad_norm": 1.794382095336914,
"image_reward": 0.2638519287109375,
"kl": 75.26498790383339,
"learning_rate": 5e-06,
"loss": -0.0105,
"reward": 0.0749910295009613,
"reward_std": 0.9545040905475617,
"rewards/reward_func": 0.0749910295009613,
"step": 5920,
"toxic_reward": 4.043283843994141
},
{
"clip_ratio": 0.0,
"completion_length": 43.6,
"epoch": 1.4012287334593574,
"format_reward": 0.0,
"grad_norm": 10.70461654663086,
"image_reward": 0.26204325407743456,
"kl": 6.424372181296349,
"learning_rate": 5e-06,
"loss": -0.0192,
"reward": 0.3424019992351532,
"reward_std": 0.8532586313784123,
"rewards/reward_func": 0.3424019992351532,
"step": 5930,
"toxic_reward": 3.697358027100563
},
{
"clip_ratio": 0.0,
"completion_length": 47.7,
"epoch": 1.4035916824196597,
"format_reward": 0.0,
"grad_norm": 15.443364143371582,
"image_reward": 0.25388997346162795,
"kl": 2.7157889783382414,
"learning_rate": 5e-06,
"loss": 0.0194,
"reward": 0.8944644808769227,
"reward_std": 0.907353313267231,
"rewards/reward_func": 0.8944644808769227,
"step": 5940,
"toxic_reward": 4.2960577487945555
},
{
"clip_ratio": 0.0,
"completion_length": 46.625,
"epoch": 1.4059546313799622,
"format_reward": 0.0,
"grad_norm": 9.80057144165039,
"image_reward": 0.2643798828125,
"kl": 3.2323968172073365,
"learning_rate": 5e-06,
"loss": -0.0714,
"reward": 0.7592375218868256,
"reward_std": 1.0486908692866563,
"rewards/reward_func": 0.7592375218868256,
"step": 5950,
"toxic_reward": 4.236204934120178
},
{
"clip_ratio": 0.0,
"completion_length": 47.45,
"epoch": 1.4083175803402646,
"format_reward": 0.0,
"grad_norm": 32.608253479003906,
"image_reward": 0.2897979736328125,
"kl": 0.868326199054718,
"learning_rate": 5e-06,
"loss": 0.0072,
"reward": 0.13819260597229005,
"reward_std": 0.9927060969173909,
"rewards/reward_func": 0.13819260597229005,
"step": 5960,
"toxic_reward": 4.137164163589477
},
{
"clip_ratio": 0.0,
"completion_length": 49.875,
"epoch": 1.410680529300567,
"format_reward": -0.5,
"grad_norm": 3.4970862865448,
"image_reward": 0.254620361328125,
"kl": 1.8707860291004181,
"learning_rate": 5e-06,
"loss": -0.0251,
"reward": -0.12371634542942048,
"reward_std": 1.6602010667324065,
"rewards/reward_func": -0.12371634542942048,
"step": 5970,
"toxic_reward": 4.560637950897217
},
{
"clip_ratio": 0.0,
"completion_length": 47.675,
"epoch": 1.4130434782608696,
"format_reward": 0.0,
"grad_norm": 18.951919555664062,
"image_reward": 0.258807373046875,
"kl": 1.7996377795934677,
"learning_rate": 5e-06,
"loss": 0.012,
"reward": 0.7075730919837951,
"reward_std": 0.9400279764086008,
"rewards/reward_func": 0.7075730919837951,
"step": 5980,
"toxic_reward": 3.758779287338257
},
{
"clip_ratio": 0.0,
"completion_length": 44.025,
"epoch": 1.4154064272211722,
"format_reward": 0.0,
"grad_norm": 5.1872663497924805,
"image_reward": 0.2968048095703125,
"kl": 4.290337887406349,
"learning_rate": 5e-06,
"loss": -0.0076,
"reward": 0.3689495801925659,
"reward_std": 0.5776140118017793,
"rewards/reward_func": 0.3689495801925659,
"step": 5990,
"toxic_reward": 4.157499670982361
},
{
"clip_ratio": 0.0,
"completion_length": 50.225,
"epoch": 1.4177693761814745,
"format_reward": 0.0,
"grad_norm": 4.382224082946777,
"image_reward": 0.24061279296875,
"kl": 3.217728292942047,
"learning_rate": 5e-06,
"loss": -0.0161,
"reward": 0.37828874588012695,
"reward_std": 0.3327252045273781,
"rewards/reward_func": 0.37828874588012695,
"step": 6000,
"toxic_reward": 4.625493478775025
},
{
"clip_ratio": 0.0,
"completion_length": 40.625,
"epoch": 1.420132325141777,
"format_reward": 0.0,
"grad_norm": 17.742074966430664,
"image_reward": 0.260565185546875,
"kl": 3.087987443804741,
"learning_rate": 5e-06,
"loss": -0.0098,
"reward": 0.22000501453876495,
"reward_std": 0.6759357416536659,
"rewards/reward_func": 0.22000501453876495,
"step": 6010,
"toxic_reward": 4.1144504189491276
},
{
"clip_ratio": 0.0,
"completion_length": 51.725,
"epoch": 1.4224952741020793,
"format_reward": -0.25,
"grad_norm": 23.140647888183594,
"image_reward": 0.2770843505859375,
"kl": 1.3970532178878785,
"learning_rate": 5e-06,
"loss": -0.0249,
"reward": -0.045973950624465944,
"reward_std": 1.367066621594131,
"rewards/reward_func": -0.045973950624465944,
"step": 6020,
"toxic_reward": 4.268449664115906
},
{
"clip_ratio": 0.0,
"completion_length": 52.125,
"epoch": 1.4248582230623819,
"format_reward": -0.5,
"grad_norm": 63.63026428222656,
"image_reward": 0.2500905364751816,
"kl": 1.5568452209234238,
"learning_rate": 5e-06,
"loss": 0.032,
"reward": 0.36329651772975924,
"reward_std": 2.2665354389697314,
"rewards/reward_func": 0.36329651772975924,
"step": 6030,
"toxic_reward": 3.9466104745864867
},
{
"clip_ratio": 0.0,
"completion_length": 52.15,
"epoch": 1.4272211720226844,
"format_reward": -0.25,
"grad_norm": 3.4662349224090576,
"image_reward": 0.2569427490234375,
"kl": 2.127922511100769,
"learning_rate": 5e-06,
"loss": 0.0478,
"reward": 0.4287997782230377,
"reward_std": 1.335706689953804,
"rewards/reward_func": 0.4287997782230377,
"step": 6040,
"toxic_reward": 4.417151093482971
},
{
"clip_ratio": 0.0,
"completion_length": 51.775,
"epoch": 1.4295841209829867,
"format_reward": 0.0,
"grad_norm": 6.905588626861572,
"image_reward": 0.2496734619140625,
"kl": 2.1467004269361496,
"learning_rate": 5e-06,
"loss": -0.1361,
"reward": 0.6416638314723968,
"reward_std": 0.6212250446900726,
"rewards/reward_func": 0.6416638314723968,
"step": 6050,
"toxic_reward": 4.481846666336059
},
{
"clip_ratio": 0.0,
"completion_length": 44.45,
"epoch": 1.4319470699432892,
"format_reward": 0.0,
"grad_norm": 7.034083843231201,
"image_reward": 0.2545166015625,
"kl": 1.5230970159173012,
"learning_rate": 5e-06,
"loss": -0.0407,
"reward": 0.29611208438873293,
"reward_std": 0.7949410590808839,
"rewards/reward_func": 0.29611208438873293,
"step": 6060,
"toxic_reward": 4.335282778739929
},
{
"clip_ratio": 0.0,
"completion_length": 45.75,
"epoch": 1.4343100189035916,
"format_reward": 0.0,
"grad_norm": 3.7928450107574463,
"image_reward": 0.24527740478515625,
"kl": 0.8901469498872757,
"learning_rate": 5e-06,
"loss": -0.0367,
"reward": 0.26578280329704285,
"reward_std": 1.3428313750773668,
"rewards/reward_func": 0.26578280329704285,
"step": 6070,
"toxic_reward": 3.388633108139038
},
{
"clip_ratio": 0.0,
"completion_length": 69.625,
"epoch": 1.436672967863894,
"format_reward": -0.25,
"grad_norm": 19.84122085571289,
"image_reward": 0.2548517853021622,
"kl": 1.0234291791915893,
"learning_rate": 5e-06,
"loss": 0.1725,
"reward": 0.31002968549728394,
"reward_std": 1.8640546321868896,
"rewards/reward_func": 0.31002968549728394,
"step": 6080,
"toxic_reward": 3.8593465805053713
},
{
"clip_ratio": 0.0,
"completion_length": 50.6,
"epoch": 1.4390359168241966,
"format_reward": -0.25,
"grad_norm": 10.626410484313965,
"image_reward": 0.2676523834466934,
"kl": 5.208069609105587,
"learning_rate": 5e-06,
"loss": 0.0794,
"reward": 0.2428498387336731,
"reward_std": 1.3197494292631746,
"rewards/reward_func": 0.2428498387336731,
"step": 6090,
"toxic_reward": 4.562512469291687
},
{
"clip_ratio": 0.0,
"completion_length": 43.8,
"epoch": 1.4413988657844992,
"format_reward": 0.0,
"grad_norm": 11.22333812713623,
"image_reward": 0.2790537506341934,
"kl": 4.089116859436035,
"learning_rate": 5e-06,
"loss": 0.0053,
"reward": 1.0973919004201889,
"reward_std": 0.9867459360510111,
"rewards/reward_func": 1.0973919004201889,
"step": 6100,
"toxic_reward": 4.121850895881653
},
{
"clip_ratio": 0.0,
"completion_length": 40.675,
"epoch": 1.4437618147448015,
"format_reward": 0.0,
"grad_norm": 2.594348907470703,
"image_reward": 0.24527740478515625,
"kl": 2.355099043250084,
"learning_rate": 5e-06,
"loss": 0.0302,
"reward": 0.2879053592681885,
"reward_std": 1.3369514867663383,
"rewards/reward_func": 0.2879053592681885,
"step": 6110,
"toxic_reward": 3.482189404964447
},
{
"clip_ratio": 0.0,
"completion_length": 43.625,
"epoch": 1.446124763705104,
"format_reward": 0.0,
"grad_norm": 2.740832805633545,
"image_reward": 0.27071533203125,
"kl": 1.8061291784048081,
"learning_rate": 5e-06,
"loss": 0.098,
"reward": 0.3818982481956482,
"reward_std": 0.8427915960550308,
"rewards/reward_func": 0.3818982481956482,
"step": 6120,
"toxic_reward": 4.095608282089233
},
{
"clip_ratio": 0.0,
"completion_length": 45.05,
"epoch": 1.4484877126654063,
"format_reward": 0.0,
"grad_norm": 16.700410842895508,
"image_reward": 0.2543426513671875,
"kl": 1.4419916868209839,
"learning_rate": 5e-06,
"loss": -0.056,
"reward": 0.8265678405761718,
"reward_std": 0.835081409662962,
"rewards/reward_func": 0.8265678405761718,
"step": 6130,
"toxic_reward": 4.317971038818359
},
{
"clip_ratio": 0.0,
"completion_length": 39.75,
"epoch": 1.4508506616257089,
"format_reward": 0.0,
"grad_norm": 5.467940330505371,
"image_reward": 0.253302001953125,
"kl": 1.128901758790016,
"learning_rate": 5e-06,
"loss": -0.0416,
"reward": 0.22405808568000793,
"reward_std": 0.430261270259507,
"rewards/reward_func": 0.22405808568000793,
"step": 6140,
"toxic_reward": 4.605859112739563
},
{
"clip_ratio": 0.0,
"completion_length": 43.7,
"epoch": 1.4532136105860114,
"format_reward": 0.0,
"grad_norm": 15.90230941772461,
"image_reward": 0.261627197265625,
"kl": 0.6474134013056755,
"learning_rate": 5e-06,
"loss": -0.0453,
"reward": 0.23209627866744995,
"reward_std": 0.9918515108525753,
"rewards/reward_func": 0.23209627866744995,
"step": 6150,
"toxic_reward": 4.06419689655304
},
{
"clip_ratio": 0.0,
"completion_length": 52.85,
"epoch": 1.455576559546314,
"format_reward": 0.0,
"grad_norm": 14.443000793457031,
"image_reward": 0.2465087890625,
"kl": 0.6866413161158562,
"learning_rate": 5e-06,
"loss": -0.0889,
"reward": 0.37001847475767136,
"reward_std": 0.7742633601650596,
"rewards/reward_func": 0.37001847475767136,
"step": 6160,
"toxic_reward": 4.076632690429688
},
{
"clip_ratio": 0.0,
"completion_length": 46.775,
"epoch": 1.4579395085066162,
"format_reward": -0.25,
"grad_norm": 16.315828323364258,
"image_reward": 0.27055562287569046,
"kl": 1.8430037647485733,
"learning_rate": 5e-06,
"loss": -0.005,
"reward": 0.37416398525238037,
"reward_std": 1.238390678167343,
"rewards/reward_func": 0.37416398525238037,
"step": 6170,
"toxic_reward": 3.7685179471969605
},
{
"clip_ratio": 0.0,
"completion_length": 51.725,
"epoch": 1.4603024574669188,
"format_reward": -0.25,
"grad_norm": 9.808785438537598,
"image_reward": 0.26315511018037796,
"kl": 3.501179130375385,
"learning_rate": 5e-06,
"loss": -0.0757,
"reward": 0.4839250385761261,
"reward_std": 1.0200102254748344,
"rewards/reward_func": 0.4839250385761261,
"step": 6180,
"toxic_reward": 4.458306789398193
},
{
"clip_ratio": 0.0,
"completion_length": 45.1,
"epoch": 1.462665406427221,
"format_reward": 0.0,
"grad_norm": 6.254196643829346,
"image_reward": 0.2431121826171875,
"kl": 0.6629411533474923,
"learning_rate": 5e-06,
"loss": -0.0243,
"reward": 1.0291070997714997,
"reward_std": 0.6561918726190925,
"rewards/reward_func": 1.0291070997714997,
"step": 6190,
"toxic_reward": 4.277180218696595
},
{
"clip_ratio": 0.0,
"completion_length": 41.075,
"epoch": 1.4650283553875236,
"format_reward": 0.0,
"grad_norm": 4.544678211212158,
"image_reward": 0.288299560546875,
"kl": 0.5533515185117721,
"learning_rate": 5e-06,
"loss": 0.0114,
"reward": 0.09159567654132843,
"reward_std": 0.6166084105148911,
"rewards/reward_func": 0.09159567654132843,
"step": 6200,
"toxic_reward": 4.60030083656311
},
{
"clip_ratio": 0.0,
"completion_length": 49.75,
"epoch": 1.4673913043478262,
"format_reward": 0.0,
"grad_norm": 2.3730123043060303,
"image_reward": 0.2605743408203125,
"kl": 0.9568765789270401,
"learning_rate": 5e-06,
"loss": 0.0904,
"reward": 0.7965957373380661,
"reward_std": 0.7220977865159511,
"rewards/reward_func": 0.7965957373380661,
"step": 6210,
"toxic_reward": 3.7931410372257233
},
{
"clip_ratio": 0.0,
"completion_length": 43.05,
"epoch": 1.4697542533081285,
"format_reward": 0.0,
"grad_norm": 9.046610832214355,
"image_reward": 0.25125885009765625,
"kl": 0.966689832508564,
"learning_rate": 5e-06,
"loss": 0.006,
"reward": 0.42708381414413454,
"reward_std": 0.8918632004410029,
"rewards/reward_func": 0.42708381414413454,
"step": 6220,
"toxic_reward": 3.811506199836731
},
{
"clip_ratio": 0.0,
"completion_length": 41.225,
"epoch": 1.472117202268431,
"format_reward": -0.25,
"grad_norm": 11.783949851989746,
"image_reward": 0.2802093505859375,
"kl": 1.5058857083320618,
"learning_rate": 5e-06,
"loss": 0.1229,
"reward": -0.2064610540866852,
"reward_std": 1.2439106579869985,
"rewards/reward_func": -0.2064610540866852,
"step": 6230,
"toxic_reward": 3.776739251613617
},
{
"clip_ratio": 0.0,
"completion_length": 51.55,
"epoch": 1.4744801512287333,
"format_reward": -0.25,
"grad_norm": 19.82095718383789,
"image_reward": 0.2695220947265625,
"kl": 4.01448056101799,
"learning_rate": 5e-06,
"loss": 0.0224,
"reward": 0.12153833210468293,
"reward_std": 1.4096519321203231,
"rewards/reward_func": 0.12153833210468293,
"step": 6240,
"toxic_reward": 3.5601022720336912
},
{
"clip_ratio": 0.0,
"completion_length": 53.575,
"epoch": 1.4768431001890359,
"format_reward": 0.0,
"grad_norm": 3.002202272415161,
"image_reward": 0.2589111328125,
"kl": 6.391136825084686,
"learning_rate": 5e-06,
"loss": 0.0246,
"reward": 0.8984602272510529,
"reward_std": 0.8823632273823023,
"rewards/reward_func": 0.8984602272510529,
"step": 6250,
"toxic_reward": 4.186536240577698
},
{
"clip_ratio": 0.0,
"completion_length": 41.95,
"epoch": 1.4792060491493384,
"format_reward": -0.25,
"grad_norm": 6.6309990882873535,
"image_reward": 0.25638376772403715,
"kl": 5.093664228916168,
"learning_rate": 5e-06,
"loss": -0.1168,
"reward": 0.7955404102802277,
"reward_std": 1.139578907750547,
"rewards/reward_func": 0.7955404102802277,
"step": 6260,
"toxic_reward": 4.489496183395386
},
{
"clip_ratio": 0.0,
"completion_length": 36.475,
"epoch": 1.481568998109641,
"format_reward": -0.25,
"grad_norm": 6.48809289932251,
"image_reward": 0.24294535368680953,
"kl": 2.03928547501564,
"learning_rate": 5e-06,
"loss": -0.0415,
"reward": 0.3108412384986877,
"reward_std": 1.2936087466776371,
"rewards/reward_func": 0.3108412384986877,
"step": 6270,
"toxic_reward": 3.8573724269866942
},
{
"clip_ratio": 0.0,
"completion_length": 44.025,
"epoch": 1.4839319470699432,
"format_reward": 0.0,
"grad_norm": 12.869200706481934,
"image_reward": 0.269140625,
"kl": 2.209321880340576,
"learning_rate": 5e-06,
"loss": 0.0065,
"reward": 1.0746480822563171,
"reward_std": 0.9140975341200829,
"rewards/reward_func": 1.0746480822563171,
"step": 6280,
"toxic_reward": 4.498701477050782
},
{
"clip_ratio": 0.0,
"completion_length": 52.6,
"epoch": 1.4862948960302458,
"format_reward": -0.25,
"grad_norm": 14.994149208068848,
"image_reward": 0.2699289947748184,
"kl": 1.982865560054779,
"learning_rate": 5e-06,
"loss": -0.0155,
"reward": -0.04395916759967804,
"reward_std": 1.351198247075081,
"rewards/reward_func": -0.04395916759967804,
"step": 6290,
"toxic_reward": 4.100183129310608
},
{
"clip_ratio": 0.0,
"completion_length": 48.525,
"epoch": 1.488657844990548,
"format_reward": 0.0,
"grad_norm": 13.48897647857666,
"image_reward": 0.2738677978515625,
"kl": 73.90320363342762,
"learning_rate": 5e-06,
"loss": 0.0515,
"reward": 0.2645439743995667,
"reward_std": 0.8113596703857183,
"rewards/reward_func": 0.2645439743995667,
"step": 6300,
"toxic_reward": 4.189095830917358
},
{
"clip_ratio": 0.0,
"completion_length": 39.85,
"epoch": 1.4910207939508506,
"format_reward": -0.25,
"grad_norm": 3.184455394744873,
"image_reward": 0.2667582184076309,
"kl": 1.9667763262987137,
"learning_rate": 5e-06,
"loss": -0.0049,
"reward": 0.582335239648819,
"reward_std": 1.003530977293849,
"rewards/reward_func": 0.582335239648819,
"step": 6310,
"toxic_reward": 4.428107571601868
},
{
"clip_ratio": 0.0,
"completion_length": 43.475,
"epoch": 1.4933837429111532,
"format_reward": 0.0,
"grad_norm": 16.498891830444336,
"image_reward": 0.26070556640625,
"kl": 2.5642897844314576,
"learning_rate": 5e-06,
"loss": -0.0706,
"reward": 0.8335122138261795,
"reward_std": 0.8066307563334704,
"rewards/reward_func": 0.8335122138261795,
"step": 6320,
"toxic_reward": 3.9580556988716125
},
{
"clip_ratio": 0.0,
"completion_length": 49.875,
"epoch": 1.4957466918714557,
"format_reward": -0.25,
"grad_norm": 10.699289321899414,
"image_reward": 0.2347137451171875,
"kl": 3.54896736741066,
"learning_rate": 5e-06,
"loss": 0.004,
"reward": -0.006625699996948242,
"reward_std": 1.1276695830747485,
"rewards/reward_func": -0.006625699996948242,
"step": 6330,
"toxic_reward": 4.5597028732299805
},
{
"clip_ratio": 0.0,
"completion_length": 44.2,
"epoch": 1.498109640831758,
"format_reward": 0.0,
"grad_norm": 25.74764633178711,
"image_reward": 0.2565399169921875,
"kl": 3.312129205465317,
"learning_rate": 5e-06,
"loss": -0.0109,
"reward": 0.974502682685852,
"reward_std": 0.8662806877866387,
"rewards/reward_func": 0.974502682685852,
"step": 6340,
"toxic_reward": 4.394499397277832
},
{
"clip_ratio": 0.0,
"completion_length": 40.75,
"epoch": 1.5004725897920603,
"format_reward": -0.25,
"grad_norm": 7.920969009399414,
"image_reward": 0.25830586850643156,
"kl": 4.049802941083908,
"learning_rate": 5e-06,
"loss": 0.1152,
"reward": 0.04905744194984436,
"reward_std": 1.192653514072299,
"rewards/reward_func": 0.04905744194984436,
"step": 6350,
"toxic_reward": 4.2746446371078495
},
{
"clip_ratio": 0.0,
"completion_length": 54.45,
"epoch": 1.5028355387523629,
"format_reward": 0.0,
"grad_norm": 10.33079719543457,
"image_reward": 0.26002349853515627,
"kl": 5.124299117922783,
"learning_rate": 5e-06,
"loss": 0.0575,
"reward": 0.20074379444122314,
"reward_std": 0.825444309413433,
"rewards/reward_func": 0.20074379444122314,
"step": 6360,
"toxic_reward": 3.7550126791000364
},
{
"clip_ratio": 0.0,
"completion_length": 45.8,
"epoch": 1.5051984877126654,
"format_reward": 0.0,
"grad_norm": 8.266423225402832,
"image_reward": 0.262652587890625,
"kl": 13.647933864593506,
"learning_rate": 5e-06,
"loss": -0.0262,
"reward": 0.6701415419578552,
"reward_std": 0.6827380709350109,
"rewards/reward_func": 0.6701415419578552,
"step": 6370,
"toxic_reward": 4.507234740257263
},
{
"clip_ratio": 0.0,
"completion_length": 52.55,
"epoch": 1.507561436672968,
"format_reward": -0.25,
"grad_norm": 16.820531845092773,
"image_reward": 0.2648305267095566,
"kl": 2943.323862874508,
"learning_rate": 5e-06,
"loss": 0.3168,
"reward": -0.18676466941833497,
"reward_std": 1.4769982114434241,
"rewards/reward_func": -0.18676466941833497,
"step": 6380,
"toxic_reward": 3.6400262832641603
},
{
"clip_ratio": 0.0,
"completion_length": 45.85,
"epoch": 1.5099243856332705,
"format_reward": 0.0,
"grad_norm": 22.587871551513672,
"image_reward": 0.2433380126953125,
"kl": 1.810417714715004,
"learning_rate": 5e-06,
"loss": 0.0137,
"reward": 0.35521286725997925,
"reward_std": 0.5707202635705471,
"rewards/reward_func": 0.35521286725997925,
"step": 6390,
"toxic_reward": 4.4419690608978275
},
{
"clip_ratio": 0.0,
"completion_length": 45.85,
"epoch": 1.5122873345935728,
"format_reward": 0.0,
"grad_norm": 7.745183944702148,
"image_reward": 0.253375244140625,
"kl": 1.7831827580928803,
"learning_rate": 5e-06,
"loss": 0.0178,
"reward": -0.09840984344482422,
"reward_std": 0.7997165352106095,
"rewards/reward_func": -0.09840984344482422,
"step": 6400,
"toxic_reward": 3.674038052558899
},
{
"clip_ratio": 0.0,
"completion_length": 46.025,
"epoch": 1.514650283553875,
"format_reward": 0.0,
"grad_norm": 6.026752948760986,
"image_reward": 0.249884033203125,
"kl": 4.073290675878525,
"learning_rate": 5e-06,
"loss": 0.0349,
"reward": 0.7357653975486755,
"reward_std": 1.102572639286518,
"rewards/reward_func": 0.7357653975486755,
"step": 6410,
"toxic_reward": 4.154096102714538
},
{
"clip_ratio": 0.0,
"completion_length": 42.5,
"epoch": 1.5170132325141776,
"format_reward": 0.0,
"grad_norm": 2.0699822902679443,
"image_reward": 0.2804595947265625,
"kl": 1.0494691252708435,
"learning_rate": 5e-06,
"loss": -0.0105,
"reward": 0.3559255480766296,
"reward_std": 0.9502544086426497,
"rewards/reward_func": 0.3559255480766296,
"step": 6420,
"toxic_reward": 3.829944038391113
},
{
"clip_ratio": 0.0,
"completion_length": 50.725,
"epoch": 1.5193761814744802,
"format_reward": -0.25,
"grad_norm": 2.6939737796783447,
"image_reward": 0.24329833984375,
"kl": 2.4543985188007356,
"learning_rate": 5e-06,
"loss": 0.0488,
"reward": -0.22206905484199524,
"reward_std": 1.1714405838400126,
"rewards/reward_func": -0.22206905484199524,
"step": 6430,
"toxic_reward": 4.02121376991272
},
{
"clip_ratio": 0.0,
"completion_length": 41.8,
"epoch": 1.5217391304347827,
"format_reward": 0.0,
"grad_norm": 7.930452823638916,
"image_reward": 0.270928955078125,
"kl": 2.3063239082694054,
"learning_rate": 5e-06,
"loss": 0.0354,
"reward": -0.090572190284729,
"reward_std": 0.8380892558023334,
"rewards/reward_func": -0.090572190284729,
"step": 6440,
"toxic_reward": 4.0341674268245695
},
{
"clip_ratio": 0.0,
"completion_length": 45.675,
"epoch": 1.524102079395085,
"format_reward": -0.25,
"grad_norm": 11.180920600891113,
"image_reward": 0.26758829653263094,
"kl": 1.2753668040037156,
"learning_rate": 5e-06,
"loss": -0.0044,
"reward": 0.8226781934499741,
"reward_std": 1.3871233612298965,
"rewards/reward_func": 0.8226781934499741,
"step": 6450,
"toxic_reward": 3.5244659066200255
},
{
"clip_ratio": 0.0,
"completion_length": 49.75,
"epoch": 1.5264650283553876,
"format_reward": -0.25,
"grad_norm": 21.079256057739258,
"image_reward": 0.24825642853975297,
"kl": 0.9987513780593872,
"learning_rate": 5e-06,
"loss": -0.0776,
"reward": 0.38041144609451294,
"reward_std": 1.5831992760300637,
"rewards/reward_func": 0.38041144609451294,
"step": 6460,
"toxic_reward": 4.319032979011536
},
{
"clip_ratio": 0.0,
"completion_length": 38.5,
"epoch": 1.5288279773156899,
"format_reward": -0.25,
"grad_norm": 6.294722557067871,
"image_reward": 0.23506622314453124,
"kl": 1.3047454893589019,
"learning_rate": 5e-06,
"loss": -0.037,
"reward": 0.483357185125351,
"reward_std": 1.4818070188164711,
"rewards/reward_func": 0.483357185125351,
"step": 6470,
"toxic_reward": 4.28910231590271
},
{
"clip_ratio": 0.0,
"completion_length": 44.35,
"epoch": 1.5311909262759924,
"format_reward": 0.0,
"grad_norm": 25.183557510375977,
"image_reward": 0.2621653228998184,
"kl": 5.334516155719757,
"learning_rate": 5e-06,
"loss": 0.184,
"reward": 0.5027847826480866,
"reward_std": 0.6526452742516995,
"rewards/reward_func": 0.5027847826480866,
"step": 6480,
"toxic_reward": 4.249637746810913
},
{
"clip_ratio": 0.0,
"completion_length": 37.825,
"epoch": 1.533553875236295,
"format_reward": 0.0,
"grad_norm": 10.60496997833252,
"image_reward": 0.26144917905330656,
"kl": 3.185924381017685,
"learning_rate": 5e-06,
"loss": -0.0162,
"reward": 0.6137366682291031,
"reward_std": 0.5949534647166729,
"rewards/reward_func": 0.6137366682291031,
"step": 6490,
"toxic_reward": 4.377641320228577
},
{
"clip_ratio": 0.0,
"completion_length": 36.975,
"epoch": 1.5359168241965975,
"format_reward": 0.0,
"grad_norm": 5.117782115936279,
"image_reward": 0.2869781494140625,
"kl": 0.5223678901791573,
"learning_rate": 5e-06,
"loss": 0.0356,
"reward": 0.20925453603267669,
"reward_std": 0.5696444906294346,
"rewards/reward_func": 0.20925453603267669,
"step": 6500,
"toxic_reward": 4.292421555519104
},
{
"clip_ratio": 0.0,
"completion_length": 47.7,
"epoch": 1.5382797731568998,
"format_reward": 0.0,
"grad_norm": 2.3820912837982178,
"image_reward": 0.250634765625,
"kl": 1.4595504850149155,
"learning_rate": 5e-06,
"loss": 0.02,
"reward": 0.719105675816536,
"reward_std": 0.9932258397340774,
"rewards/reward_func": 0.719105675816536,
"step": 6510,
"toxic_reward": 4.147652292251587
},
{
"clip_ratio": 0.0,
"completion_length": 49.65,
"epoch": 1.5406427221172023,
"format_reward": 0.0,
"grad_norm": 10.115738868713379,
"image_reward": 0.2735626220703125,
"kl": 2.502386949956417,
"learning_rate": 5e-06,
"loss": 0.0075,
"reward": 0.229107666015625,
"reward_std": 0.8675182597711683,
"rewards/reward_func": 0.229107666015625,
"step": 6520,
"toxic_reward": 3.437857782840729
},
{
"clip_ratio": 0.0,
"completion_length": 39.975,
"epoch": 1.5430056710775046,
"format_reward": 0.0,
"grad_norm": 12.080828666687012,
"image_reward": 0.26392364501953125,
"kl": 1.0831295281648636,
"learning_rate": 5e-06,
"loss": 0.0655,
"reward": 0.5672924667596817,
"reward_std": 1.0283904120326042,
"rewards/reward_func": 0.5672924667596817,
"step": 6530,
"toxic_reward": 4.013876247406006
},
{
"clip_ratio": 0.0,
"completion_length": 52.825,
"epoch": 1.5453686200378072,
"format_reward": -0.25,
"grad_norm": 4.358407497406006,
"image_reward": 0.2578287750482559,
"kl": 0.8826712548732758,
"learning_rate": 5e-06,
"loss": 0.0534,
"reward": 0.1970734715461731,
"reward_std": 1.158833772689104,
"rewards/reward_func": 0.1970734715461731,
"step": 6540,
"toxic_reward": 4.346637082099915
},
{
"clip_ratio": 0.0,
"completion_length": 39.85,
"epoch": 1.5477315689981097,
"format_reward": 0.0,
"grad_norm": 1.7384082078933716,
"image_reward": 0.245843505859375,
"kl": 1.1669968128204347,
"learning_rate": 5e-06,
"loss": 0.1047,
"reward": 0.4790124922990799,
"reward_std": 0.8679840985685587,
"rewards/reward_func": 0.4790124922990799,
"step": 6550,
"toxic_reward": 4.215170729160309
},
{
"clip_ratio": 0.0,
"completion_length": 57.025,
"epoch": 1.5500945179584122,
"format_reward": -0.25,
"grad_norm": 19.237064361572266,
"image_reward": 0.2605519607663155,
"kl": 22.883435368537903,
"learning_rate": 5e-06,
"loss": 0.0861,
"reward": 0.3663973331451416,
"reward_std": 1.3235621018335224,
"rewards/reward_func": 0.3663973331451416,
"step": 6560,
"toxic_reward": 4.399778747558594
},
{
"clip_ratio": 0.0,
"completion_length": 44.5,
"epoch": 1.5524574669187146,
"format_reward": 0.0,
"grad_norm": 12.169953346252441,
"image_reward": 0.240032958984375,
"kl": 2.908788651227951,
"learning_rate": 5e-06,
"loss": -0.0407,
"reward": 0.6662415623664856,
"reward_std": 0.9049512568861247,
"rewards/reward_func": 0.6662415623664856,
"step": 6570,
"toxic_reward": 4.262545752525329
},
{
"clip_ratio": 0.0,
"completion_length": 47.225,
"epoch": 1.5548204158790169,
"format_reward": 0.0,
"grad_norm": 14.397954940795898,
"image_reward": 0.2709014892578125,
"kl": 1.8459113836288452,
"learning_rate": 5e-06,
"loss": 0.0948,
"reward": 0.4346597075462341,
"reward_std": 1.024691704288125,
"rewards/reward_func": 0.4346597075462341,
"step": 6580,
"toxic_reward": 3.837217903137207
},
{
"clip_ratio": 0.0,
"completion_length": 50.25,
"epoch": 1.5571833648393194,
"format_reward": 0.0,
"grad_norm": 5.823775768280029,
"image_reward": 0.2421417236328125,
"kl": 361.87849075496194,
"learning_rate": 5e-06,
"loss": 0.1249,
"reward": 0.7188747763633728,
"reward_std": 0.7972227469086647,
"rewards/reward_func": 0.7188747763633728,
"step": 6590,
"toxic_reward": 4.341918230056763
},
{
"clip_ratio": 0.0,
"completion_length": 47.025,
"epoch": 1.559546313799622,
"format_reward": 0.0,
"grad_norm": 10.157111167907715,
"image_reward": 0.2904388427734375,
"kl": 2.8333058834075926,
"learning_rate": 5e-06,
"loss": -0.0125,
"reward": 0.1938968062400818,
"reward_std": 0.5627395014278591,
"rewards/reward_func": 0.1938968062400818,
"step": 6600,
"toxic_reward": 4.045072281360627
},
{
"clip_ratio": 0.0,
"completion_length": 39.875,
"epoch": 1.5619092627599245,
"format_reward": -0.5,
"grad_norm": 7.558686256408691,
"image_reward": 0.28143310397863386,
"kl": 5.3599341928958895,
"learning_rate": 5e-06,
"loss": -0.0333,
"reward": 0.3363319247961044,
"reward_std": 1.8769858199171723,
"rewards/reward_func": 0.3363319247961044,
"step": 6610,
"toxic_reward": 3.4191954016685484
},
{
"clip_ratio": 0.0,
"completion_length": 36.3,
"epoch": 1.5642722117202268,
"format_reward": 0.0,
"grad_norm": 6.938777446746826,
"image_reward": 0.2576054885983467,
"kl": 3.9582558915019037,
"learning_rate": 5e-06,
"loss": -0.0771,
"reward": 0.785160881280899,
"reward_std": 1.3131701787933707,
"rewards/reward_func": 0.785160881280899,
"step": 6620,
"toxic_reward": 3.997890996932983
},
{
"clip_ratio": 0.0,
"completion_length": 52.075,
"epoch": 1.5666351606805293,
"format_reward": 0.0,
"grad_norm": 7.617331027984619,
"image_reward": 0.2779083251953125,
"kl": 2.7307909965515136,
"learning_rate": 5e-06,
"loss": 0.052,
"reward": 0.6236707329750061,
"reward_std": 0.8986939422786235,
"rewards/reward_func": 0.6236707329750061,
"step": 6630,
"toxic_reward": 4.061939382553101
},
{
"clip_ratio": 0.0,
"completion_length": 51.1,
"epoch": 1.5689981096408316,
"format_reward": -0.5,
"grad_norm": 11.394710540771484,
"image_reward": 0.27598063051700594,
"kl": 4.903402748703956,
"learning_rate": 5e-06,
"loss": 0.0366,
"reward": 0.095058873295784,
"reward_std": 1.6800902128219604,
"rewards/reward_func": 0.095058873295784,
"step": 6640,
"toxic_reward": 3.813139808177948
},
{
"clip_ratio": 0.0,
"completion_length": 59.8,
"epoch": 1.5713610586011342,
"format_reward": 0.0,
"grad_norm": 3.759659767150879,
"image_reward": 0.23873443603515626,
"kl": 7.589515461027622,
"learning_rate": 5e-06,
"loss": -0.0834,
"reward": 0.44450428485870364,
"reward_std": 0.5481153151020408,
"rewards/reward_func": 0.44450428485870364,
"step": 6650,
"toxic_reward": 3.8413244128227233
},
{
"clip_ratio": 0.0,
"completion_length": 51.3,
"epoch": 1.5737240075614367,
"format_reward": 0.0,
"grad_norm": 13.018331527709961,
"image_reward": 0.2600331619381905,
"kl": 3.1137637734413146,
"learning_rate": 5e-06,
"loss": 0.0214,
"reward": 0.7498049587011337,
"reward_std": 0.7316715233027935,
"rewards/reward_func": 0.7498049587011337,
"step": 6660,
"toxic_reward": 4.240022134780884
},
{
"clip_ratio": 0.0,
"completion_length": 49.575,
"epoch": 1.5760869565217392,
"format_reward": 0.0,
"grad_norm": 7.951615333557129,
"image_reward": 0.2636627197265625,
"kl": 0.875077161192894,
"learning_rate": 5e-06,
"loss": -0.1023,
"reward": 0.6656131267547607,
"reward_std": 0.6067664973437786,
"rewards/reward_func": 0.6656131267547607,
"step": 6670,
"toxic_reward": 4.546977305412293
},
{
"clip_ratio": 0.0,
"completion_length": 40.725,
"epoch": 1.5784499054820416,
"format_reward": 0.0,
"grad_norm": 4.168910026550293,
"image_reward": 0.24169108122587205,
"kl": 4.073548844456672,
"learning_rate": 5e-06,
"loss": -0.0498,
"reward": 0.5430697202682495,
"reward_std": 1.2346604462713002,
"rewards/reward_func": 0.5430697202682495,
"step": 6680,
"toxic_reward": 3.895166778564453
},
{
"clip_ratio": 0.0,
"completion_length": 53.275,
"epoch": 1.580812854442344,
"format_reward": 0.0,
"grad_norm": 2.9899535179138184,
"image_reward": 0.23625640869140624,
"kl": 6.596899893879891,
"learning_rate": 5e-06,
"loss": 0.0148,
"reward": 0.3857423186302185,
"reward_std": 0.7419607482850552,
"rewards/reward_func": 0.3857423186302185,
"step": 6690,
"toxic_reward": 4.416726422309876
},
{
"clip_ratio": 0.0,
"completion_length": 40.475,
"epoch": 1.5831758034026464,
"format_reward": 0.0,
"grad_norm": 17.505062103271484,
"image_reward": 0.24977264404296876,
"kl": 130.44692096710205,
"learning_rate": 5e-06,
"loss": 0.0063,
"reward": 0.8495797365903854,
"reward_std": 1.0700383991003037,
"rewards/reward_func": 0.8495797365903854,
"step": 6700,
"toxic_reward": 3.7804057955741883
},
{
"clip_ratio": 0.0,
"completion_length": 46.25,
"epoch": 1.585538752362949,
"format_reward": -0.25,
"grad_norm": 9.865876197814941,
"image_reward": 0.2743357330560684,
"kl": 1.342175406217575,
"learning_rate": 5e-06,
"loss": 0.0144,
"reward": -0.01569686532020569,
"reward_std": 1.0583332434296608,
"rewards/reward_func": -0.01569686532020569,
"step": 6710,
"toxic_reward": 4.605420160293579
},
{
"clip_ratio": 0.0,
"completion_length": 46.65,
"epoch": 1.5879017013232515,
"format_reward": -0.25,
"grad_norm": 1.4694126844406128,
"image_reward": 0.26331074982881547,
"kl": 1.179875871539116,
"learning_rate": 5e-06,
"loss": 0.0439,
"reward": 0.19556427299976348,
"reward_std": 1.060202201642096,
"rewards/reward_func": 0.19556427299976348,
"step": 6720,
"toxic_reward": 4.195696997642517
},
{
"clip_ratio": 0.0,
"completion_length": 41.75,
"epoch": 1.590264650283554,
"format_reward": 0.0,
"grad_norm": 14.831318855285645,
"image_reward": 0.26193084716796877,
"kl": 3.8945027977228164,
"learning_rate": 5e-06,
"loss": -0.0405,
"reward": 0.8553763270378113,
"reward_std": 0.7129356294870377,
"rewards/reward_func": 0.8553763270378113,
"step": 6730,
"toxic_reward": 4.33803927898407
},
{
"clip_ratio": 0.0,
"completion_length": 56.475,
"epoch": 1.5926275992438563,
"format_reward": -0.25,
"grad_norm": 19.41834831237793,
"image_reward": 0.25118509978055953,
"kl": 3.402638703584671,
"learning_rate": 5e-06,
"loss": 0.0046,
"reward": 0.5174610838294029,
"reward_std": 1.2720857471227647,
"rewards/reward_func": 0.5174610838294029,
"step": 6740,
"toxic_reward": 3.870224565267563
},
{
"clip_ratio": 0.0,
"completion_length": 48.025,
"epoch": 1.5949905482041586,
"format_reward": 0.0,
"grad_norm": 6.40997838973999,
"image_reward": 0.258154296875,
"kl": 1.6460766345262527,
"learning_rate": 5e-06,
"loss": 0.0986,
"reward": 1.0767779767513275,
"reward_std": 1.5294719189405441,
"rewards/reward_func": 1.0767779767513275,
"step": 6750,
"toxic_reward": 3.531558334827423
},
{
"clip_ratio": 0.0,
"completion_length": 41.7,
"epoch": 1.5973534971644612,
"format_reward": -0.5,
"grad_norm": 30.405113220214844,
"image_reward": 0.25986429750919343,
"kl": 3.0854232251644134,
"learning_rate": 5e-06,
"loss": -0.0433,
"reward": 0.2282954216003418,
"reward_std": 2.2074968218803406,
"rewards/reward_func": 0.2282954216003418,
"step": 6760,
"toxic_reward": 4.195013093948364
},
{
"clip_ratio": 0.0,
"completion_length": 36.3,
"epoch": 1.5997164461247637,
"format_reward": -0.25,
"grad_norm": 7.383143424987793,
"image_reward": 0.26789347380399703,
"kl": 3.230149340629578,
"learning_rate": 5e-06,
"loss": -0.0221,
"reward": 0.44291332364082336,
"reward_std": 1.4428741056472063,
"rewards/reward_func": 0.44291332364082336,
"step": 6770,
"toxic_reward": 3.8504308581352236
},
{
"clip_ratio": 0.0,
"completion_length": 49.925,
"epoch": 1.6020793950850662,
"format_reward": 0.0,
"grad_norm": 6.7697906494140625,
"image_reward": 0.2436859130859375,
"kl": 3.0731608659029006,
"learning_rate": 5e-06,
"loss": -0.0282,
"reward": 0.9663064420223236,
"reward_std": 0.8580235980451107,
"rewards/reward_func": 0.9663064420223236,
"step": 6780,
"toxic_reward": 3.754929578304291
},
{
"clip_ratio": 0.0,
"completion_length": 43.725,
"epoch": 1.6044423440453688,
"format_reward": 0.0,
"grad_norm": 3.5446696281433105,
"image_reward": 0.260052490234375,
"kl": 763.1712962627411,
"learning_rate": 5e-06,
"loss": 0.0939,
"reward": 0.7344351947307587,
"reward_std": 0.6753151521086693,
"rewards/reward_func": 0.7344351947307587,
"step": 6790,
"toxic_reward": 4.139233088493347
},
{
"clip_ratio": 0.0,
"completion_length": 44.6,
"epoch": 1.606805293005671,
"format_reward": 0.0,
"grad_norm": 21.1129093170166,
"image_reward": 0.27585601806640625,
"kl": 2.870541882514954,
"learning_rate": 5e-06,
"loss": -0.0466,
"reward": 0.5618703544139863,
"reward_std": 0.8203244937583805,
"rewards/reward_func": 0.5618703544139863,
"step": 6800,
"toxic_reward": 4.3937297582626345
},
{
"clip_ratio": 0.0,
"completion_length": 57.125,
"epoch": 1.6091682419659734,
"format_reward": 0.0,
"grad_norm": 5.104684352874756,
"image_reward": 0.280615234375,
"kl": 3.3239043831825255,
"learning_rate": 5e-06,
"loss": 0.045,
"reward": 0.7852797448635102,
"reward_std": 0.610455094370991,
"rewards/reward_func": 0.7852797448635102,
"step": 6810,
"toxic_reward": 4.593313884735108
},
{
"clip_ratio": 0.0,
"completion_length": 45.725,
"epoch": 1.611531190926276,
"format_reward": 0.0,
"grad_norm": 2.496898889541626,
"image_reward": 0.2752532958984375,
"kl": 2.6775636196136476,
"learning_rate": 5e-06,
"loss": -0.012,
"reward": 0.7639135122299194,
"reward_std": 0.9162261974066496,
"rewards/reward_func": 0.7639135122299194,
"step": 6820,
"toxic_reward": 4.146224117279052
},
{
"clip_ratio": 0.0,
"completion_length": 42.075,
"epoch": 1.6138941398865785,
"format_reward": 0.0,
"grad_norm": 6.608152866363525,
"image_reward": 0.267572021484375,
"kl": 3.533026337623596,
"learning_rate": 5e-06,
"loss": 0.0572,
"reward": 0.44893051087856295,
"reward_std": 0.9419144628569484,
"rewards/reward_func": 0.44893051087856295,
"step": 6830,
"toxic_reward": 4.1230400681495665
},
{
"clip_ratio": 0.0,
"completion_length": 54.925,
"epoch": 1.616257088846881,
"format_reward": 0.0,
"grad_norm": 11.28681755065918,
"image_reward": 0.25560302734375,
"kl": 7.369207835197448,
"learning_rate": 5e-06,
"loss": 0.0152,
"reward": 0.5668485701084137,
"reward_std": 0.749977857619524,
"rewards/reward_func": 0.5668485701084137,
"step": 6840,
"toxic_reward": 4.343744564056396
},
{
"clip_ratio": 0.0,
"completion_length": 45.9,
"epoch": 1.6186200378071833,
"format_reward": -0.5,
"grad_norm": 28.541820526123047,
"image_reward": 0.2723876953125,
"kl": 5.971449375152588,
"learning_rate": 5e-06,
"loss": -0.0574,
"reward": -0.5758611798286438,
"reward_std": 1.3836607769131661,
"rewards/reward_func": -0.5758611798286438,
"step": 6850,
"toxic_reward": 4.467629170417785
},
{
"clip_ratio": 0.0,
"completion_length": 37.1,
"epoch": 1.6209829867674859,
"format_reward": 0.0,
"grad_norm": 12.005922317504883,
"image_reward": 0.2653228759765625,
"kl": 6.38155357837677,
"learning_rate": 5e-06,
"loss": 0.0537,
"reward": 0.6645367026329041,
"reward_std": 0.6022280365228653,
"rewards/reward_func": 0.6645367026329041,
"step": 6860,
"toxic_reward": 4.12990357875824
},
{
"clip_ratio": 0.0,
"completion_length": 50.175,
"epoch": 1.6233459357277882,
"format_reward": 0.0,
"grad_norm": 4.00104284286499,
"image_reward": 0.26532745361328125,
"kl": 6.140709114074707,
"learning_rate": 5e-06,
"loss": 0.0866,
"reward": 0.5954837799072266,
"reward_std": 0.9484383892267942,
"rewards/reward_func": 0.5954837799072266,
"step": 6870,
"toxic_reward": 4.030814599990845
},
{
"clip_ratio": 0.0,
"completion_length": 53.2,
"epoch": 1.6257088846880907,
"format_reward": 0.0,
"grad_norm": 7.69809627532959,
"image_reward": 0.2781280517578125,
"kl": 8.641590279340743,
"learning_rate": 5e-06,
"loss": 0.0299,
"reward": 0.35074634552001954,
"reward_std": 0.5954089154489338,
"rewards/reward_func": 0.35074634552001954,
"step": 6880,
"toxic_reward": 4.168118977546692
},
{
"clip_ratio": 0.0,
"completion_length": 47.6,
"epoch": 1.6280718336483933,
"format_reward": -0.25,
"grad_norm": 7.882171154022217,
"image_reward": 0.25705363005399706,
"kl": 5.1897116780281065,
"learning_rate": 5e-06,
"loss": 0.0134,
"reward": -0.19078816771507262,
"reward_std": 1.1844907969236373,
"rewards/reward_func": -0.19078816771507262,
"step": 6890,
"toxic_reward": 4.343024659156799
},
{
"clip_ratio": 0.0,
"completion_length": 45.925,
"epoch": 1.6304347826086958,
"format_reward": -0.25,
"grad_norm": 13.869507789611816,
"image_reward": 0.2716888427734375,
"kl": 6.5070923328399655,
"learning_rate": 5e-06,
"loss": -0.0103,
"reward": 0.15193371772766112,
"reward_std": 1.270319462940097,
"rewards/reward_func": 0.15193371772766112,
"step": 6900,
"toxic_reward": 4.229231309890747
},
{
"clip_ratio": 0.0,
"completion_length": 61.3,
"epoch": 1.632797731568998,
"format_reward": -0.25,
"grad_norm": 6.519335746765137,
"image_reward": 0.268505859375,
"kl": 83.58075475692749,
"learning_rate": 5e-06,
"loss": 0.0011,
"reward": 0.7866749823093414,
"reward_std": 1.1198090038727968,
"rewards/reward_func": 0.7866749823093414,
"step": 6910,
"toxic_reward": 4.473654842376709
},
{
"clip_ratio": 0.0,
"completion_length": 47.875,
"epoch": 1.6351606805293004,
"format_reward": 0.0,
"grad_norm": 5.124833583831787,
"image_reward": 0.2610076904296875,
"kl": 2.196775460243225,
"learning_rate": 5e-06,
"loss": 0.015,
"reward": 1.0072305798530579,
"reward_std": 1.1389783814549446,
"rewards/reward_func": 1.0072305798530579,
"step": 6920,
"toxic_reward": 4.3077033996582035
},
{
"clip_ratio": 0.0,
"completion_length": 40.975,
"epoch": 1.637523629489603,
"format_reward": -0.5,
"grad_norm": 3.5923500061035156,
"image_reward": 0.2612147033214569,
"kl": 4.726305472850799,
"learning_rate": 5e-06,
"loss": 0.0617,
"reward": 0.24457889199256896,
"reward_std": 1.512747337669134,
"rewards/reward_func": 0.24457889199256896,
"step": 6930,
"toxic_reward": 4.234147024154663
},
{
"clip_ratio": 0.0,
"completion_length": 50.425,
"epoch": 1.6398865784499055,
"format_reward": 0.0,
"grad_norm": 7.177937030792236,
"image_reward": 0.2646942138671875,
"kl": 5.225604176521301,
"learning_rate": 5e-06,
"loss": -0.0462,
"reward": 0.3636160969734192,
"reward_std": 0.6955469690263272,
"rewards/reward_func": 0.3636160969734192,
"step": 6940,
"toxic_reward": 4.153347599506378
},
{
"clip_ratio": 0.0,
"completion_length": 47.25,
"epoch": 1.642249527410208,
"format_reward": 0.0,
"grad_norm": 10.053350448608398,
"image_reward": 0.253045654296875,
"kl": 5.69408215880394,
"learning_rate": 5e-06,
"loss": 0.0027,
"reward": 0.612578509747982,
"reward_std": 0.6395491607487201,
"rewards/reward_func": 0.612578509747982,
"step": 6950,
"toxic_reward": 3.9997507095336915
},
{
"clip_ratio": 0.0,
"completion_length": 47.025,
"epoch": 1.6446124763705106,
"format_reward": 0.0,
"grad_norm": 43.84629440307617,
"image_reward": 0.2462066650390625,
"kl": 391.4977917432785,
"learning_rate": 5e-06,
"loss": 0.0408,
"reward": 0.610540634393692,
"reward_std": 1.4011766005307436,
"rewards/reward_func": 0.610540634393692,
"step": 6960,
"toxic_reward": 3.513826107978821
},
{
"clip_ratio": 0.0,
"completion_length": 45.7,
"epoch": 1.6469754253308129,
"format_reward": 0.0,
"grad_norm": 2.5391600131988525,
"image_reward": 0.263494873046875,
"kl": 26.704900431632996,
"learning_rate": 5e-06,
"loss": -0.0209,
"reward": 0.6357394754886627,
"reward_std": 0.9666919514536858,
"rewards/reward_func": 0.6357394754886627,
"step": 6970,
"toxic_reward": 4.3967194080352785
},
{
"clip_ratio": 0.0,
"completion_length": 49.275,
"epoch": 1.6493383742911152,
"format_reward": 0.0,
"grad_norm": 1.4004714488983154,
"image_reward": 0.252850341796875,
"kl": 5.729834485054016,
"learning_rate": 5e-06,
"loss": 0.0379,
"reward": 0.47990578413009644,
"reward_std": 0.5631790950894355,
"rewards/reward_func": 0.47990578413009644,
"step": 6980,
"toxic_reward": 4.208314228057861
},
{
"clip_ratio": 0.0,
"completion_length": 57.5,
"epoch": 1.6517013232514177,
"format_reward": 0.0,
"grad_norm": 5.806096076965332,
"image_reward": 0.256463623046875,
"kl": 6.840502554178238,
"learning_rate": 5e-06,
"loss": -0.0496,
"reward": 0.16656889617443085,
"reward_std": 0.9250041805207729,
"rewards/reward_func": 0.16656889617443085,
"step": 6990,
"toxic_reward": 3.7445754587650297
},
{
"clip_ratio": 0.0,
"completion_length": 45.475,
"epoch": 1.6540642722117203,
"format_reward": -0.25,
"grad_norm": 1.5651546716690063,
"image_reward": 0.2714019775390625,
"kl": 4.0309244930744175,
"learning_rate": 5e-06,
"loss": -0.0266,
"reward": 0.3252350568771362,
"reward_std": 1.123583555780351,
"rewards/reward_func": 0.3252350568771362,
"step": 7000,
"toxic_reward": 4.437454390525818
},
{
"clip_ratio": 0.0,
"completion_length": 45.225,
"epoch": 1.6564272211720228,
"format_reward": -0.25,
"grad_norm": 8.255953788757324,
"image_reward": 0.2734893798828125,
"kl": 13.629169458150864,
"learning_rate": 5e-06,
"loss": 0.0185,
"reward": 0.5219172418117524,
"reward_std": 1.4155076075345279,
"rewards/reward_func": 0.5219172418117524,
"step": 7010,
"toxic_reward": 3.9341206908226014
},
{
"clip_ratio": 0.0,
"completion_length": 44.525,
"epoch": 1.658790170132325,
"format_reward": 0.0,
"grad_norm": 7.765683650970459,
"image_reward": 0.2684661865234375,
"kl": 4.484488549828529,
"learning_rate": 5e-06,
"loss": 0.0198,
"reward": 0.41308672428131105,
"reward_std": 0.6728175904601812,
"rewards/reward_func": 0.41308672428131105,
"step": 7020,
"toxic_reward": 4.680417871475219
},
{
"clip_ratio": 0.0,
"completion_length": 43.425,
"epoch": 1.6611531190926276,
"format_reward": 0.0,
"grad_norm": 6.050631523132324,
"image_reward": 0.2446624755859375,
"kl": 5.303134024143219,
"learning_rate": 5e-06,
"loss": -0.0023,
"reward": 0.7305093944072724,
"reward_std": 0.8056725425645709,
"rewards/reward_func": 0.7305093944072724,
"step": 7030,
"toxic_reward": 4.371766519546509
},
{
"clip_ratio": 0.0,
"completion_length": 50.725,
"epoch": 1.66351606805293,
"format_reward": -0.25,
"grad_norm": 36.67766189575195,
"image_reward": 0.2738332122564316,
"kl": 3.6509076714515687,
"learning_rate": 5e-06,
"loss": 0.0104,
"reward": 0.1336117923259735,
"reward_std": 1.408228962123394,
"rewards/reward_func": 0.1336117923259735,
"step": 7040,
"toxic_reward": 3.868592691421509
},
{
"clip_ratio": 0.0,
"completion_length": 45.95,
"epoch": 1.6658790170132325,
"format_reward": -0.25,
"grad_norm": 3.2863216400146484,
"image_reward": 0.2496002197265625,
"kl": 47.7468825340271,
"learning_rate": 5e-06,
"loss": -0.0431,
"reward": -0.1912323772907257,
"reward_std": 1.0762672819197179,
"rewards/reward_func": -0.1912323772907257,
"step": 7050,
"toxic_reward": 4.286359405517578
},
{
"clip_ratio": 0.0,
"completion_length": 50.8,
"epoch": 1.668241965973535,
"format_reward": -0.75,
"grad_norm": 15.117477416992188,
"image_reward": 0.2343353286385536,
"kl": 12.957087469100951,
"learning_rate": 5e-06,
"loss": -0.0128,
"reward": -0.5250297307968139,
"reward_std": 2.128708484955132,
"rewards/reward_func": -0.5250297307968139,
"step": 7060,
"toxic_reward": 4.045247128605842
},
{
"clip_ratio": 0.0,
"completion_length": 49.15,
"epoch": 1.6706049149338376,
"format_reward": -0.25,
"grad_norm": 6.236794471740723,
"image_reward": 0.2446756988763809,
"kl": 3.7372434973716735,
"learning_rate": 5e-06,
"loss": -0.0289,
"reward": 0.1568456247448921,
"reward_std": 1.3125899083912373,
"rewards/reward_func": 0.1568456247448921,
"step": 7070,
"toxic_reward": 4.527845191955566
},
{
"clip_ratio": 0.0,
"completion_length": 47.0,
"epoch": 1.6729678638941399,
"format_reward": 0.0,
"grad_norm": 1.8276275396347046,
"image_reward": 0.253338623046875,
"kl": 18.982901883125304,
"learning_rate": 5e-06,
"loss": -0.0591,
"reward": 0.5796426713466645,
"reward_std": 0.8607377586886287,
"rewards/reward_func": 0.5796426713466645,
"step": 7080,
"toxic_reward": 4.365747809410095
},
{
"clip_ratio": 0.0,
"completion_length": 52.0,
"epoch": 1.6753308128544422,
"format_reward": 0.0,
"grad_norm": 5.115592956542969,
"image_reward": 0.25368804931640626,
"kl": 8.384132671356202,
"learning_rate": 5e-06,
"loss": -0.0358,
"reward": 0.4894866108894348,
"reward_std": 0.82001001983881,
"rewards/reward_func": 0.4894866108894348,
"step": 7090,
"toxic_reward": 4.161544275283814
},
{
"clip_ratio": 0.0,
"completion_length": 55.45,
"epoch": 1.6776937618147447,
"format_reward": -0.25,
"grad_norm": 4.421766757965088,
"image_reward": 0.257010905444622,
"kl": 1.5507995724678039,
"learning_rate": 5e-06,
"loss": 0.0479,
"reward": -0.053127193450927736,
"reward_std": 0.8562082014977932,
"rewards/reward_func": -0.053127193450927736,
"step": 7100,
"toxic_reward": 4.494407868385315
},
{
"clip_ratio": 0.0,
"completion_length": 39.125,
"epoch": 1.6800567107750473,
"format_reward": 0.0,
"grad_norm": 15.849198341369629,
"image_reward": 0.254315185546875,
"kl": 4.097546017169952,
"learning_rate": 5e-06,
"loss": -0.0412,
"reward": 0.42282047867774963,
"reward_std": 0.9609952576458454,
"rewards/reward_func": 0.42282047867774963,
"step": 7110,
"toxic_reward": 4.278821682929992
},
{
"clip_ratio": 0.0,
"completion_length": 47.15,
"epoch": 1.6824196597353498,
"format_reward": -0.25,
"grad_norm": 9.184070587158203,
"image_reward": 0.2508982330560684,
"kl": 1.9086317151784897,
"learning_rate": 5e-06,
"loss": -0.0676,
"reward": -0.08927419185638427,
"reward_std": 1.1106989961117506,
"rewards/reward_func": -0.08927419185638427,
"step": 7120,
"toxic_reward": 4.535511326789856
},
{
"clip_ratio": 0.0,
"completion_length": 48.45,
"epoch": 1.6847826086956523,
"format_reward": 0.0,
"grad_norm": 10.833540916442871,
"image_reward": 0.2317718505859375,
"kl": 1.5903507679700852,
"learning_rate": 5e-06,
"loss": -0.0026,
"reward": 0.5086119592189788,
"reward_std": 0.610715470276773,
"rewards/reward_func": 0.5086119592189788,
"step": 7130,
"toxic_reward": 4.333575582504272
},
{
"clip_ratio": 0.0,
"completion_length": 49.75,
"epoch": 1.6871455576559546,
"format_reward": 0.0,
"grad_norm": 5.887187480926514,
"image_reward": 0.2499237060546875,
"kl": 49.958091259002686,
"learning_rate": 5e-06,
"loss": -0.0703,
"reward": 0.7590021967887879,
"reward_std": 0.8256058894097805,
"rewards/reward_func": 0.7590021967887879,
"step": 7140,
"toxic_reward": 4.225302958488465
},
{
"clip_ratio": 0.0,
"completion_length": 55.325,
"epoch": 1.689508506616257,
"format_reward": 0.0,
"grad_norm": 12.265044212341309,
"image_reward": 0.23530120849609376,
"kl": 0.9182627111673355,
"learning_rate": 5e-06,
"loss": -0.0673,
"reward": 0.3405183613300323,
"reward_std": 0.7431152425706387,
"rewards/reward_func": 0.3405183613300323,
"step": 7150,
"toxic_reward": 4.184520816802978
},
{
"clip_ratio": 0.0,
"completion_length": 52.175,
"epoch": 1.6918714555765595,
"format_reward": -0.25,
"grad_norm": 3.966953754425049,
"image_reward": 0.267620849609375,
"kl": 1.64820496737957,
"learning_rate": 5e-06,
"loss": 0.1687,
"reward": 0.01701483130455017,
"reward_std": 1.4536124819889664,
"rewards/reward_func": 0.01701483130455017,
"step": 7160,
"toxic_reward": 3.7502978086471557
},
{
"clip_ratio": 0.0,
"completion_length": 45.15,
"epoch": 1.694234404536862,
"format_reward": 0.0,
"grad_norm": 5.9182000160217285,
"image_reward": 0.25394287109375,
"kl": 1.6780494809150697,
"learning_rate": 5e-06,
"loss": 0.0719,
"reward": 0.8605155050754547,
"reward_std": 0.9149322494864464,
"rewards/reward_func": 0.8605155050754547,
"step": 7170,
"toxic_reward": 3.9694084405899046
},
{
"clip_ratio": 0.0,
"completion_length": 44.525,
"epoch": 1.6965973534971646,
"format_reward": 0.0,
"grad_norm": 2.476659059524536,
"image_reward": 0.272882080078125,
"kl": 3.32854140996933,
"learning_rate": 5e-06,
"loss": -0.0742,
"reward": 1.0633208215236665,
"reward_std": 0.9789414823055267,
"rewards/reward_func": 1.0633208215236665,
"step": 7180,
"toxic_reward": 4.391367101669312
},
{
"clip_ratio": 0.0,
"completion_length": 45.825,
"epoch": 1.6989603024574669,
"format_reward": 0.0,
"grad_norm": 3.7560040950775146,
"image_reward": 0.269110107421875,
"kl": 1.4420736670494079,
"learning_rate": 5e-06,
"loss": 0.0345,
"reward": 1.0360184490680695,
"reward_std": 0.8136029925197363,
"rewards/reward_func": 1.0360184490680695,
"step": 7190,
"toxic_reward": 3.99700380563736
},
{
"clip_ratio": 0.0,
"completion_length": 33.925,
"epoch": 1.7013232514177694,
"format_reward": 0.0,
"grad_norm": 11.579362869262695,
"image_reward": 0.2462371826171875,
"kl": 0.9962957471609115,
"learning_rate": 5e-06,
"loss": -0.0255,
"reward": 0.2942840725183487,
"reward_std": 0.3486198179423809,
"rewards/reward_func": 0.2942840725183487,
"step": 7200,
"toxic_reward": 3.8329622387886046
},
{
"clip_ratio": 0.0,
"completion_length": 48.825,
"epoch": 1.7036862003780717,
"format_reward": -0.5,
"grad_norm": 15.74374008178711,
"image_reward": 0.26438903957605364,
"kl": 1.2382088035345078,
"learning_rate": 5e-06,
"loss": -0.0735,
"reward": 0.2413632392883301,
"reward_std": 1.651388045027852,
"rewards/reward_func": 0.2413632392883301,
"step": 7210,
"toxic_reward": 4.516292905807495
},
{
"clip_ratio": 0.0,
"completion_length": 46.625,
"epoch": 1.7060491493383743,
"format_reward": 0.0,
"grad_norm": 9.237770080566406,
"image_reward": 0.2369354248046875,
"kl": 1.5744222581386567,
"learning_rate": 5e-06,
"loss": 0.0256,
"reward": 0.6944510787725449,
"reward_std": 1.11760393679142,
"rewards/reward_func": 0.6944510787725449,
"step": 7220,
"toxic_reward": 3.7596142530441283
},
{
"clip_ratio": 0.0,
"completion_length": 39.125,
"epoch": 1.7084120982986768,
"format_reward": 0.0,
"grad_norm": 3.7665228843688965,
"image_reward": 0.2577423095703125,
"kl": 0.7259436190128327,
"learning_rate": 5e-06,
"loss": 0.0117,
"reward": 0.5142745256423951,
"reward_std": 0.6884998820722104,
"rewards/reward_func": 0.5142745256423951,
"step": 7230,
"toxic_reward": 4.332102084159851
},
{
"clip_ratio": 0.0,
"completion_length": 48.475,
"epoch": 1.7107750472589793,
"format_reward": 0.0,
"grad_norm": 4.795387268066406,
"image_reward": 0.28794708251953127,
"kl": 1.6049385368824005,
"learning_rate": 5e-06,
"loss": 0.0341,
"reward": 0.308843332529068,
"reward_std": 0.4225019045174122,
"rewards/reward_func": 0.308843332529068,
"step": 7240,
"toxic_reward": 4.501336789131164
},
{
"clip_ratio": 0.0,
"completion_length": 55.0,
"epoch": 1.7131379962192816,
"format_reward": -0.25,
"grad_norm": 11.164639472961426,
"image_reward": 0.25756022036075593,
"kl": 0.43412337452173233,
"learning_rate": 5e-06,
"loss": -0.026,
"reward": 0.46165032386779786,
"reward_std": 0.9854918915778399,
"rewards/reward_func": 0.46165032386779786,
"step": 7250,
"toxic_reward": 4.23072258234024
},
{
"clip_ratio": 0.0,
"completion_length": 48.925,
"epoch": 1.715500945179584,
"format_reward": 0.0,
"grad_norm": 26.601303100585938,
"image_reward": 0.24893798828125,
"kl": 3.482639339566231,
"learning_rate": 5e-06,
"loss": -0.0419,
"reward": 0.5657954633235931,
"reward_std": 1.2434701435267925,
"rewards/reward_func": 0.5657954633235931,
"step": 7260,
"toxic_reward": 4.052207565307617
},
{
"clip_ratio": 0.0,
"completion_length": 46.85,
"epoch": 1.7178638941398865,
"format_reward": -0.25,
"grad_norm": 19.468366622924805,
"image_reward": 0.24361775815486908,
"kl": 0.6207199424505234,
"learning_rate": 5e-06,
"loss": -0.1011,
"reward": 0.2289634108543396,
"reward_std": 1.1521323285996914,
"rewards/reward_func": 0.2289634108543396,
"step": 7270,
"toxic_reward": 4.243139553070068
},
{
"clip_ratio": 0.0,
"completion_length": 44.475,
"epoch": 1.720226843100189,
"format_reward": 0.0,
"grad_norm": 11.427348136901855,
"image_reward": 0.24371236115694045,
"kl": 1.2423572808504104,
"learning_rate": 5e-06,
"loss": -0.0055,
"reward": 0.44162888526916505,
"reward_std": 1.226283924281597,
"rewards/reward_func": 0.44162888526916505,
"step": 7280,
"toxic_reward": 3.9729990482330324
},
{
"clip_ratio": 0.0,
"completion_length": 49.725,
"epoch": 1.7225897920604916,
"format_reward": 0.0,
"grad_norm": 8.307239532470703,
"image_reward": 0.2623565673828125,
"kl": 0.795675303786993,
"learning_rate": 5e-06,
"loss": 0.005,
"reward": 0.47723318338394166,
"reward_std": 0.5881602220237255,
"rewards/reward_func": 0.47723318338394166,
"step": 7290,
"toxic_reward": 4.611540603637695
},
{
"clip_ratio": 0.0,
"completion_length": 49.125,
"epoch": 1.724952741020794,
"format_reward": 0.0,
"grad_norm": 7.304860591888428,
"image_reward": 0.2538177490234375,
"kl": 1.0193208366632462,
"learning_rate": 5e-06,
"loss": -0.023,
"reward": 0.17551978230476378,
"reward_std": 0.5646818313747645,
"rewards/reward_func": 0.17551978230476378,
"step": 7300,
"toxic_reward": 4.499063897132873
},
{
"clip_ratio": 0.0,
"completion_length": 50.275,
"epoch": 1.7273156899810964,
"format_reward": 0.0,
"grad_norm": 1.351771354675293,
"image_reward": 0.2463592529296875,
"kl": 1.9171950757503509,
"learning_rate": 5e-06,
"loss": 0.0213,
"reward": 0.466388076543808,
"reward_std": 0.8451812721788883,
"rewards/reward_func": 0.466388076543808,
"step": 7310,
"toxic_reward": 4.565359354019165
},
{
"clip_ratio": 0.0,
"completion_length": 46.925,
"epoch": 1.7296786389413987,
"format_reward": -0.25,
"grad_norm": 2.364166021347046,
"image_reward": 0.25976969450712206,
"kl": 0.5259292095899581,
"learning_rate": 5e-06,
"loss": -0.084,
"reward": 0.45669102370738984,
"reward_std": 1.098591622710228,
"rewards/reward_func": 0.45669102370738984,
"step": 7320,
"toxic_reward": 4.627902317047119
},
{
"clip_ratio": 0.0,
"completion_length": 50.825,
"epoch": 1.7320415879017013,
"format_reward": 0.0,
"grad_norm": 23.96133804321289,
"image_reward": 0.2389556884765625,
"kl": 1.1734901428222657,
"learning_rate": 5e-06,
"loss": 0.038,
"reward": 0.7277517914772034,
"reward_std": 0.8356013357639313,
"rewards/reward_func": 0.7277517914772034,
"step": 7330,
"toxic_reward": 4.407384157180786
},
{
"clip_ratio": 0.0,
"completion_length": 44.0,
"epoch": 1.7344045368620038,
"format_reward": 0.0,
"grad_norm": 17.774612426757812,
"image_reward": 0.2680206298828125,
"kl": 4.040656617283821,
"learning_rate": 5e-06,
"loss": 0.0436,
"reward": 0.2283779501914978,
"reward_std": 0.34994165217503903,
"rewards/reward_func": 0.2283779501914978,
"step": 7340,
"toxic_reward": 4.637366437911988
},
{
"clip_ratio": 0.0,
"completion_length": 42.25,
"epoch": 1.7367674858223063,
"format_reward": -0.25,
"grad_norm": 12.662446022033691,
"image_reward": 0.24230550229549408,
"kl": 0.5235348105430603,
"learning_rate": 5e-06,
"loss": -0.042,
"reward": 0.8077804684638977,
"reward_std": 1.315062115341425,
"rewards/reward_func": 0.8077804684638977,
"step": 7350,
"toxic_reward": 4.6036452293396
},
{
"clip_ratio": 0.0,
"completion_length": 49.75,
"epoch": 1.7391304347826086,
"format_reward": 0.0,
"grad_norm": 2.7947723865509033,
"image_reward": 0.259271240234375,
"kl": 0.5023418068885803,
"learning_rate": 5e-06,
"loss": -0.0101,
"reward": 0.47644210457801817,
"reward_std": 0.6371240261942148,
"rewards/reward_func": 0.47644210457801817,
"step": 7360,
"toxic_reward": 4.352305841445923
},
{
"clip_ratio": 0.0,
"completion_length": 53.975,
"epoch": 1.7414933837429112,
"format_reward": 0.0,
"grad_norm": 6.967306137084961,
"image_reward": 0.2540252685546875,
"kl": 0.5360975474119186,
"learning_rate": 5e-06,
"loss": 0.0708,
"reward": 0.5753240287303925,
"reward_std": 0.8622719066217541,
"rewards/reward_func": 0.5753240287303925,
"step": 7370,
"toxic_reward": 4.0306689739227295
},
{
"clip_ratio": 0.0,
"completion_length": 44.75,
"epoch": 1.7438563327032135,
"format_reward": -0.5,
"grad_norm": 31.72753143310547,
"image_reward": 0.22639973908662797,
"kl": 0.5255977511405945,
"learning_rate": 5e-06,
"loss": 0.0039,
"reward": -0.24822215884923934,
"reward_std": 1.6855425260961057,
"rewards/reward_func": -0.24822215884923934,
"step": 7380,
"toxic_reward": 3.752596640586853
},
{
"clip_ratio": 0.0,
"completion_length": 52.375,
"epoch": 1.746219281663516,
"format_reward": 0.0,
"grad_norm": 7.6846818923950195,
"image_reward": 0.2564056396484375,
"kl": 3.591386225819588,
"learning_rate": 5e-06,
"loss": 0.0212,
"reward": 0.12304354310035706,
"reward_std": 0.8115306086838245,
"rewards/reward_func": 0.12304354310035706,
"step": 7390,
"toxic_reward": 3.613353615999222
},
{
"clip_ratio": 0.0,
"completion_length": 49.65,
"epoch": 1.7485822306238186,
"format_reward": 0.0,
"grad_norm": 2.726175308227539,
"image_reward": 0.283404541015625,
"kl": 0.9659576997160911,
"learning_rate": 5e-06,
"loss": 0.0252,
"reward": 0.3961315780878067,
"reward_std": 1.0492550559341907,
"rewards/reward_func": 0.3961315780878067,
"step": 7400,
"toxic_reward": 3.501691198348999
},
{
"clip_ratio": 0.0,
"completion_length": 41.95,
"epoch": 1.750945179584121,
"format_reward": 0.0,
"grad_norm": 3.0125391483306885,
"image_reward": 0.2607086181640625,
"kl": 0.6532519310712814,
"learning_rate": 5e-06,
"loss": 0.0276,
"reward": 0.4769218623638153,
"reward_std": 0.6247519843280316,
"rewards/reward_func": 0.4769218623638153,
"step": 7410,
"toxic_reward": 4.560657954216003
},
{
"clip_ratio": 0.0,
"completion_length": 56.425,
"epoch": 1.7533081285444234,
"format_reward": 0.0,
"grad_norm": 18.774812698364258,
"image_reward": 0.24655609130859374,
"kl": 1.901971572637558,
"learning_rate": 5e-06,
"loss": 0.0145,
"reward": 0.6345466494560241,
"reward_std": 1.1331901341676711,
"rewards/reward_func": 0.6345466494560241,
"step": 7420,
"toxic_reward": 4.449591946601868
},
{
"clip_ratio": 0.0,
"completion_length": 52.375,
"epoch": 1.755671077504726,
"format_reward": 0.0,
"grad_norm": 4.1103057861328125,
"image_reward": 0.265411376953125,
"kl": 1.7676091372966767,
"learning_rate": 5e-06,
"loss": 0.0034,
"reward": 0.6921305894851685,
"reward_std": 0.6238477535545826,
"rewards/reward_func": 0.6921305894851685,
"step": 7430,
"toxic_reward": 3.859569197893143
},
{
"clip_ratio": 0.0,
"completion_length": 46.65,
"epoch": 1.7580340264650283,
"format_reward": 0.0,
"grad_norm": 2.0048232078552246,
"image_reward": 0.24298477172851562,
"kl": 4.202221667766571,
"learning_rate": 5e-06,
"loss": 0.1367,
"reward": 0.9155125916004181,
"reward_std": 0.7328770853579044,
"rewards/reward_func": 0.9155125916004181,
"step": 7440,
"toxic_reward": 4.531428098678589
},
{
"clip_ratio": 0.0,
"completion_length": 38.25,
"epoch": 1.7603969754253308,
"format_reward": 0.0,
"grad_norm": 20.737003326416016,
"image_reward": 0.25689697265625,
"kl": 16.54909121990204,
"learning_rate": 5e-06,
"loss": 0.037,
"reward": 0.8588055372238159,
"reward_std": 0.9005012600682676,
"rewards/reward_func": 0.8588055372238159,
"step": 7450,
"toxic_reward": 4.513736462593078
},
{
"clip_ratio": 0.0,
"completion_length": 50.7,
"epoch": 1.7627599243856333,
"format_reward": 0.0,
"grad_norm": 1.5940968990325928,
"image_reward": 0.26309814453125,
"kl": 2.317558985948563,
"learning_rate": 5e-06,
"loss": 0.0018,
"reward": 0.20084644556045533,
"reward_std": 0.7237232834100723,
"rewards/reward_func": 0.20084644556045533,
"step": 7460,
"toxic_reward": 4.334891009330749
},
{
"clip_ratio": 0.0,
"completion_length": 44.325,
"epoch": 1.7651228733459359,
"format_reward": 0.0,
"grad_norm": 10.07941722869873,
"image_reward": 0.2528594970703125,
"kl": 1.3212820410728454,
"learning_rate": 5e-06,
"loss": -0.0444,
"reward": 1.2387877494096755,
"reward_std": 0.8179315060377121,
"rewards/reward_func": 1.2387877494096755,
"step": 7470,
"toxic_reward": 4.363593196868896
},
{
"clip_ratio": 0.0,
"completion_length": 49.825,
"epoch": 1.7674858223062382,
"format_reward": 0.0,
"grad_norm": 28.392396926879883,
"image_reward": 0.25749053955078127,
"kl": 2.198029878735542,
"learning_rate": 5e-06,
"loss": 0.0322,
"reward": 0.1901194632053375,
"reward_std": 0.5339192871004343,
"rewards/reward_func": 0.1901194632053375,
"step": 7480,
"toxic_reward": 4.514597225189209
},
{
"clip_ratio": 0.0,
"completion_length": 44.7,
"epoch": 1.7698487712665405,
"format_reward": 0.0,
"grad_norm": 26.77941131591797,
"image_reward": 0.241033935546875,
"kl": 6.588536351919174,
"learning_rate": 5e-06,
"loss": 0.0218,
"reward": 0.2174743801355362,
"reward_std": 0.8413432762026787,
"rewards/reward_func": 0.2174743801355362,
"step": 7490,
"toxic_reward": 4.284235906600952
},
{
"clip_ratio": 0.0,
"completion_length": 42.55,
"epoch": 1.772211720226843,
"format_reward": -0.25,
"grad_norm": 18.408794403076172,
"image_reward": 0.2581207275390625,
"kl": 3.106099420785904,
"learning_rate": 5e-06,
"loss": 0.0257,
"reward": 0.31152122020721434,
"reward_std": 1.2936958684585989,
"rewards/reward_func": 0.31152122020721434,
"step": 7500,
"toxic_reward": 4.310132288932801
},
{
"clip_ratio": 0.0,
"completion_length": 40.475,
"epoch": 1.7745746691871456,
"format_reward": 0.0,
"grad_norm": 32.10823440551758,
"image_reward": 0.232781982421875,
"kl": 11.768871355056763,
"learning_rate": 5e-06,
"loss": -0.0201,
"reward": 1.5193881750106812,
"reward_std": 0.8748866233974695,
"rewards/reward_func": 1.5193881750106812,
"step": 7510,
"toxic_reward": 4.612711477279663
},
{
"clip_ratio": 0.0,
"completion_length": 45.575,
"epoch": 1.776937618147448,
"format_reward": 0.0,
"grad_norm": 10.912269592285156,
"image_reward": 0.235693359375,
"kl": 2.0526355147361754,
"learning_rate": 5e-06,
"loss": 0.1284,
"reward": 1.3539286196231841,
"reward_std": 0.9052736334502697,
"rewards/reward_func": 1.3539286196231841,
"step": 7520,
"toxic_reward": 4.487947154045105
},
{
"clip_ratio": 0.0,
"completion_length": 46.575,
"epoch": 1.7793005671077504,
"format_reward": 0.0,
"grad_norm": 13.928491592407227,
"image_reward": 0.2556488037109375,
"kl": 21.833010697364806,
"learning_rate": 5e-06,
"loss": -0.1174,
"reward": 0.5344179272651672,
"reward_std": 0.7245766028761864,
"rewards/reward_func": 0.5344179272651672,
"step": 7530,
"toxic_reward": 4.373207831382752
},
{
"clip_ratio": 0.0,
"completion_length": 46.975,
"epoch": 1.781663516068053,
"format_reward": 0.0,
"grad_norm": 8.675307273864746,
"image_reward": 0.24088897705078124,
"kl": 1.3107800006866455,
"learning_rate": 5e-06,
"loss": 0.0403,
"reward": 0.04000200629234314,
"reward_std": 1.0572677969932556,
"rewards/reward_func": 0.04000200629234314,
"step": 7540,
"toxic_reward": 4.048869323730469
},
{
"clip_ratio": 0.0,
"completion_length": 40.975,
"epoch": 1.7840264650283553,
"format_reward": -0.25,
"grad_norm": 3.656561851501465,
"image_reward": 0.24349263608455657,
"kl": 3.6083962321281433,
"learning_rate": 5e-06,
"loss": -0.0326,
"reward": 0.10396124720573426,
"reward_std": 1.0819443106651305,
"rewards/reward_func": 0.10396124720573426,
"step": 7550,
"toxic_reward": 4.45411868095398
},
{
"clip_ratio": 0.0,
"completion_length": 40.6,
"epoch": 1.7863894139886578,
"format_reward": 0.0,
"grad_norm": 2.0053718090057373,
"image_reward": 0.280938720703125,
"kl": 1.8616322338581086,
"learning_rate": 5e-06,
"loss": -0.0035,
"reward": 0.602351513504982,
"reward_std": 0.8774395015090704,
"rewards/reward_func": 0.602351513504982,
"step": 7560,
"toxic_reward": 3.8221758723258974
},
{
"clip_ratio": 0.0,
"completion_length": 49.15,
"epoch": 1.7887523629489603,
"format_reward": 0.0,
"grad_norm": 6.999305248260498,
"image_reward": 0.24803619384765624,
"kl": 1.7729626595973969,
"learning_rate": 5e-06,
"loss": -0.004,
"reward": 0.33846797943115237,
"reward_std": 0.587756198644638,
"rewards/reward_func": 0.33846797943115237,
"step": 7570,
"toxic_reward": 4.2568159103393555
},
{
"clip_ratio": 0.0,
"completion_length": 38.375,
"epoch": 1.7911153119092629,
"format_reward": 0.0,
"grad_norm": 12.467576026916504,
"image_reward": 0.2431640625,
"kl": 1.3180940926074982,
"learning_rate": 5e-06,
"loss": 0.0225,
"reward": 0.6568324744701386,
"reward_std": 0.5710492163896561,
"rewards/reward_func": 0.6568324744701386,
"step": 7580,
"toxic_reward": 4.575870084762573
},
{
"clip_ratio": 0.0,
"completion_length": 48.675,
"epoch": 1.7934782608695652,
"format_reward": 0.0,
"grad_norm": 41.636165618896484,
"image_reward": 0.2553070068359375,
"kl": 1.2196908950805665,
"learning_rate": 5e-06,
"loss": 0.0276,
"reward": 0.9933471500873565,
"reward_std": 0.8478576868772507,
"rewards/reward_func": 0.9933471500873565,
"step": 7590,
"toxic_reward": 4.177789008617401
},
{
"clip_ratio": 0.0,
"completion_length": 36.3,
"epoch": 1.7958412098298677,
"format_reward": 0.0,
"grad_norm": 8.115588188171387,
"image_reward": 0.27226715087890624,
"kl": 5.791901814937591,
"learning_rate": 5e-06,
"loss": 0.0022,
"reward": 0.3163196682929993,
"reward_std": 0.8629786409437656,
"rewards/reward_func": 0.3163196682929993,
"step": 7600,
"toxic_reward": 3.73489425778389
},
{
"clip_ratio": 0.0,
"completion_length": 49.95,
"epoch": 1.79820415879017,
"format_reward": 0.0,
"grad_norm": 2.3679516315460205,
"image_reward": 0.2376190185546875,
"kl": 4.311083900928497,
"learning_rate": 5e-06,
"loss": -0.0175,
"reward": 0.6290358543395996,
"reward_std": 1.0244077319279312,
"rewards/reward_func": 0.6290358543395996,
"step": 7610,
"toxic_reward": 4.054656505584717
},
{
"clip_ratio": 0.0,
"completion_length": 49.35,
"epoch": 1.8005671077504726,
"format_reward": 0.0,
"grad_norm": 2.1850380897521973,
"image_reward": 0.247137451171875,
"kl": 3.278796100616455,
"learning_rate": 5e-06,
"loss": 0.056,
"reward": 1.2004601210355759,
"reward_std": 0.7055684822611511,
"rewards/reward_func": 1.2004601210355759,
"step": 7620,
"toxic_reward": 3.5256235122680666
},
{
"clip_ratio": 0.0,
"completion_length": 39.75,
"epoch": 1.802930056710775,
"format_reward": -0.25,
"grad_norm": 3.8605425357818604,
"image_reward": 0.24492238312959672,
"kl": 7.6126263558864595,
"learning_rate": 5e-06,
"loss": -0.0959,
"reward": 0.37777516841888426,
"reward_std": 1.1775035494938493,
"rewards/reward_func": 0.37777516841888426,
"step": 7630,
"toxic_reward": 4.522587513923645
},
{
"clip_ratio": 0.0,
"completion_length": 45.375,
"epoch": 1.8052930056710776,
"format_reward": -0.25,
"grad_norm": 6.144404411315918,
"image_reward": 0.241168212890625,
"kl": 1.456436914205551,
"learning_rate": 5e-06,
"loss": -0.0231,
"reward": 0.36865578293800355,
"reward_std": 1.7164668783545494,
"rewards/reward_func": 0.36865578293800355,
"step": 7640,
"toxic_reward": 3.9745461702346803
},
{
"clip_ratio": 0.0,
"completion_length": 47.775,
"epoch": 1.80765595463138,
"format_reward": 0.0,
"grad_norm": 33.95363998413086,
"image_reward": 0.2472625732421875,
"kl": 2.2694355845451355,
"learning_rate": 5e-06,
"loss": -0.1037,
"reward": 0.8588967323303223,
"reward_std": 1.019287913478911,
"rewards/reward_func": 0.8588967323303223,
"step": 7650,
"toxic_reward": 4.213714742660523
},
{
"clip_ratio": 0.0,
"completion_length": 50.35,
"epoch": 1.8100189035916823,
"format_reward": -0.25,
"grad_norm": 6.865695953369141,
"image_reward": 0.25559844970703127,
"kl": 6.3857537567615505,
"learning_rate": 5e-06,
"loss": -0.0924,
"reward": -0.07846069931983948,
"reward_std": 1.1336833463981748,
"rewards/reward_func": -0.07846069931983948,
"step": 7660,
"toxic_reward": 4.2933889627456665
},
{
"clip_ratio": 0.0,
"completion_length": 53.825,
"epoch": 1.8123818525519848,
"format_reward": 0.0,
"grad_norm": 2.160090923309326,
"image_reward": 0.2841064453125,
"kl": 5.202520692348481,
"learning_rate": 5e-06,
"loss": 0.006,
"reward": 1.153634887933731,
"reward_std": 1.2888424217700958,
"rewards/reward_func": 1.153634887933731,
"step": 7670,
"toxic_reward": 3.994613242149353
},
{
"clip_ratio": 0.0,
"completion_length": 50.975,
"epoch": 1.8147448015122873,
"format_reward": 0.0,
"grad_norm": 3.903553009033203,
"image_reward": 0.26880950927734376,
"kl": 63.095464119315146,
"learning_rate": 5e-06,
"loss": 0.001,
"reward": 1.0250155806541443,
"reward_std": 0.7393251709640026,
"rewards/reward_func": 1.0250155806541443,
"step": 7680,
"toxic_reward": 3.694820535182953
},
{
"clip_ratio": 0.0,
"completion_length": 43.15,
"epoch": 1.8171077504725899,
"format_reward": 0.0,
"grad_norm": 15.986948013305664,
"image_reward": 0.2573333740234375,
"kl": 3.8690971970558166,
"learning_rate": 5e-06,
"loss": 0.0365,
"reward": 0.9120604813098907,
"reward_std": 0.8725108332931996,
"rewards/reward_func": 0.9120604813098907,
"step": 7690,
"toxic_reward": 4.068342316150665
},
{
"clip_ratio": 0.0,
"completion_length": 46.65,
"epoch": 1.8194706994328924,
"format_reward": 0.0,
"grad_norm": 2.521322727203369,
"image_reward": 0.232586669921875,
"kl": 1.3404993683099746,
"learning_rate": 5e-06,
"loss": -0.0105,
"reward": 0.08191419243812562,
"reward_std": 0.6063120868057013,
"rewards/reward_func": 0.08191419243812562,
"step": 7700,
"toxic_reward": 4.285334658622742
},
{
"clip_ratio": 0.0,
"completion_length": 54.65,
"epoch": 1.8218336483931947,
"format_reward": 0.0,
"grad_norm": 10.563508033752441,
"image_reward": 0.251318359375,
"kl": 4.375722473859787,
"learning_rate": 5e-06,
"loss": 0.0574,
"reward": 0.7043181240558625,
"reward_std": 0.5366579249501229,
"rewards/reward_func": 0.7043181240558625,
"step": 7710,
"toxic_reward": 4.468820595741272
},
{
"clip_ratio": 0.0,
"completion_length": 47.175,
"epoch": 1.824196597353497,
"format_reward": -0.25,
"grad_norm": 5.306228160858154,
"image_reward": 0.25533854216337204,
"kl": 1.2862511157989502,
"learning_rate": 5e-06,
"loss": 0.0064,
"reward": 0.2715910017490387,
"reward_std": 1.3802445553243161,
"rewards/reward_func": 0.2715910017490387,
"step": 7720,
"toxic_reward": 4.128815948963165
},
{
"clip_ratio": 0.0,
"completion_length": 53.525,
"epoch": 1.8265595463137996,
"format_reward": 0.0,
"grad_norm": 4.18682336807251,
"image_reward": 0.22406005859375,
"kl": 13.262214809656143,
"learning_rate": 5e-06,
"loss": -0.0483,
"reward": 0.43178263306617737,
"reward_std": 0.5340902636758983,
"rewards/reward_func": 0.43178263306617737,
"step": 7730,
"toxic_reward": 4.167550274729729
},
{
"clip_ratio": 0.0,
"completion_length": 51.15,
"epoch": 1.8289224952741021,
"format_reward": 0.0,
"grad_norm": 16.534120559692383,
"image_reward": 0.2627288818359375,
"kl": 4.888235807418823,
"learning_rate": 5e-06,
"loss": 0.0231,
"reward": 0.46792620718479155,
"reward_std": 0.6471607919782401,
"rewards/reward_func": 0.46792620718479155,
"step": 7740,
"toxic_reward": 4.068465518951416
},
{
"clip_ratio": 0.0,
"completion_length": 45.225,
"epoch": 1.8312854442344046,
"format_reward": 0.0,
"grad_norm": 10.179228782653809,
"image_reward": 0.249444580078125,
"kl": 3.951664477586746,
"learning_rate": 5e-06,
"loss": -0.0106,
"reward": 1.0039419054985046,
"reward_std": 0.8490265306085348,
"rewards/reward_func": 1.0039419054985046,
"step": 7750,
"toxic_reward": 4.28922358751297
},
{
"clip_ratio": 0.0,
"completion_length": 43.0,
"epoch": 1.833648393194707,
"format_reward": 0.0,
"grad_norm": 1.3173015117645264,
"image_reward": 0.2619903564453125,
"kl": 3.221765196323395,
"learning_rate": 5e-06,
"loss": -0.0057,
"reward": 0.5499142289161683,
"reward_std": 0.8114865634590387,
"rewards/reward_func": 0.5499142289161683,
"step": 7760,
"toxic_reward": 4.202396821975708
},
{
"clip_ratio": 0.0,
"completion_length": 45.95,
"epoch": 1.8360113421550095,
"format_reward": -0.25,
"grad_norm": 9.477835655212402,
"image_reward": 0.275567626953125,
"kl": 7.138307851552963,
"learning_rate": 5e-06,
"loss": -0.0182,
"reward": 0.5576439201831818,
"reward_std": 1.718572654016316,
"rewards/reward_func": 0.5576439201831818,
"step": 7770,
"toxic_reward": 3.892837381362915
},
{
"clip_ratio": 0.0,
"completion_length": 55.325,
"epoch": 1.8383742911153118,
"format_reward": -0.25,
"grad_norm": 15.364556312561035,
"image_reward": 0.22845306396484374,
"kl": 5.752876976132393,
"learning_rate": 5e-06,
"loss": 0.0724,
"reward": 0.4619426131248474,
"reward_std": 0.994839246571064,
"rewards/reward_func": 0.4619426131248474,
"step": 7780,
"toxic_reward": 4.777106142044067
},
{
"clip_ratio": 0.0,
"completion_length": 62.75,
"epoch": 1.8407372400756143,
"format_reward": -0.25,
"grad_norm": 13.07127571105957,
"image_reward": 0.24772542268037795,
"kl": 2.2956355273723603,
"learning_rate": 5e-06,
"loss": -0.0286,
"reward": 0.23627470731735228,
"reward_std": 1.3513500357046724,
"rewards/reward_func": 0.23627470731735228,
"step": 7790,
"toxic_reward": 3.998799777030945
},
{
"clip_ratio": 0.0,
"completion_length": 54.675,
"epoch": 1.8431001890359169,
"format_reward": -0.25,
"grad_norm": 1.4357541799545288,
"image_reward": 0.2586761474609375,
"kl": 2.018620651960373,
"learning_rate": 5e-06,
"loss": 0.0136,
"reward": 0.5957891523838044,
"reward_std": 1.3981972932815552,
"rewards/reward_func": 0.5957891523838044,
"step": 7800,
"toxic_reward": 3.74977787733078
},
{
"clip_ratio": 0.0,
"completion_length": 50.9,
"epoch": 1.8454631379962194,
"format_reward": 0.0,
"grad_norm": 12.382879257202148,
"image_reward": 0.251861572265625,
"kl": 2.0946659803390504,
"learning_rate": 5e-06,
"loss": -0.0109,
"reward": 0.2852811634540558,
"reward_std": 0.7155913963913918,
"rewards/reward_func": 0.2852811634540558,
"step": 7810,
"toxic_reward": 4.4501420021057125
},
{
"clip_ratio": 0.0,
"completion_length": 48.125,
"epoch": 1.8478260869565217,
"format_reward": 0.0,
"grad_norm": 4.438508987426758,
"image_reward": 0.270166015625,
"kl": 1.658120059967041,
"learning_rate": 5e-06,
"loss": 0.0217,
"reward": 0.8978000760078431,
"reward_std": 1.2586904138326644,
"rewards/reward_func": 0.8978000760078431,
"step": 7820,
"toxic_reward": 4.126551675796509
},
{
"clip_ratio": 0.0,
"completion_length": 37.9,
"epoch": 1.850189035916824,
"format_reward": 0.0,
"grad_norm": 1.4302005767822266,
"image_reward": 0.2738861083984375,
"kl": 1.6736175537109375,
"learning_rate": 5e-06,
"loss": -0.1503,
"reward": 0.2234538435935974,
"reward_std": 0.7356585245579481,
"rewards/reward_func": 0.2234538435935974,
"step": 7830,
"toxic_reward": 3.9116656303405763
},
{
"clip_ratio": 0.0,
"completion_length": 46.125,
"epoch": 1.8525519848771266,
"format_reward": 0.0,
"grad_norm": 5.846213340759277,
"image_reward": 0.2658111572265625,
"kl": 1.36658373773098,
"learning_rate": 5e-06,
"loss": -0.0164,
"reward": -0.04418985247611999,
"reward_std": 0.827529611485079,
"rewards/reward_func": -0.04418985247611999,
"step": 7840,
"toxic_reward": 3.892432355880737
},
{
"clip_ratio": 0.0,
"completion_length": 51.475,
"epoch": 1.8549149338374291,
"format_reward": 0.0,
"grad_norm": 5.060561656951904,
"image_reward": 0.23461151123046875,
"kl": 0.5276786342263222,
"learning_rate": 5e-06,
"loss": -0.003,
"reward": 0.7852385342121124,
"reward_std": 0.9399228170514107,
"rewards/reward_func": 0.7852385342121124,
"step": 7850,
"toxic_reward": 3.641209203004837
},
{
"clip_ratio": 0.0,
"completion_length": 46.025,
"epoch": 1.8572778827977316,
"format_reward": 0.0,
"grad_norm": 2.505263566970825,
"image_reward": 0.259368896484375,
"kl": 1.0133032470941543,
"learning_rate": 5e-06,
"loss": 0.0581,
"reward": 0.8989585757255554,
"reward_std": 0.917613423243165,
"rewards/reward_func": 0.8989585757255554,
"step": 7860,
"toxic_reward": 4.13404905796051
},
{
"clip_ratio": 0.0,
"completion_length": 43.775,
"epoch": 1.8596408317580342,
"format_reward": 0.0,
"grad_norm": 37.166500091552734,
"image_reward": 0.25547637939453127,
"kl": 2.1411855638027193,
"learning_rate": 5e-06,
"loss": 0.1396,
"reward": 0.21031073927879335,
"reward_std": 0.6978237416595221,
"rewards/reward_func": 0.21031073927879335,
"step": 7870,
"toxic_reward": 4.086808681488037
},
{
"clip_ratio": 0.0,
"completion_length": 46.05,
"epoch": 1.8620037807183365,
"format_reward": 0.0,
"grad_norm": 1.7183008193969727,
"image_reward": 0.2409576416015625,
"kl": 0.6873624622821808,
"learning_rate": 5e-06,
"loss": 0.033,
"reward": 0.7567365884780883,
"reward_std": 0.95932078063488,
"rewards/reward_func": 0.7567365884780883,
"step": 7880,
"toxic_reward": 4.077835154533386
},
{
"clip_ratio": 0.0,
"completion_length": 47.95,
"epoch": 1.8643667296786388,
"format_reward": 0.0,
"grad_norm": 1.375571846961975,
"image_reward": 0.251434326171875,
"kl": 1.8803806602954865,
"learning_rate": 5e-06,
"loss": -0.0674,
"reward": 0.1468454658985138,
"reward_std": 0.7339655995368958,
"rewards/reward_func": 0.1468454658985138,
"step": 7890,
"toxic_reward": 4.2400289416313175
},
{
"clip_ratio": 0.0,
"completion_length": 38.625,
"epoch": 1.8667296786389413,
"format_reward": 0.0,
"grad_norm": 2.778831720352173,
"image_reward": 0.273419189453125,
"kl": 12.759307652711868,
"learning_rate": 5e-06,
"loss": 0.091,
"reward": 0.2764736473560333,
"reward_std": 0.6703889116644859,
"rewards/reward_func": 0.2764736473560333,
"step": 7900,
"toxic_reward": 4.633634448051453
},
{
"clip_ratio": 0.0,
"completion_length": 53.525,
"epoch": 1.8690926275992439,
"format_reward": 0.0,
"grad_norm": 14.088724136352539,
"image_reward": 0.23843803405761718,
"kl": 2.752323019504547,
"learning_rate": 5e-06,
"loss": -0.0523,
"reward": 0.44507230520248414,
"reward_std": 0.8451843298971653,
"rewards/reward_func": 0.44507230520248414,
"step": 7910,
"toxic_reward": 3.7819975137710573
},
{
"clip_ratio": 0.0,
"completion_length": 46.275,
"epoch": 1.8714555765595464,
"format_reward": 0.0,
"grad_norm": 12.696130752563477,
"image_reward": 0.25413665771484373,
"kl": 2.022602713108063,
"learning_rate": 5e-06,
"loss": 0.0137,
"reward": 0.6168730854988098,
"reward_std": 1.198334063589573,
"rewards/reward_func": 0.6168730854988098,
"step": 7920,
"toxic_reward": 3.9178677558898927
},
{
"clip_ratio": 0.0,
"completion_length": 51.75,
"epoch": 1.8738185255198487,
"format_reward": 0.0,
"grad_norm": 6.441836357116699,
"image_reward": 0.23479461669921875,
"kl": 2.0815513670444488,
"learning_rate": 5e-06,
"loss": 0.099,
"reward": 0.49921011328697207,
"reward_std": 0.8878588248044252,
"rewards/reward_func": 0.49921011328697207,
"step": 7930,
"toxic_reward": 4.21897873878479
},
{
"clip_ratio": 0.0,
"completion_length": 44.575,
"epoch": 1.8761814744801513,
"format_reward": -0.25,
"grad_norm": 17.865110397338867,
"image_reward": 0.2600982666015625,
"kl": 2.150428944826126,
"learning_rate": 5e-06,
"loss": -0.0837,
"reward": 0.48849809169769287,
"reward_std": 1.4677658422850073,
"rewards/reward_func": 0.48849809169769287,
"step": 7940,
"toxic_reward": 4.1003117799758915
},
{
"clip_ratio": 0.0,
"completion_length": 49.925,
"epoch": 1.8785444234404536,
"format_reward": 0.0,
"grad_norm": 6.697957992553711,
"image_reward": 0.25945892333984377,
"kl": 2.223458543419838,
"learning_rate": 5e-06,
"loss": 0.0562,
"reward": 0.20426468104124068,
"reward_std": 0.5012361383065581,
"rewards/reward_func": 0.20426468104124068,
"step": 7950,
"toxic_reward": 4.173866260051727
},
{
"clip_ratio": 0.0,
"completion_length": 54.075,
"epoch": 1.8809073724007561,
"format_reward": 0.0,
"grad_norm": 5.58077335357666,
"image_reward": 0.25069351196289064,
"kl": 8.21664493083954,
"learning_rate": 5e-06,
"loss": -0.0792,
"reward": 0.7380830064415932,
"reward_std": 1.2196707382798195,
"rewards/reward_func": 0.7380830064415932,
"step": 7960,
"toxic_reward": 3.5894944429397584
},
{
"clip_ratio": 0.0,
"completion_length": 43.475,
"epoch": 1.8832703213610587,
"format_reward": 0.0,
"grad_norm": 8.384510040283203,
"image_reward": 0.23923797607421876,
"kl": 21.914335840940474,
"learning_rate": 5e-06,
"loss": -0.0204,
"reward": 0.8768561869859696,
"reward_std": 0.7653445459902286,
"rewards/reward_func": 0.8768561869859696,
"step": 7970,
"toxic_reward": 3.7090541243553163
},
{
"clip_ratio": 0.0,
"completion_length": 54.075,
"epoch": 1.8856332703213612,
"format_reward": -0.25,
"grad_norm": 3.9271442890167236,
"image_reward": 0.2266026809811592,
"kl": 1.7329542875289916,
"learning_rate": 5e-06,
"loss": 0.0735,
"reward": 0.006925755739212036,
"reward_std": 1.2168598100543022,
"rewards/reward_func": 0.006925755739212036,
"step": 7980,
"toxic_reward": 4.598031067848206
},
{
"clip_ratio": 0.0,
"completion_length": 46.375,
"epoch": 1.8879962192816635,
"format_reward": 0.0,
"grad_norm": 17.941791534423828,
"image_reward": 0.24617818146944045,
"kl": 3.56347342133522,
"learning_rate": 5e-06,
"loss": -0.1121,
"reward": 0.15787817239761354,
"reward_std": 0.5696724381297826,
"rewards/reward_func": 0.15787817239761354,
"step": 7990,
"toxic_reward": 4.376770114898681
},
{
"clip_ratio": 0.0,
"completion_length": 40.675,
"epoch": 1.8903591682419658,
"format_reward": 0.0,
"grad_norm": 26.69174575805664,
"image_reward": 0.2819636031985283,
"kl": 7.143774968385697,
"learning_rate": 5e-06,
"loss": -0.0619,
"reward": 0.6548231065273284,
"reward_std": 0.8737724728882312,
"rewards/reward_func": 0.6548231065273284,
"step": 8000,
"toxic_reward": 4.582368350028991
},
{
"clip_ratio": 0.0,
"completion_length": 51.625,
"epoch": 1.8927221172022684,
"format_reward": 0.0,
"grad_norm": 7.003530025482178,
"image_reward": 0.261529541015625,
"kl": 9.618525552749634,
"learning_rate": 5e-06,
"loss": 0.051,
"reward": -0.017962449789047243,
"reward_std": 0.5481395080685616,
"rewards/reward_func": -0.017962449789047243,
"step": 8010,
"toxic_reward": 4.220958662033081
},
{
"clip_ratio": 0.0,
"completion_length": 42.3,
"epoch": 1.8950850661625709,
"format_reward": 0.0,
"grad_norm": 7.718620777130127,
"image_reward": 0.2414947509765625,
"kl": 4.341373115777969,
"learning_rate": 5e-06,
"loss": 0.0835,
"reward": 0.38290356993675234,
"reward_std": 0.9348091699182988,
"rewards/reward_func": 0.38290356993675234,
"step": 8020,
"toxic_reward": 4.002734637260437
},
{
"clip_ratio": 0.0,
"completion_length": 46.5,
"epoch": 1.8974480151228734,
"format_reward": 0.0,
"grad_norm": 10.61853313446045,
"image_reward": 0.2405120849609375,
"kl": 18.692966318130495,
"learning_rate": 5e-06,
"loss": 0.019,
"reward": 0.6024147510528565,
"reward_std": 0.8250786025077105,
"rewards/reward_func": 0.6024147510528565,
"step": 8030,
"toxic_reward": 4.069397926330566
},
{
"clip_ratio": 0.0,
"completion_length": 53.95,
"epoch": 1.899810964083176,
"format_reward": -0.5,
"grad_norm": 4.87439489364624,
"image_reward": 0.26594645231962205,
"kl": 4.881083369255066,
"learning_rate": 5e-06,
"loss": 0.0141,
"reward": 0.1986172914505005,
"reward_std": 1.8204052031040192,
"rewards/reward_func": 0.1986172914505005,
"step": 8040,
"toxic_reward": 4.146627187728882
},
{
"clip_ratio": 0.0,
"completion_length": 46.075,
"epoch": 1.9021739130434783,
"format_reward": 0.0,
"grad_norm": 19.0607852935791,
"image_reward": 0.2584747314453125,
"kl": 13.449040079116822,
"learning_rate": 5e-06,
"loss": -0.0336,
"reward": 0.09852480292320251,
"reward_std": 0.37513242168352007,
"rewards/reward_func": 0.09852480292320251,
"step": 8050,
"toxic_reward": 4.5937717914581295
},
{
"clip_ratio": 0.0,
"completion_length": 45.125,
"epoch": 1.9045368620037806,
"format_reward": 0.0,
"grad_norm": 4.807636260986328,
"image_reward": 0.238970947265625,
"kl": 9.84277012348175,
"learning_rate": 5e-06,
"loss": 0.0099,
"reward": 0.7841103792190551,
"reward_std": 0.931809046678245,
"rewards/reward_func": 0.7841103792190551,
"step": 8060,
"toxic_reward": 4.410308980941773
},
{
"clip_ratio": 0.0,
"completion_length": 47.575,
"epoch": 1.9068998109640831,
"format_reward": 0.0,
"grad_norm": 30.570436477661133,
"image_reward": 0.2677642822265625,
"kl": 11.538963747024535,
"learning_rate": 5e-06,
"loss": 0.0571,
"reward": 0.7513397336006165,
"reward_std": 0.6926180317997932,
"rewards/reward_func": 0.7513397336006165,
"step": 8070,
"toxic_reward": 4.325729882717132
},
{
"clip_ratio": 0.0,
"completion_length": 49.925,
"epoch": 1.9092627599243857,
"format_reward": 0.0,
"grad_norm": 3.387159824371338,
"image_reward": 0.2416534423828125,
"kl": 36.42685050964356,
"learning_rate": 5e-06,
"loss": -0.0984,
"reward": 1.0627863883972168,
"reward_std": 0.9809991672635079,
"rewards/reward_func": 1.0627863883972168,
"step": 8080,
"toxic_reward": 4.355536723136902
},
{
"clip_ratio": 0.0,
"completion_length": 51.875,
"epoch": 1.9116257088846882,
"format_reward": 0.0,
"grad_norm": 8.823395729064941,
"image_reward": 0.2377349853515625,
"kl": 14.548263192176819,
"learning_rate": 5e-06,
"loss": -0.0671,
"reward": 0.2453417807817459,
"reward_std": 0.8620891466736793,
"rewards/reward_func": 0.2453417807817459,
"step": 8090,
"toxic_reward": 4.175139570236206
},
{
"clip_ratio": 0.0,
"completion_length": 53.7,
"epoch": 1.9139886578449905,
"format_reward": -0.25,
"grad_norm": 3.934446334838867,
"image_reward": 0.2485321044921875,
"kl": 476.8398398399353,
"learning_rate": 5e-06,
"loss": 0.1697,
"reward": -0.03726454377174378,
"reward_std": 1.2227270498871803,
"rewards/reward_func": -0.03726454377174378,
"step": 8100,
"toxic_reward": 4.573867344856263
},
{
"clip_ratio": 0.0,
"completion_length": 47.8,
"epoch": 1.916351606805293,
"format_reward": 0.0,
"grad_norm": 11.159741401672363,
"image_reward": 0.2699676513671875,
"kl": 5.77522222995758,
"learning_rate": 5e-06,
"loss": -0.1138,
"reward": 0.4612575590610504,
"reward_std": 0.5476422467269003,
"rewards/reward_func": 0.4612575590610504,
"step": 8110,
"toxic_reward": 4.567614626884461
},
{
"clip_ratio": 0.0,
"completion_length": 48.575,
"epoch": 1.9187145557655954,
"format_reward": 0.0,
"grad_norm": 11.536759376525879,
"image_reward": 0.24012298583984376,
"kl": 6.318757677078247,
"learning_rate": 5e-06,
"loss": 0.0103,
"reward": 0.9109591245651245,
"reward_std": 1.29407604560256,
"rewards/reward_func": 0.9109591245651245,
"step": 8120,
"toxic_reward": 4.05263090133667
},
{
"clip_ratio": 0.0,
"completion_length": 58.575,
"epoch": 1.9210775047258979,
"format_reward": 0.0,
"grad_norm": 6.833136558532715,
"image_reward": 0.2688323974609375,
"kl": 6.9808355331420895,
"learning_rate": 5e-06,
"loss": 0.0913,
"reward": 0.9232870817184449,
"reward_std": 0.8357461627572775,
"rewards/reward_func": 0.9232870817184449,
"step": 8130,
"toxic_reward": 4.430827951431274
},
{
"clip_ratio": 0.0,
"completion_length": 50.9,
"epoch": 1.9234404536862004,
"format_reward": 0.0,
"grad_norm": 14.8239164352417,
"image_reward": 0.2474365234375,
"kl": 137.28185538053512,
"learning_rate": 5e-06,
"loss": 0.0189,
"reward": 0.401202654838562,
"reward_std": 0.4000473257154226,
"rewards/reward_func": 0.401202654838562,
"step": 8140,
"toxic_reward": 4.723700523376465
},
{
"clip_ratio": 0.0,
"completion_length": 45.175,
"epoch": 1.925803402646503,
"format_reward": 0.0,
"grad_norm": 1.823515772819519,
"image_reward": 0.22316131591796876,
"kl": 14.130688643455505,
"learning_rate": 5e-06,
"loss": 0.0239,
"reward": 1.057025855779648,
"reward_std": 0.9014536026865244,
"rewards/reward_func": 1.057025855779648,
"step": 8150,
"toxic_reward": 4.387946319580078
},
{
"clip_ratio": 0.0,
"completion_length": 54.875,
"epoch": 1.9281663516068053,
"format_reward": -0.25,
"grad_norm": 46.756038665771484,
"image_reward": 0.26631062775850295,
"kl": 6.435283923149109,
"learning_rate": 5e-06,
"loss": -0.0328,
"reward": 0.22599496245384215,
"reward_std": 1.4984263110905887,
"rewards/reward_func": 0.22599496245384215,
"step": 8160,
"toxic_reward": 4.138309001922607
},
{
"clip_ratio": 0.0,
"completion_length": 53.45,
"epoch": 1.9305293005671076,
"format_reward": 0.0,
"grad_norm": 34.66867446899414,
"image_reward": 0.25106658935546877,
"kl": 1020.1139773368835,
"learning_rate": 5e-06,
"loss": 0.0999,
"reward": 0.7446302771568298,
"reward_std": 0.906285472586751,
"rewards/reward_func": 0.7446302771568298,
"step": 8170,
"toxic_reward": 4.375624704360962
},
{
"clip_ratio": 0.0,
"completion_length": 49.475,
"epoch": 1.9328922495274101,
"format_reward": 0.0,
"grad_norm": 2.081218957901001,
"image_reward": 0.242840576171875,
"kl": 3.102721667289734,
"learning_rate": 5e-06,
"loss": 0.0571,
"reward": 0.5706271648406982,
"reward_std": 0.9108416954986751,
"rewards/reward_func": 0.5706271648406982,
"step": 8180,
"toxic_reward": 3.2474088430404664
},
{
"clip_ratio": 0.0,
"completion_length": 40.475,
"epoch": 1.9352551984877127,
"format_reward": -0.25,
"grad_norm": 13.313660621643066,
"image_reward": 0.27943929135799406,
"kl": 9.811500716209412,
"learning_rate": 5e-06,
"loss": 0.0091,
"reward": -0.0842776358127594,
"reward_std": 1.1166115825995804,
"rewards/reward_func": -0.0842776358127594,
"step": 8190,
"toxic_reward": 4.493260765075684
},
{
"clip_ratio": 0.0,
"completion_length": 49.225,
"epoch": 1.9376181474480152,
"format_reward": 0.0,
"grad_norm": 11.93384838104248,
"image_reward": 0.2549346923828125,
"kl": 13.695673048496246,
"learning_rate": 5e-06,
"loss": -0.0083,
"reward": 0.5832914412021637,
"reward_std": 0.7408401468303054,
"rewards/reward_func": 0.5832914412021637,
"step": 8200,
"toxic_reward": 4.143073153495789
},
{
"clip_ratio": 0.0,
"completion_length": 44.125,
"epoch": 1.9399810964083177,
"format_reward": 0.0,
"grad_norm": 6.53907585144043,
"image_reward": 0.2403350830078125,
"kl": 6.522427618503571,
"learning_rate": 5e-06,
"loss": 0.1325,
"reward": 0.1342033863067627,
"reward_std": 0.7933921405114234,
"rewards/reward_func": 0.1342033863067627,
"step": 8210,
"toxic_reward": 4.601714444160462
},
{
"clip_ratio": 0.0,
"completion_length": 44.95,
"epoch": 1.94234404536862,
"format_reward": 0.0,
"grad_norm": 23.774093627929688,
"image_reward": 0.25664520263671875,
"kl": 5.8061746001243595,
"learning_rate": 5e-06,
"loss": -0.1029,
"reward": 0.6099749207496643,
"reward_std": 1.0578389540314674,
"rewards/reward_func": 0.6099749207496643,
"step": 8220,
"toxic_reward": 3.542074370384216
},
{
"clip_ratio": 0.0,
"completion_length": 44.525,
"epoch": 1.9447069943289224,
"format_reward": 0.0,
"grad_norm": 19.021333694458008,
"image_reward": 0.25049285888671874,
"kl": 4.400176310539246,
"learning_rate": 5e-06,
"loss": -0.0805,
"reward": 0.271647572517395,
"reward_std": 0.8572761943563819,
"rewards/reward_func": 0.271647572517395,
"step": 8230,
"toxic_reward": 4.576322746276856
},
{
"clip_ratio": 0.0,
"completion_length": 48.4,
"epoch": 1.947069943289225,
"format_reward": -0.25,
"grad_norm": 12.740744590759277,
"image_reward": 0.2639495849609375,
"kl": 53.72892454862595,
"learning_rate": 5e-06,
"loss": 0.0517,
"reward": 0.4684752345085144,
"reward_std": 1.5598361855372787,
"rewards/reward_func": 0.4684752345085144,
"step": 8240,
"toxic_reward": 4.280627131462097
},
{
"clip_ratio": 0.0,
"completion_length": 42.05,
"epoch": 1.9494328922495274,
"format_reward": 0.0,
"grad_norm": 8.727499961853027,
"image_reward": 0.273443603515625,
"kl": 9.401781392097472,
"learning_rate": 5e-06,
"loss": -0.0578,
"reward": 0.6043965280056,
"reward_std": 0.7762668525800109,
"rewards/reward_func": 0.6043965280056,
"step": 8250,
"toxic_reward": 4.007175719738006
},
{
"clip_ratio": 0.0,
"completion_length": 39.175,
"epoch": 1.95179584120983,
"format_reward": -0.25,
"grad_norm": 21.95665740966797,
"image_reward": 0.2783833831548691,
"kl": 6.502747631072998,
"learning_rate": 5e-06,
"loss": -0.1757,
"reward": 0.7646288216114044,
"reward_std": 1.2125793328508734,
"rewards/reward_func": 0.7646288216114044,
"step": 8260,
"toxic_reward": 4.438870096206665
},
{
"clip_ratio": 0.0,
"completion_length": 47.375,
"epoch": 1.9541587901701323,
"format_reward": 0.0,
"grad_norm": 19.78591537475586,
"image_reward": 0.2660888671875,
"kl": 101.96959731578826,
"learning_rate": 5e-06,
"loss": 0.0584,
"reward": 0.8457072794437408,
"reward_std": 0.8602423138916493,
"rewards/reward_func": 0.8457072794437408,
"step": 8270,
"toxic_reward": 4.274328458309173
},
{
"clip_ratio": 0.0,
"completion_length": 49.325,
"epoch": 1.9565217391304348,
"format_reward": -0.25,
"grad_norm": 7.575157642364502,
"image_reward": 0.26166178435087206,
"kl": 7.8605184674263,
"learning_rate": 5e-06,
"loss": -0.0592,
"reward": 0.7780414521694183,
"reward_std": 1.34521058909595,
"rewards/reward_func": 0.7780414521694183,
"step": 8280,
"toxic_reward": 4.395621502399445
},
{
"clip_ratio": 0.0,
"completion_length": 45.0,
"epoch": 1.9588846880907371,
"format_reward": 0.0,
"grad_norm": 13.91838550567627,
"image_reward": 0.2494293212890625,
"kl": 3.4681380152702332,
"learning_rate": 5e-06,
"loss": -0.0285,
"reward": 1.0126874148845673,
"reward_std": 0.884580178745091,
"rewards/reward_func": 1.0126874148845673,
"step": 8290,
"toxic_reward": 4.259213161468506
},
{
"clip_ratio": 0.0,
"completion_length": 39.275,
"epoch": 1.9612476370510397,
"format_reward": -0.5,
"grad_norm": 11.346104621887207,
"image_reward": 0.24504598081111909,
"kl": 17.73236060142517,
"learning_rate": 5e-06,
"loss": -0.0501,
"reward": -0.41139370799064634,
"reward_std": 1.535068777576089,
"rewards/reward_func": -0.41139370799064634,
"step": 8300,
"toxic_reward": 4.184125363826752
},
{
"clip_ratio": 0.0,
"completion_length": 47.1,
"epoch": 1.9636105860113422,
"format_reward": 0.0,
"grad_norm": 7.8980631828308105,
"image_reward": 0.2494049072265625,
"kl": 1.3982277452945708,
"learning_rate": 5e-06,
"loss": -0.0632,
"reward": 0.7493218898773193,
"reward_std": 0.7001253291964531,
"rewards/reward_func": 0.7493218898773193,
"step": 8310,
"toxic_reward": 4.593434143066406
},
{
"clip_ratio": 0.0,
"completion_length": 46.025,
"epoch": 1.9659735349716447,
"format_reward": 0.0,
"grad_norm": 1.629384994506836,
"image_reward": 0.2574554443359375,
"kl": 9.406988048553467,
"learning_rate": 5e-06,
"loss": 0.0561,
"reward": 0.6752925157546997,
"reward_std": 1.2529858350753784,
"rewards/reward_func": 0.6752925157546997,
"step": 8320,
"toxic_reward": 3.6643527030944822
},
{
"clip_ratio": 0.0,
"completion_length": 42.725,
"epoch": 1.968336483931947,
"format_reward": 0.0,
"grad_norm": 6.693783283233643,
"image_reward": 0.24935455322265626,
"kl": 4.6708708822727205,
"learning_rate": 5e-06,
"loss": -0.0369,
"reward": 1.2317909479141236,
"reward_std": 1.4201693460345268,
"rewards/reward_func": 1.2317909479141236,
"step": 8330,
"toxic_reward": 3.705500102043152
},
{
"clip_ratio": 0.0,
"completion_length": 43.675,
"epoch": 1.9706994328922496,
"format_reward": 0.0,
"grad_norm": 13.678855895996094,
"image_reward": 0.26453857421875,
"kl": 1.7596666514873505,
"learning_rate": 5e-06,
"loss": 0.0084,
"reward": 0.6438661813735962,
"reward_std": 0.5453263748437166,
"rewards/reward_func": 0.6438661813735962,
"step": 8340,
"toxic_reward": 4.577846193313599
},
{
"clip_ratio": 0.0,
"completion_length": 44.55,
"epoch": 1.973062381852552,
"format_reward": 0.0,
"grad_norm": 5.530174255371094,
"image_reward": 0.2434234619140625,
"kl": 16.00339319705963,
"learning_rate": 5e-06,
"loss": 0.0858,
"reward": 0.7399854481220245,
"reward_std": 0.5954274158924818,
"rewards/reward_func": 0.7399854481220245,
"step": 8350,
"toxic_reward": 4.567293620109558
},
{
"clip_ratio": 0.0,
"completion_length": 44.8,
"epoch": 1.9754253308128544,
"format_reward": -0.25,
"grad_norm": 23.65260124206543,
"image_reward": 0.2397003173828125,
"kl": 413.27391294240954,
"learning_rate": 5e-06,
"loss": -0.0452,
"reward": 0.21110110878944396,
"reward_std": 1.2717279449105263,
"rewards/reward_func": 0.21110110878944396,
"step": 8360,
"toxic_reward": 4.29474036693573
},
{
"clip_ratio": 0.0,
"completion_length": 58.15,
"epoch": 1.977788279773157,
"format_reward": 0.0,
"grad_norm": 8.489328384399414,
"image_reward": 0.2703460693359375,
"kl": 11.292006134986877,
"learning_rate": 5e-06,
"loss": -0.0396,
"reward": 0.522923594713211,
"reward_std": 0.6722989223897458,
"rewards/reward_func": 0.522923594713211,
"step": 8370,
"toxic_reward": 4.362267994880677
},
{
"clip_ratio": 0.0,
"completion_length": 53.7,
"epoch": 1.9801512287334595,
"format_reward": 0.0,
"grad_norm": 14.112800598144531,
"image_reward": 0.2404998779296875,
"kl": 5.943003642559051,
"learning_rate": 5e-06,
"loss": 0.0066,
"reward": 1.046756339073181,
"reward_std": 1.401267148554325,
"rewards/reward_func": 1.046756339073181,
"step": 8380,
"toxic_reward": 4.379712152481079
},
{
"clip_ratio": 0.0,
"completion_length": 45.075,
"epoch": 1.9825141776937618,
"format_reward": 0.0,
"grad_norm": 15.351452827453613,
"image_reward": 0.2679229736328125,
"kl": 2.1231451511383055,
"learning_rate": 5e-06,
"loss": -0.0874,
"reward": 0.044296592473983765,
"reward_std": 0.7907688375562429,
"rewards/reward_func": 0.044296592473983765,
"step": 8390,
"toxic_reward": 4.44194188117981
},
{
"clip_ratio": 0.0,
"completion_length": 52.8,
"epoch": 1.9848771266540641,
"format_reward": 0.0,
"grad_norm": 14.493269920349121,
"image_reward": 0.23359222412109376,
"kl": 15.598973235487938,
"learning_rate": 5e-06,
"loss": 0.0523,
"reward": 0.6035852313041687,
"reward_std": 0.7898097388446331,
"rewards/reward_func": 0.6035852313041687,
"step": 8400,
"toxic_reward": 4.0595218420028685
},
{
"clip_ratio": 0.0,
"completion_length": 49.025,
"epoch": 1.9872400756143667,
"format_reward": 0.0,
"grad_norm": 2.004755735397339,
"image_reward": 0.23458099365234375,
"kl": 13.407473123073578,
"learning_rate": 5e-06,
"loss": 0.0054,
"reward": 0.5494411200284958,
"reward_std": 0.5586541540920734,
"rewards/reward_func": 0.5494411200284958,
"step": 8410,
"toxic_reward": 4.175926774740219
},
{
"clip_ratio": 0.0,
"completion_length": 50.325,
"epoch": 1.9896030245746692,
"format_reward": 0.0,
"grad_norm": 9.598527908325195,
"image_reward": 0.301202392578125,
"kl": 4.9204403221607205,
"learning_rate": 5e-06,
"loss": -0.001,
"reward": 0.4649462789297104,
"reward_std": 0.7171205889433623,
"rewards/reward_func": 0.4649462789297104,
"step": 8420,
"toxic_reward": 3.910860872268677
},
{
"clip_ratio": 0.0,
"completion_length": 57.825,
"epoch": 1.9919659735349717,
"format_reward": 0.0,
"grad_norm": 2.607607841491699,
"image_reward": 0.2745025634765625,
"kl": 9.545298218727112,
"learning_rate": 5e-06,
"loss": 0.0779,
"reward": 0.43806184232234957,
"reward_std": 0.8561135273426771,
"rewards/reward_func": 0.43806184232234957,
"step": 8430,
"toxic_reward": 4.119659066200256
},
{
"clip_ratio": 0.0,
"completion_length": 42.8,
"epoch": 1.994328922495274,
"format_reward": -0.25,
"grad_norm": 1.9090756177902222,
"image_reward": 0.264605712890625,
"kl": 1.142916288971901,
"learning_rate": 5e-06,
"loss": -0.0035,
"reward": -0.06725225448608399,
"reward_std": 1.1679431475698947,
"rewards/reward_func": -0.06725225448608399,
"step": 8440,
"toxic_reward": 4.506508493423462
},
{
"clip_ratio": 0.0,
"completion_length": 45.625,
"epoch": 1.9966918714555766,
"format_reward": -0.25,
"grad_norm": 1.924688458442688,
"image_reward": 0.247418212890625,
"kl": 1.739441803097725,
"learning_rate": 5e-06,
"loss": -0.0847,
"reward": 0.2795759916305542,
"reward_std": 1.532812624052167,
"rewards/reward_func": 0.2795759916305542,
"step": 8450,
"toxic_reward": 3.7154327273368835
},
{
"clip_ratio": 0.0,
"completion_length": 50.075,
"epoch": 1.999054820415879,
"format_reward": 0.0,
"grad_norm": 3.183807373046875,
"image_reward": 0.259228515625,
"kl": 1.071340024471283,
"learning_rate": 5e-06,
"loss": 0.0299,
"reward": 1.3993828475475312,
"reward_std": 1.1979968290776015,
"rewards/reward_func": 1.3993828475475312,
"step": 8460,
"toxic_reward": 4.236328482627869
},
{
"clip_ratio": 0.0,
"completion_length": 47.325,
"epoch": 2.0014177693761814,
"format_reward": 0.0,
"grad_norm": 7.500320911407471,
"image_reward": 0.2599090576171875,
"kl": 1.2782041728496552,
"learning_rate": 5e-06,
"loss": 0.046,
"reward": 1.2368434906005858,
"reward_std": 1.188733378984034,
"rewards/reward_func": 1.2368434906005858,
"step": 8470,
"toxic_reward": 3.8694416284561157
},
{
"clip_ratio": 0.0,
"completion_length": 42.575,
"epoch": 2.003780718336484,
"format_reward": 0.0,
"grad_norm": 3.4954817295074463,
"image_reward": 0.25406494140625,
"kl": 2.6761809453368186,
"learning_rate": 5e-06,
"loss": -0.0561,
"reward": 0.3607616722583771,
"reward_std": 0.599818766117096,
"rewards/reward_func": 0.3607616722583771,
"step": 8480,
"toxic_reward": 4.048572421073914
},
{
"clip_ratio": 0.0,
"completion_length": 54.05,
"epoch": 2.0061436672967865,
"format_reward": 0.0,
"grad_norm": 5.18286657333374,
"image_reward": 0.22822036743164062,
"kl": 2.461097413301468,
"learning_rate": 5e-06,
"loss": -0.0261,
"reward": 0.2195432722568512,
"reward_std": 0.7936036609113216,
"rewards/reward_func": 0.2195432722568512,
"step": 8490,
"toxic_reward": 4.110178589820862
},
{
"clip_ratio": 0.0,
"completion_length": 41.9,
"epoch": 2.008506616257089,
"format_reward": -0.75,
"grad_norm": 2.6953821182250977,
"image_reward": 0.238427734375,
"kl": 1.0251432090997696,
"learning_rate": 5e-06,
"loss": -0.0027,
"reward": -0.4569409370422363,
"reward_std": 1.0821652268990873,
"rewards/reward_func": -0.4569409370422363,
"step": 8500,
"toxic_reward": 4.185848736763001
},
{
"clip_ratio": 0.0,
"completion_length": 52.4,
"epoch": 2.010869565217391,
"format_reward": 0.0,
"grad_norm": 6.174482822418213,
"image_reward": 0.245025634765625,
"kl": 570.9768789380789,
"learning_rate": 5e-06,
"loss": 0.0155,
"reward": 0.634968101978302,
"reward_std": 0.5698891028761863,
"rewards/reward_func": 0.634968101978302,
"step": 8510,
"toxic_reward": 4.557809638977051
},
{
"clip_ratio": 0.0,
"completion_length": 37.525,
"epoch": 2.0132325141776937,
"format_reward": 0.0,
"grad_norm": 12.716261863708496,
"image_reward": 0.2716217041015625,
"kl": 1.0744814962148665,
"learning_rate": 5e-06,
"loss": 0.0371,
"reward": 0.8971363306045532,
"reward_std": 1.0540940549224616,
"rewards/reward_func": 0.8971363306045532,
"step": 8520,
"toxic_reward": 4.03425624370575
},
{
"clip_ratio": 0.0,
"completion_length": 47.725,
"epoch": 2.015595463137996,
"format_reward": 0.0,
"grad_norm": 11.573805809020996,
"image_reward": 0.25701904296875,
"kl": 1.1612621247768402,
"learning_rate": 5e-06,
"loss": -0.0256,
"reward": 0.4974235534667969,
"reward_std": 0.7099893309175969,
"rewards/reward_func": 0.4974235534667969,
"step": 8530,
"toxic_reward": 4.757012367248535
},
{
"clip_ratio": 0.0,
"completion_length": 50.325,
"epoch": 2.0179584120982987,
"format_reward": 0.0,
"grad_norm": 2.5829172134399414,
"image_reward": 0.23183441162109375,
"kl": 1.0122918039560318,
"learning_rate": 5e-06,
"loss": -0.0224,
"reward": 0.5489160656929016,
"reward_std": 0.4481811560690403,
"rewards/reward_func": 0.5489160656929016,
"step": 8540,
"toxic_reward": 4.330148541927338
},
{
"clip_ratio": 0.0,
"completion_length": 45.6,
"epoch": 2.0203213610586013,
"format_reward": -0.25,
"grad_norm": 8.287252426147461,
"image_reward": 0.2674835205078125,
"kl": 1.138858178257942,
"learning_rate": 5e-06,
"loss": 0.0238,
"reward": -0.09747375845909119,
"reward_std": 0.8301142632961274,
"rewards/reward_func": -0.09747375845909119,
"step": 8550,
"toxic_reward": 4.7045900344848635
},
{
"clip_ratio": 0.0,
"completion_length": 49.65,
"epoch": 2.022684310018904,
"format_reward": 0.0,
"grad_norm": 9.928176879882812,
"image_reward": 0.2459930419921875,
"kl": 1.701068675518036,
"learning_rate": 5e-06,
"loss": -0.0446,
"reward": 0.5473175823688508,
"reward_std": 0.7223521884530782,
"rewards/reward_func": 0.5473175823688508,
"step": 8560,
"toxic_reward": 4.571657824516296
},
{
"clip_ratio": 0.0,
"completion_length": 41.725,
"epoch": 2.025047258979206,
"format_reward": -0.25,
"grad_norm": 5.9600677490234375,
"image_reward": 0.26257222443819045,
"kl": 2.5904053121805193,
"learning_rate": 5e-06,
"loss": 0.0163,
"reward": -0.20251348614692688,
"reward_std": 0.9303808398544788,
"rewards/reward_func": -0.20251348614692688,
"step": 8570,
"toxic_reward": 4.581225419044495
},
{
"clip_ratio": 0.0,
"completion_length": 45.55,
"epoch": 2.0274102079395084,
"format_reward": 0.0,
"grad_norm": 3.309791088104248,
"image_reward": 0.23095855712890626,
"kl": 1.5135916233062745,
"learning_rate": 5e-06,
"loss": -0.0068,
"reward": 0.21151033639907837,
"reward_std": 0.7603108703624457,
"rewards/reward_func": 0.21151033639907837,
"step": 8580,
"toxic_reward": 4.328943312168121
},
{
"clip_ratio": 0.0,
"completion_length": 47.5,
"epoch": 2.029773156899811,
"format_reward": 0.0,
"grad_norm": 8.408251762390137,
"image_reward": 0.2651763916015625,
"kl": 0.6560351371765136,
"learning_rate": 5e-06,
"loss": -0.0997,
"reward": 0.012960964441299438,
"reward_std": 0.35295800119638443,
"rewards/reward_func": 0.012960964441299438,
"step": 8590,
"toxic_reward": 4.5852957487106325
},
{
"clip_ratio": 0.0,
"completion_length": 45.425,
"epoch": 2.0321361058601135,
"format_reward": 0.0,
"grad_norm": 10.234503746032715,
"image_reward": 0.242572021484375,
"kl": 1.6390519708395004,
"learning_rate": 5e-06,
"loss": -0.0652,
"reward": 0.49767774939537046,
"reward_std": 0.8766103692352771,
"rewards/reward_func": 0.49767774939537046,
"step": 8600,
"toxic_reward": 4.273276591300965
},
{
"clip_ratio": 0.0,
"completion_length": 55.525,
"epoch": 2.034499054820416,
"format_reward": 0.0,
"grad_norm": 31.749767303466797,
"image_reward": 0.24173736572265625,
"kl": 1.4096274197101593,
"learning_rate": 5e-06,
"loss": -0.0081,
"reward": 0.4087996184825897,
"reward_std": 0.9022964790463448,
"rewards/reward_func": 0.4087996184825897,
"step": 8610,
"toxic_reward": 4.324217915534973
},
{
"clip_ratio": 0.0,
"completion_length": 54.825,
"epoch": 2.036862003780718,
"format_reward": -0.25,
"grad_norm": 12.874961853027344,
"image_reward": 0.2652323395013809,
"kl": 1.0484755635261536,
"learning_rate": 5e-06,
"loss": 0.027,
"reward": -0.2117618590593338,
"reward_std": 1.249949687719345,
"rewards/reward_func": -0.2117618590593338,
"step": 8620,
"toxic_reward": 3.5184057116508485
},
{
"clip_ratio": 0.0,
"completion_length": 46.375,
"epoch": 2.0392249527410207,
"format_reward": 0.0,
"grad_norm": 1.5234144926071167,
"image_reward": 0.2357269287109375,
"kl": 1.2848842471837998,
"learning_rate": 5e-06,
"loss": 0.0752,
"reward": 0.716532975435257,
"reward_std": 0.89201683960855,
"rewards/reward_func": 0.716532975435257,
"step": 8630,
"toxic_reward": 4.4441750049591064
},
{
"clip_ratio": 0.0,
"completion_length": 56.125,
"epoch": 2.041587901701323,
"format_reward": 0.0,
"grad_norm": 1.1807531118392944,
"image_reward": 0.25293731689453125,
"kl": 2.8808428183197976,
"learning_rate": 5e-06,
"loss": 0.023,
"reward": 0.9091297924518585,
"reward_std": 0.7464996237307787,
"rewards/reward_func": 0.9091297924518585,
"step": 8640,
"toxic_reward": 4.224450874328613
},
{
"clip_ratio": 0.0,
"completion_length": 40.15,
"epoch": 2.0439508506616257,
"format_reward": 0.0,
"grad_norm": 10.838650703430176,
"image_reward": 0.23937225341796875,
"kl": 2.5997998148202894,
"learning_rate": 5e-06,
"loss": -0.0419,
"reward": 0.3074700653553009,
"reward_std": 0.8474891871213913,
"rewards/reward_func": 0.3074700653553009,
"step": 8650,
"toxic_reward": 4.419002604484558
},
{
"clip_ratio": 0.0,
"completion_length": 40.3,
"epoch": 2.0463137996219283,
"format_reward": 0.0,
"grad_norm": 2.063800811767578,
"image_reward": 0.2351318359375,
"kl": 2.429117688536644,
"learning_rate": 5e-06,
"loss": 0.0507,
"reward": 0.464035177230835,
"reward_std": 0.8178490117192269,
"rewards/reward_func": 0.464035177230835,
"step": 8660,
"toxic_reward": 4.122921991348266
},
{
"clip_ratio": 0.0,
"completion_length": 50.725,
"epoch": 2.048676748582231,
"format_reward": -0.25,
"grad_norm": 5.179421424865723,
"image_reward": 0.2340398147702217,
"kl": 1.4141836494207383,
"learning_rate": 5e-06,
"loss": -0.0321,
"reward": 0.2651833713054657,
"reward_std": 1.3055690463632346,
"rewards/reward_func": 0.2651833713054657,
"step": 8670,
"toxic_reward": 4.099702596664429
},
{
"clip_ratio": 0.0,
"completion_length": 49.0,
"epoch": 2.051039697542533,
"format_reward": 0.0,
"grad_norm": 5.729818344116211,
"image_reward": 0.22962646484375,
"kl": 1.0476927325129508,
"learning_rate": 5e-06,
"loss": 0.0396,
"reward": 0.5858624681830407,
"reward_std": 0.8647454358637333,
"rewards/reward_func": 0.5858624681830407,
"step": 8680,
"toxic_reward": 3.923676002025604
},
{
"clip_ratio": 0.0,
"completion_length": 43.575,
"epoch": 2.0534026465028354,
"format_reward": -0.25,
"grad_norm": 33.29255294799805,
"image_reward": 0.23504893034696578,
"kl": 0.5507122159004212,
"learning_rate": 5e-06,
"loss": 0.0042,
"reward": 0.5358553946018219,
"reward_std": 1.371009534597397,
"rewards/reward_func": 0.5358553946018219,
"step": 8690,
"toxic_reward": 4.1810872793197635
},
{
"clip_ratio": 0.0,
"completion_length": 44.425,
"epoch": 2.055765595463138,
"format_reward": 0.0,
"grad_norm": 9.838844299316406,
"image_reward": 0.2368377685546875,
"kl": 0.8958913296461105,
"learning_rate": 5e-06,
"loss": -0.0191,
"reward": 0.8707101225852967,
"reward_std": 1.2157209530472755,
"rewards/reward_func": 0.8707101225852967,
"step": 8700,
"toxic_reward": 3.7892824172973634
},
{
"clip_ratio": 0.0,
"completion_length": 50.275,
"epoch": 2.0581285444234405,
"format_reward": 0.0,
"grad_norm": 18.979665756225586,
"image_reward": 0.259954833984375,
"kl": 1.8286799043416977,
"learning_rate": 5e-06,
"loss": -0.0646,
"reward": 0.3829235196113586,
"reward_std": 0.9690108880400657,
"rewards/reward_func": 0.3829235196113586,
"step": 8710,
"toxic_reward": 4.264838469028473
},
{
"clip_ratio": 0.0,
"completion_length": 50.35,
"epoch": 2.060491493383743,
"format_reward": 0.0,
"grad_norm": 6.838248252868652,
"image_reward": 0.22672042846679688,
"kl": 0.7338828861713409,
"learning_rate": 5e-06,
"loss": 0.0185,
"reward": 0.827715927362442,
"reward_std": 0.9109129812568426,
"rewards/reward_func": 0.827715927362442,
"step": 8720,
"toxic_reward": 4.2410869836807255
},
{
"clip_ratio": 0.0,
"completion_length": 48.725,
"epoch": 2.0628544423440456,
"format_reward": 0.0,
"grad_norm": 4.239810943603516,
"image_reward": 0.25128173828125,
"kl": 5.776644492149353,
"learning_rate": 5e-06,
"loss": -0.0179,
"reward": 0.5484015077352524,
"reward_std": 1.311685237288475,
"rewards/reward_func": 0.5484015077352524,
"step": 8730,
"toxic_reward": 4.056830906867981
},
{
"clip_ratio": 0.0,
"completion_length": 43.75,
"epoch": 2.0652173913043477,
"format_reward": 0.0,
"grad_norm": 11.002195358276367,
"image_reward": 0.229833984375,
"kl": 1.1322214603424072,
"learning_rate": 5e-06,
"loss": 0.1107,
"reward": -0.03462121486663818,
"reward_std": 0.43875638470053674,
"rewards/reward_func": -0.03462121486663818,
"step": 8740,
"toxic_reward": 4.620968174934387
},
{
"clip_ratio": 0.0,
"completion_length": 54.975,
"epoch": 2.06758034026465,
"format_reward": -0.25,
"grad_norm": 11.761068344116211,
"image_reward": 0.227862548828125,
"kl": 7.682415267825126,
"learning_rate": 5e-06,
"loss": -0.09,
"reward": 0.26341341733932494,
"reward_std": 1.4870022028684615,
"rewards/reward_func": 0.26341341733932494,
"step": 8750,
"toxic_reward": 4.433785676956177
},
{
"clip_ratio": 0.0,
"completion_length": 49.425,
"epoch": 2.0699432892249527,
"format_reward": 0.0,
"grad_norm": 5.119037628173828,
"image_reward": 0.2395660400390625,
"kl": 1.2481903672218322,
"learning_rate": 5e-06,
"loss": -0.0746,
"reward": 0.3276951313018799,
"reward_std": 0.46017137840390204,
"rewards/reward_func": 0.3276951313018799,
"step": 8760,
"toxic_reward": 4.593200016021728
},
{
"clip_ratio": 0.0,
"completion_length": 44.775,
"epoch": 2.0723062381852553,
"format_reward": 0.0,
"grad_norm": 1.704590916633606,
"image_reward": 0.2639801025390625,
"kl": 2.778309851884842,
"learning_rate": 5e-06,
"loss": -0.0417,
"reward": 0.30720534920692444,
"reward_std": 0.6144355796277523,
"rewards/reward_func": 0.30720534920692444,
"step": 8770,
"toxic_reward": 4.481031203269959
},
{
"clip_ratio": 0.0,
"completion_length": 47.75,
"epoch": 2.074669187145558,
"format_reward": 0.0,
"grad_norm": 11.392171859741211,
"image_reward": 0.2711700439453125,
"kl": 2.4740293115377425,
"learning_rate": 5e-06,
"loss": 0.0428,
"reward": 0.318191659450531,
"reward_std": 0.6928373419679701,
"rewards/reward_func": 0.318191659450531,
"step": 8780,
"toxic_reward": 4.502946138381958
},
{
"clip_ratio": 0.0,
"completion_length": 45.85,
"epoch": 2.07703213610586,
"format_reward": 0.0,
"grad_norm": 11.103386878967285,
"image_reward": 0.2586761474609375,
"kl": 0.8142087966203689,
"learning_rate": 5e-06,
"loss": -0.0231,
"reward": 0.5202795565128326,
"reward_std": 0.9188865400850773,
"rewards/reward_func": 0.5202795565128326,
"step": 8790,
"toxic_reward": 4.268853735923767
},
{
"clip_ratio": 0.0,
"completion_length": 53.825,
"epoch": 2.0793950850661624,
"format_reward": -0.5,
"grad_norm": 3.831815719604492,
"image_reward": 0.23363494873046875,
"kl": 3.2880129516124725,
"learning_rate": 5e-06,
"loss": 0.0507,
"reward": -0.17698687314987183,
"reward_std": 1.4326126247644424,
"rewards/reward_func": -0.17698687314987183,
"step": 8800,
"toxic_reward": 4.581766486167908
},
{
"clip_ratio": 0.0,
"completion_length": 42.95,
"epoch": 2.081758034026465,
"format_reward": -0.5,
"grad_norm": 13.328118324279785,
"image_reward": 0.283209228515625,
"kl": 2.583532452583313,
"learning_rate": 5e-06,
"loss": 0.1117,
"reward": 0.2989026606082916,
"reward_std": 2.0480130195617674,
"rewards/reward_func": 0.2989026606082916,
"step": 8810,
"toxic_reward": 3.5884770512580872
},
{
"clip_ratio": 0.0,
"completion_length": 47.125,
"epoch": 2.0841209829867675,
"format_reward": 0.0,
"grad_norm": 1.780765414237976,
"image_reward": 0.242584228515625,
"kl": 1.3252637952566146,
"learning_rate": 5e-06,
"loss": -0.0244,
"reward": 0.29446207284927367,
"reward_std": 1.0302367629483342,
"rewards/reward_func": 0.29446207284927367,
"step": 8820,
"toxic_reward": 4.3270234823226925
},
{
"clip_ratio": 0.0,
"completion_length": 50.55,
"epoch": 2.08648393194707,
"format_reward": 0.0,
"grad_norm": 0.9498596787452698,
"image_reward": 0.25778045654296877,
"kl": 1.5434826999902724,
"learning_rate": 5e-06,
"loss": -0.1175,
"reward": 0.414847657084465,
"reward_std": 0.7169051881879568,
"rewards/reward_func": 0.414847657084465,
"step": 8830,
"toxic_reward": 4.253011137247086
},
{
"clip_ratio": 0.0,
"completion_length": 42.925,
"epoch": 2.0888468809073726,
"format_reward": 0.0,
"grad_norm": 8.519845008850098,
"image_reward": 0.23509521484375,
"kl": 0.5451234139502048,
"learning_rate": 5e-06,
"loss": 0.1295,
"reward": 0.510816776752472,
"reward_std": 0.6249840931501239,
"rewards/reward_func": 0.510816776752472,
"step": 8840,
"toxic_reward": 4.746308994293213
},
{
"clip_ratio": 0.0,
"completion_length": 46.225,
"epoch": 2.0912098298676747,
"format_reward": 0.0,
"grad_norm": 3.2223246097564697,
"image_reward": 0.23397216796875,
"kl": 1.196854567527771,
"learning_rate": 5e-06,
"loss": 0.0121,
"reward": 0.430766886472702,
"reward_std": 1.1830935038626194,
"rewards/reward_func": 0.430766886472702,
"step": 8850,
"toxic_reward": 4.010055112838745
},
{
"clip_ratio": 0.0,
"completion_length": 50.1,
"epoch": 2.093572778827977,
"format_reward": 0.0,
"grad_norm": 4.516735553741455,
"image_reward": 0.248101806640625,
"kl": 1.3512360364198686,
"learning_rate": 5e-06,
"loss": -0.0064,
"reward": 0.4724120795726776,
"reward_std": 1.2229724466800689,
"rewards/reward_func": 0.4724120795726776,
"step": 8860,
"toxic_reward": 4.219368410110474
},
{
"clip_ratio": 0.0,
"completion_length": 51.45,
"epoch": 2.0959357277882797,
"format_reward": 0.0,
"grad_norm": 10.528098106384277,
"image_reward": 0.2487030029296875,
"kl": 0.8361154735088349,
"learning_rate": 5e-06,
"loss": 0.0096,
"reward": 0.481815043091774,
"reward_std": 0.8121814839541912,
"rewards/reward_func": 0.481815043091774,
"step": 8870,
"toxic_reward": 4.385925316810608
},
{
"clip_ratio": 0.0,
"completion_length": 42.625,
"epoch": 2.0982986767485823,
"format_reward": 0.0,
"grad_norm": 14.012845993041992,
"image_reward": 0.26658477783203127,
"kl": 5.801792293787003,
"learning_rate": 5e-06,
"loss": -0.0543,
"reward": 0.8176810801029205,
"reward_std": 0.9641741991043091,
"rewards/reward_func": 0.8176810801029205,
"step": 8880,
"toxic_reward": 4.4868937015533445
},
{
"clip_ratio": 0.0,
"completion_length": 58.625,
"epoch": 2.100661625708885,
"format_reward": 0.0,
"grad_norm": 1.7836425304412842,
"image_reward": 0.219366455078125,
"kl": 2.3325648605823517,
"learning_rate": 5e-06,
"loss": -0.0747,
"reward": 0.07698584794998169,
"reward_std": 0.7939232878386975,
"rewards/reward_func": 0.07698584794998169,
"step": 8890,
"toxic_reward": 3.8968687295913695
},
{
"clip_ratio": 0.0,
"completion_length": 35.1,
"epoch": 2.1030245746691874,
"format_reward": 0.0,
"grad_norm": 4.322965145111084,
"image_reward": 0.2381072998046875,
"kl": 1.5042289346456528,
"learning_rate": 5e-06,
"loss": -0.0396,
"reward": 0.6448795169591903,
"reward_std": 1.071408730885014,
"rewards/reward_func": 0.6448795169591903,
"step": 8900,
"toxic_reward": 4.1689093708992
},
{
"clip_ratio": 0.0,
"completion_length": 46.6,
"epoch": 2.1053875236294894,
"format_reward": -0.25,
"grad_norm": 3.794384002685547,
"image_reward": 0.2495122268795967,
"kl": 1.5198310285806655,
"learning_rate": 5e-06,
"loss": 0.0187,
"reward": -0.018249320983886718,
"reward_std": 0.9345291556790472,
"rewards/reward_func": -0.018249320983886718,
"step": 8910,
"toxic_reward": 4.52795147895813
},
{
"clip_ratio": 0.0,
"completion_length": 53.425,
"epoch": 2.107750472589792,
"format_reward": 0.0,
"grad_norm": 5.023624420166016,
"image_reward": 0.2479766845703125,
"kl": 0.8687845975160599,
"learning_rate": 5e-06,
"loss": -0.0294,
"reward": 0.188352632522583,
"reward_std": 0.6677849385887384,
"rewards/reward_func": 0.188352632522583,
"step": 8920,
"toxic_reward": 4.531054210662842
},
{
"clip_ratio": 0.0,
"completion_length": 50.625,
"epoch": 2.1101134215500945,
"format_reward": 0.0,
"grad_norm": 2.3595223426818848,
"image_reward": 0.2230987548828125,
"kl": 8.144508588314057,
"learning_rate": 5e-06,
"loss": -0.0584,
"reward": 0.1267090529203415,
"reward_std": 0.4783048752695322,
"rewards/reward_func": 0.1267090529203415,
"step": 8930,
"toxic_reward": 4.553759598731995
},
{
"clip_ratio": 0.0,
"completion_length": 48.075,
"epoch": 2.112476370510397,
"format_reward": 0.0,
"grad_norm": 27.809850692749023,
"image_reward": 0.2523773193359375,
"kl": 0.7018902823328972,
"learning_rate": 5e-06,
"loss": -0.0582,
"reward": 0.3417531728744507,
"reward_std": 0.6842773109674454,
"rewards/reward_func": 0.3417531728744507,
"step": 8940,
"toxic_reward": 4.307323157787323
},
{
"clip_ratio": 0.0,
"completion_length": 48.875,
"epoch": 2.1148393194706996,
"format_reward": -0.25,
"grad_norm": 2.785470962524414,
"image_reward": 0.2328338623046875,
"kl": 0.866449561715126,
"learning_rate": 5e-06,
"loss": -0.0235,
"reward": 0.9232547760009766,
"reward_std": 1.3794653311371803,
"rewards/reward_func": 0.9232547760009766,
"step": 8950,
"toxic_reward": 4.619945740699768
},
{
"clip_ratio": 0.0,
"completion_length": 41.5,
"epoch": 2.1172022684310017,
"format_reward": 0.0,
"grad_norm": 7.097494602203369,
"image_reward": 0.2396942138671875,
"kl": 2.0439702540636064,
"learning_rate": 5e-06,
"loss": 0.072,
"reward": 0.26967796087265017,
"reward_std": 0.541577224060893,
"rewards/reward_func": 0.26967796087265017,
"step": 8960,
"toxic_reward": 4.525745010375976
},
{
"clip_ratio": 0.0,
"completion_length": 49.5,
"epoch": 2.119565217391304,
"format_reward": 0.0,
"grad_norm": 9.064950942993164,
"image_reward": 0.256915283203125,
"kl": 1.2320655643939973,
"learning_rate": 5e-06,
"loss": -0.0406,
"reward": 0.2311327040195465,
"reward_std": 0.5596455704420805,
"rewards/reward_func": 0.2311327040195465,
"step": 8970,
"toxic_reward": 4.595108699798584
},
{
"clip_ratio": 0.0,
"completion_length": 46.65,
"epoch": 2.1219281663516067,
"format_reward": 0.0,
"grad_norm": 2.589348793029785,
"image_reward": 0.2637451171875,
"kl": 3.0987811207771303,
"learning_rate": 5e-06,
"loss": 0.0007,
"reward": 0.11190776824951172,
"reward_std": 0.6281056736595929,
"rewards/reward_func": 0.11190776824951172,
"step": 8980,
"toxic_reward": 3.8689257740974425
},
{
"clip_ratio": 0.0,
"completion_length": 46.575,
"epoch": 2.1242911153119093,
"format_reward": 0.0,
"grad_norm": 8.932865142822266,
"image_reward": 0.246466064453125,
"kl": 0.9722367227077484,
"learning_rate": 5e-06,
"loss": -0.0272,
"reward": 0.507748281955719,
"reward_std": 0.49981794953346254,
"rewards/reward_func": 0.507748281955719,
"step": 8990,
"toxic_reward": 4.460081267356872
},
{
"clip_ratio": 0.0,
"completion_length": 52.45,
"epoch": 2.126654064272212,
"format_reward": 0.0,
"grad_norm": 6.9373064041137695,
"image_reward": 0.247686767578125,
"kl": 1.1410144418478012,
"learning_rate": 5e-06,
"loss": -0.0068,
"reward": 0.7415260970592499,
"reward_std": 0.7849105328321457,
"rewards/reward_func": 0.7415260970592499,
"step": 9000,
"toxic_reward": 4.300376343727112
},
{
"clip_ratio": 0.0,
"completion_length": 46.45,
"epoch": 2.1290170132325144,
"format_reward": 0.0,
"grad_norm": 2.5460894107818604,
"image_reward": 0.253936767578125,
"kl": 0.8742299884557724,
"learning_rate": 5e-06,
"loss": 0.009,
"reward": 0.2949145630002022,
"reward_std": 0.8127535484731198,
"rewards/reward_func": 0.2949145630002022,
"step": 9010,
"toxic_reward": 4.208229756355285
},
{
"clip_ratio": 0.0,
"completion_length": 44.875,
"epoch": 2.1313799621928164,
"format_reward": 0.0,
"grad_norm": 8.273791313171387,
"image_reward": 0.23465576171875,
"kl": 1.841402593255043,
"learning_rate": 5e-06,
"loss": 0.0698,
"reward": 0.2962026834487915,
"reward_std": 0.5367021195590496,
"rewards/reward_func": 0.2962026834487915,
"step": 9020,
"toxic_reward": 4.80813364982605
},
{
"clip_ratio": 0.0,
"completion_length": 45.275,
"epoch": 2.133742911153119,
"format_reward": 0.0,
"grad_norm": 1.396345853805542,
"image_reward": 0.25015411376953123,
"kl": 2.9800774693489074,
"learning_rate": 5e-06,
"loss": 0.0132,
"reward": 0.2012641340494156,
"reward_std": 0.9129719872027635,
"rewards/reward_func": 0.2012641340494156,
"step": 9030,
"toxic_reward": 4.130698096752167
},
{
"clip_ratio": 0.0,
"completion_length": 51.85,
"epoch": 2.1361058601134215,
"format_reward": 0.0,
"grad_norm": 6.688182830810547,
"image_reward": 0.27446441650390624,
"kl": 3.037346550822258,
"learning_rate": 5e-06,
"loss": -0.0086,
"reward": 0.7237348094582557,
"reward_std": 0.9079257231205702,
"rewards/reward_func": 0.7237348094582557,
"step": 9040,
"toxic_reward": 4.051010203361511
},
{
"clip_ratio": 0.0,
"completion_length": 42.55,
"epoch": 2.138468809073724,
"format_reward": -0.25,
"grad_norm": 2.8426876068115234,
"image_reward": 0.26009623110294344,
"kl": 234.93744373321533,
"learning_rate": 5e-06,
"loss": 0.0741,
"reward": 0.14779042601585388,
"reward_std": 1.6020304949954152,
"rewards/reward_func": 0.14779042601585388,
"step": 9050,
"toxic_reward": 4.297617936134339
},
{
"clip_ratio": 0.0,
"completion_length": 46.55,
"epoch": 2.1408317580340266,
"format_reward": 0.0,
"grad_norm": 14.305941581726074,
"image_reward": 0.23194732666015624,
"kl": 1.907991024851799,
"learning_rate": 5e-06,
"loss": -0.0095,
"reward": 0.5170632779598237,
"reward_std": 0.8214797399006784,
"rewards/reward_func": 0.5170632779598237,
"step": 9060,
"toxic_reward": 4.2402391791343685
},
{
"clip_ratio": 0.0,
"completion_length": 44.0,
"epoch": 2.143194706994329,
"format_reward": 0.0,
"grad_norm": 4.640130043029785,
"image_reward": 0.2427520751953125,
"kl": 7.810242688655853,
"learning_rate": 5e-06,
"loss": -0.032,
"reward": 0.20102212131023406,
"reward_std": 1.3208093732595443,
"rewards/reward_func": 0.20102212131023406,
"step": 9070,
"toxic_reward": 3.7083510875701906
},
{
"clip_ratio": 0.0,
"completion_length": 54.1,
"epoch": 2.145557655954631,
"format_reward": 0.0,
"grad_norm": 1.3630574941635132,
"image_reward": 0.2402252197265625,
"kl": 10.548876631259919,
"learning_rate": 5e-06,
"loss": 0.0112,
"reward": 0.7412046194076538,
"reward_std": 0.9147299766540528,
"rewards/reward_func": 0.7412046194076538,
"step": 9080,
"toxic_reward": 4.000400519371032
},
{
"clip_ratio": 0.0,
"completion_length": 39.2,
"epoch": 2.1479206049149338,
"format_reward": 0.0,
"grad_norm": 2.9857964515686035,
"image_reward": 0.2373077392578125,
"kl": 11.153948432207107,
"learning_rate": 5e-06,
"loss": -0.0525,
"reward": 0.4043663561344147,
"reward_std": 0.7483500481583178,
"rewards/reward_func": 0.4043663561344147,
"step": 9090,
"toxic_reward": 4.85220890045166
},
{
"clip_ratio": 0.0,
"completion_length": 54.05,
"epoch": 2.1502835538752363,
"format_reward": 0.0,
"grad_norm": 2.3352339267730713,
"image_reward": 0.2479736328125,
"kl": 3.4772801220417024,
"learning_rate": 5e-06,
"loss": 0.0421,
"reward": 0.5415297746658325,
"reward_std": 0.90726547986269,
"rewards/reward_func": 0.5415297746658325,
"step": 9100,
"toxic_reward": 4.584239768981933
},
{
"clip_ratio": 0.0,
"completion_length": 39.125,
"epoch": 2.152646502835539,
"format_reward": 0.0,
"grad_norm": 7.961544990539551,
"image_reward": 0.2632598876953125,
"kl": 2.5744317561388015,
"learning_rate": 5e-06,
"loss": 0.1083,
"reward": 0.5891210317611695,
"reward_std": 1.2839626222848892,
"rewards/reward_func": 0.5891210317611695,
"step": 9110,
"toxic_reward": 4.181219959259034
},
{
"clip_ratio": 0.0,
"completion_length": 49.075,
"epoch": 2.1550094517958414,
"format_reward": 0.0,
"grad_norm": 9.345954895019531,
"image_reward": 0.238812255859375,
"kl": 6.054638743400574,
"learning_rate": 5e-06,
"loss": -0.1109,
"reward": 0.06543984264135361,
"reward_std": 0.519540898501873,
"rewards/reward_func": 0.06543984264135361,
"step": 9120,
"toxic_reward": 4.270166897773743
},
{
"clip_ratio": 0.0,
"completion_length": 40.875,
"epoch": 2.1573724007561434,
"format_reward": 0.0,
"grad_norm": 8.313584327697754,
"image_reward": 0.24713134765625,
"kl": 6.326075008511543,
"learning_rate": 5e-06,
"loss": -0.0767,
"reward": 0.8043414294719696,
"reward_std": 1.0881578013300897,
"rewards/reward_func": 0.8043414294719696,
"step": 9130,
"toxic_reward": 4.155464768409729
},
{
"clip_ratio": 0.0,
"completion_length": 53.075,
"epoch": 2.159735349716446,
"format_reward": 0.0,
"grad_norm": 8.283196449279785,
"image_reward": 0.2358612060546875,
"kl": 2.606276285648346,
"learning_rate": 5e-06,
"loss": 0.1205,
"reward": 0.1571010023355484,
"reward_std": 0.7534957839176059,
"rewards/reward_func": 0.1571010023355484,
"step": 9140,
"toxic_reward": 4.338043940067291
},
{
"clip_ratio": 0.0,
"completion_length": 49.65,
"epoch": 2.1620982986767485,
"format_reward": 0.0,
"grad_norm": 12.468461036682129,
"image_reward": 0.2525299072265625,
"kl": 10.651741808652877,
"learning_rate": 5e-06,
"loss": 0.0658,
"reward": 0.499350106716156,
"reward_std": 0.7922366757877171,
"rewards/reward_func": 0.499350106716156,
"step": 9150,
"toxic_reward": 4.407905173301697
},
{
"clip_ratio": 0.0,
"completion_length": 43.5,
"epoch": 2.164461247637051,
"format_reward": 0.0,
"grad_norm": 1.4810751676559448,
"image_reward": 0.22776641845703124,
"kl": 6.478110730648041,
"learning_rate": 5e-06,
"loss": -0.0464,
"reward": 0.9298590540885925,
"reward_std": 0.8800611793994904,
"rewards/reward_func": 0.9298590540885925,
"step": 9160,
"toxic_reward": 4.576443719863891
},
{
"clip_ratio": 0.0,
"completion_length": 52.875,
"epoch": 2.1668241965973536,
"format_reward": 0.0,
"grad_norm": 9.652629852294922,
"image_reward": 0.260015869140625,
"kl": 1.726886612176895,
"learning_rate": 5e-06,
"loss": -0.127,
"reward": 0.4531150579452515,
"reward_std": 0.8976183220744133,
"rewards/reward_func": 0.4531150579452515,
"step": 9170,
"toxic_reward": 4.432527303695679
},
{
"clip_ratio": 0.0,
"completion_length": 45.0,
"epoch": 2.169187145557656,
"format_reward": 0.0,
"grad_norm": 3.5050952434539795,
"image_reward": 0.2384307861328125,
"kl": 4.27691433429718,
"learning_rate": 5e-06,
"loss": -0.0311,
"reward": 0.6826965510845184,
"reward_std": 0.7877496212720871,
"rewards/reward_func": 0.6826965510845184,
"step": 9180,
"toxic_reward": 4.4837501525878904
},
{
"clip_ratio": 0.0,
"completion_length": 52.1,
"epoch": 2.171550094517958,
"format_reward": 0.0,
"grad_norm": 3.2137303352355957,
"image_reward": 0.24414825439453125,
"kl": 3.9786434292793276,
"learning_rate": 5e-06,
"loss": -0.048,
"reward": 0.34388454258441925,
"reward_std": 0.7642363490536809,
"rewards/reward_func": 0.34388454258441925,
"step": 9190,
"toxic_reward": 4.380149924755097
},
{
"clip_ratio": 0.0,
"completion_length": 45.8,
"epoch": 2.1739130434782608,
"format_reward": 0.0,
"grad_norm": 10.09927749633789,
"image_reward": 0.2380889892578125,
"kl": 3.893726623058319,
"learning_rate": 5e-06,
"loss": 0.0829,
"reward": 0.28469178080558777,
"reward_std": 0.7502976493909955,
"rewards/reward_func": 0.28469178080558777,
"step": 9200,
"toxic_reward": 4.405940270423889
},
{
"clip_ratio": 0.0,
"completion_length": 42.25,
"epoch": 2.1762759924385633,
"format_reward": -0.25,
"grad_norm": 36.55522537231445,
"image_reward": 0.23165105208754538,
"kl": 25.98146269917488,
"learning_rate": 5e-06,
"loss": 0.0734,
"reward": -0.05637494325637817,
"reward_std": 1.5777033947408199,
"rewards/reward_func": -0.05637494325637817,
"step": 9210,
"toxic_reward": 4.138173961639405
},
{
"clip_ratio": 0.0,
"completion_length": 47.45,
"epoch": 2.178638941398866,
"format_reward": 0.0,
"grad_norm": 2.0444726943969727,
"image_reward": 0.2312713623046875,
"kl": 2.639026927947998,
"learning_rate": 5e-06,
"loss": -0.0598,
"reward": 0.5707060933113098,
"reward_std": 1.1906714523211122,
"rewards/reward_func": 0.5707060933113098,
"step": 9220,
"toxic_reward": 3.9467220425605776
},
{
"clip_ratio": 0.0,
"completion_length": 44.725,
"epoch": 2.1810018903591684,
"format_reward": 0.0,
"grad_norm": 14.821864128112793,
"image_reward": 0.260614013671875,
"kl": 2.6187705636024474,
"learning_rate": 5e-06,
"loss": -0.0004,
"reward": 0.776735657453537,
"reward_std": 0.8302984148263931,
"rewards/reward_func": 0.776735657453537,
"step": 9230,
"toxic_reward": 4.346326851844788
},
{
"clip_ratio": 0.0,
"completion_length": 44.95,
"epoch": 2.183364839319471,
"format_reward": 0.0,
"grad_norm": 8.473363876342773,
"image_reward": 0.227203369140625,
"kl": 1.9701256573200225,
"learning_rate": 5e-06,
"loss": 0.0091,
"reward": 0.6466103255748749,
"reward_std": 0.5622012199833989,
"rewards/reward_func": 0.6466103255748749,
"step": 9240,
"toxic_reward": 4.649976348876953
},
{
"clip_ratio": 0.0,
"completion_length": 46.9,
"epoch": 2.185727788279773,
"format_reward": 0.0,
"grad_norm": 16.42177391052246,
"image_reward": 0.2678741455078125,
"kl": 3.791227114200592,
"learning_rate": 5e-06,
"loss": 0.0161,
"reward": 0.5803273111581803,
"reward_std": 0.8187548790127039,
"rewards/reward_func": 0.5803273111581803,
"step": 9250,
"toxic_reward": 4.081698262691498
},
{
"clip_ratio": 0.0,
"completion_length": 50.5,
"epoch": 2.1880907372400755,
"format_reward": 0.0,
"grad_norm": 23.73421859741211,
"image_reward": 0.227972412109375,
"kl": 3.5688813447952272,
"learning_rate": 5e-06,
"loss": 0.052,
"reward": 0.5074087619781494,
"reward_std": 0.9598018784075976,
"rewards/reward_func": 0.5074087619781494,
"step": 9260,
"toxic_reward": 4.401837420463562
},
{
"clip_ratio": 0.0,
"completion_length": 60.425,
"epoch": 2.190453686200378,
"format_reward": 0.0,
"grad_norm": 9.969075202941895,
"image_reward": 0.245391845703125,
"kl": 2.1753955483436584,
"learning_rate": 5e-06,
"loss": -0.0984,
"reward": 0.9439354777336121,
"reward_std": 0.7434614159166812,
"rewards/reward_func": 0.9439354777336121,
"step": 9270,
"toxic_reward": 4.398157930374145
},
{
"clip_ratio": 0.0,
"completion_length": 54.325,
"epoch": 2.1928166351606806,
"format_reward": 0.0,
"grad_norm": 11.830771446228027,
"image_reward": 0.23084869384765624,
"kl": 1.2656208366155624,
"learning_rate": 5e-06,
"loss": 0.0047,
"reward": 0.31412020325660706,
"reward_std": 0.8172964336816222,
"rewards/reward_func": 0.31412020325660706,
"step": 9280,
"toxic_reward": 4.430534148216248
},
{
"clip_ratio": 0.0,
"completion_length": 44.475,
"epoch": 2.195179584120983,
"format_reward": 0.0,
"grad_norm": 5.753904819488525,
"image_reward": 0.24887237548828126,
"kl": 1.3502902746200562,
"learning_rate": 5e-06,
"loss": 0.0623,
"reward": 0.7708447635173797,
"reward_std": 0.9731228679418564,
"rewards/reward_func": 0.7708447635173797,
"step": 9290,
"toxic_reward": 4.273838710784912
},
{
"clip_ratio": 0.0,
"completion_length": 45.45,
"epoch": 2.197542533081285,
"format_reward": 0.0,
"grad_norm": 7.7253947257995605,
"image_reward": 0.2331573486328125,
"kl": 3.059584191441536,
"learning_rate": 5e-06,
"loss": 0.0942,
"reward": 0.43726455271244047,
"reward_std": 1.3362455716356636,
"rewards/reward_func": 0.43726455271244047,
"step": 9300,
"toxic_reward": 4.275756049156189
},
{
"clip_ratio": 0.0,
"completion_length": 40.525,
"epoch": 2.1999054820415878,
"format_reward": 0.0,
"grad_norm": 14.917332649230957,
"image_reward": 0.2485260009765625,
"kl": 3.154623621702194,
"learning_rate": 5e-06,
"loss": -0.0028,
"reward": 0.7619877219200134,
"reward_std": 0.7480236226692796,
"rewards/reward_func": 0.7619877219200134,
"step": 9310,
"toxic_reward": 4.5632892370224
},
{
"clip_ratio": 0.0,
"completion_length": 50.425,
"epoch": 2.2022684310018903,
"format_reward": 0.0,
"grad_norm": 2.2232089042663574,
"image_reward": 0.272552490234375,
"kl": 4.570386919379234,
"learning_rate": 5e-06,
"loss": 0.0767,
"reward": -0.32990662753582,
"reward_std": 0.6452065747231245,
"rewards/reward_func": -0.32990662753582,
"step": 9320,
"toxic_reward": 4.057078433036804
},
{
"clip_ratio": 0.0,
"completion_length": 51.475,
"epoch": 2.204631379962193,
"format_reward": 0.0,
"grad_norm": 12.401063919067383,
"image_reward": 0.22475687563419341,
"kl": 2.3822293996810915,
"learning_rate": 5e-06,
"loss": -0.1368,
"reward": 0.6720861852169037,
"reward_std": 0.5467347849160433,
"rewards/reward_func": 0.6720861852169037,
"step": 9330,
"toxic_reward": 4.748937749862671
},
{
"clip_ratio": 0.0,
"completion_length": 46.725,
"epoch": 2.2069943289224954,
"format_reward": 0.0,
"grad_norm": 18.317251205444336,
"image_reward": 0.2457794189453125,
"kl": 2.1576267421245574,
"learning_rate": 5e-06,
"loss": 0.0443,
"reward": 0.4097582340240479,
"reward_std": 0.6367886804975569,
"rewards/reward_func": 0.4097582340240479,
"step": 9340,
"toxic_reward": 3.8079322576522827
},
{
"clip_ratio": 0.0,
"completion_length": 53.0,
"epoch": 2.209357277882798,
"format_reward": 0.0,
"grad_norm": 9.418509483337402,
"image_reward": 0.2345733642578125,
"kl": 0.8293094992637634,
"learning_rate": 5e-06,
"loss": 0.1013,
"reward": 0.4199859380722046,
"reward_std": 0.8710890758782626,
"rewards/reward_func": 0.4199859380722046,
"step": 9350,
"toxic_reward": 4.7352869510650635
},
{
"clip_ratio": 0.0,
"completion_length": 43.7,
"epoch": 2.2117202268431,
"format_reward": 0.0,
"grad_norm": 16.66223907470703,
"image_reward": 0.25906219482421877,
"kl": 1.3076835095882415,
"learning_rate": 5e-06,
"loss": 0.1376,
"reward": 0.2918867290019989,
"reward_std": 0.2911624666303396,
"rewards/reward_func": 0.2918867290019989,
"step": 9360,
"toxic_reward": 4.793551731109619
},
{
"clip_ratio": 0.0,
"completion_length": 54.2,
"epoch": 2.2140831758034025,
"format_reward": 0.0,
"grad_norm": 6.431090354919434,
"image_reward": 0.214971923828125,
"kl": 2.7103491842746736,
"learning_rate": 5e-06,
"loss": 0.0478,
"reward": 0.3785900384187698,
"reward_std": 0.8829892821609974,
"rewards/reward_func": 0.3785900384187698,
"step": 9370,
"toxic_reward": 4.103424906730652
},
{
"clip_ratio": 0.0,
"completion_length": 44.45,
"epoch": 2.216446124763705,
"format_reward": 0.0,
"grad_norm": 5.423946857452393,
"image_reward": 0.2241668701171875,
"kl": 9.351680633425712,
"learning_rate": 5e-06,
"loss": -0.0282,
"reward": 0.6931559234857559,
"reward_std": 0.9827461183071137,
"rewards/reward_func": 0.6931559234857559,
"step": 9380,
"toxic_reward": 4.327680516242981
},
{
"clip_ratio": 0.0,
"completion_length": 46.75,
"epoch": 2.2188090737240076,
"format_reward": 0.0,
"grad_norm": 12.53814697265625,
"image_reward": 0.24124603271484374,
"kl": 1.539423054456711,
"learning_rate": 5e-06,
"loss": 0.0074,
"reward": 0.5662744238972663,
"reward_std": 0.8970771560445427,
"rewards/reward_func": 0.5662744238972663,
"step": 9390,
"toxic_reward": 3.676301693916321
},
{
"clip_ratio": 0.0,
"completion_length": 53.85,
"epoch": 2.22117202268431,
"format_reward": -0.25,
"grad_norm": 27.224220275878906,
"image_reward": 0.26278177797794344,
"kl": 8.620309627056121,
"learning_rate": 5e-06,
"loss": -0.0561,
"reward": 0.45245649218559264,
"reward_std": 1.51493993550539,
"rewards/reward_func": 0.45245649218559264,
"step": 9400,
"toxic_reward": 4.195838165283203
},
{
"clip_ratio": 0.0,
"completion_length": 46.3,
"epoch": 2.2235349716446127,
"format_reward": 0.0,
"grad_norm": 16.83915901184082,
"image_reward": 0.244732666015625,
"kl": 7.121062386035919,
"learning_rate": 5e-06,
"loss": 0.0277,
"reward": 0.5056580305099487,
"reward_std": 0.6380140800029039,
"rewards/reward_func": 0.5056580305099487,
"step": 9410,
"toxic_reward": 4.542606806755066
},
{
"clip_ratio": 0.0,
"completion_length": 40.525,
"epoch": 2.2258979206049148,
"format_reward": 0.0,
"grad_norm": 14.892727851867676,
"image_reward": 0.24930419921875,
"kl": 5.096332561969757,
"learning_rate": 5e-06,
"loss": -0.0329,
"reward": 0.48427205085754393,
"reward_std": 1.0285473830997944,
"rewards/reward_func": 0.48427205085754393,
"step": 9420,
"toxic_reward": 4.446974515914917
},
{
"clip_ratio": 0.0,
"completion_length": 40.75,
"epoch": 2.2282608695652173,
"format_reward": -0.25,
"grad_norm": 1.2352709770202637,
"image_reward": 0.24373575747013093,
"kl": 16.825757110118865,
"learning_rate": 5e-06,
"loss": -0.0367,
"reward": 0.5291045546531677,
"reward_std": 1.2605504954233766,
"rewards/reward_func": 0.5291045546531677,
"step": 9430,
"toxic_reward": 4.275347375869751
},
{
"clip_ratio": 0.0,
"completion_length": 40.875,
"epoch": 2.23062381852552,
"format_reward": 0.0,
"grad_norm": 22.900882720947266,
"image_reward": 0.25095672607421876,
"kl": 10.578787690401077,
"learning_rate": 5e-06,
"loss": 0.0032,
"reward": 0.26428125500679017,
"reward_std": 0.9156784310936927,
"rewards/reward_func": 0.26428125500679017,
"step": 9440,
"toxic_reward": 3.866254734992981
},
{
"clip_ratio": 0.0,
"completion_length": 39.5,
"epoch": 2.2329867674858224,
"format_reward": -0.25,
"grad_norm": 13.677875518798828,
"image_reward": 0.238702392578125,
"kl": 5.4229684472084045,
"learning_rate": 5e-06,
"loss": -0.0035,
"reward": 0.34162178039550783,
"reward_std": 1.2561071523465217,
"rewards/reward_func": 0.34162178039550783,
"step": 9450,
"toxic_reward": 4.54083218574524
},
{
"clip_ratio": 0.0,
"completion_length": 50.25,
"epoch": 2.235349716446125,
"format_reward": 0.0,
"grad_norm": 7.939948081970215,
"image_reward": 0.245355224609375,
"kl": 2.677640450000763,
"learning_rate": 5e-06,
"loss": -0.0006,
"reward": 0.28101458549499514,
"reward_std": 0.8911348965018988,
"rewards/reward_func": 0.28101458549499514,
"step": 9460,
"toxic_reward": 4.145281267166138
},
{
"clip_ratio": 0.0,
"completion_length": 50.0,
"epoch": 2.237712665406427,
"format_reward": 0.0,
"grad_norm": 1.8240618705749512,
"image_reward": 0.231634521484375,
"kl": 1.528824520111084,
"learning_rate": 5e-06,
"loss": -0.0875,
"reward": 0.37455313801765444,
"reward_std": 0.4142075888812542,
"rewards/reward_func": 0.37455313801765444,
"step": 9470,
"toxic_reward": 4.762905406951904
},
{
"clip_ratio": 0.0,
"completion_length": 45.325,
"epoch": 2.2400756143667295,
"format_reward": 0.0,
"grad_norm": 6.417304515838623,
"image_reward": 0.2490814208984375,
"kl": 1.3931379437446594,
"learning_rate": 5e-06,
"loss": 0.0247,
"reward": 1.1561188876628876,
"reward_std": 0.7829106822609901,
"rewards/reward_func": 1.1561188876628876,
"step": 9480,
"toxic_reward": 4.180247139930725
},
{
"clip_ratio": 0.0,
"completion_length": 46.1,
"epoch": 2.242438563327032,
"format_reward": 0.0,
"grad_norm": 6.768500328063965,
"image_reward": 0.2283935546875,
"kl": 0.9197474420070648,
"learning_rate": 5e-06,
"loss": -0.0378,
"reward": 0.5479820281267166,
"reward_std": 0.8298372723162174,
"rewards/reward_func": 0.5479820281267166,
"step": 9490,
"toxic_reward": 4.205300378799438
},
{
"clip_ratio": 0.0,
"completion_length": 55.375,
"epoch": 2.2448015122873346,
"format_reward": 0.0,
"grad_norm": 26.945127487182617,
"image_reward": 0.26320343017578124,
"kl": 1.3565968126058578,
"learning_rate": 5e-06,
"loss": 0.0841,
"reward": 0.7786126613616944,
"reward_std": 1.1838067084550858,
"rewards/reward_func": 0.7786126613616944,
"step": 9500,
"toxic_reward": 3.8664269924163817
},
{
"clip_ratio": 0.0,
"completion_length": 36.3,
"epoch": 2.247164461247637,
"format_reward": -0.25,
"grad_norm": 5.876742839813232,
"image_reward": 0.2632466644048691,
"kl": 2.275825482606888,
"learning_rate": 5e-06,
"loss": -0.0704,
"reward": 0.1731979250907898,
"reward_std": 1.4120358280837535,
"rewards/reward_func": 0.1731979250907898,
"step": 9510,
"toxic_reward": 4.367373514175415
},
{
"clip_ratio": 0.0,
"completion_length": 51.525,
"epoch": 2.2495274102079397,
"format_reward": 0.0,
"grad_norm": 4.764988422393799,
"image_reward": 0.2396881103515625,
"kl": 5.604674518108368,
"learning_rate": 5e-06,
"loss": 0.0467,
"reward": 0.2715915977954865,
"reward_std": 0.6032503295689822,
"rewards/reward_func": 0.2715915977954865,
"step": 9520,
"toxic_reward": 4.3036177396774296
},
{
"clip_ratio": 0.0,
"completion_length": 46.15,
"epoch": 2.251890359168242,
"format_reward": 0.0,
"grad_norm": 23.23556900024414,
"image_reward": 0.2485931396484375,
"kl": 2.390829586982727,
"learning_rate": 5e-06,
"loss": 0.0824,
"reward": 0.7335005760192871,
"reward_std": 0.5389063037931919,
"rewards/reward_func": 0.7335005760192871,
"step": 9530,
"toxic_reward": 4.7080058574676515
},
{
"clip_ratio": 0.0,
"completion_length": 43.55,
"epoch": 2.2542533081285443,
"format_reward": 0.0,
"grad_norm": 20.620105743408203,
"image_reward": 0.2229095458984375,
"kl": 5.458765661716461,
"learning_rate": 5e-06,
"loss": 0.1314,
"reward": 0.5191788256168366,
"reward_std": 1.1402710743248463,
"rewards/reward_func": 0.5191788256168366,
"step": 9540,
"toxic_reward": 4.577926540374756
},
{
"clip_ratio": 0.0,
"completion_length": 51.75,
"epoch": 2.256616257088847,
"format_reward": 0.0,
"grad_norm": 6.686956405639648,
"image_reward": 0.231097412109375,
"kl": 2.384101688861847,
"learning_rate": 5e-06,
"loss": -0.0264,
"reward": 0.3123217046260834,
"reward_std": 1.1325789090245961,
"rewards/reward_func": 0.3123217046260834,
"step": 9550,
"toxic_reward": 4.343110990524292
},
{
"clip_ratio": 0.0,
"completion_length": 41.8,
"epoch": 2.2589792060491494,
"format_reward": 0.0,
"grad_norm": 11.559769630432129,
"image_reward": 0.236077880859375,
"kl": 2.987881660461426,
"learning_rate": 5e-06,
"loss": 0.0371,
"reward": 1.1903966188430786,
"reward_std": 0.8922829747200012,
"rewards/reward_func": 1.1903966188430786,
"step": 9560,
"toxic_reward": 4.265221381187439
},
{
"clip_ratio": 0.0,
"completion_length": 43.15,
"epoch": 2.261342155009452,
"format_reward": -0.25,
"grad_norm": 8.45358657836914,
"image_reward": 0.2397247314453125,
"kl": 3.8612433314323424,
"learning_rate": 5e-06,
"loss": -0.0161,
"reward": -0.07919068932533264,
"reward_std": 1.113222143240273,
"rewards/reward_func": -0.07919068932533264,
"step": 9570,
"toxic_reward": 4.036798477172852
},
{
"clip_ratio": 0.0,
"completion_length": 53.95,
"epoch": 2.2637051039697544,
"format_reward": 0.0,
"grad_norm": 9.02270221710205,
"image_reward": 0.232049560546875,
"kl": 4.92200248837471,
"learning_rate": 5e-06,
"loss": 0.0855,
"reward": 0.7455990195274353,
"reward_std": 0.8981746949255467,
"rewards/reward_func": 0.7455990195274353,
"step": 9580,
"toxic_reward": 4.544493341445923
},
{
"clip_ratio": 0.0,
"completion_length": 67.2,
"epoch": 2.2660680529300565,
"format_reward": 0.0,
"grad_norm": 19.980321884155273,
"image_reward": 0.2468475341796875,
"kl": 3.3157293617725374,
"learning_rate": 5e-06,
"loss": 0.1177,
"reward": 0.4287958800792694,
"reward_std": 0.8264384102076292,
"rewards/reward_func": 0.4287958800792694,
"step": 9590,
"toxic_reward": 4.492921185493469
},
{
"clip_ratio": 0.0,
"completion_length": 39.425,
"epoch": 2.268431001890359,
"format_reward": 0.0,
"grad_norm": 1.6929293870925903,
"image_reward": 0.2279388427734375,
"kl": 11.748549377918243,
"learning_rate": 5e-06,
"loss": 0.0227,
"reward": 0.8450492799282074,
"reward_std": 0.8821253469213843,
"rewards/reward_func": 0.8450492799282074,
"step": 9600,
"toxic_reward": 4.329080724716187
},
{
"clip_ratio": 0.0,
"completion_length": 44.075,
"epoch": 2.2707939508506616,
"format_reward": -0.25,
"grad_norm": 6.6792683601379395,
"image_reward": 0.2432342529296875,
"kl": 23.715504467487335,
"learning_rate": 5e-06,
"loss": -0.0603,
"reward": 0.6058377146720886,
"reward_std": 1.8178761571645736,
"rewards/reward_func": 0.6058377146720886,
"step": 9610,
"toxic_reward": 3.8193166494369506
},
{
"clip_ratio": 0.0,
"completion_length": 43.6,
"epoch": 2.273156899810964,
"format_reward": -0.25,
"grad_norm": 4.806612014770508,
"image_reward": 0.25201212614774704,
"kl": 2.9662541508674622,
"learning_rate": 5e-06,
"loss": 0.0205,
"reward": 0.14691731929779053,
"reward_std": 1.3215140633285045,
"rewards/reward_func": 0.14691731929779053,
"step": 9620,
"toxic_reward": 4.118840670585632
},
{
"clip_ratio": 0.0,
"completion_length": 56.875,
"epoch": 2.2755198487712667,
"format_reward": 0.0,
"grad_norm": 11.628203392028809,
"image_reward": 0.2614166259765625,
"kl": 3.4263065993785857,
"learning_rate": 5e-06,
"loss": 0.0042,
"reward": 0.7031525075435638,
"reward_std": 0.9792011518031358,
"rewards/reward_func": 0.7031525075435638,
"step": 9630,
"toxic_reward": 4.620864820480347
},
{
"clip_ratio": 0.0,
"completion_length": 33.3,
"epoch": 2.2778827977315688,
"format_reward": 0.0,
"grad_norm": 10.783760070800781,
"image_reward": 0.2508331298828125,
"kl": 12.492328238487243,
"learning_rate": 5e-06,
"loss": 0.0118,
"reward": 0.2551820993423462,
"reward_std": 0.8312258010730147,
"rewards/reward_func": 0.2551820993423462,
"step": 9640,
"toxic_reward": 4.169378912448883
},
{
"clip_ratio": 0.0,
"completion_length": 47.475,
"epoch": 2.2802457466918713,
"format_reward": 0.0,
"grad_norm": 9.430181503295898,
"image_reward": 0.256829833984375,
"kl": 10.857812678813934,
"learning_rate": 5e-06,
"loss": 0.0124,
"reward": 0.6474542915821075,
"reward_std": 1.0158755726995878,
"rewards/reward_func": 0.6474542915821075,
"step": 9650,
"toxic_reward": 4.2302504777908325
},
{
"clip_ratio": 0.0,
"completion_length": 35.225,
"epoch": 2.282608695652174,
"format_reward": 0.0,
"grad_norm": 2.653324604034424,
"image_reward": 0.258251953125,
"kl": 4.948359310626984,
"learning_rate": 5e-06,
"loss": -0.0303,
"reward": 0.8631646454334259,
"reward_std": 1.4457193814218043,
"rewards/reward_func": 0.8631646454334259,
"step": 9660,
"toxic_reward": 4.028263640403748
},
{
"clip_ratio": 0.0,
"completion_length": 50.475,
"epoch": 2.2849716446124764,
"format_reward": 0.0,
"grad_norm": 46.17441940307617,
"image_reward": 0.2468775436282158,
"kl": 4.73446731865406,
"learning_rate": 5e-06,
"loss": -0.0188,
"reward": 0.47216950058937074,
"reward_std": 0.5647860389202833,
"rewards/reward_func": 0.47216950058937074,
"step": 9670,
"toxic_reward": 4.644181919097901
},
{
"clip_ratio": 0.0,
"completion_length": 54.975,
"epoch": 2.287334593572779,
"format_reward": 0.0,
"grad_norm": 6.452030181884766,
"image_reward": 0.24481913298368455,
"kl": 84.93760406374932,
"learning_rate": 5e-06,
"loss": -0.0133,
"reward": 0.45135449171066283,
"reward_std": 0.9933142360066995,
"rewards/reward_func": 0.45135449171066283,
"step": 9680,
"toxic_reward": 4.119644379615783
},
{
"clip_ratio": 0.0,
"completion_length": 50.925,
"epoch": 2.2896975425330814,
"format_reward": 0.0,
"grad_norm": 7.41399621963501,
"image_reward": 0.2643157958984375,
"kl": 2.3054057717323304,
"learning_rate": 5e-06,
"loss": 0.0556,
"reward": 0.8660075426101684,
"reward_std": 1.1269529208540916,
"rewards/reward_func": 0.8660075426101684,
"step": 9690,
"toxic_reward": 4.305045056343078
},
{
"clip_ratio": 0.0,
"completion_length": 56.95,
"epoch": 2.292060491493384,
"format_reward": 0.0,
"grad_norm": 4.1870198249816895,
"image_reward": 0.255328369140625,
"kl": 0.9636234432458878,
"learning_rate": 5e-06,
"loss": 0.0444,
"reward": 0.07925584316253662,
"reward_std": 0.9312605137005449,
"rewards/reward_func": 0.07925584316253662,
"step": 9700,
"toxic_reward": 4.480338978767395
},
{
"clip_ratio": 0.0,
"completion_length": 47.175,
"epoch": 2.294423440453686,
"format_reward": 0.0,
"grad_norm": 17.8924617767334,
"image_reward": 0.2297576904296875,
"kl": 1.2910286843776704,
"learning_rate": 5e-06,
"loss": -0.0594,
"reward": 0.09658912420272828,
"reward_std": 0.534552292432636,
"rewards/reward_func": 0.09658912420272828,
"step": 9710,
"toxic_reward": 4.421128726005554
},
{
"clip_ratio": 0.0,
"completion_length": 51.5,
"epoch": 2.2967863894139886,
"format_reward": 0.0,
"grad_norm": 8.845993995666504,
"image_reward": 0.260223388671875,
"kl": 11.969202554225921,
"learning_rate": 5e-06,
"loss": 0.0984,
"reward": 0.49135610461235046,
"reward_std": 1.074414287507534,
"rewards/reward_func": 0.49135610461235046,
"step": 9720,
"toxic_reward": 4.317939972877502
},
{
"clip_ratio": 0.0,
"completion_length": 54.725,
"epoch": 2.299149338374291,
"format_reward": 0.0,
"grad_norm": 2.5245182514190674,
"image_reward": 0.2630401611328125,
"kl": 18.936833548545838,
"learning_rate": 5e-06,
"loss": 0.0013,
"reward": 0.10787631869316101,
"reward_std": 0.7801851622760296,
"rewards/reward_func": 0.10787631869316101,
"step": 9730,
"toxic_reward": 4.423897337913513
},
{
"clip_ratio": 0.0,
"completion_length": 47.875,
"epoch": 2.3015122873345937,
"format_reward": -0.25,
"grad_norm": 3.0716590881347656,
"image_reward": 0.25508524626493456,
"kl": 2.160058504343033,
"learning_rate": 5e-06,
"loss": -0.0232,
"reward": 0.1063625156879425,
"reward_std": 1.0759605418890714,
"rewards/reward_func": 0.1063625156879425,
"step": 9740,
"toxic_reward": 4.3748561382293705
},
{
"clip_ratio": 0.0,
"completion_length": 53.475,
"epoch": 2.303875236294896,
"format_reward": 0.0,
"grad_norm": 2.622436285018921,
"image_reward": 0.23123779296875,
"kl": 8.901539516448974,
"learning_rate": 5e-06,
"loss": 0.0451,
"reward": 0.41706631779670716,
"reward_std": 0.8451563934795558,
"rewards/reward_func": 0.41706631779670716,
"step": 9750,
"toxic_reward": 4.452003169059753
},
{
"clip_ratio": 0.0,
"completion_length": 42.425,
"epoch": 2.3062381852551983,
"format_reward": 0.0,
"grad_norm": 7.1622514724731445,
"image_reward": 0.2591522216796875,
"kl": 17.092305302619934,
"learning_rate": 5e-06,
"loss": 0.0045,
"reward": 0.43682674169540403,
"reward_std": 1.0151184625923633,
"rewards/reward_func": 0.43682674169540403,
"step": 9760,
"toxic_reward": 3.4172345459461213
},
{
"clip_ratio": 0.0,
"completion_length": 44.35,
"epoch": 2.308601134215501,
"format_reward": 0.0,
"grad_norm": 12.564181327819824,
"image_reward": 0.2551788330078125,
"kl": 8.035814380645752,
"learning_rate": 5e-06,
"loss": -0.057,
"reward": 0.47375474870204926,
"reward_std": 0.6293097786605358,
"rewards/reward_func": 0.47375474870204926,
"step": 9770,
"toxic_reward": 3.9829455375671388
},
{
"clip_ratio": 0.0,
"completion_length": 45.425,
"epoch": 2.3109640831758034,
"format_reward": 0.0,
"grad_norm": 12.808835983276367,
"image_reward": 0.2319427490234375,
"kl": 3.5698849081993105,
"learning_rate": 5e-06,
"loss": 0.0338,
"reward": 0.6212572991847992,
"reward_std": 0.6545703388750553,
"rewards/reward_func": 0.6212572991847992,
"step": 9780,
"toxic_reward": 4.615470266342163
},
{
"clip_ratio": 0.0,
"completion_length": 43.7,
"epoch": 2.313327032136106,
"format_reward": 0.0,
"grad_norm": 11.370565414428711,
"image_reward": 0.2495208740234375,
"kl": 58.05081114768982,
"learning_rate": 5e-06,
"loss": -0.003,
"reward": 0.3863606512546539,
"reward_std": 0.7871608097106219,
"rewards/reward_func": 0.3863606512546539,
"step": 9790,
"toxic_reward": 4.543632507324219
},
{
"clip_ratio": 0.0,
"completion_length": 53.225,
"epoch": 2.3156899810964084,
"format_reward": 0.0,
"grad_norm": 17.273242950439453,
"image_reward": 0.256707763671875,
"kl": 63.41283442378044,
"learning_rate": 5e-06,
"loss": -0.0867,
"reward": -0.17932948917150499,
"reward_std": 0.5697868175804615,
"rewards/reward_func": -0.17932948917150499,
"step": 9800,
"toxic_reward": 3.8667405366897585
},
{
"clip_ratio": 0.0,
"completion_length": 46.575,
"epoch": 2.3180529300567105,
"format_reward": 0.0,
"grad_norm": 2.3364224433898926,
"image_reward": 0.272943115234375,
"kl": 2.4312843918800353,
"learning_rate": 5e-06,
"loss": 0.0565,
"reward": 0.25352796316146853,
"reward_std": 0.8136134160682559,
"rewards/reward_func": 0.25352796316146853,
"step": 9810,
"toxic_reward": 4.419120264053345
},
{
"clip_ratio": 0.0,
"completion_length": 43.975,
"epoch": 2.320415879017013,
"format_reward": 0.0,
"grad_norm": 16.5479793548584,
"image_reward": 0.2720245361328125,
"kl": 4.018320089578628,
"learning_rate": 5e-06,
"loss": -0.052,
"reward": 0.3462803453207016,
"reward_std": 0.6793602051213383,
"rewards/reward_func": 0.3462803453207016,
"step": 9820,
"toxic_reward": 4.1594162940979
},
{
"clip_ratio": 0.0,
"completion_length": 44.5,
"epoch": 2.3227788279773156,
"format_reward": 0.0,
"grad_norm": 16.928800582885742,
"image_reward": 0.2370941162109375,
"kl": 5.093863940238952,
"learning_rate": 5e-06,
"loss": -0.0256,
"reward": -0.17395999431610107,
"reward_std": 0.733401482924819,
"rewards/reward_func": -0.17395999431610107,
"step": 9830,
"toxic_reward": 4.316698336601258
},
{
"clip_ratio": 0.0,
"completion_length": 40.3,
"epoch": 2.325141776937618,
"format_reward": 0.0,
"grad_norm": 5.420310020446777,
"image_reward": 0.2193939208984375,
"kl": 10.776195186376572,
"learning_rate": 5e-06,
"loss": 0.0166,
"reward": 0.9508269459009171,
"reward_std": 1.237346090376377,
"rewards/reward_func": 0.9508269459009171,
"step": 9840,
"toxic_reward": 4.61365122795105
},
{
"clip_ratio": 0.0,
"completion_length": 43.6,
"epoch": 2.3275047258979207,
"format_reward": 0.0,
"grad_norm": 8.147756576538086,
"image_reward": 0.2236907958984375,
"kl": 14.836266088485718,
"learning_rate": 5e-06,
"loss": -0.0104,
"reward": 1.2669459402561187,
"reward_std": 0.8822499677538872,
"rewards/reward_func": 1.2669459402561187,
"step": 9850,
"toxic_reward": 4.57221360206604
},
{
"clip_ratio": 0.0,
"completion_length": 51.225,
"epoch": 2.329867674858223,
"format_reward": 0.0,
"grad_norm": 9.47561264038086,
"image_reward": 0.2329345703125,
"kl": 7.03211784362793,
"learning_rate": 5e-06,
"loss": 0.1103,
"reward": 0.4148245692253113,
"reward_std": 0.4656851476058364,
"rewards/reward_func": 0.4148245692253113,
"step": 9860,
"toxic_reward": 4.482611513137817
},
{
"clip_ratio": 0.0,
"completion_length": 45.45,
"epoch": 2.3322306238185257,
"format_reward": 0.0,
"grad_norm": 5.070466995239258,
"image_reward": 0.242041015625,
"kl": 4.666208404302597,
"learning_rate": 5e-06,
"loss": -0.0147,
"reward": 0.29748362898826597,
"reward_std": 0.7133689053356648,
"rewards/reward_func": 0.29748362898826597,
"step": 9870,
"toxic_reward": 4.607629799842835
},
{
"clip_ratio": 0.0,
"completion_length": 48.7,
"epoch": 2.334593572778828,
"format_reward": 0.0,
"grad_norm": 18.918405532836914,
"image_reward": 0.2441070556640625,
"kl": 1.534792199730873,
"learning_rate": 5e-06,
"loss": -0.0478,
"reward": 1.092936259508133,
"reward_std": 1.04658992420882,
"rewards/reward_func": 1.092936259508133,
"step": 9880,
"toxic_reward": 4.236094212532043
},
{
"clip_ratio": 0.0,
"completion_length": 45.525,
"epoch": 2.3369565217391304,
"format_reward": -0.25,
"grad_norm": 2.751826524734497,
"image_reward": 0.239617919921875,
"kl": 4.680880203843117,
"learning_rate": 5e-06,
"loss": 0.0816,
"reward": 0.287129682302475,
"reward_std": 1.596864845789969,
"rewards/reward_func": 0.287129682302475,
"step": 9890,
"toxic_reward": 3.968970334529877
},
{
"clip_ratio": 0.0,
"completion_length": 37.05,
"epoch": 2.339319470699433,
"format_reward": 0.0,
"grad_norm": 7.023763179779053,
"image_reward": 0.22930908203125,
"kl": 2.7311850488185883,
"learning_rate": 5e-06,
"loss": 0.0339,
"reward": 0.4986346364021301,
"reward_std": 1.2056658655405044,
"rewards/reward_func": 0.4986346364021301,
"step": 9900,
"toxic_reward": 4.340516233444214
},
{
"clip_ratio": 0.0,
"completion_length": 41.425,
"epoch": 2.3416824196597354,
"format_reward": -0.5,
"grad_norm": 10.157588958740234,
"image_reward": 0.24829813539981843,
"kl": 2.7676683485507967,
"learning_rate": 5e-06,
"loss": 0.0746,
"reward": -0.3049712359905243,
"reward_std": 1.7655695647001266,
"rewards/reward_func": -0.3049712359905243,
"step": 9910,
"toxic_reward": 4.103359699249268
},
{
"clip_ratio": 0.0,
"completion_length": 45.525,
"epoch": 2.344045368620038,
"format_reward": -0.25,
"grad_norm": 13.108144760131836,
"image_reward": 0.22231547087430953,
"kl": 0.8436797827482223,
"learning_rate": 5e-06,
"loss": -0.0158,
"reward": 0.21243730187416077,
"reward_std": 1.2439281724393367,
"rewards/reward_func": 0.21243730187416077,
"step": 9920,
"toxic_reward": 4.508874106407165
},
{
"clip_ratio": 0.0,
"completion_length": 39.875,
"epoch": 2.34640831758034,
"format_reward": 0.0,
"grad_norm": 7.745576858520508,
"image_reward": 0.2392669677734375,
"kl": 2.106270205974579,
"learning_rate": 5e-06,
"loss": 0.0696,
"reward": 0.02986244559288025,
"reward_std": 0.5784997101873159,
"rewards/reward_func": 0.02986244559288025,
"step": 9930,
"toxic_reward": 4.542821955680847
},
{
"clip_ratio": 0.0,
"completion_length": 45.5,
"epoch": 2.3487712665406426,
"format_reward": 0.0,
"grad_norm": 12.980175018310547,
"image_reward": 0.26973876953125,
"kl": 6.715492057800293,
"learning_rate": 5e-06,
"loss": -0.0504,
"reward": 0.8542561173439026,
"reward_std": 0.8472011580131948,
"rewards/reward_func": 0.8542561173439026,
"step": 9940,
"toxic_reward": 4.496533703804016
},
{
"clip_ratio": 0.0,
"completion_length": 45.775,
"epoch": 2.351134215500945,
"format_reward": 0.0,
"grad_norm": 8.429265975952148,
"image_reward": 0.245904541015625,
"kl": 1.740691715478897,
"learning_rate": 5e-06,
"loss": 0.0812,
"reward": 0.524560397863388,
"reward_std": 0.6684761707670986,
"rewards/reward_func": 0.524560397863388,
"step": 9950,
"toxic_reward": 4.557698893547058
},
{
"clip_ratio": 0.0,
"completion_length": 49.725,
"epoch": 2.3534971644612477,
"format_reward": 0.0,
"grad_norm": 7.811772346496582,
"image_reward": 0.23507537841796874,
"kl": 4.798752707242966,
"learning_rate": 5e-06,
"loss": 0.0502,
"reward": 0.2622858464717865,
"reward_std": 0.7538703501224517,
"rewards/reward_func": 0.2622858464717865,
"step": 9960,
"toxic_reward": 4.338898825645447
},
{
"clip_ratio": 0.0,
"completion_length": 35.6,
"epoch": 2.35586011342155,
"format_reward": 0.0,
"grad_norm": 3.807326555252075,
"image_reward": 0.227703857421875,
"kl": 0.7807338133454322,
"learning_rate": 5e-06,
"loss": 0.0267,
"reward": 1.1823074579238892,
"reward_std": 1.5363767087459563,
"rewards/reward_func": 1.1823074579238892,
"step": 9970,
"toxic_reward": 4.390237951278687
},
{
"clip_ratio": 0.0,
"completion_length": 44.225,
"epoch": 2.3582230623818523,
"format_reward": 0.0,
"grad_norm": 1.2711127996444702,
"image_reward": 0.23590087890625,
"kl": 1.0028429985046388,
"learning_rate": 5e-06,
"loss": -0.0221,
"reward": 0.1531411349773407,
"reward_std": 0.5583895549178124,
"rewards/reward_func": 0.1531411349773407,
"step": 9980,
"toxic_reward": 4.677754878997803
},
{
"clip_ratio": 0.0,
"completion_length": 42.775,
"epoch": 2.360586011342155,
"format_reward": 0.0,
"grad_norm": 2.5527610778808594,
"image_reward": 0.24173583984375,
"kl": 1.1347097665071488,
"learning_rate": 5e-06,
"loss": 0.0127,
"reward": -0.07762867212295532,
"reward_std": 0.6002773646265268,
"rewards/reward_func": -0.07762867212295532,
"step": 9990,
"toxic_reward": 4.172585511207581
},
{
"clip_ratio": 0.0,
"completion_length": 38.05,
"epoch": 2.3629489603024574,
"format_reward": 0.0,
"grad_norm": 4.715011119842529,
"image_reward": 0.23904571533203126,
"kl": 7.07305488884449,
"learning_rate": 5e-06,
"loss": -0.021,
"reward": 0.26526654958724977,
"reward_std": 0.82782434374094,
"rewards/reward_func": 0.26526654958724977,
"step": 10000,
"toxic_reward": 4.33233335018158
}
],
"logging_steps": 10,
"max_steps": 100000,
"num_input_tokens_seen": 0,
"num_train_epochs": 24,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}