Game-RL-Qwen2.5-VL-7B / trainer_state.json
Gabriel166's picture
Upload folder using huggingface_hub
ce75af0 verified
{
"best_metric": 0.34333334282040595,
"best_model_checkpoint": "/mnt/data/user/zhao_jun/tangjixin/output/model/qwen2.5vl-7b-grpo_new_v20_5k/v13-20250325-021847/checkpoint-2475",
"epoch": 1.0,
"eval_steps": 250,
"global_step": 2475,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"clip_ratio": 0.0,
"completion_length": 359.125,
"epoch": 0.00040404040404040404,
"grad_norm": 1.364031546421686,
"kl": 0.0,
"learning_rate": 1.6129032258064515e-09,
"loss": -0.0474996417760849,
"memory(GiB)": 81.93,
"response_clip_ratio": 0.0,
"reward": 0.2083333432674408,
"reward_std": 0.25746434926986694,
"rewards/MultiModalAccuracyORM": 0.2083333432674408,
"step": 1,
"train_speed(iter/s)": 0.005983
},
{
"clip_ratio": 0.0,
"completion_length": 304.95833945274353,
"epoch": 0.00202020202020202,
"grad_norm": 1.6130071483346196,
"kl": 0.00015279650688171387,
"learning_rate": 8.064516129032257e-09,
"loss": -0.0010303221642971039,
"memory(GiB)": 86.73,
"response_clip_ratio": 0.0,
"reward": 0.052083334885537624,
"reward_std": 0.13339675217866898,
"rewards/MultiModalAccuracyORM": 0.052083334885537624,
"step": 5,
"train_speed(iter/s)": 0.019266
},
{
"clip_ratio": 0.0,
"completion_length": 297.46667594909667,
"epoch": 0.00404040404040404,
"grad_norm": 1.760454082663187,
"kl": 0.000270843505859375,
"learning_rate": 1.6129032258064514e-08,
"loss": 0.005405974388122558,
"memory(GiB)": 87.09,
"response_clip_ratio": 0.0,
"reward": 0.14166667312383652,
"reward_std": 0.26492767333984374,
"rewards/MultiModalAccuracyORM": 0.14166667312383652,
"step": 10,
"train_speed(iter/s)": 0.026623
},
{
"clip_ratio": 0.0,
"completion_length": 452.308349609375,
"epoch": 0.006060606060606061,
"grad_norm": 1.1507264780517972,
"kl": 0.0002508640289306641,
"learning_rate": 2.4193548387096773e-08,
"loss": 0.013352996110916138,
"memory(GiB)": 87.09,
"response_clip_ratio": 0.02500000074505806,
"reward": 0.34166667610406876,
"reward_std": 0.36744636595249175,
"rewards/MultiModalAccuracyORM": 0.34166667610406876,
"step": 15,
"train_speed(iter/s)": 0.027725
},
{
"clip_ratio": 0.0,
"completion_length": 291.9916717529297,
"epoch": 0.00808080808080808,
"grad_norm": 1.9440298564534324,
"kl": 0.00028104782104492186,
"learning_rate": 3.225806451612903e-08,
"loss": 0.006416285037994384,
"memory(GiB)": 87.09,
"response_clip_ratio": 0.0,
"reward": 0.2833333373069763,
"reward_std": 0.2916341096162796,
"rewards/MultiModalAccuracyORM": 0.2833333373069763,
"step": 20,
"train_speed(iter/s)": 0.031051
},
{
"clip_ratio": 0.0,
"completion_length": 378.5500061035156,
"epoch": 0.010101010101010102,
"grad_norm": 1.6907685802618988,
"kl": 0.0002666950225830078,
"learning_rate": 4.032258064516129e-08,
"loss": -0.018301564455032348,
"memory(GiB)": 87.09,
"response_clip_ratio": 0.0,
"reward": 0.30833334624767306,
"reward_std": 0.3720185041427612,
"rewards/MultiModalAccuracyORM": 0.30833334624767306,
"step": 25,
"train_speed(iter/s)": 0.032339
},
{
"clip_ratio": 0.0,
"completion_length": 370.2333450317383,
"epoch": 0.012121212121212121,
"grad_norm": 1.5722363224769262,
"kl": 0.0002593994140625,
"learning_rate": 4.8387096774193546e-08,
"loss": -0.027563482522964478,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.25000000596046446,
"reward_std": 0.3226982891559601,
"rewards/MultiModalAccuracyORM": 0.25000000596046446,
"step": 30,
"train_speed(iter/s)": 0.032649
},
{
"clip_ratio": 0.0,
"completion_length": 398.5916778564453,
"epoch": 0.014141414141414142,
"grad_norm": 2.304234213678912,
"kl": 0.00022954940795898436,
"learning_rate": 5.645161290322581e-08,
"loss": 0.048061671853065493,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.00833333358168602,
"reward": 0.1416666716337204,
"reward_std": 0.3226627051830292,
"rewards/MultiModalAccuracyORM": 0.1416666716337204,
"step": 35,
"train_speed(iter/s)": 0.033014
},
{
"clip_ratio": 0.0,
"completion_length": 274.97500972747804,
"epoch": 0.01616161616161616,
"grad_norm": 1.6894032790709004,
"kl": 0.0002648591995239258,
"learning_rate": 6.451612903225806e-08,
"loss": 0.012092837691307068,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.2666666753590107,
"reward_std": 0.222271066904068,
"rewards/MultiModalAccuracyORM": 0.2666666753590107,
"step": 40,
"train_speed(iter/s)": 0.034411
},
{
"clip_ratio": 0.0,
"completion_length": 421.9333435058594,
"epoch": 0.01818181818181818,
"grad_norm": 1.9171038477045215,
"kl": 0.00023059844970703126,
"learning_rate": 7.258064516129032e-08,
"loss": -0.0132610023021698,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.15833333879709244,
"reward_std": 0.2489179015159607,
"rewards/MultiModalAccuracyORM": 0.15833333879709244,
"step": 45,
"train_speed(iter/s)": 0.034702
},
{
"clip_ratio": 0.0,
"completion_length": 444.20001525878905,
"epoch": 0.020202020202020204,
"grad_norm": 1.795783985834061,
"kl": 0.00021610260009765624,
"learning_rate": 8.064516129032257e-08,
"loss": 0.055432689189910886,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.13333333730697633,
"reward_std": 0.320406436920166,
"rewards/MultiModalAccuracyORM": 0.13333333730697633,
"step": 50,
"train_speed(iter/s)": 0.034713
},
{
"clip_ratio": 0.0,
"completion_length": 271.8500068664551,
"epoch": 0.022222222222222223,
"grad_norm": 1.570392013394559,
"kl": 0.00024003982543945311,
"learning_rate": 8.870967741935484e-08,
"loss": 0.0527652382850647,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.17500000968575477,
"reward_std": 0.24862808585166932,
"rewards/MultiModalAccuracyORM": 0.17500000968575477,
"step": 55,
"train_speed(iter/s)": 0.035397
},
{
"clip_ratio": 0.0,
"completion_length": 240.03333892822266,
"epoch": 0.024242424242424242,
"grad_norm": 1.7404447091659765,
"kl": 0.00024061203002929689,
"learning_rate": 9.677419354838709e-08,
"loss": -0.06867231130599975,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.39166667982935904,
"reward_std": 0.33052347004413607,
"rewards/MultiModalAccuracyORM": 0.39166667982935904,
"step": 60,
"train_speed(iter/s)": 0.036121
},
{
"clip_ratio": 0.0,
"completion_length": 449.5083480834961,
"epoch": 0.026262626262626262,
"grad_norm": 1.770871195621109,
"kl": 0.0002596855163574219,
"learning_rate": 1.0483870967741934e-07,
"loss": 0.019220371544361115,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.00833333358168602,
"reward": 0.1416666701436043,
"reward_std": 0.27753120064735415,
"rewards/MultiModalAccuracyORM": 0.1416666701436043,
"step": 65,
"train_speed(iter/s)": 0.035829
},
{
"clip_ratio": 0.0,
"completion_length": 307.05834197998047,
"epoch": 0.028282828282828285,
"grad_norm": 1.1236406922162803,
"kl": 0.00025534629821777344,
"learning_rate": 1.1290322580645162e-07,
"loss": 0.006563323736190796,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.15833334252238274,
"reward_std": 0.18108985424041749,
"rewards/MultiModalAccuracyORM": 0.15833334252238274,
"step": 70,
"train_speed(iter/s)": 0.036273
},
{
"clip_ratio": 0.0,
"completion_length": 285.05833969116213,
"epoch": 0.030303030303030304,
"grad_norm": 2.2244576725130276,
"kl": 0.00026721954345703124,
"learning_rate": 1.2096774193548387e-07,
"loss": 0.021188412606716157,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.28333333805203437,
"reward_std": 0.3494287371635437,
"rewards/MultiModalAccuracyORM": 0.28333333805203437,
"step": 75,
"train_speed(iter/s)": 0.036577
},
{
"clip_ratio": 0.0,
"completion_length": 365.70000381469725,
"epoch": 0.03232323232323232,
"grad_norm": 2.238393674944575,
"kl": 0.00026388168334960936,
"learning_rate": 1.2903225806451611e-07,
"loss": 0.029351598024368285,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.01666666716337204,
"reward": 0.22500000521540642,
"reward_std": 0.279270276427269,
"rewards/MultiModalAccuracyORM": 0.22500000521540642,
"step": 80,
"train_speed(iter/s)": 0.036263
},
{
"clip_ratio": 0.0,
"completion_length": 245.05000381469728,
"epoch": 0.03434343434343434,
"grad_norm": 1.5092959560425367,
"kl": 0.00028471946716308595,
"learning_rate": 1.3709677419354838e-07,
"loss": -0.036607831716537476,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.28333334177732467,
"reward_std": 0.39707074165344236,
"rewards/MultiModalAccuracyORM": 0.28333334177732467,
"step": 85,
"train_speed(iter/s)": 0.035112
},
{
"clip_ratio": 0.0,
"completion_length": 359.3000152587891,
"epoch": 0.03636363636363636,
"grad_norm": 1.983727747725694,
"kl": 0.0002570152282714844,
"learning_rate": 1.4516129032258064e-07,
"loss": 0.02973529100418091,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.17500000447034836,
"reward_std": 0.27928483188152314,
"rewards/MultiModalAccuracyORM": 0.17500000447034836,
"step": 90,
"train_speed(iter/s)": 0.035019
},
{
"clip_ratio": 0.0,
"completion_length": 420.7333511352539,
"epoch": 0.03838383838383838,
"grad_norm": 1.6243054678942601,
"kl": 0.00022783279418945313,
"learning_rate": 1.5322580645161288e-07,
"loss": -0.030441620945930482,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.00833333358168602,
"reward": 0.23333333805203438,
"reward_std": 0.35868159830570223,
"rewards/MultiModalAccuracyORM": 0.23333333805203438,
"step": 95,
"train_speed(iter/s)": 0.035038
},
{
"clip_ratio": 0.0,
"completion_length": 320.6583419799805,
"epoch": 0.04040404040404041,
"grad_norm": 1.5278965004190905,
"kl": 0.00023970603942871093,
"learning_rate": 1.6129032258064515e-07,
"loss": 0.014825087785720826,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.3833333417773247,
"reward_std": 0.24560283720493317,
"rewards/MultiModalAccuracyORM": 0.3833333417773247,
"step": 100,
"train_speed(iter/s)": 0.035336
},
{
"clip_ratio": 0.0,
"completion_length": 367.6000091552734,
"epoch": 0.04242424242424243,
"grad_norm": 2.275003739183734,
"kl": 0.0002989768981933594,
"learning_rate": 1.6935483870967741e-07,
"loss": 0.021370184421539307,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.3083333410322666,
"reward_std": 0.31520852744579314,
"rewards/MultiModalAccuracyORM": 0.3083333410322666,
"step": 105,
"train_speed(iter/s)": 0.035535
},
{
"clip_ratio": 0.0,
"completion_length": 375.37500915527346,
"epoch": 0.044444444444444446,
"grad_norm": 1.3264840189361857,
"kl": 0.00028629302978515624,
"learning_rate": 1.7741935483870968e-07,
"loss": 0.013422733545303345,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.1416666701436043,
"reward_std": 0.24885829985141755,
"rewards/MultiModalAccuracyORM": 0.1416666701436043,
"step": 110,
"train_speed(iter/s)": 0.035867
},
{
"clip_ratio": 0.0,
"completion_length": 400.7583488464355,
"epoch": 0.046464646464646465,
"grad_norm": 0.0068729642108505875,
"kl": 0.00022754669189453124,
"learning_rate": 1.8548387096774192e-07,
"loss": 0.007101482152938843,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.01666666716337204,
"reward": 0.24166667386889457,
"reward_std": 0.23854664266109465,
"rewards/MultiModalAccuracyORM": 0.24166667386889457,
"step": 115,
"train_speed(iter/s)": 0.035529
},
{
"clip_ratio": 0.0,
"completion_length": 358.0500122070313,
"epoch": 0.048484848484848485,
"grad_norm": 1.666888807483155,
"kl": 0.00029277801513671875,
"learning_rate": 1.9354838709677418e-07,
"loss": -0.013055479526519776,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.20833334028720857,
"reward_std": 0.24041947722434998,
"rewards/MultiModalAccuracyORM": 0.20833334028720857,
"step": 120,
"train_speed(iter/s)": 0.035843
},
{
"clip_ratio": 0.0,
"completion_length": 285.0916717529297,
"epoch": 0.050505050505050504,
"grad_norm": 3.6057797063570765,
"kl": 0.00020406246185302734,
"learning_rate": 2e-07,
"loss": 0.029223644733428956,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.2750000052154064,
"reward_std": 0.27371591329574585,
"rewards/MultiModalAccuracyORM": 0.2750000052154064,
"step": 125,
"train_speed(iter/s)": 0.036026
},
{
"clip_ratio": 0.0,
"completion_length": 488.7583526611328,
"epoch": 0.052525252525252523,
"grad_norm": 1.7900187922950372,
"kl": 0.00025043487548828127,
"learning_rate": 2e-07,
"loss": 0.0551780104637146,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.2833333484828472,
"reward_std": 0.3641817569732666,
"rewards/MultiModalAccuracyORM": 0.2833333484828472,
"step": 130,
"train_speed(iter/s)": 0.036075
},
{
"clip_ratio": 0.0,
"completion_length": 330.5000072479248,
"epoch": 0.05454545454545454,
"grad_norm": 2.529917707084592,
"kl": 0.0002529144287109375,
"learning_rate": 2e-07,
"loss": 0.02438216805458069,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.00833333358168602,
"reward": 0.3416666708886623,
"reward_std": 0.279270276427269,
"rewards/MultiModalAccuracyORM": 0.3416666708886623,
"step": 135,
"train_speed(iter/s)": 0.036092
},
{
"clip_ratio": 0.0,
"completion_length": 373.6333465576172,
"epoch": 0.05656565656565657,
"grad_norm": 1.3049814649570146,
"kl": 0.0002875804901123047,
"learning_rate": 2e-07,
"loss": -0.022501662373542786,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.3833333469927311,
"reward_std": 0.34958777129650115,
"rewards/MultiModalAccuracyORM": 0.3833333469927311,
"step": 140,
"train_speed(iter/s)": 0.036095
},
{
"clip_ratio": 0.0,
"completion_length": 345.41668395996095,
"epoch": 0.05858585858585859,
"grad_norm": 1.8437868971897566,
"kl": 0.00023627281188964844,
"learning_rate": 2e-07,
"loss": 0.06273630857467652,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.00833333358168602,
"reward": 0.3666666768491268,
"reward_std": 0.3914994150400162,
"rewards/MultiModalAccuracyORM": 0.3666666768491268,
"step": 145,
"train_speed(iter/s)": 0.036226
},
{
"clip_ratio": 0.0,
"completion_length": 266.51667098999025,
"epoch": 0.06060606060606061,
"grad_norm": 1.0785517011291799,
"kl": 0.00021938085556030273,
"learning_rate": 2e-07,
"loss": 0.02771698534488678,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.400000012665987,
"reward_std": 0.3516494154930115,
"rewards/MultiModalAccuracyORM": 0.400000012665987,
"step": 150,
"train_speed(iter/s)": 0.036427
},
{
"clip_ratio": 0.0,
"completion_length": 333.3500152587891,
"epoch": 0.06262626262626263,
"grad_norm": 12.619972342482905,
"kl": 0.00030460357666015623,
"learning_rate": 2e-07,
"loss": -0.06058757305145264,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.2250000074505806,
"reward_std": 0.37600439190864565,
"rewards/MultiModalAccuracyORM": 0.2250000074505806,
"step": 155,
"train_speed(iter/s)": 0.036609
},
{
"clip_ratio": 0.0,
"completion_length": 358.7416732788086,
"epoch": 0.06464646464646465,
"grad_norm": 1.306377595968382,
"kl": 0.00027475357055664065,
"learning_rate": 2e-07,
"loss": -0.00979010909795761,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.00833333358168602,
"reward": 0.15833333432674407,
"reward_std": 0.28456337153911593,
"rewards/MultiModalAccuracyORM": 0.15833333432674407,
"step": 160,
"train_speed(iter/s)": 0.036431
},
{
"clip_ratio": 0.0,
"completion_length": 324.9916763305664,
"epoch": 0.06666666666666667,
"grad_norm": 0.9830762972924579,
"kl": 0.00030498504638671876,
"learning_rate": 2e-07,
"loss": -0.008201467990875243,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.10000000149011612,
"reward_std": 0.2260383188724518,
"rewards/MultiModalAccuracyORM": 0.10000000149011612,
"step": 165,
"train_speed(iter/s)": 0.036665
},
{
"clip_ratio": 0.0,
"completion_length": 249.37500610351563,
"epoch": 0.06868686868686869,
"grad_norm": 2.1917101699979287,
"kl": 0.00025620460510253904,
"learning_rate": 2e-07,
"loss": 0.016992685198783875,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.3500000037252903,
"reward_std": 0.330559054017067,
"rewards/MultiModalAccuracyORM": 0.3500000037252903,
"step": 170,
"train_speed(iter/s)": 0.036951
},
{
"clip_ratio": 0.0,
"completion_length": 358.87500762939453,
"epoch": 0.0707070707070707,
"grad_norm": 1.0748542635448965,
"kl": 0.0002711296081542969,
"learning_rate": 2e-07,
"loss": 0.010954010486602783,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.20833333656191827,
"reward_std": 0.22400068640708923,
"rewards/MultiModalAccuracyORM": 0.20833333656191827,
"step": 175,
"train_speed(iter/s)": 0.037203
},
{
"clip_ratio": 0.0,
"completion_length": 313.62500762939453,
"epoch": 0.07272727272727272,
"grad_norm": 2.2725379948331543,
"kl": 0.00025653839111328125,
"learning_rate": 2e-07,
"loss": 0.03469780087471008,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.16666667237877847,
"reward_std": 0.3332285821437836,
"rewards/MultiModalAccuracyORM": 0.16666667237877847,
"step": 180,
"train_speed(iter/s)": 0.037355
},
{
"clip_ratio": 0.0,
"completion_length": 271.96667327880857,
"epoch": 0.07474747474747474,
"grad_norm": 1.4486054691502512,
"kl": 0.0002918243408203125,
"learning_rate": 2e-07,
"loss": -0.009673595428466797,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.3083333373069763,
"reward_std": 0.102961727976799,
"rewards/MultiModalAccuracyORM": 0.3083333373069763,
"step": 185,
"train_speed(iter/s)": 0.037564
},
{
"clip_ratio": 0.0,
"completion_length": 403.25834197998046,
"epoch": 0.07676767676767676,
"grad_norm": 3.170971594101629,
"kl": 0.00025038719177246095,
"learning_rate": 2e-07,
"loss": 0.0012440800666809082,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.01666666716337204,
"reward": 0.24166667014360427,
"reward_std": 0.30789810717105864,
"rewards/MultiModalAccuracyORM": 0.24166667014360427,
"step": 190,
"train_speed(iter/s)": 0.037415
},
{
"clip_ratio": 0.0,
"completion_length": 294.62500991821287,
"epoch": 0.07878787878787878,
"grad_norm": 1.98318367969525,
"kl": 0.00029687881469726564,
"learning_rate": 2e-07,
"loss": 0.008435648679733277,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.00833333358168602,
"reward": 0.33333334028720857,
"reward_std": 0.24741607010364533,
"rewards/MultiModalAccuracyORM": 0.33333334028720857,
"step": 195,
"train_speed(iter/s)": 0.037342
},
{
"clip_ratio": 0.0,
"completion_length": 374.6333465576172,
"epoch": 0.08080808080808081,
"grad_norm": 1.503273341785427,
"kl": 0.000333404541015625,
"learning_rate": 2e-07,
"loss": 0.005708768963813782,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.26666667237877845,
"reward_std": 0.3603756338357925,
"rewards/MultiModalAccuracyORM": 0.26666667237877845,
"step": 200,
"train_speed(iter/s)": 0.037521
},
{
"clip_ratio": 0.0,
"completion_length": 379.68334407806395,
"epoch": 0.08282828282828283,
"grad_norm": 0.5199716532978094,
"kl": 0.0004832744598388672,
"learning_rate": 2e-07,
"loss": -0.014856468141078948,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.24166667014360427,
"reward_std": 0.33937130570411683,
"rewards/MultiModalAccuracyORM": 0.24166667014360427,
"step": 205,
"train_speed(iter/s)": 0.037585
},
{
"clip_ratio": 0.0,
"completion_length": 305.3916732788086,
"epoch": 0.08484848484848485,
"grad_norm": 2.1287930828371358,
"kl": 0.000292205810546875,
"learning_rate": 2e-07,
"loss": 0.001297689974308014,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.21666667386889457,
"reward_std": 0.32771685123443606,
"rewards/MultiModalAccuracyORM": 0.21666667386889457,
"step": 210,
"train_speed(iter/s)": 0.037751
},
{
"clip_ratio": 0.0,
"completion_length": 345.49167213439944,
"epoch": 0.08686868686868687,
"grad_norm": 1.7796242872827708,
"kl": 0.00042543411254882815,
"learning_rate": 2e-07,
"loss": -0.006988461315631867,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.23333333656191826,
"reward_std": 0.2692273885011673,
"rewards/MultiModalAccuracyORM": 0.23333333656191826,
"step": 215,
"train_speed(iter/s)": 0.037754
},
{
"clip_ratio": 0.0,
"completion_length": 314.81667442321776,
"epoch": 0.08888888888888889,
"grad_norm": 1.7638027896241226,
"kl": 0.0006679534912109375,
"learning_rate": 2e-07,
"loss": 0.006352822482585907,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.15833333507180214,
"reward_std": 0.25008893609046934,
"rewards/MultiModalAccuracyORM": 0.15833333507180214,
"step": 220,
"train_speed(iter/s)": 0.03785
},
{
"clip_ratio": 0.0,
"completion_length": 311.2750076293945,
"epoch": 0.09090909090909091,
"grad_norm": 0.012708836578688367,
"kl": 0.00029745101928710935,
"learning_rate": 2e-07,
"loss": 0.0504034161567688,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.30000000149011613,
"reward_std": 0.3164917230606079,
"rewards/MultiModalAccuracyORM": 0.30000000149011613,
"step": 225,
"train_speed(iter/s)": 0.038015
},
{
"clip_ratio": 0.0,
"completion_length": 265.6750061035156,
"epoch": 0.09292929292929293,
"grad_norm": 2.064611776487197,
"kl": 0.000385284423828125,
"learning_rate": 2e-07,
"loss": 0.07023286819458008,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.15000000298023225,
"reward_std": 0.2650228708982468,
"rewards/MultiModalAccuracyORM": 0.15000000298023225,
"step": 230,
"train_speed(iter/s)": 0.03818
},
{
"clip_ratio": 0.0,
"completion_length": 371.5416793823242,
"epoch": 0.09494949494949495,
"grad_norm": 1.949431436305181,
"kl": 0.0002506256103515625,
"learning_rate": 2e-07,
"loss": 0.01011454164981842,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.3333333469927311,
"reward_std": 0.3637147039175034,
"rewards/MultiModalAccuracyORM": 0.3333333469927311,
"step": 235,
"train_speed(iter/s)": 0.03819
},
{
"clip_ratio": 0.0,
"completion_length": 360.533341217041,
"epoch": 0.09696969696969697,
"grad_norm": 0.5471178347466235,
"kl": 0.0010341405868530273,
"learning_rate": 2e-07,
"loss": -0.0015352100133895874,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.01666666716337204,
"reward": 0.28333333805203437,
"reward_std": 0.3511823683977127,
"rewards/MultiModalAccuracyORM": 0.28333333805203437,
"step": 240,
"train_speed(iter/s)": 0.037977
},
{
"clip_ratio": 0.0,
"completion_length": 336.3000129699707,
"epoch": 0.09898989898989899,
"grad_norm": 2.3165413137247333,
"kl": 0.00027217864990234373,
"learning_rate": 2e-07,
"loss": 0.0210051491856575,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.00833333358168602,
"reward": 0.32500000968575476,
"reward_std": 0.38450039029121397,
"rewards/MultiModalAccuracyORM": 0.32500000968575476,
"step": 245,
"train_speed(iter/s)": 0.037993
},
{
"epoch": 0.10101010101010101,
"grad_norm": 2.645674704495033,
"learning_rate": 2e-07,
"loss": -0.03384391665458679,
"memory(GiB)": 87.45,
"step": 250,
"train_speed(iter/s)": 0.038032
},
{
"epoch": 0.10101010101010101,
"eval_clip_ratio": 0.0,
"eval_completion_length": 334.34500762939456,
"eval_kl": 0.0004983329772949218,
"eval_loss": 0.023834386840462685,
"eval_response_clip_ratio": 0.003333333432674408,
"eval_reward": 0.24666667267680167,
"eval_reward_std": 0.30061395645141603,
"eval_rewards/MultiModalAccuracyORM": 0.24666667267680167,
"eval_runtime": 585.2435,
"eval_samples_per_second": 0.085,
"eval_steps_per_second": 0.009,
"step": 250
},
{
"clip_ratio": 0.0,
"completion_length": 369.79583778381345,
"epoch": 0.10303030303030303,
"grad_norm": 1.5910045148895993,
"kl": 0.0006116151809692383,
"learning_rate": 2e-07,
"loss": -0.05511324405670166,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.34166667647659776,
"reward_std": 0.3701108664274216,
"rewards/MultiModalAccuracyORM": 0.34166667647659776,
"step": 255,
"train_speed(iter/s)": 0.03329
},
{
"clip_ratio": 0.0,
"completion_length": 287.85,
"epoch": 0.10505050505050505,
"grad_norm": 1.8789057522234565,
"kl": 0.0006687164306640625,
"learning_rate": 2e-07,
"loss": 0.08147464394569397,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.3500000037252903,
"reward_std": 0.3494287371635437,
"rewards/MultiModalAccuracyORM": 0.3500000037252903,
"step": 260,
"train_speed(iter/s)": 0.033421
},
{
"clip_ratio": 0.0,
"completion_length": 327.0,
"epoch": 0.10707070707070707,
"grad_norm": 1.685788699755795,
"kl": 0.00030879974365234376,
"learning_rate": 2e-07,
"loss": 0.0021983295679092406,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.2083333358168602,
"reward_std": 0.3010816007852554,
"rewards/MultiModalAccuracyORM": 0.2083333358168602,
"step": 265,
"train_speed(iter/s)": 0.033374
},
{
"clip_ratio": 0.0,
"completion_length": 380.5,
"epoch": 0.10909090909090909,
"grad_norm": 2.9700739773322695,
"kl": 0.00040111541748046877,
"learning_rate": 2e-07,
"loss": -0.004064649343490601,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.15833333730697632,
"reward_std": 0.33526621460914613,
"rewards/MultiModalAccuracyORM": 0.15833333730697632,
"step": 270,
"train_speed(iter/s)": 0.033364
},
{
"clip_ratio": 0.0,
"completion_length": 324.25,
"epoch": 0.1111111111111111,
"grad_norm": 1.5939506920216808,
"kl": 0.00045032501220703124,
"learning_rate": 2e-07,
"loss": 0.026332959532737732,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.2583333395421505,
"reward_std": 0.2526735752820969,
"rewards/MultiModalAccuracyORM": 0.2583333395421505,
"step": 275,
"train_speed(iter/s)": 0.033468
},
{
"clip_ratio": 0.0,
"completion_length": 496.5,
"epoch": 0.11313131313131314,
"grad_norm": 1.3058289755881347,
"kl": 0.000375831127166748,
"learning_rate": 2e-07,
"loss": 0.027166426181793213,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.32500001341104506,
"reward_std": 0.37195890247821806,
"rewards/MultiModalAccuracyORM": 0.32500001341104506,
"step": 280,
"train_speed(iter/s)": 0.033383
},
{
"clip_ratio": 0.0,
"completion_length": 361.05,
"epoch": 0.11515151515151516,
"grad_norm": 0.5211592745612927,
"kl": 0.0004334449768066406,
"learning_rate": 2e-07,
"loss": -0.001045474410057068,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.22500000149011612,
"reward_std": 0.1808116167783737,
"rewards/MultiModalAccuracyORM": 0.22500000149011612,
"step": 285,
"train_speed(iter/s)": 0.03333
},
{
"clip_ratio": 0.0,
"completion_length": 345.25,
"epoch": 0.11717171717171718,
"grad_norm": 1.9995357461573446,
"kl": 0.0005333900451660156,
"learning_rate": 2e-07,
"loss": -0.00281745046377182,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.18333333656191825,
"reward_std": 0.3385071337223053,
"rewards/MultiModalAccuracyORM": 0.18333333656191825,
"step": 290,
"train_speed(iter/s)": 0.033413
},
{
"clip_ratio": 0.0,
"completion_length": 387.1,
"epoch": 0.1191919191919192,
"grad_norm": 3.694756818436622,
"kl": 0.0010143280029296874,
"learning_rate": 2e-07,
"loss": -0.003062787652015686,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.15833333879709244,
"reward_std": 0.314164274930954,
"rewards/MultiModalAccuracyORM": 0.15833333879709244,
"step": 295,
"train_speed(iter/s)": 0.03345
},
{
"clip_ratio": 0.0,
"completion_length": 393.25,
"epoch": 0.12121212121212122,
"grad_norm": 1.5577866137872902,
"kl": 0.00044269561767578124,
"learning_rate": 2e-07,
"loss": -0.022827643156051635,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.2583333447575569,
"reward_std": 0.3393001317977905,
"rewards/MultiModalAccuracyORM": 0.2583333447575569,
"step": 300,
"train_speed(iter/s)": 0.033327
},
{
"clip_ratio": 0.0,
"completion_length": 416.25,
"epoch": 0.12323232323232323,
"grad_norm": 0.8793802822161716,
"kl": 0.00045299530029296875,
"learning_rate": 2e-07,
"loss": 0.039026769995689395,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.2333333395421505,
"reward_std": 0.33277973234653474,
"rewards/MultiModalAccuracyORM": 0.2333333395421505,
"step": 305,
"train_speed(iter/s)": 0.032887
},
{
"clip_ratio": 0.0,
"completion_length": 334.3,
"epoch": 0.12525252525252525,
"grad_norm": 1.9841151826732792,
"kl": 0.0006313323974609375,
"learning_rate": 2e-07,
"loss": -0.006224775314331054,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.23333334252238275,
"reward_std": 0.31441850066184995,
"rewards/MultiModalAccuracyORM": 0.23333334252238275,
"step": 310,
"train_speed(iter/s)": 0.032913
},
{
"clip_ratio": 0.0,
"completion_length": 537.7,
"epoch": 0.12727272727272726,
"grad_norm": 1.2729907719968943,
"kl": 0.0007027626037597656,
"learning_rate": 2e-07,
"loss": 0.014832744002342224,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.05,
"reward": 0.11666666939854622,
"reward_std": 0.25891573131084444,
"rewards/MultiModalAccuracyORM": 0.11666666939854622,
"step": 315,
"train_speed(iter/s)": 0.032886
},
{
"clip_ratio": 0.0,
"completion_length": 282.8,
"epoch": 0.1292929292929293,
"grad_norm": 0.9148877498687834,
"kl": 0.000760650634765625,
"learning_rate": 2e-07,
"loss": 0.06303757429122925,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.21666667014360427,
"reward_std": 0.2323044866323471,
"rewards/MultiModalAccuracyORM": 0.21666667014360427,
"step": 320,
"train_speed(iter/s)": 0.032974
},
{
"clip_ratio": 0.0,
"completion_length": 404.8,
"epoch": 0.13131313131313133,
"grad_norm": 2.00474803214382,
"kl": 0.0007790565490722656,
"learning_rate": 2e-07,
"loss": 0.02660681903362274,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.13333333656191826,
"reward_std": 0.2486636757850647,
"rewards/MultiModalAccuracyORM": 0.13333333656191826,
"step": 325,
"train_speed(iter/s)": 0.033068
},
{
"clip_ratio": 0.0,
"completion_length": 333.8,
"epoch": 0.13333333333333333,
"grad_norm": 1.6448765146368245,
"kl": 0.0005625724792480469,
"learning_rate": 2e-07,
"loss": 0.024477413296699523,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.19166667237877846,
"reward_std": 0.2629852324724197,
"rewards/MultiModalAccuracyORM": 0.19166667237877846,
"step": 330,
"train_speed(iter/s)": 0.0332
},
{
"clip_ratio": 0.0,
"completion_length": 330.25,
"epoch": 0.13535353535353536,
"grad_norm": 2.2001765187520776,
"kl": 0.0006697654724121093,
"learning_rate": 2e-07,
"loss": 0.07480921745300292,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.27500001043081285,
"reward_std": 0.37195890247821806,
"rewards/MultiModalAccuracyORM": 0.27500001043081285,
"step": 335,
"train_speed(iter/s)": 0.033276
},
{
"clip_ratio": 0.0,
"completion_length": 386.1,
"epoch": 0.13737373737373737,
"grad_norm": 0.6836764259374134,
"kl": 0.0006744384765625,
"learning_rate": 2e-07,
"loss": 0.050872421264648436,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.36666667386889457,
"reward_std": 0.25897533297538755,
"rewards/MultiModalAccuracyORM": 0.36666667386889457,
"step": 340,
"train_speed(iter/s)": 0.033397
},
{
"clip_ratio": 0.0,
"completion_length": 430.0,
"epoch": 0.1393939393939394,
"grad_norm": 0.02974363962833146,
"kl": 0.0007775306701660156,
"learning_rate": 2e-07,
"loss": -0.00942653715610504,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.1500000037252903,
"reward_std": 0.1933199405670166,
"rewards/MultiModalAccuracyORM": 0.1500000037252903,
"step": 345,
"train_speed(iter/s)": 0.033409
},
{
"clip_ratio": 0.0,
"completion_length": 406.15,
"epoch": 0.1414141414141414,
"grad_norm": 2.153809687333121,
"kl": 0.00106048583984375,
"learning_rate": 2e-07,
"loss": -0.04788823127746582,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.3833333425223827,
"reward_std": 0.3908641755580902,
"rewards/MultiModalAccuracyORM": 0.3833333425223827,
"step": 350,
"train_speed(iter/s)": 0.033484
},
{
"clip_ratio": 0.0,
"completion_length": 273.95,
"epoch": 0.14343434343434344,
"grad_norm": 2.9003800421035084,
"kl": 0.001187896728515625,
"learning_rate": 2e-07,
"loss": -0.025590839982032775,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.1500000014901161,
"reward_std": 0.24484840035438538,
"rewards/MultiModalAccuracyORM": 0.1500000014901161,
"step": 355,
"train_speed(iter/s)": 0.033613
},
{
"clip_ratio": 0.0,
"completion_length": 258.15,
"epoch": 0.14545454545454545,
"grad_norm": 1.3041121484800926,
"kl": 0.001438140869140625,
"learning_rate": 2e-07,
"loss": 0.10738253593444824,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.18333333656191825,
"reward_std": 0.3196970522403717,
"rewards/MultiModalAccuracyORM": 0.18333333656191825,
"step": 360,
"train_speed(iter/s)": 0.033727
},
{
"clip_ratio": 0.0,
"completion_length": 380.15,
"epoch": 0.14747474747474748,
"grad_norm": 0.8360441109730193,
"kl": 0.00127105712890625,
"learning_rate": 2e-07,
"loss": -0.003975853323936462,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.05000000149011612,
"reward_std": 0.13558491468429565,
"rewards/MultiModalAccuracyORM": 0.05000000149011612,
"step": 365,
"train_speed(iter/s)": 0.033745
},
{
"clip_ratio": 0.0,
"completion_length": 296.6,
"epoch": 0.1494949494949495,
"grad_norm": 2.3979328705343153,
"kl": 0.001323699951171875,
"learning_rate": 2e-07,
"loss": -0.048431962728500366,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.25000000521540644,
"reward_std": 0.35312480926513673,
"rewards/MultiModalAccuracyORM": 0.25000000521540644,
"step": 370,
"train_speed(iter/s)": 0.033877
},
{
"clip_ratio": 0.0,
"completion_length": 345.2,
"epoch": 0.15151515151515152,
"grad_norm": 1.5241819642025198,
"kl": 0.0015224456787109376,
"learning_rate": 2e-07,
"loss": 0.08156558275222778,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.3916666768491268,
"reward_std": 0.30183603167533873,
"rewards/MultiModalAccuracyORM": 0.3916666768491268,
"step": 375,
"train_speed(iter/s)": 0.033941
},
{
"clip_ratio": 0.0,
"completion_length": 318.7,
"epoch": 0.15353535353535352,
"grad_norm": 1.4091270455051919,
"kl": 0.0014804840087890626,
"learning_rate": 2e-07,
"loss": -0.005422207713127136,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.25833333656191826,
"reward_std": 0.29863070249557494,
"rewards/MultiModalAccuracyORM": 0.25833333656191826,
"step": 380,
"train_speed(iter/s)": 0.03401
},
{
"clip_ratio": 0.0,
"completion_length": 332.4,
"epoch": 0.15555555555555556,
"grad_norm": 1.7741695775671322,
"kl": 0.0017261505126953125,
"learning_rate": 2e-07,
"loss": 0.013069793581962585,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.35833334028720853,
"reward_std": 0.41791602969169617,
"rewards/MultiModalAccuracyORM": 0.35833334028720853,
"step": 385,
"train_speed(iter/s)": 0.034132
},
{
"clip_ratio": 0.0,
"completion_length": 325.15,
"epoch": 0.15757575757575756,
"grad_norm": 2.1621073881433954,
"kl": 0.001946258544921875,
"learning_rate": 2e-07,
"loss": 0.018825350701808928,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.2000000074505806,
"reward_std": 0.329024064540863,
"rewards/MultiModalAccuracyORM": 0.2000000074505806,
"step": 390,
"train_speed(iter/s)": 0.034186
},
{
"clip_ratio": 0.0,
"completion_length": 393.7,
"epoch": 0.1595959595959596,
"grad_norm": 1.8573956206789706,
"kl": 0.0013622283935546876,
"learning_rate": 2e-07,
"loss": 0.01834181547164917,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.2583333380520344,
"reward_std": 0.33226497769355773,
"rewards/MultiModalAccuracyORM": 0.2583333380520344,
"step": 395,
"train_speed(iter/s)": 0.034168
},
{
"clip_ratio": 0.0,
"completion_length": 319.05,
"epoch": 0.16161616161616163,
"grad_norm": 2.2110728171395646,
"kl": 0.0019084930419921875,
"learning_rate": 2e-07,
"loss": 0.019550779461860658,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.20000000521540642,
"reward_std": 0.3008869707584381,
"rewards/MultiModalAccuracyORM": 0.20000000521540642,
"step": 400,
"train_speed(iter/s)": 0.034255
},
{
"clip_ratio": 0.0,
"completion_length": 263.5,
"epoch": 0.16363636363636364,
"grad_norm": 2.2884019112467,
"kl": 0.00233917236328125,
"learning_rate": 2e-07,
"loss": 0.00730045884847641,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.30000000819563866,
"reward_std": 0.32297652661800386,
"rewards/MultiModalAccuracyORM": 0.30000000819563866,
"step": 405,
"train_speed(iter/s)": 0.034354
},
{
"clip_ratio": 0.0,
"completion_length": 366.95,
"epoch": 0.16565656565656567,
"grad_norm": 3.384921120442682,
"kl": 0.001834869384765625,
"learning_rate": 2e-07,
"loss": 0.02867870032787323,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.30833334401249884,
"reward_std": 0.3604020655155182,
"rewards/MultiModalAccuracyORM": 0.30833334401249884,
"step": 410,
"train_speed(iter/s)": 0.034303
},
{
"clip_ratio": 0.0,
"completion_length": 380.85,
"epoch": 0.16767676767676767,
"grad_norm": 2.578682884841481,
"kl": 0.0019824981689453127,
"learning_rate": 2e-07,
"loss": 0.007520823180675507,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.20833333656191827,
"reward_std": 0.24105713665485382,
"rewards/MultiModalAccuracyORM": 0.20833333656191827,
"step": 415,
"train_speed(iter/s)": 0.034306
},
{
"clip_ratio": 0.0,
"completion_length": 223.9,
"epoch": 0.1696969696969697,
"grad_norm": 2.841135168153006,
"kl": 0.003629302978515625,
"learning_rate": 2e-07,
"loss": 0.008403807878494263,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.4166666746139526,
"reward_std": 0.31846399009227755,
"rewards/MultiModalAccuracyORM": 0.4166666746139526,
"step": 420,
"train_speed(iter/s)": 0.034394
},
{
"clip_ratio": 0.0,
"completion_length": 406.0,
"epoch": 0.1717171717171717,
"grad_norm": 1.3952154788825455,
"kl": 0.0026947021484375,
"learning_rate": 2e-07,
"loss": 0.016321972012519836,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.3750000104308128,
"reward_std": 0.3541358977556229,
"rewards/MultiModalAccuracyORM": 0.3750000104308128,
"step": 425,
"train_speed(iter/s)": 0.034427
},
{
"clip_ratio": 0.0,
"completion_length": 426.85,
"epoch": 0.17373737373737375,
"grad_norm": 2.642228792263709,
"kl": 0.0035511016845703124,
"learning_rate": 2e-07,
"loss": 0.04757256805896759,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.3000000059604645,
"reward_std": 0.27122943103313446,
"rewards/MultiModalAccuracyORM": 0.3000000059604645,
"step": 430,
"train_speed(iter/s)": 0.034492
},
{
"clip_ratio": 0.0,
"completion_length": 357.9,
"epoch": 0.17575757575757575,
"grad_norm": 2.3061110590781433,
"kl": 0.0025909423828125,
"learning_rate": 2e-07,
"loss": -0.02955559492111206,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.27500000819563863,
"reward_std": 0.42218015491962435,
"rewards/MultiModalAccuracyORM": 0.27500000819563863,
"step": 435,
"train_speed(iter/s)": 0.034539
},
{
"clip_ratio": 0.0,
"completion_length": 353.9,
"epoch": 0.17777777777777778,
"grad_norm": 0.03487250614691778,
"kl": 0.00295562744140625,
"learning_rate": 2e-07,
"loss": 0.03084596395492554,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.15833333432674407,
"reward_std": 0.2657532900571823,
"rewards/MultiModalAccuracyORM": 0.15833333432674407,
"step": 440,
"train_speed(iter/s)": 0.034613
},
{
"clip_ratio": 0.0,
"completion_length": 357.5,
"epoch": 0.1797979797979798,
"grad_norm": 1.8186333166660678,
"kl": 0.0029296875,
"learning_rate": 2e-07,
"loss": -0.008677978813648225,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.3083333358168602,
"reward_std": 0.23004821836948394,
"rewards/MultiModalAccuracyORM": 0.3083333358168602,
"step": 445,
"train_speed(iter/s)": 0.034594
},
{
"clip_ratio": 0.0,
"completion_length": 277.7,
"epoch": 0.18181818181818182,
"grad_norm": 1.5483724144717876,
"kl": 0.003802490234375,
"learning_rate": 2e-07,
"loss": -0.010931169986724854,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.21666667461395264,
"reward_std": 0.36794900298118594,
"rewards/MultiModalAccuracyORM": 0.21666667461395264,
"step": 450,
"train_speed(iter/s)": 0.034617
},
{
"clip_ratio": 0.0,
"completion_length": 442.1,
"epoch": 0.18383838383838383,
"grad_norm": 0.8802169915779423,
"kl": 0.00302734375,
"learning_rate": 2e-07,
"loss": -0.04651644229888916,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.15000000596046448,
"reward_std": 0.2963056802749634,
"rewards/MultiModalAccuracyORM": 0.15000000596046448,
"step": 455,
"train_speed(iter/s)": 0.034674
},
{
"clip_ratio": 0.0,
"completion_length": 329.5,
"epoch": 0.18585858585858586,
"grad_norm": 1.6049021687383316,
"kl": 0.00660247802734375,
"learning_rate": 2e-07,
"loss": 0.008616116642951966,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.25833333656191826,
"reward_std": 0.25741389989852903,
"rewards/MultiModalAccuracyORM": 0.25833333656191826,
"step": 460,
"train_speed(iter/s)": 0.034754
},
{
"clip_ratio": 0.0,
"completion_length": 316.5,
"epoch": 0.18787878787878787,
"grad_norm": 2.893110887441056,
"kl": 0.002629852294921875,
"learning_rate": 2e-07,
"loss": 0.0028022266924381256,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.40000001415610315,
"reward_std": 0.4707459330558777,
"rewards/MultiModalAccuracyORM": 0.40000001415610315,
"step": 465,
"train_speed(iter/s)": 0.034821
},
{
"clip_ratio": 0.0,
"completion_length": 279.5,
"epoch": 0.1898989898989899,
"grad_norm": 2.1102869760511584,
"kl": 0.0035003662109375,
"learning_rate": 2e-07,
"loss": 0.0047733023762702945,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.20000000670552254,
"reward_std": 0.3082119345664978,
"rewards/MultiModalAccuracyORM": 0.20000000670552254,
"step": 470,
"train_speed(iter/s)": 0.034862
},
{
"clip_ratio": 0.0,
"completion_length": 312.1,
"epoch": 0.1919191919191919,
"grad_norm": 2.403767582762209,
"kl": 0.00347442626953125,
"learning_rate": 2e-07,
"loss": 0.0637534499168396,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.5083333484828472,
"reward_std": 0.34557787179946897,
"rewards/MultiModalAccuracyORM": 0.5083333484828472,
"step": 475,
"train_speed(iter/s)": 0.03495
},
{
"clip_ratio": 0.0,
"completion_length": 348.65,
"epoch": 0.19393939393939394,
"grad_norm": 0.6979791277265925,
"kl": 0.00365142822265625,
"learning_rate": 2e-07,
"loss": -0.04180996119976044,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.26666667088866236,
"reward_std": 0.32826719582080843,
"rewards/MultiModalAccuracyORM": 0.26666667088866236,
"step": 480,
"train_speed(iter/s)": 0.034951
},
{
"clip_ratio": 0.0,
"completion_length": 269.85,
"epoch": 0.19595959595959597,
"grad_norm": 0.0525932465492366,
"kl": 0.00377197265625,
"learning_rate": 2e-07,
"loss": -0.014869007468223571,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.4000000052154064,
"reward_std": 0.20967912971973418,
"rewards/MultiModalAccuracyORM": 0.4000000052154064,
"step": 485,
"train_speed(iter/s)": 0.035021
},
{
"clip_ratio": 0.0,
"completion_length": 294.1,
"epoch": 0.19797979797979798,
"grad_norm": 1.6281647114218305,
"kl": 0.004177093505859375,
"learning_rate": 2e-07,
"loss": 0.015925824642181396,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.21666667014360427,
"reward_std": 0.3227578908205032,
"rewards/MultiModalAccuracyORM": 0.21666667014360427,
"step": 490,
"train_speed(iter/s)": 0.035088
},
{
"clip_ratio": 0.0,
"completion_length": 329.75,
"epoch": 0.2,
"grad_norm": 1.984961473458151,
"kl": 0.00326995849609375,
"learning_rate": 2e-07,
"loss": -0.0037449508905410766,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.27500000819563863,
"reward_std": 0.2855509877204895,
"rewards/MultiModalAccuracyORM": 0.27500000819563863,
"step": 495,
"train_speed(iter/s)": 0.035113
},
{
"epoch": 0.20202020202020202,
"grad_norm": 0.6734714455829673,
"learning_rate": 2e-07,
"loss": -0.013085539638996124,
"memory(GiB)": 87.45,
"step": 500,
"train_speed(iter/s)": 0.035182
},
{
"epoch": 0.20202020202020202,
"eval_clip_ratio": 0.0,
"eval_completion_length": 363.1450085449219,
"eval_kl": 0.003147125244140625,
"eval_loss": 0.024374496191740036,
"eval_response_clip_ratio": 0.003333333432674408,
"eval_reward": 0.26666667237877845,
"eval_reward_std": 0.28797652542591096,
"eval_rewards/MultiModalAccuracyORM": 0.26666667237877845,
"eval_runtime": 597.4581,
"eval_samples_per_second": 0.084,
"eval_steps_per_second": 0.008,
"step": 500
},
{
"clip_ratio": 0.0,
"completion_length": 345.4,
"epoch": 0.20404040404040405,
"grad_norm": 2.0097245676314053,
"kl": 0.002962684631347656,
"learning_rate": 2e-07,
"loss": 0.008341678977012634,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.22916666902601718,
"reward_std": 0.28844616413116453,
"rewards/MultiModalAccuracyORM": 0.22916666902601718,
"step": 505,
"train_speed(iter/s)": 0.033026
},
{
"clip_ratio": 0.0,
"completion_length": 478.15,
"epoch": 0.20606060606060606,
"grad_norm": 0.04671524557136776,
"kl": 0.004395294189453125,
"learning_rate": 2e-07,
"loss": 0.019101715087890624,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.20833333656191827,
"reward_std": 0.22704698145389557,
"rewards/MultiModalAccuracyORM": 0.20833333656191827,
"step": 510,
"train_speed(iter/s)": 0.033029
},
{
"clip_ratio": 0.0,
"completion_length": 390.65,
"epoch": 0.2080808080808081,
"grad_norm": 1.7656462373703843,
"kl": 0.003029632568359375,
"learning_rate": 2e-07,
"loss": 0.04230659604072571,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.22500000521540642,
"reward_std": 0.248858305811882,
"rewards/MultiModalAccuracyORM": 0.22500000521540642,
"step": 515,
"train_speed(iter/s)": 0.032925
},
{
"clip_ratio": 0.0,
"completion_length": 313.8,
"epoch": 0.2101010101010101,
"grad_norm": 1.2593604182587,
"kl": 0.0040802001953125,
"learning_rate": 2e-07,
"loss": -0.0020169973373413085,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.4583333432674408,
"reward_std": 0.4390155434608459,
"rewards/MultiModalAccuracyORM": 0.4583333432674408,
"step": 520,
"train_speed(iter/s)": 0.032885
},
{
"clip_ratio": 0.0,
"completion_length": 410.7,
"epoch": 0.21212121212121213,
"grad_norm": 10.635733115288671,
"kl": 0.006873321533203125,
"learning_rate": 2e-07,
"loss": 0.013639546930789948,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.25000000447034837,
"reward_std": 0.29108133912086487,
"rewards/MultiModalAccuracyORM": 0.25000000447034837,
"step": 525,
"train_speed(iter/s)": 0.032731
},
{
"clip_ratio": 0.0,
"completion_length": 335.65,
"epoch": 0.21414141414141413,
"grad_norm": 2.2605304578434664,
"kl": 0.00481109619140625,
"learning_rate": 2e-07,
"loss": 0.029361778497695924,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.29166667610406877,
"reward_std": 0.3948740750551224,
"rewards/MultiModalAccuracyORM": 0.29166667610406877,
"step": 530,
"train_speed(iter/s)": 0.032671
},
{
"clip_ratio": 0.0,
"completion_length": 300.95,
"epoch": 0.21616161616161617,
"grad_norm": 3.233553935601456,
"kl": 0.005239105224609375,
"learning_rate": 2e-07,
"loss": -0.02358839809894562,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.32500000670552254,
"reward_std": 0.39305841624736787,
"rewards/MultiModalAccuracyORM": 0.32500000670552254,
"step": 535,
"train_speed(iter/s)": 0.032679
},
{
"clip_ratio": 0.0,
"completion_length": 347.15,
"epoch": 0.21818181818181817,
"grad_norm": 1.4435208932830024,
"kl": 0.0038543701171875,
"learning_rate": 2e-07,
"loss": 0.012015002965927123,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.14166666939854622,
"reward_std": 0.2184889554977417,
"rewards/MultiModalAccuracyORM": 0.14166666939854622,
"step": 540,
"train_speed(iter/s)": 0.032663
},
{
"clip_ratio": 0.0,
"completion_length": 280.7,
"epoch": 0.2202020202020202,
"grad_norm": 2.124111886424564,
"kl": 0.00633544921875,
"learning_rate": 2e-07,
"loss": 0.016453295946121216,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.43333334773778914,
"reward_std": 0.40082641541957853,
"rewards/MultiModalAccuracyORM": 0.43333334773778914,
"step": 545,
"train_speed(iter/s)": 0.032702
},
{
"clip_ratio": 0.0,
"completion_length": 405.4,
"epoch": 0.2222222222222222,
"grad_norm": 2.528384017814939,
"kl": 0.004555511474609375,
"learning_rate": 2e-07,
"loss": -0.013006833195686341,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.2333333410322666,
"reward_std": 0.3478317141532898,
"rewards/MultiModalAccuracyORM": 0.2333333410322666,
"step": 550,
"train_speed(iter/s)": 0.032503
},
{
"clip_ratio": 0.0,
"completion_length": 363.4,
"epoch": 0.22424242424242424,
"grad_norm": 2.5915001907307977,
"kl": 0.00524749755859375,
"learning_rate": 2e-07,
"loss": 0.02111098766326904,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.2916666753590107,
"reward_std": 0.3644451290369034,
"rewards/MultiModalAccuracyORM": 0.2916666753590107,
"step": 555,
"train_speed(iter/s)": 0.032469
},
{
"clip_ratio": 0.0,
"completion_length": 419.65,
"epoch": 0.22626262626262628,
"grad_norm": 1.5712795723400375,
"kl": 0.004864501953125,
"learning_rate": 2e-07,
"loss": 0.06747217178344726,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.21666667386889457,
"reward_std": 0.30639870166778566,
"rewards/MultiModalAccuracyORM": 0.21666667386889457,
"step": 560,
"train_speed(iter/s)": 0.032374
},
{
"clip_ratio": 0.0,
"completion_length": 330.25,
"epoch": 0.22828282828282828,
"grad_norm": 2.1872516406963483,
"kl": 0.0059844970703125,
"learning_rate": 2e-07,
"loss": -0.01907222718000412,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.18333333656191825,
"reward_std": 0.27402731478214265,
"rewards/MultiModalAccuracyORM": 0.18333333656191825,
"step": 565,
"train_speed(iter/s)": 0.03236
},
{
"clip_ratio": 0.0,
"completion_length": 252.8,
"epoch": 0.23030303030303031,
"grad_norm": 1.9388301349526922,
"kl": 0.00828857421875,
"learning_rate": 2e-07,
"loss": 0.0710361123085022,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.28333333507180214,
"reward_std": 0.26670235097408296,
"rewards/MultiModalAccuracyORM": 0.28333333507180214,
"step": 570,
"train_speed(iter/s)": 0.032388
},
{
"clip_ratio": 0.0,
"completion_length": 475.4,
"epoch": 0.23232323232323232,
"grad_norm": 2.0643763651689424,
"kl": 0.0043544769287109375,
"learning_rate": 2e-07,
"loss": 0.038624811172485354,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.29166667386889455,
"reward_std": 0.38400964736938475,
"rewards/MultiModalAccuracyORM": 0.29166667386889455,
"step": 575,
"train_speed(iter/s)": 0.032305
},
{
"clip_ratio": 0.0,
"completion_length": 366.45,
"epoch": 0.23434343434343435,
"grad_norm": 2.5185952971698566,
"kl": 0.00495452880859375,
"learning_rate": 2e-07,
"loss": 0.02923307418823242,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.36666667461395264,
"reward_std": 0.35012357234954833,
"rewards/MultiModalAccuracyORM": 0.36666667461395264,
"step": 580,
"train_speed(iter/s)": 0.032206
},
{
"clip_ratio": 0.0,
"completion_length": 419.9,
"epoch": 0.23636363636363636,
"grad_norm": 1.8128917450324007,
"kl": 0.0055450439453125,
"learning_rate": 2e-07,
"loss": 0.013245610892772675,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.28333333879709244,
"reward_std": 0.23710441291332246,
"rewards/MultiModalAccuracyORM": 0.28333333879709244,
"step": 585,
"train_speed(iter/s)": 0.032276
},
{
"clip_ratio": 0.0,
"completion_length": 355.4,
"epoch": 0.2383838383838384,
"grad_norm": 4.329439973170006,
"kl": 0.00757293701171875,
"learning_rate": 2e-07,
"loss": -0.0028860807418823243,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.25000000819563867,
"reward_std": 0.30661733746528624,
"rewards/MultiModalAccuracyORM": 0.25000000819563867,
"step": 590,
"train_speed(iter/s)": 0.032341
},
{
"clip_ratio": 0.0,
"completion_length": 411.8,
"epoch": 0.2404040404040404,
"grad_norm": 1.8156019329792383,
"kl": 0.005291748046875,
"learning_rate": 2e-07,
"loss": -0.004809608310461044,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.4166666753590107,
"reward_std": 0.40967183113098143,
"rewards/MultiModalAccuracyORM": 0.4166666753590107,
"step": 595,
"train_speed(iter/s)": 0.032425
},
{
"clip_ratio": 0.0,
"completion_length": 518.65,
"epoch": 0.24242424242424243,
"grad_norm": 1.6812944635615767,
"kl": 0.0045440673828125,
"learning_rate": 2e-07,
"loss": 0.0016623079776763917,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.21666667312383653,
"reward_std": 0.35134140253067014,
"rewards/MultiModalAccuracyORM": 0.21666667312383653,
"step": 600,
"train_speed(iter/s)": 0.032411
},
{
"clip_ratio": 0.0,
"completion_length": 344.4,
"epoch": 0.24444444444444444,
"grad_norm": 2.089820121690527,
"kl": 0.00710906982421875,
"learning_rate": 2e-07,
"loss": -0.028999322652816774,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.2666666738688946,
"reward_std": 0.33552044034004214,
"rewards/MultiModalAccuracyORM": 0.2666666738688946,
"step": 605,
"train_speed(iter/s)": 0.032494
},
{
"clip_ratio": 0.0,
"completion_length": 246.75,
"epoch": 0.24646464646464647,
"grad_norm": 2.728310100204588,
"kl": 0.00543060302734375,
"learning_rate": 2e-07,
"loss": 0.03924176394939423,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.1916666716337204,
"reward_std": 0.27078639566898344,
"rewards/MultiModalAccuracyORM": 0.1916666716337204,
"step": 610,
"train_speed(iter/s)": 0.032569
},
{
"clip_ratio": 0.0,
"completion_length": 403.2,
"epoch": 0.24848484848484848,
"grad_norm": 1.3175052417192106,
"kl": 0.00468902587890625,
"learning_rate": 2e-07,
"loss": 0.038245481252670285,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.2500000067055225,
"reward_std": 0.4048719048500061,
"rewards/MultiModalAccuracyORM": 0.2500000067055225,
"step": 615,
"train_speed(iter/s)": 0.032501
},
{
"clip_ratio": 0.0,
"completion_length": 344.7,
"epoch": 0.2505050505050505,
"grad_norm": 1.9529912685373527,
"kl": 0.00550537109375,
"learning_rate": 2e-07,
"loss": 0.011770330369472504,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.3916666753590107,
"reward_std": 0.2895964771509171,
"rewards/MultiModalAccuracyORM": 0.3916666753590107,
"step": 620,
"train_speed(iter/s)": 0.032477
},
{
"clip_ratio": 0.0,
"completion_length": 382.75,
"epoch": 0.25252525252525254,
"grad_norm": 0.05113023046556139,
"kl": 0.00566864013671875,
"learning_rate": 2e-07,
"loss": 0.01361861228942871,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.2333333358168602,
"reward_std": 0.275274920463562,
"rewards/MultiModalAccuracyORM": 0.2333333358168602,
"step": 625,
"train_speed(iter/s)": 0.032463
},
{
"clip_ratio": 0.0,
"completion_length": 307.15,
"epoch": 0.2545454545454545,
"grad_norm": 2.556743977258531,
"kl": 0.005108642578125,
"learning_rate": 2e-07,
"loss": 0.014950770139694213,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.3333333425223827,
"reward_std": 0.34713688492774963,
"rewards/MultiModalAccuracyORM": 0.3333333425223827,
"step": 630,
"train_speed(iter/s)": 0.032484
},
{
"clip_ratio": 0.0,
"completion_length": 406.15,
"epoch": 0.25656565656565655,
"grad_norm": 2.2423462644187624,
"kl": 0.004283905029296875,
"learning_rate": 2e-07,
"loss": 0.008650130033493042,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.1833333395421505,
"reward_std": 0.28752902448177337,
"rewards/MultiModalAccuracyORM": 0.1833333395421505,
"step": 635,
"train_speed(iter/s)": 0.032416
},
{
"clip_ratio": 0.0,
"completion_length": 347.15,
"epoch": 0.2585858585858586,
"grad_norm": 2.7318256637713327,
"kl": 0.0051483154296875,
"learning_rate": 2e-07,
"loss": 0.021026265621185303,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.2500000111758709,
"reward_std": 0.29385479390621183,
"rewards/MultiModalAccuracyORM": 0.2500000111758709,
"step": 640,
"train_speed(iter/s)": 0.032398
},
{
"clip_ratio": 0.0,
"completion_length": 363.9,
"epoch": 0.2606060606060606,
"grad_norm": 0.04170508484645814,
"kl": 0.00531463623046875,
"learning_rate": 2e-07,
"loss": -0.04355872869491577,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.25000000819563867,
"reward_std": 0.3089067697525024,
"rewards/MultiModalAccuracyORM": 0.25000000819563867,
"step": 645,
"train_speed(iter/s)": 0.032375
},
{
"clip_ratio": 0.0,
"completion_length": 407.55,
"epoch": 0.26262626262626265,
"grad_norm": 1.2451580073322923,
"kl": 0.003839111328125,
"learning_rate": 2e-07,
"loss": 0.00021180734038352966,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.26666667461395266,
"reward_std": 0.2676923930644989,
"rewards/MultiModalAccuracyORM": 0.26666667461395266,
"step": 650,
"train_speed(iter/s)": 0.032327
},
{
"clip_ratio": 0.0,
"completion_length": 432.75,
"epoch": 0.26464646464646463,
"grad_norm": 1.9808716749773743,
"kl": 0.00391082763671875,
"learning_rate": 2e-07,
"loss": 0.026480630040168762,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.22500000670552253,
"reward_std": 0.2817953139543533,
"rewards/MultiModalAccuracyORM": 0.22500000670552253,
"step": 655,
"train_speed(iter/s)": 0.032322
},
{
"clip_ratio": 0.0,
"completion_length": 463.85,
"epoch": 0.26666666666666666,
"grad_norm": 1.1399233339835215,
"kl": 0.004100799560546875,
"learning_rate": 2e-07,
"loss": -0.02441052794456482,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.1833333380520344,
"reward_std": 0.25897533297538755,
"rewards/MultiModalAccuracyORM": 0.1833333380520344,
"step": 660,
"train_speed(iter/s)": 0.032385
},
{
"clip_ratio": 0.0,
"completion_length": 254.3,
"epoch": 0.2686868686868687,
"grad_norm": 2.4222117834215964,
"kl": 0.0057952880859375,
"learning_rate": 2e-07,
"loss": 0.01856023073196411,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.2166666716337204,
"reward_std": 0.3348231792449951,
"rewards/MultiModalAccuracyORM": 0.2166666716337204,
"step": 665,
"train_speed(iter/s)": 0.032448
},
{
"clip_ratio": 0.0,
"completion_length": 360.55,
"epoch": 0.27070707070707073,
"grad_norm": 2.596880019981878,
"kl": 0.0034820556640625,
"learning_rate": 2e-07,
"loss": -0.004870015382766724,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.40000001192092893,
"reward_std": 0.3786772578954697,
"rewards/MultiModalAccuracyORM": 0.40000001192092893,
"step": 670,
"train_speed(iter/s)": 0.032507
},
{
"clip_ratio": 0.0,
"completion_length": 547.2,
"epoch": 0.2727272727272727,
"grad_norm": 1.261892143617939,
"kl": 0.003546142578125,
"learning_rate": 2e-07,
"loss": 0.018378911912441252,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.24166667088866234,
"reward_std": 0.25365822613239286,
"rewards/MultiModalAccuracyORM": 0.24166667088866234,
"step": 675,
"train_speed(iter/s)": 0.032509
},
{
"clip_ratio": 0.0,
"completion_length": 389.15,
"epoch": 0.27474747474747474,
"grad_norm": 1.5125590979703638,
"kl": 0.00487823486328125,
"learning_rate": 2e-07,
"loss": -0.004463189840316772,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.25000000596046446,
"reward_std": 0.2488823115825653,
"rewards/MultiModalAccuracyORM": 0.25000000596046446,
"step": 680,
"train_speed(iter/s)": 0.03256
},
{
"clip_ratio": 0.0,
"completion_length": 461.15,
"epoch": 0.2767676767676768,
"grad_norm": 0.0206379809755319,
"kl": 0.00426177978515625,
"learning_rate": 2e-07,
"loss": 0.021875476837158202,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.11666666865348815,
"reward_std": 0.26496326327323916,
"rewards/MultiModalAccuracyORM": 0.11666666865348815,
"step": 685,
"train_speed(iter/s)": 0.032514
},
{
"clip_ratio": 0.0,
"completion_length": 333.5,
"epoch": 0.2787878787878788,
"grad_norm": 2.5475669372401737,
"kl": 0.00420379638671875,
"learning_rate": 2e-07,
"loss": 0.004043090343475342,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.15000000447034836,
"reward_std": 0.30210480093955994,
"rewards/MultiModalAccuracyORM": 0.15000000447034836,
"step": 690,
"train_speed(iter/s)": 0.032521
},
{
"clip_ratio": 0.0,
"completion_length": 416.7,
"epoch": 0.2808080808080808,
"grad_norm": 1.5500150159182102,
"kl": 0.00518798828125,
"learning_rate": 2e-07,
"loss": -0.023865307867527007,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.1833333343267441,
"reward_std": 0.1683032989501953,
"rewards/MultiModalAccuracyORM": 0.1833333343267441,
"step": 695,
"train_speed(iter/s)": 0.032416
},
{
"clip_ratio": 0.0,
"completion_length": 372.35,
"epoch": 0.2828282828282828,
"grad_norm": 1.9962407432487237,
"kl": 0.005457305908203125,
"learning_rate": 2e-07,
"loss": -0.028327393531799316,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.05,
"reward": 0.2250000037252903,
"reward_std": 0.2099333554506302,
"rewards/MultiModalAccuracyORM": 0.2250000037252903,
"step": 700,
"train_speed(iter/s)": 0.032361
},
{
"clip_ratio": 0.0,
"completion_length": 313.4,
"epoch": 0.28484848484848485,
"grad_norm": 1.6074003724487615,
"kl": 0.00528717041015625,
"learning_rate": 2e-07,
"loss": 0.014926820993423462,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.3083333410322666,
"reward_std": 0.27223809361457824,
"rewards/MultiModalAccuracyORM": 0.3083333410322666,
"step": 705,
"train_speed(iter/s)": 0.032343
},
{
"clip_ratio": 0.0,
"completion_length": 286.5,
"epoch": 0.2868686868686869,
"grad_norm": 1.6995014935336248,
"kl": 0.0051483154296875,
"learning_rate": 2e-07,
"loss": -0.019916635751724244,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.24166667014360427,
"reward_std": 0.24885829985141755,
"rewards/MultiModalAccuracyORM": 0.24166667014360427,
"step": 710,
"train_speed(iter/s)": 0.032338
},
{
"clip_ratio": 0.0,
"completion_length": 414.3,
"epoch": 0.28888888888888886,
"grad_norm": 2.5308810289000134,
"kl": 0.00496978759765625,
"learning_rate": 2e-07,
"loss": 0.01712719202041626,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.40000000447034834,
"reward_std": 0.3906455457210541,
"rewards/MultiModalAccuracyORM": 0.40000000447034834,
"step": 715,
"train_speed(iter/s)": 0.03227
},
{
"clip_ratio": 0.0,
"completion_length": 464.2,
"epoch": 0.2909090909090909,
"grad_norm": 3.1179537828506865,
"kl": 0.00511016845703125,
"learning_rate": 2e-07,
"loss": -0.0032517150044441222,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.22500001043081283,
"reward_std": 0.3038526177406311,
"rewards/MultiModalAccuracyORM": 0.22500001043081283,
"step": 720,
"train_speed(iter/s)": 0.032189
},
{
"clip_ratio": 0.0,
"completion_length": 339.9,
"epoch": 0.29292929292929293,
"grad_norm": 1.3264200657485663,
"kl": 0.0060546875,
"learning_rate": 2e-07,
"loss": 0.005654716491699218,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.2250000059604645,
"reward_std": 0.2988493382930756,
"rewards/MultiModalAccuracyORM": 0.2250000059604645,
"step": 725,
"train_speed(iter/s)": 0.032186
},
{
"clip_ratio": 0.0,
"completion_length": 471.65,
"epoch": 0.29494949494949496,
"grad_norm": 0.5240042260688945,
"kl": 0.005621719360351563,
"learning_rate": 2e-07,
"loss": 0.010572614520788193,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.1750000037252903,
"reward_std": 0.2945852130651474,
"rewards/MultiModalAccuracyORM": 0.1750000037252903,
"step": 730,
"train_speed(iter/s)": 0.032129
},
{
"clip_ratio": 0.0,
"completion_length": 445.2,
"epoch": 0.296969696969697,
"grad_norm": 2.049661779713074,
"kl": 0.00519866943359375,
"learning_rate": 2e-07,
"loss": 0.022058649361133574,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.2833333410322666,
"reward_std": 0.38726511001586916,
"rewards/MultiModalAccuracyORM": 0.2833333410322666,
"step": 735,
"train_speed(iter/s)": 0.032089
},
{
"clip_ratio": 0.0,
"completion_length": 391.25,
"epoch": 0.298989898989899,
"grad_norm": 0.962602559613357,
"kl": 0.0046844482421875,
"learning_rate": 2e-07,
"loss": -0.0028517723083496095,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.21666667014360427,
"reward_std": 0.23328913748264313,
"rewards/MultiModalAccuracyORM": 0.21666667014360427,
"step": 740,
"train_speed(iter/s)": 0.032072
},
{
"clip_ratio": 0.0,
"completion_length": 364.5,
"epoch": 0.301010101010101,
"grad_norm": 2.0529334461639337,
"kl": 0.00500030517578125,
"learning_rate": 2e-07,
"loss": 0.0314439594745636,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.23333334177732468,
"reward_std": 0.3212204694747925,
"rewards/MultiModalAccuracyORM": 0.23333334177732468,
"step": 745,
"train_speed(iter/s)": 0.032037
},
{
"epoch": 0.30303030303030304,
"grad_norm": 1.3580773911974338,
"learning_rate": 2e-07,
"loss": -0.007335931062698364,
"memory(GiB)": 87.45,
"step": 750,
"train_speed(iter/s)": 0.032014
},
{
"epoch": 0.30303030303030304,
"eval_clip_ratio": 0.0,
"eval_completion_length": 352.49667709350587,
"eval_kl": 0.00640625,
"eval_loss": 0.002320815809071064,
"eval_response_clip_ratio": 0.0,
"eval_reward": 0.2716666729748249,
"eval_reward_std": 0.33371097803115846,
"eval_rewards/MultiModalAccuracyORM": 0.2716666729748249,
"eval_runtime": 876.1057,
"eval_samples_per_second": 0.057,
"eval_steps_per_second": 0.006,
"step": 750
},
{
"clip_ratio": 0.0,
"completion_length": 392.55,
"epoch": 0.30505050505050507,
"grad_norm": 2.1426610619194815,
"kl": 0.00631256103515625,
"learning_rate": 2e-07,
"loss": -0.040098315477371214,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.13333333656191826,
"reward_std": 0.22312387079000473,
"rewards/MultiModalAccuracyORM": 0.13333333656191826,
"step": 755,
"train_speed(iter/s)": 0.029206
},
{
"clip_ratio": 0.0,
"completion_length": 468.4,
"epoch": 0.30707070707070705,
"grad_norm": 0.8717248302301553,
"kl": 0.00636749267578125,
"learning_rate": 2e-07,
"loss": 0.015009742975234986,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.2833333358168602,
"reward_std": 0.2940850019454956,
"rewards/MultiModalAccuracyORM": 0.2833333358168602,
"step": 760,
"train_speed(iter/s)": 0.029162
},
{
"clip_ratio": 0.0,
"completion_length": 307.8,
"epoch": 0.3090909090909091,
"grad_norm": 2.4403464428155925,
"kl": 0.0062957763671875,
"learning_rate": 2e-07,
"loss": 0.019652032852172853,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.45000001043081284,
"reward_std": 0.3222792655229568,
"rewards/MultiModalAccuracyORM": 0.45000001043081284,
"step": 765,
"train_speed(iter/s)": 0.029211
},
{
"clip_ratio": 0.0,
"completion_length": 448.35,
"epoch": 0.3111111111111111,
"grad_norm": 1.6980769345505524,
"kl": 0.0074066162109375,
"learning_rate": 2e-07,
"loss": 0.018609333038330077,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.31666667833924295,
"reward_std": 0.4026396483182907,
"rewards/MultiModalAccuracyORM": 0.31666667833924295,
"step": 770,
"train_speed(iter/s)": 0.029164
},
{
"clip_ratio": 0.0,
"completion_length": 406.35,
"epoch": 0.31313131313131315,
"grad_norm": 1.4345330108808567,
"kl": 0.00540924072265625,
"learning_rate": 2e-07,
"loss": 0.034766983985900876,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.05,
"reward": 0.26666667088866236,
"reward_std": 0.3167103588581085,
"rewards/MultiModalAccuracyORM": 0.26666667088866236,
"step": 775,
"train_speed(iter/s)": 0.029127
},
{
"clip_ratio": 0.0,
"completion_length": 441.5,
"epoch": 0.3151515151515151,
"grad_norm": 1.0920815430357467,
"kl": 0.0054931640625,
"learning_rate": 2e-07,
"loss": -7.512569427490235e-05,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.10833333656191826,
"reward_std": 0.22400068640708923,
"rewards/MultiModalAccuracyORM": 0.10833333656191826,
"step": 780,
"train_speed(iter/s)": 0.029106
},
{
"clip_ratio": 0.0,
"completion_length": 435.7,
"epoch": 0.31717171717171716,
"grad_norm": 1.3732918705207908,
"kl": 0.00477752685546875,
"learning_rate": 2e-07,
"loss": 0.015651023387908934,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.2416666679084301,
"reward_std": 0.23479096889495848,
"rewards/MultiModalAccuracyORM": 0.2416666679084301,
"step": 785,
"train_speed(iter/s)": 0.029052
},
{
"clip_ratio": 0.0,
"completion_length": 453.35,
"epoch": 0.3191919191919192,
"grad_norm": 2.1057593122144005,
"kl": 0.00804595947265625,
"learning_rate": 2e-07,
"loss": -0.0006304442882537842,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.3166666753590107,
"reward_std": 0.3619014710187912,
"rewards/MultiModalAccuracyORM": 0.3166666753590107,
"step": 790,
"train_speed(iter/s)": 0.029057
},
{
"clip_ratio": 0.0,
"completion_length": 408.2,
"epoch": 0.3212121212121212,
"grad_norm": 2.3354800713445654,
"kl": 0.0078216552734375,
"learning_rate": 2e-07,
"loss": 0.0310418963432312,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.36666667759418486,
"reward_std": 0.40556674003601073,
"rewards/MultiModalAccuracyORM": 0.36666667759418486,
"step": 795,
"train_speed(iter/s)": 0.029005
},
{
"clip_ratio": 0.0,
"completion_length": 409.45,
"epoch": 0.32323232323232326,
"grad_norm": 2.4825567652901444,
"kl": 0.0077880859375,
"learning_rate": 2e-07,
"loss": 0.021943604946136473,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.3666666761040688,
"reward_std": 0.3470627337694168,
"rewards/MultiModalAccuracyORM": 0.3666666761040688,
"step": 800,
"train_speed(iter/s)": 0.028978
},
{
"clip_ratio": 0.0,
"completion_length": 324.9,
"epoch": 0.32525252525252524,
"grad_norm": 3.589672291824819,
"kl": 0.00778350830078125,
"learning_rate": 2e-07,
"loss": 0.008873769640922546,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.3166666753590107,
"reward_std": 0.44790194034576414,
"rewards/MultiModalAccuracyORM": 0.3166666753590107,
"step": 805,
"train_speed(iter/s)": 0.028982
},
{
"clip_ratio": 0.0,
"completion_length": 287.2,
"epoch": 0.32727272727272727,
"grad_norm": 2.1262920297539925,
"kl": 0.0073883056640625,
"learning_rate": 2e-07,
"loss": -0.04254024624824524,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.2833333447575569,
"reward_std": 0.3589002341032028,
"rewards/MultiModalAccuracyORM": 0.2833333447575569,
"step": 810,
"train_speed(iter/s)": 0.029003
},
{
"clip_ratio": 0.0,
"completion_length": 323.9,
"epoch": 0.3292929292929293,
"grad_norm": 2.6338345195445965,
"kl": 0.0073974609375,
"learning_rate": 2e-07,
"loss": 0.008789122104644775,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.35833333879709245,
"reward_std": 0.3563657283782959,
"rewards/MultiModalAccuracyORM": 0.35833333879709245,
"step": 815,
"train_speed(iter/s)": 0.028993
},
{
"clip_ratio": 0.0,
"completion_length": 287.85,
"epoch": 0.33131313131313134,
"grad_norm": 2.540831778543349,
"kl": 0.0085357666015625,
"learning_rate": 2e-07,
"loss": 0.0017573148012161254,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.2583333395421505,
"reward_std": 0.33755565285682676,
"rewards/MultiModalAccuracyORM": 0.2583333395421505,
"step": 820,
"train_speed(iter/s)": 0.02904
},
{
"clip_ratio": 0.0,
"completion_length": 363.3,
"epoch": 0.3333333333333333,
"grad_norm": 2.280326105508933,
"kl": 0.011834716796875,
"learning_rate": 2e-07,
"loss": -0.016002975404262543,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.17500000596046447,
"reward_std": 0.24662604331970214,
"rewards/MultiModalAccuracyORM": 0.17500000596046447,
"step": 825,
"train_speed(iter/s)": 0.02907
},
{
"clip_ratio": 0.0,
"completion_length": 334.05,
"epoch": 0.33535353535353535,
"grad_norm": 1.64256260222623,
"kl": 0.00889892578125,
"learning_rate": 2e-07,
"loss": -0.008859094977378846,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.20000000149011612,
"reward_std": 0.3164917230606079,
"rewards/MultiModalAccuracyORM": 0.20000000149011612,
"step": 830,
"train_speed(iter/s)": 0.029109
},
{
"clip_ratio": 0.0,
"completion_length": 347.15,
"epoch": 0.3373737373737374,
"grad_norm": 0.09646041600368084,
"kl": 0.0067840576171875,
"learning_rate": 2e-07,
"loss": 0.02341327965259552,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.2750000096857548,
"reward_std": 0.3184880018234253,
"rewards/MultiModalAccuracyORM": 0.2750000096857548,
"step": 835,
"train_speed(iter/s)": 0.028908
},
{
"clip_ratio": 0.0,
"completion_length": 434.35,
"epoch": 0.3393939393939394,
"grad_norm": 0.886588445568382,
"kl": 0.0066741943359375,
"learning_rate": 2e-07,
"loss": 0.011455638706684113,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.34166667312383653,
"reward_std": 0.3142238825559616,
"rewards/MultiModalAccuracyORM": 0.34166667312383653,
"step": 840,
"train_speed(iter/s)": 0.02878
},
{
"clip_ratio": 0.0,
"completion_length": 448.75,
"epoch": 0.3414141414141414,
"grad_norm": 0.0732846157739433,
"kl": 0.00753173828125,
"learning_rate": 2e-07,
"loss": 0.010994693636894226,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.20833333507180213,
"reward_std": 0.19786564111709595,
"rewards/MultiModalAccuracyORM": 0.20833333507180213,
"step": 845,
"train_speed(iter/s)": 0.028759
},
{
"clip_ratio": 0.0,
"completion_length": 325.0,
"epoch": 0.3434343434343434,
"grad_norm": 2.016101823545884,
"kl": 0.00940399169921875,
"learning_rate": 2e-07,
"loss": 0.0015551522374153137,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.21666667386889457,
"reward_std": 0.37221312820911406,
"rewards/MultiModalAccuracyORM": 0.21666667386889457,
"step": 850,
"train_speed(iter/s)": 0.028759
},
{
"clip_ratio": 0.0,
"completion_length": 367.75,
"epoch": 0.34545454545454546,
"grad_norm": 1.4804689213107514,
"kl": 0.0074249267578125,
"learning_rate": 2e-07,
"loss": 0.008444362878799438,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.37500000521540644,
"reward_std": 0.33937130570411683,
"rewards/MultiModalAccuracyORM": 0.37500000521540644,
"step": 855,
"train_speed(iter/s)": 0.02876
},
{
"clip_ratio": 0.0,
"completion_length": 317.1,
"epoch": 0.3474747474747475,
"grad_norm": 2.368905519842238,
"kl": 0.008038330078125,
"learning_rate": 2e-07,
"loss": 0.026756054162979125,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.3500000111758709,
"reward_std": 0.44455128610134126,
"rewards/MultiModalAccuracyORM": 0.3500000111758709,
"step": 860,
"train_speed(iter/s)": 0.028787
},
{
"clip_ratio": 0.0,
"completion_length": 276.85,
"epoch": 0.34949494949494947,
"grad_norm": 2.3043935598394203,
"kl": 0.0070343017578125,
"learning_rate": 2e-07,
"loss": 0.059600555896759035,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.2833333432674408,
"reward_std": 0.3885723173618317,
"rewards/MultiModalAccuracyORM": 0.2833333432674408,
"step": 865,
"train_speed(iter/s)": 0.028814
},
{
"clip_ratio": 0.0,
"completion_length": 417.95,
"epoch": 0.3515151515151515,
"grad_norm": 1.9471040249213727,
"kl": 0.0069305419921875,
"learning_rate": 2e-07,
"loss": 0.028457581996917725,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.17500000596046447,
"reward_std": 0.2940494120121002,
"rewards/MultiModalAccuracyORM": 0.17500000596046447,
"step": 870,
"train_speed(iter/s)": 0.028708
},
{
"clip_ratio": 0.0,
"completion_length": 363.5,
"epoch": 0.35353535353535354,
"grad_norm": 2.196604109706096,
"kl": 0.0058319091796875,
"learning_rate": 2e-07,
"loss": 0.04532061517238617,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.3916666775941849,
"reward_std": 0.42524099349975586,
"rewards/MultiModalAccuracyORM": 0.3916666775941849,
"step": 875,
"train_speed(iter/s)": 0.028627
},
{
"clip_ratio": 0.0,
"completion_length": 349.05,
"epoch": 0.35555555555555557,
"grad_norm": 1.9101064459839039,
"kl": 0.0102691650390625,
"learning_rate": 2e-07,
"loss": 0.04224415421485901,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.30000000521540643,
"reward_std": 0.3391170799732208,
"rewards/MultiModalAccuracyORM": 0.30000000521540643,
"step": 880,
"train_speed(iter/s)": 0.028551
},
{
"clip_ratio": 0.0,
"completion_length": 317.75,
"epoch": 0.3575757575757576,
"grad_norm": 1.7650856984522036,
"kl": 0.0097930908203125,
"learning_rate": 2e-07,
"loss": 0.031351178884506226,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.3333333425223827,
"reward_std": 0.27555315792560575,
"rewards/MultiModalAccuracyORM": 0.3333333425223827,
"step": 885,
"train_speed(iter/s)": 0.028585
},
{
"clip_ratio": 0.0,
"completion_length": 287.05,
"epoch": 0.3595959595959596,
"grad_norm": 2.4394117877960615,
"kl": 0.0123748779296875,
"learning_rate": 2e-07,
"loss": 0.01872892677783966,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.22500000521540642,
"reward_std": 0.30489687621593475,
"rewards/MultiModalAccuracyORM": 0.22500000521540642,
"step": 890,
"train_speed(iter/s)": 0.028637
},
{
"clip_ratio": 0.0,
"completion_length": 476.05,
"epoch": 0.3616161616161616,
"grad_norm": 2.3682785721081854,
"kl": 0.00737762451171875,
"learning_rate": 2e-07,
"loss": 0.02124558687210083,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.3500000089406967,
"reward_std": 0.41817026138305663,
"rewards/MultiModalAccuracyORM": 0.3500000089406967,
"step": 895,
"train_speed(iter/s)": 0.028688
},
{
"clip_ratio": 0.0,
"completion_length": 429.85,
"epoch": 0.36363636363636365,
"grad_norm": 1.3234500775547358,
"kl": 0.007550048828125,
"learning_rate": 2e-07,
"loss": 0.025475236773490905,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.18333333656191825,
"reward_std": 0.2260383188724518,
"rewards/MultiModalAccuracyORM": 0.18333333656191825,
"step": 900,
"train_speed(iter/s)": 0.028712
},
{
"clip_ratio": 0.0,
"completion_length": 439.75,
"epoch": 0.3656565656565657,
"grad_norm": 3.0802331121314785,
"kl": 0.0105621337890625,
"learning_rate": 2e-07,
"loss": 0.06260026693344116,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.24166666865348815,
"reward_std": 0.3003867596387863,
"rewards/MultiModalAccuracyORM": 0.24166666865348815,
"step": 905,
"train_speed(iter/s)": 0.028645
},
{
"clip_ratio": 0.0,
"completion_length": 287.9,
"epoch": 0.36767676767676766,
"grad_norm": 3.596137864021678,
"kl": 0.01011199951171875,
"learning_rate": 2e-07,
"loss": 0.007353886961936951,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.4833333432674408,
"reward_std": 0.38523324131965636,
"rewards/MultiModalAccuracyORM": 0.4833333432674408,
"step": 910,
"train_speed(iter/s)": 0.028662
},
{
"clip_ratio": 0.0,
"completion_length": 296.65,
"epoch": 0.3696969696969697,
"grad_norm": 1.4417889638729746,
"kl": 0.01177978515625,
"learning_rate": 2e-07,
"loss": -0.006625932455062866,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.3416666783392429,
"reward_std": 0.37195890843868257,
"rewards/MultiModalAccuracyORM": 0.3416666783392429,
"step": 915,
"train_speed(iter/s)": 0.028677
},
{
"clip_ratio": 0.0,
"completion_length": 431.05,
"epoch": 0.3717171717171717,
"grad_norm": 2.8875811253312333,
"kl": 0.01148529052734375,
"learning_rate": 2e-07,
"loss": -3.943443298339844e-05,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.41666667237877847,
"reward_std": 0.3604352355003357,
"rewards/MultiModalAccuracyORM": 0.41666667237877847,
"step": 920,
"train_speed(iter/s)": 0.028643
},
{
"clip_ratio": 0.0,
"completion_length": 330.3,
"epoch": 0.37373737373737376,
"grad_norm": 1.8636332228250176,
"kl": 0.0091461181640625,
"learning_rate": 2e-07,
"loss": 0.004881632328033447,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.2833333402872086,
"reward_std": 0.3440760403871536,
"rewards/MultiModalAccuracyORM": 0.2833333402872086,
"step": 925,
"train_speed(iter/s)": 0.028644
},
{
"clip_ratio": 0.0,
"completion_length": 357.4,
"epoch": 0.37575757575757573,
"grad_norm": 2.1407505535783242,
"kl": 0.00869598388671875,
"learning_rate": 2e-07,
"loss": 0.05731675624847412,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.25833334103226663,
"reward_std": 0.41186849772930145,
"rewards/MultiModalAccuracyORM": 0.25833334103226663,
"step": 930,
"train_speed(iter/s)": 0.028644
},
{
"clip_ratio": 0.0,
"completion_length": 322.9,
"epoch": 0.37777777777777777,
"grad_norm": 3.79021329286614,
"kl": 0.009942626953125,
"learning_rate": 2e-07,
"loss": 0.0477484941482544,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.3166666716337204,
"reward_std": 0.2674737572669983,
"rewards/MultiModalAccuracyORM": 0.3166666716337204,
"step": 935,
"train_speed(iter/s)": 0.028648
},
{
"clip_ratio": 0.0,
"completion_length": 283.45,
"epoch": 0.3797979797979798,
"grad_norm": 2.2451102482111724,
"kl": 0.012542724609375,
"learning_rate": 2e-07,
"loss": -1.335442066192627e-05,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.3500000052154064,
"reward_std": 0.25591449439525604,
"rewards/MultiModalAccuracyORM": 0.3500000052154064,
"step": 940,
"train_speed(iter/s)": 0.028624
},
{
"clip_ratio": 0.0,
"completion_length": 346.15,
"epoch": 0.38181818181818183,
"grad_norm": 1.4018775780145751,
"kl": 0.0094390869140625,
"learning_rate": 2e-07,
"loss": -0.003527042269706726,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.24166667088866234,
"reward_std": 0.31088480055332185,
"rewards/MultiModalAccuracyORM": 0.24166667088866234,
"step": 945,
"train_speed(iter/s)": 0.028624
},
{
"clip_ratio": 0.0,
"completion_length": 310.7,
"epoch": 0.3838383838383838,
"grad_norm": 3.8112599620979117,
"kl": 0.01011962890625,
"learning_rate": 2e-07,
"loss": 0.01941452920436859,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.34166667610406876,
"reward_std": 0.34228681921958926,
"rewards/MultiModalAccuracyORM": 0.34166667610406876,
"step": 950,
"train_speed(iter/s)": 0.028616
},
{
"clip_ratio": 0.0,
"completion_length": 260.35,
"epoch": 0.38585858585858585,
"grad_norm": 1.8716114512263384,
"kl": 0.0135040283203125,
"learning_rate": 2e-07,
"loss": 0.01583598256111145,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.3916666701436043,
"reward_std": 0.26291108727455137,
"rewards/MultiModalAccuracyORM": 0.3916666701436043,
"step": 955,
"train_speed(iter/s)": 0.028656
},
{
"clip_ratio": 0.0,
"completion_length": 310.0,
"epoch": 0.3878787878787879,
"grad_norm": 2.6882447296010508,
"kl": 0.0098663330078125,
"learning_rate": 2e-07,
"loss": 0.008884111046791076,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.2833333410322666,
"reward_std": 0.3393357157707214,
"rewards/MultiModalAccuracyORM": 0.2833333410322666,
"step": 960,
"train_speed(iter/s)": 0.02864
},
{
"clip_ratio": 0.0,
"completion_length": 288.3,
"epoch": 0.3898989898989899,
"grad_norm": 2.477942143166408,
"kl": 0.013421630859375,
"learning_rate": 2e-07,
"loss": 0.013846510648727417,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.39166667088866236,
"reward_std": 0.3041278898715973,
"rewards/MultiModalAccuracyORM": 0.39166667088866236,
"step": 965,
"train_speed(iter/s)": 0.028659
},
{
"clip_ratio": 0.0,
"completion_length": 288.35,
"epoch": 0.39191919191919194,
"grad_norm": 1.7487986972843892,
"kl": 0.008868408203125,
"learning_rate": 2e-07,
"loss": 0.041995507478713986,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.3250000037252903,
"reward_std": 0.3277524411678314,
"rewards/MultiModalAccuracyORM": 0.3250000037252903,
"step": 970,
"train_speed(iter/s)": 0.028675
},
{
"clip_ratio": 0.0,
"completion_length": 326.9,
"epoch": 0.3939393939393939,
"grad_norm": 1.040945452450775,
"kl": 0.00943603515625,
"learning_rate": 2e-07,
"loss": 0.004313239455223083,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.18333333656191825,
"reward_std": 0.2722140818834305,
"rewards/MultiModalAccuracyORM": 0.18333333656191825,
"step": 975,
"train_speed(iter/s)": 0.028693
},
{
"clip_ratio": 0.0,
"completion_length": 298.5,
"epoch": 0.39595959595959596,
"grad_norm": 1.987178230745996,
"kl": 0.0092681884765625,
"learning_rate": 2e-07,
"loss": 0.01756379157304764,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.1833333358168602,
"reward_std": 0.3274982154369354,
"rewards/MultiModalAccuracyORM": 0.1833333358168602,
"step": 980,
"train_speed(iter/s)": 0.028714
},
{
"clip_ratio": 0.0,
"completion_length": 297.4,
"epoch": 0.397979797979798,
"grad_norm": 1.9999919818314047,
"kl": 0.012908935546875,
"learning_rate": 2e-07,
"loss": 0.04084535539150238,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.32500001341104506,
"reward_std": 0.2752989321947098,
"rewards/MultiModalAccuracyORM": 0.32500001341104506,
"step": 985,
"train_speed(iter/s)": 0.028744
},
{
"clip_ratio": 0.0,
"completion_length": 506.15,
"epoch": 0.4,
"grad_norm": 0.038170370656060805,
"kl": 0.010888671875,
"learning_rate": 2e-07,
"loss": 0.07128549218177796,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.25000000819563867,
"reward_std": 0.30416645109653473,
"rewards/MultiModalAccuracyORM": 0.25000000819563867,
"step": 990,
"train_speed(iter/s)": 0.028708
},
{
"clip_ratio": 0.0,
"completion_length": 433.45,
"epoch": 0.402020202020202,
"grad_norm": 2.632502419980814,
"kl": 0.0100616455078125,
"learning_rate": 2e-07,
"loss": 0.016613197326660157,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.39166667610406875,
"reward_std": 0.37174026668071747,
"rewards/MultiModalAccuracyORM": 0.39166667610406875,
"step": 995,
"train_speed(iter/s)": 0.028687
},
{
"epoch": 0.40404040404040403,
"grad_norm": 0.07099216395354724,
"learning_rate": 2e-07,
"loss": 0.02232474982738495,
"memory(GiB)": 87.45,
"step": 1000,
"train_speed(iter/s)": 0.028672
},
{
"epoch": 0.40404040404040403,
"eval_clip_ratio": 0.0,
"eval_completion_length": 346.9533413696289,
"eval_kl": 0.013145751953125,
"eval_loss": -0.00028896695584990084,
"eval_response_clip_ratio": 0.0,
"eval_reward": 0.281666671782732,
"eval_reward_std": 0.3010890519618988,
"eval_rewards/MultiModalAccuracyORM": 0.281666671782732,
"eval_runtime": 1406.863,
"eval_samples_per_second": 0.036,
"eval_steps_per_second": 0.004,
"step": 1000
},
{
"clip_ratio": 0.0,
"completion_length": 415.175,
"epoch": 0.40606060606060607,
"grad_norm": 1.905945440484278,
"kl": 0.009429931640625,
"learning_rate": 2e-07,
"loss": -0.0033631980419158935,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.2708333387970924,
"reward_std": 0.24963780641555786,
"rewards/MultiModalAccuracyORM": 0.2708333387970924,
"step": 1005,
"train_speed(iter/s)": 0.027262
},
{
"clip_ratio": 0.0,
"completion_length": 341.5,
"epoch": 0.4080808080808081,
"grad_norm": 1.6755020591769207,
"kl": 0.0134246826171875,
"learning_rate": 2e-07,
"loss": 0.05349223613739014,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.2500000037252903,
"reward_std": 0.3494287371635437,
"rewards/MultiModalAccuracyORM": 0.2500000037252903,
"step": 1010,
"train_speed(iter/s)": 0.027279
},
{
"clip_ratio": 0.0,
"completion_length": 351.25,
"epoch": 0.4101010101010101,
"grad_norm": 2.8913380726136872,
"kl": 0.0107147216796875,
"learning_rate": 2e-07,
"loss": -0.02667723298072815,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.38333334103226663,
"reward_std": 0.4211569488048553,
"rewards/MultiModalAccuracyORM": 0.38333334103226663,
"step": 1015,
"train_speed(iter/s)": 0.027304
},
{
"clip_ratio": 0.0,
"completion_length": 339.65,
"epoch": 0.4121212121212121,
"grad_norm": 4.180952848080379,
"kl": 0.0100433349609375,
"learning_rate": 2e-07,
"loss": 0.00991852581501007,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.2083333447575569,
"reward_std": 0.3088736057281494,
"rewards/MultiModalAccuracyORM": 0.2083333447575569,
"step": 1020,
"train_speed(iter/s)": 0.027315
},
{
"clip_ratio": 0.0,
"completion_length": 367.55,
"epoch": 0.41414141414141414,
"grad_norm": 1.9667254904423306,
"kl": 0.0121246337890625,
"learning_rate": 2e-07,
"loss": 0.01899299621582031,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.15833333507180214,
"reward_std": 0.3071291267871857,
"rewards/MultiModalAccuracyORM": 0.15833333507180214,
"step": 1025,
"train_speed(iter/s)": 0.027329
},
{
"clip_ratio": 0.0,
"completion_length": 435.4,
"epoch": 0.4161616161616162,
"grad_norm": 1.7062594547415575,
"kl": 0.0100616455078125,
"learning_rate": 2e-07,
"loss": 0.004674983024597168,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.22500001043081283,
"reward_std": 0.3679134130477905,
"rewards/MultiModalAccuracyORM": 0.22500001043081283,
"step": 1030,
"train_speed(iter/s)": 0.027298
},
{
"clip_ratio": 0.0,
"completion_length": 350.0,
"epoch": 0.41818181818181815,
"grad_norm": 72.23734764401382,
"kl": 0.011712646484375,
"learning_rate": 2e-07,
"loss": 0.05118045210838318,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.2666666775941849,
"reward_std": 0.34735551476478577,
"rewards/MultiModalAccuracyORM": 0.2666666775941849,
"step": 1035,
"train_speed(iter/s)": 0.027303
},
{
"clip_ratio": 0.0,
"completion_length": 311.15,
"epoch": 0.4202020202020202,
"grad_norm": 1.6715902563969363,
"kl": 0.0135772705078125,
"learning_rate": 2e-07,
"loss": 0.045872822403907776,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.2416666716337204,
"reward_std": 0.287842845916748,
"rewards/MultiModalAccuracyORM": 0.2416666716337204,
"step": 1040,
"train_speed(iter/s)": 0.027298
},
{
"clip_ratio": 0.0,
"completion_length": 353.15,
"epoch": 0.4222222222222222,
"grad_norm": 2.734745023688755,
"kl": 0.012158203125,
"learning_rate": 2e-07,
"loss": 0.05562522411346436,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.31666667237877844,
"reward_std": 0.4314686059951782,
"rewards/MultiModalAccuracyORM": 0.31666667237877844,
"step": 1045,
"train_speed(iter/s)": 0.027328
},
{
"clip_ratio": 0.0,
"completion_length": 359.3,
"epoch": 0.42424242424242425,
"grad_norm": 0.07598134741536419,
"kl": 0.009765625,
"learning_rate": 2e-07,
"loss": 0.008748695254325867,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.24166667014360427,
"reward_std": 0.18326250910758973,
"rewards/MultiModalAccuracyORM": 0.24166667014360427,
"step": 1050,
"train_speed(iter/s)": 0.027308
},
{
"clip_ratio": 0.0,
"completion_length": 392.2,
"epoch": 0.4262626262626263,
"grad_norm": 9.627726509942965,
"kl": 0.0136199951171875,
"learning_rate": 2e-07,
"loss": 0.03634963035583496,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.3166666768491268,
"reward_std": 0.36670139729976653,
"rewards/MultiModalAccuracyORM": 0.3166666768491268,
"step": 1055,
"train_speed(iter/s)": 0.027311
},
{
"clip_ratio": 0.0,
"completion_length": 289.7,
"epoch": 0.42828282828282827,
"grad_norm": 1.2371668114044378,
"kl": 0.0134979248046875,
"learning_rate": 2e-07,
"loss": 0.04366698265075684,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.2083333373069763,
"reward_std": 0.3498693466186523,
"rewards/MultiModalAccuracyORM": 0.2083333373069763,
"step": 1060,
"train_speed(iter/s)": 0.027334
},
{
"clip_ratio": 0.0,
"completion_length": 321.05,
"epoch": 0.4303030303030303,
"grad_norm": 2.52858518092475,
"kl": 0.0135711669921875,
"learning_rate": 2e-07,
"loss": 0.065219247341156,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.3000000111758709,
"reward_std": 0.37853889763355253,
"rewards/MultiModalAccuracyORM": 0.3000000111758709,
"step": 1065,
"train_speed(iter/s)": 0.027352
},
{
"clip_ratio": 0.0,
"completion_length": 287.5,
"epoch": 0.43232323232323233,
"grad_norm": 2.3424705728855995,
"kl": 0.0116546630859375,
"learning_rate": 2e-07,
"loss": 0.03819225430488586,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.21666667014360427,
"reward_std": 0.3227578908205032,
"rewards/MultiModalAccuracyORM": 0.21666667014360427,
"step": 1070,
"train_speed(iter/s)": 0.027305
},
{
"clip_ratio": 0.0,
"completion_length": 345.55,
"epoch": 0.43434343434343436,
"grad_norm": 2.798437729299758,
"kl": 0.014569091796875,
"learning_rate": 2e-07,
"loss": 0.004848736524581909,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.27500000819563863,
"reward_std": 0.31416428089141846,
"rewards/MultiModalAccuracyORM": 0.27500000819563863,
"step": 1075,
"train_speed(iter/s)": 0.027334
},
{
"clip_ratio": 0.0,
"completion_length": 300.8,
"epoch": 0.43636363636363634,
"grad_norm": 1.7741031757506147,
"kl": 0.0157135009765625,
"learning_rate": 2e-07,
"loss": 0.00888105109333992,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.27500000447034834,
"reward_std": 0.312698033452034,
"rewards/MultiModalAccuracyORM": 0.27500000447034834,
"step": 1080,
"train_speed(iter/s)": 0.027339
},
{
"clip_ratio": 0.0,
"completion_length": 308.6,
"epoch": 0.4383838383838384,
"grad_norm": 2.06880703867489,
"kl": 0.0158050537109375,
"learning_rate": 2e-07,
"loss": -0.05194641947746277,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.2666666716337204,
"reward_std": 0.22603832483291625,
"rewards/MultiModalAccuracyORM": 0.2666666716337204,
"step": 1085,
"train_speed(iter/s)": 0.027329
},
{
"clip_ratio": 0.0,
"completion_length": 223.4,
"epoch": 0.4404040404040404,
"grad_norm": 2.4630209071132656,
"kl": 0.015411376953125,
"learning_rate": 2e-07,
"loss": -0.018011474609375,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.20833333507180213,
"reward_std": 0.3071291267871857,
"rewards/MultiModalAccuracyORM": 0.20833333507180213,
"step": 1090,
"train_speed(iter/s)": 0.027372
},
{
"clip_ratio": 0.0,
"completion_length": 264.2,
"epoch": 0.44242424242424244,
"grad_norm": 2.265643619288025,
"kl": 0.01461181640625,
"learning_rate": 2e-07,
"loss": 0.04221695959568024,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.2666666701436043,
"reward_std": 0.3329358011484146,
"rewards/MultiModalAccuracyORM": 0.2666666701436043,
"step": 1095,
"train_speed(iter/s)": 0.027407
},
{
"clip_ratio": 0.0,
"completion_length": 357.95,
"epoch": 0.4444444444444444,
"grad_norm": 2.894324596003934,
"kl": 0.009808349609375,
"learning_rate": 2e-07,
"loss": 0.02248055934906006,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.4166666828095913,
"reward_std": 0.44607712924480436,
"rewards/MultiModalAccuracyORM": 0.4166666828095913,
"step": 1100,
"train_speed(iter/s)": 0.027442
},
{
"clip_ratio": 0.0,
"completion_length": 247.95,
"epoch": 0.44646464646464645,
"grad_norm": 0.9507289625656876,
"kl": 0.0140777587890625,
"learning_rate": 2e-07,
"loss": -0.0001364484429359436,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.46666667237877846,
"reward_std": 0.24261614382267,
"rewards/MultiModalAccuracyORM": 0.46666667237877846,
"step": 1105,
"train_speed(iter/s)": 0.027471
},
{
"clip_ratio": 0.0,
"completion_length": 238.75,
"epoch": 0.4484848484848485,
"grad_norm": 4.493560880958603,
"kl": 0.01422119140625,
"learning_rate": 2e-07,
"loss": 0.00024300813674926758,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.4500000149011612,
"reward_std": 0.345323646068573,
"rewards/MultiModalAccuracyORM": 0.4500000149011612,
"step": 1110,
"train_speed(iter/s)": 0.027312
},
{
"clip_ratio": 0.0,
"completion_length": 368.15,
"epoch": 0.4505050505050505,
"grad_norm": 1.866809698039603,
"kl": 0.0131317138671875,
"learning_rate": 2e-07,
"loss": -0.007444334030151367,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.20000000298023224,
"reward_std": 0.3281930506229401,
"rewards/MultiModalAccuracyORM": 0.20000000298023224,
"step": 1115,
"train_speed(iter/s)": 0.027296
},
{
"clip_ratio": 0.0,
"completion_length": 386.45,
"epoch": 0.45252525252525255,
"grad_norm": 0.04083454065583723,
"kl": 0.0086578369140625,
"learning_rate": 2e-07,
"loss": 0.009036242961883545,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.22500000447034835,
"reward_std": 0.24710224866867064,
"rewards/MultiModalAccuracyORM": 0.22500000447034835,
"step": 1120,
"train_speed(iter/s)": 0.027256
},
{
"clip_ratio": 0.0,
"completion_length": 299.75,
"epoch": 0.45454545454545453,
"grad_norm": 2.1257862237671588,
"kl": 0.01603851318359375,
"learning_rate": 2e-07,
"loss": -0.014222325384616851,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.4333333432674408,
"reward_std": 0.4078585982322693,
"rewards/MultiModalAccuracyORM": 0.4333333432674408,
"step": 1125,
"train_speed(iter/s)": 0.027299
},
{
"clip_ratio": 0.0,
"completion_length": 338.15,
"epoch": 0.45656565656565656,
"grad_norm": 48.10712707725128,
"kl": 0.0124542236328125,
"learning_rate": 2e-07,
"loss": 0.009453803300857544,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.30833333879709246,
"reward_std": 0.32858102321624755,
"rewards/MultiModalAccuracyORM": 0.30833333879709246,
"step": 1130,
"train_speed(iter/s)": 0.027339
},
{
"clip_ratio": 0.0,
"completion_length": 434.6,
"epoch": 0.4585858585858586,
"grad_norm": 0.8869001794016839,
"kl": 0.01041259765625,
"learning_rate": 2e-07,
"loss": -0.002349555492401123,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.13333333730697633,
"reward_std": 0.29003951847553255,
"rewards/MultiModalAccuracyORM": 0.13333333730697633,
"step": 1135,
"train_speed(iter/s)": 0.027364
},
{
"clip_ratio": 0.0,
"completion_length": 287.0,
"epoch": 0.46060606060606063,
"grad_norm": 2.2315283680448346,
"kl": 0.0132476806640625,
"learning_rate": 2e-07,
"loss": -0.010060985386371613,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.17500000745058059,
"reward_std": 0.3043610692024231,
"rewards/MultiModalAccuracyORM": 0.17500000745058059,
"step": 1140,
"train_speed(iter/s)": 0.027393
},
{
"clip_ratio": 0.0,
"completion_length": 449.2,
"epoch": 0.4626262626262626,
"grad_norm": 0.04850876090724914,
"kl": 0.0081451416015625,
"learning_rate": 2e-07,
"loss": -0.022587394714355467,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.10833333656191826,
"reward_std": 0.20343697369098662,
"rewards/MultiModalAccuracyORM": 0.10833333656191826,
"step": 1145,
"train_speed(iter/s)": 0.027421
},
{
"clip_ratio": 0.0,
"completion_length": 376.05,
"epoch": 0.46464646464646464,
"grad_norm": 2.2096178690715,
"kl": 0.0104400634765625,
"learning_rate": 2e-07,
"loss": 0.01734369993209839,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.3250000074505806,
"reward_std": 0.33700530230998993,
"rewards/MultiModalAccuracyORM": 0.3250000074505806,
"step": 1150,
"train_speed(iter/s)": 0.027419
},
{
"clip_ratio": 0.0,
"completion_length": 308.65,
"epoch": 0.4666666666666667,
"grad_norm": 1.3995623416059861,
"kl": 0.020782470703125,
"learning_rate": 2e-07,
"loss": 0.004217700660228729,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.27500000447034834,
"reward_std": 0.20594746768474578,
"rewards/MultiModalAccuracyORM": 0.27500000447034834,
"step": 1155,
"train_speed(iter/s)": 0.027419
},
{
"clip_ratio": 0.0,
"completion_length": 229.45,
"epoch": 0.4686868686868687,
"grad_norm": 7.604841869136694,
"kl": 0.017425537109375,
"learning_rate": 2e-07,
"loss": 0.04910666048526764,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.3000000029802322,
"reward_std": 0.3408561676740646,
"rewards/MultiModalAccuracyORM": 0.3000000029802322,
"step": 1160,
"train_speed(iter/s)": 0.02739
},
{
"clip_ratio": 0.0,
"completion_length": 279.1,
"epoch": 0.4707070707070707,
"grad_norm": 1.7338556861412973,
"kl": 0.009881591796875,
"learning_rate": 2e-07,
"loss": -0.02307046055793762,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.17500000149011613,
"reward_std": 0.18081162869930267,
"rewards/MultiModalAccuracyORM": 0.17500000149011613,
"step": 1165,
"train_speed(iter/s)": 0.027388
},
{
"clip_ratio": 0.0,
"completion_length": 352.15,
"epoch": 0.4727272727272727,
"grad_norm": 1.2587552234540058,
"kl": 0.0092010498046875,
"learning_rate": 2e-07,
"loss": -0.05895323753356933,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.41666668429970743,
"reward_std": 0.40890581607818605,
"rewards/MultiModalAccuracyORM": 0.41666668429970743,
"step": 1170,
"train_speed(iter/s)": 0.027373
},
{
"clip_ratio": 0.0,
"completion_length": 381.2,
"epoch": 0.47474747474747475,
"grad_norm": 0.06683334066144007,
"kl": 0.01002349853515625,
"learning_rate": 2e-07,
"loss": 0.02935360074043274,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.2416666716337204,
"reward_std": 0.27523933053016664,
"rewards/MultiModalAccuracyORM": 0.2416666716337204,
"step": 1175,
"train_speed(iter/s)": 0.027312
},
{
"clip_ratio": 0.0,
"completion_length": 443.15,
"epoch": 0.4767676767676768,
"grad_norm": 27.070556493942583,
"kl": 0.00930938720703125,
"learning_rate": 2e-07,
"loss": 0.0851466953754425,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.25000000223517416,
"reward_std": 0.3342405825853348,
"rewards/MultiModalAccuracyORM": 0.25000000223517416,
"step": 1180,
"train_speed(iter/s)": 0.027331
},
{
"clip_ratio": 0.0,
"completion_length": 403.55,
"epoch": 0.47878787878787876,
"grad_norm": 1.5534177345271625,
"kl": 0.0102996826171875,
"learning_rate": 2e-07,
"loss": 0.028819066286087037,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.266666679084301,
"reward_std": 0.3129431068897247,
"rewards/MultiModalAccuracyORM": 0.266666679084301,
"step": 1185,
"train_speed(iter/s)": 0.027335
},
{
"clip_ratio": 0.0,
"completion_length": 327.65,
"epoch": 0.4808080808080808,
"grad_norm": 2.8838868478156816,
"kl": 0.02685546875,
"learning_rate": 2e-07,
"loss": 0.006991004943847657,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.17500000447034836,
"reward_std": 0.2323400765657425,
"rewards/MultiModalAccuracyORM": 0.17500000447034836,
"step": 1190,
"train_speed(iter/s)": 0.027082
},
{
"clip_ratio": 0.0,
"completion_length": 500.2,
"epoch": 0.48282828282828283,
"grad_norm": 2.6317167816627993,
"kl": 0.014031982421875,
"learning_rate": 2e-07,
"loss": -0.003238886594772339,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.05,
"reward": 0.20000000596046447,
"reward_std": 0.30388820767402647,
"rewards/MultiModalAccuracyORM": 0.20000000596046447,
"step": 1195,
"train_speed(iter/s)": 0.026955
},
{
"clip_ratio": 0.0,
"completion_length": 259.35,
"epoch": 0.48484848484848486,
"grad_norm": 53.95756362621299,
"kl": 0.0124114990234375,
"learning_rate": 2e-07,
"loss": -0.00888831913471222,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.3000000029802322,
"reward_std": 0.29782613217830656,
"rewards/MultiModalAccuracyORM": 0.3000000029802322,
"step": 1200,
"train_speed(iter/s)": 0.026995
},
{
"clip_ratio": 0.0,
"completion_length": 287.0,
"epoch": 0.4868686868686869,
"grad_norm": 1.8840812265683782,
"kl": 0.016448974609375,
"learning_rate": 2e-07,
"loss": 0.024408812820911407,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.34166667610406876,
"reward_std": 0.4253006011247635,
"rewards/MultiModalAccuracyORM": 0.34166667610406876,
"step": 1205,
"train_speed(iter/s)": 0.02702
},
{
"clip_ratio": 0.0,
"completion_length": 263.2,
"epoch": 0.4888888888888889,
"grad_norm": 2.267475237086073,
"kl": 0.01165771484375,
"learning_rate": 2e-07,
"loss": -0.02959960699081421,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.4416666813194752,
"reward_std": 0.3111630380153656,
"rewards/MultiModalAccuracyORM": 0.4416666813194752,
"step": 1210,
"train_speed(iter/s)": 0.027058
},
{
"clip_ratio": 0.0,
"completion_length": 341.55,
"epoch": 0.4909090909090909,
"grad_norm": 1.53249738300366,
"kl": 0.01207275390625,
"learning_rate": 2e-07,
"loss": 0.01664416640996933,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.3500000089406967,
"reward_std": 0.39155901670455934,
"rewards/MultiModalAccuracyORM": 0.3500000089406967,
"step": 1215,
"train_speed(iter/s)": 0.027075
},
{
"clip_ratio": 0.0,
"completion_length": 346.35,
"epoch": 0.49292929292929294,
"grad_norm": 2.838473944184638,
"kl": 0.0138153076171875,
"learning_rate": 2e-07,
"loss": 0.011857110261917114,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.16666667237877847,
"reward_std": 0.32422170639038084,
"rewards/MultiModalAccuracyORM": 0.16666667237877847,
"step": 1220,
"train_speed(iter/s)": 0.027075
},
{
"clip_ratio": 0.0,
"completion_length": 340.2,
"epoch": 0.494949494949495,
"grad_norm": 2.239419757076915,
"kl": 0.0130462646484375,
"learning_rate": 2e-07,
"loss": 0.03971967101097107,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.36666667088866234,
"reward_std": 0.23224489092826844,
"rewards/MultiModalAccuracyORM": 0.36666667088866234,
"step": 1225,
"train_speed(iter/s)": 0.027083
},
{
"clip_ratio": 0.0,
"completion_length": 244.8,
"epoch": 0.49696969696969695,
"grad_norm": 2.1763944900135637,
"kl": 0.0342437744140625,
"learning_rate": 2e-07,
"loss": -0.010297659039497375,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.3583333410322666,
"reward_std": 0.3408351272344589,
"rewards/MultiModalAccuracyORM": 0.3583333410322666,
"step": 1230,
"train_speed(iter/s)": 0.027096
},
{
"clip_ratio": 0.0,
"completion_length": 293.45,
"epoch": 0.498989898989899,
"grad_norm": 6.002103596814289,
"kl": 0.020233154296875,
"learning_rate": 2e-07,
"loss": 0.08779069185256957,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.40000000670552255,
"reward_std": 0.35311026573181153,
"rewards/MultiModalAccuracyORM": 0.40000000670552255,
"step": 1235,
"train_speed(iter/s)": 0.027106
},
{
"clip_ratio": 0.0,
"completion_length": 468.45,
"epoch": 0.501010101010101,
"grad_norm": 1.7067044601090864,
"kl": 0.00786285400390625,
"learning_rate": 2e-07,
"loss": 0.05108952522277832,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.3500000089406967,
"reward_std": 0.3033378630876541,
"rewards/MultiModalAccuracyORM": 0.3500000089406967,
"step": 1240,
"train_speed(iter/s)": 0.027091
},
{
"clip_ratio": 0.0,
"completion_length": 397.4,
"epoch": 0.503030303030303,
"grad_norm": 0.8938521798548926,
"kl": 0.009466552734375,
"learning_rate": 2e-07,
"loss": -0.01685338616371155,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.13333333656191826,
"reward_std": 0.2292436480522156,
"rewards/MultiModalAccuracyORM": 0.13333333656191826,
"step": 1245,
"train_speed(iter/s)": 0.02707
},
{
"epoch": 0.5050505050505051,
"grad_norm": 3.702370322108623,
"learning_rate": 2e-07,
"loss": 0.036279809474945066,
"memory(GiB)": 87.45,
"step": 1250,
"train_speed(iter/s)": 0.027086
},
{
"epoch": 0.5050505050505051,
"eval_clip_ratio": 0.0,
"eval_completion_length": 321.4716763305664,
"eval_kl": 0.015718994140625,
"eval_loss": 0.013520264066755772,
"eval_response_clip_ratio": 0.0,
"eval_reward": 0.3033333399891853,
"eval_reward_std": 0.3383384072780609,
"eval_rewards/MultiModalAccuracyORM": 0.3033333399891853,
"eval_runtime": 765.5729,
"eval_samples_per_second": 0.065,
"eval_steps_per_second": 0.007,
"step": 1250
},
{
"clip_ratio": 0.0,
"completion_length": 349.475,
"epoch": 0.5070707070707071,
"grad_norm": 1.4811421198816048,
"kl": 0.01293487548828125,
"learning_rate": 2e-07,
"loss": 0.03056705594062805,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.35833334140479567,
"reward_std": 0.38048321902751925,
"rewards/MultiModalAccuracyORM": 0.35833334140479567,
"step": 1255,
"train_speed(iter/s)": 0.026435
},
{
"clip_ratio": 0.0,
"completion_length": 209.15,
"epoch": 0.509090909090909,
"grad_norm": 2.0552411044504764,
"kl": 0.0252899169921875,
"learning_rate": 2e-07,
"loss": 0.028329643607139587,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.33333334177732465,
"reward_std": 0.281466943025589,
"rewards/MultiModalAccuracyORM": 0.33333334177732465,
"step": 1260,
"train_speed(iter/s)": 0.026464
},
{
"clip_ratio": 0.0,
"completion_length": 380.4,
"epoch": 0.5111111111111111,
"grad_norm": 2.615766039038286,
"kl": 0.01002197265625,
"learning_rate": 2e-07,
"loss": 0.002955615520477295,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.25000000521540644,
"reward_std": 0.2292436480522156,
"rewards/MultiModalAccuracyORM": 0.25000000521540644,
"step": 1265,
"train_speed(iter/s)": 0.02646
},
{
"clip_ratio": 0.0,
"completion_length": 338.85,
"epoch": 0.5131313131313131,
"grad_norm": 1.9893529067484352,
"kl": 0.011163330078125,
"learning_rate": 2e-07,
"loss": 0.018701747059822083,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.300000012665987,
"reward_std": 0.3127244770526886,
"rewards/MultiModalAccuracyORM": 0.300000012665987,
"step": 1270,
"train_speed(iter/s)": 0.026466
},
{
"clip_ratio": 0.0,
"completion_length": 253.65,
"epoch": 0.5151515151515151,
"grad_norm": 1.6843559930041148,
"kl": 0.0115509033203125,
"learning_rate": 2e-07,
"loss": 0.012320590019226075,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.5000000111758709,
"reward_std": 0.345323646068573,
"rewards/MultiModalAccuracyORM": 0.5000000111758709,
"step": 1275,
"train_speed(iter/s)": 0.026464
},
{
"clip_ratio": 0.0,
"completion_length": 308.25,
"epoch": 0.5171717171717172,
"grad_norm": 3.0894548096911407,
"kl": 0.010302734375,
"learning_rate": 2e-07,
"loss": -0.02475722283124924,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.2500000074505806,
"reward_std": 0.21999078691005708,
"rewards/MultiModalAccuracyORM": 0.2500000074505806,
"step": 1280,
"train_speed(iter/s)": 0.026449
},
{
"clip_ratio": 0.0,
"completion_length": 286.45,
"epoch": 0.5191919191919192,
"grad_norm": 0.056162470903676515,
"kl": 0.010772705078125,
"learning_rate": 2e-07,
"loss": -0.0004087850451469421,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.21666667386889457,
"reward_std": 0.23105688095092775,
"rewards/MultiModalAccuracyORM": 0.21666667386889457,
"step": 1285,
"train_speed(iter/s)": 0.026422
},
{
"clip_ratio": 0.0,
"completion_length": 285.55,
"epoch": 0.5212121212121212,
"grad_norm": 1.7176303706462466,
"kl": 0.011578369140625,
"learning_rate": 2e-07,
"loss": 0.023639577627182006,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.2666666693985462,
"reward_std": 0.28077210783958434,
"rewards/MultiModalAccuracyORM": 0.2666666693985462,
"step": 1290,
"train_speed(iter/s)": 0.026422
},
{
"clip_ratio": 0.0,
"completion_length": 285.65,
"epoch": 0.5232323232323233,
"grad_norm": 1.244445708488179,
"kl": 0.0103790283203125,
"learning_rate": 2e-07,
"loss": -0.017145507037639618,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.3833333395421505,
"reward_std": 0.4086130350828171,
"rewards/MultiModalAccuracyORM": 0.3833333395421505,
"step": 1295,
"train_speed(iter/s)": 0.026442
},
{
"clip_ratio": 0.0,
"completion_length": 321.2,
"epoch": 0.5252525252525253,
"grad_norm": 1.7914388567184454,
"kl": 0.0092559814453125,
"learning_rate": 2e-07,
"loss": 0.054825717210769655,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.3083333417773247,
"reward_std": 0.3536572724580765,
"rewards/MultiModalAccuracyORM": 0.3083333417773247,
"step": 1300,
"train_speed(iter/s)": 0.026415
},
{
"clip_ratio": 0.0,
"completion_length": 251.9,
"epoch": 0.5272727272727272,
"grad_norm": 2.6174114359405976,
"kl": 0.010308837890625,
"learning_rate": 2e-07,
"loss": -0.019986753165721894,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.4500000074505806,
"reward_std": 0.3099655658006668,
"rewards/MultiModalAccuracyORM": 0.4500000074505806,
"step": 1305,
"train_speed(iter/s)": 0.026387
},
{
"clip_ratio": 0.0,
"completion_length": 264.9,
"epoch": 0.5292929292929293,
"grad_norm": 32.625329420627345,
"kl": 0.00882568359375,
"learning_rate": 2e-07,
"loss": 0.008027985692024231,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.41666667312383654,
"reward_std": 0.40485736131668093,
"rewards/MultiModalAccuracyORM": 0.41666667312383654,
"step": 1310,
"train_speed(iter/s)": 0.026366
},
{
"clip_ratio": 0.0,
"completion_length": 356.3,
"epoch": 0.5313131313131313,
"grad_norm": 1.6706902692989012,
"kl": 0.0086761474609375,
"learning_rate": 2e-07,
"loss": 0.028931498527526855,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.28333334624767303,
"reward_std": 0.3558539390563965,
"rewards/MultiModalAccuracyORM": 0.28333334624767303,
"step": 1315,
"train_speed(iter/s)": 0.026338
},
{
"clip_ratio": 0.0,
"completion_length": 274.25,
"epoch": 0.5333333333333333,
"grad_norm": 1.8800912459209826,
"kl": 0.0249176025390625,
"learning_rate": 2e-07,
"loss": 0.048329290747642514,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.31666667237877844,
"reward_std": 0.25897533297538755,
"rewards/MultiModalAccuracyORM": 0.31666667237877844,
"step": 1320,
"train_speed(iter/s)": 0.026308
},
{
"clip_ratio": 0.0,
"completion_length": 374.3,
"epoch": 0.5353535353535354,
"grad_norm": 3.1086990293234904,
"kl": 0.01292724609375,
"learning_rate": 2e-07,
"loss": 0.006182897090911865,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.34166667312383653,
"reward_std": 0.3867922484874725,
"rewards/MultiModalAccuracyORM": 0.34166667312383653,
"step": 1325,
"train_speed(iter/s)": 0.026274
},
{
"clip_ratio": 0.0,
"completion_length": 404.3,
"epoch": 0.5373737373737374,
"grad_norm": 0.08070215671871471,
"kl": 0.0099578857421875,
"learning_rate": 2e-07,
"loss": 0.062343114614486696,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.3000000059604645,
"reward_std": 0.22625695466995238,
"rewards/MultiModalAccuracyORM": 0.3000000059604645,
"step": 1330,
"train_speed(iter/s)": 0.026241
},
{
"clip_ratio": 0.0,
"completion_length": 360.75,
"epoch": 0.5393939393939394,
"grad_norm": 3.4146119265895893,
"kl": 0.0290008544921875,
"learning_rate": 2e-07,
"loss": -0.02337663769721985,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.40000000819563863,
"reward_std": 0.31852359175682066,
"rewards/MultiModalAccuracyORM": 0.40000000819563863,
"step": 1335,
"train_speed(iter/s)": 0.026231
},
{
"clip_ratio": 0.0,
"completion_length": 300.85,
"epoch": 0.5414141414141415,
"grad_norm": 1.014030648475331,
"kl": 0.0152801513671875,
"learning_rate": 2e-07,
"loss": 0.03424631953239441,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.3333333425223827,
"reward_std": 0.22807018756866454,
"rewards/MultiModalAccuracyORM": 0.3333333425223827,
"step": 1340,
"train_speed(iter/s)": 0.026218
},
{
"clip_ratio": 0.0,
"completion_length": 297.5,
"epoch": 0.5434343434343434,
"grad_norm": 2.579076344272663,
"kl": 0.0294189453125,
"learning_rate": 2e-07,
"loss": -0.004431784152984619,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.425000012665987,
"reward_std": 0.3433456152677536,
"rewards/MultiModalAccuracyORM": 0.425000012665987,
"step": 1345,
"train_speed(iter/s)": 0.026212
},
{
"clip_ratio": 0.0,
"completion_length": 365.2,
"epoch": 0.5454545454545454,
"grad_norm": 0.09604007460689165,
"kl": 0.0132415771484375,
"learning_rate": 2e-07,
"loss": 0.011541323363780975,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.15833333730697632,
"reward_std": 0.24961273670196532,
"rewards/MultiModalAccuracyORM": 0.15833333730697632,
"step": 1350,
"train_speed(iter/s)": 0.026199
},
{
"clip_ratio": 0.0,
"completion_length": 294.2,
"epoch": 0.5474747474747474,
"grad_norm": 2.8630066616840306,
"kl": 0.0131500244140625,
"learning_rate": 2e-07,
"loss": 0.0038095355033874513,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.4083333484828472,
"reward_std": 0.371958914399147,
"rewards/MultiModalAccuracyORM": 0.4083333484828472,
"step": 1355,
"train_speed(iter/s)": 0.026195
},
{
"clip_ratio": 0.0,
"completion_length": 290.35,
"epoch": 0.5494949494949495,
"grad_norm": 2.8462264230542202,
"kl": 0.0113922119140625,
"learning_rate": 2e-07,
"loss": -0.013850301504135132,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.13333333879709244,
"reward_std": 0.23857065439224243,
"rewards/MultiModalAccuracyORM": 0.13333333879709244,
"step": 1360,
"train_speed(iter/s)": 0.026178
},
{
"clip_ratio": 0.0,
"completion_length": 352.8,
"epoch": 0.5515151515151515,
"grad_norm": 1.9037157526983224,
"kl": 0.0115966796875,
"learning_rate": 2e-07,
"loss": 0.061475354433059695,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.28333334252238274,
"reward_std": 0.37644500732421876,
"rewards/MultiModalAccuracyORM": 0.28333334252238274,
"step": 1365,
"train_speed(iter/s)": 0.026169
},
{
"clip_ratio": 0.0,
"completion_length": 290.25,
"epoch": 0.5535353535353535,
"grad_norm": 1.5230914677267515,
"kl": 0.012347412109375,
"learning_rate": 2e-07,
"loss": 0.02505878210067749,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.36666667386889457,
"reward_std": 0.26496326327323916,
"rewards/MultiModalAccuracyORM": 0.36666667386889457,
"step": 1370,
"train_speed(iter/s)": 0.026162
},
{
"clip_ratio": 0.0,
"completion_length": 304.7,
"epoch": 0.5555555555555556,
"grad_norm": 1.9879722073308892,
"kl": 0.0135223388671875,
"learning_rate": 2e-07,
"loss": 0.010433109104633331,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.27500000447034834,
"reward_std": 0.18332211077213287,
"rewards/MultiModalAccuracyORM": 0.27500000447034834,
"step": 1375,
"train_speed(iter/s)": 0.026157
},
{
"clip_ratio": 0.0,
"completion_length": 319.9,
"epoch": 0.5575757575757576,
"grad_norm": 2.649637312336083,
"kl": 0.012469482421875,
"learning_rate": 2e-07,
"loss": 0.009650683403015137,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.23333333805203438,
"reward_std": 0.3890485167503357,
"rewards/MultiModalAccuracyORM": 0.23333333805203438,
"step": 1380,
"train_speed(iter/s)": 0.02615
},
{
"clip_ratio": 0.0,
"completion_length": 355.05,
"epoch": 0.5595959595959596,
"grad_norm": 0.05006149717815439,
"kl": 0.016656494140625,
"learning_rate": 2e-07,
"loss": -0.007993972301483155,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.29166667386889455,
"reward_std": 0.3541334718465805,
"rewards/MultiModalAccuracyORM": 0.29166667386889455,
"step": 1385,
"train_speed(iter/s)": 0.026129
},
{
"clip_ratio": 0.0,
"completion_length": 155.65,
"epoch": 0.5616161616161616,
"grad_norm": 0.08079407011077554,
"kl": 0.01981201171875,
"learning_rate": 2e-07,
"loss": 0.03422499895095825,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.25833334103226663,
"reward_std": 0.3597048044204712,
"rewards/MultiModalAccuracyORM": 0.25833334103226663,
"step": 1390,
"train_speed(iter/s)": 0.026124
},
{
"clip_ratio": 0.0,
"completion_length": 411.3,
"epoch": 0.5636363636363636,
"grad_norm": 2.595093461800728,
"kl": 0.016748046875,
"learning_rate": 2e-07,
"loss": 0.0661674439907074,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.41666667610406877,
"reward_std": 0.41412476599216463,
"rewards/MultiModalAccuracyORM": 0.41666667610406877,
"step": 1395,
"train_speed(iter/s)": 0.02611
},
{
"clip_ratio": 0.0,
"completion_length": 312.3,
"epoch": 0.5656565656565656,
"grad_norm": 1.8524460034780388,
"kl": 0.0219970703125,
"learning_rate": 2e-07,
"loss": 0.0748141050338745,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.2833333410322666,
"reward_std": 0.3222051203250885,
"rewards/MultiModalAccuracyORM": 0.2833333410322666,
"step": 1400,
"train_speed(iter/s)": 0.026104
},
{
"clip_ratio": 0.0,
"completion_length": 284.7,
"epoch": 0.5676767676767677,
"grad_norm": 1.8645433263018287,
"kl": 0.020556640625,
"learning_rate": 2e-07,
"loss": -0.019703832268714905,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.2833333402872086,
"reward_std": 0.23230449855327606,
"rewards/MultiModalAccuracyORM": 0.2833333402872086,
"step": 1405,
"train_speed(iter/s)": 0.026096
},
{
"clip_ratio": 0.0,
"completion_length": 301.15,
"epoch": 0.5696969696969697,
"grad_norm": 2.007508731899272,
"kl": 0.014324951171875,
"learning_rate": 2e-07,
"loss": 0.026613450050354003,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.40000000819563863,
"reward_std": 0.26928699016571045,
"rewards/MultiModalAccuracyORM": 0.40000000819563863,
"step": 1410,
"train_speed(iter/s)": 0.026082
},
{
"clip_ratio": 0.0,
"completion_length": 384.6,
"epoch": 0.5717171717171717,
"grad_norm": 1.3049808616717113,
"kl": 0.0161651611328125,
"learning_rate": 2e-07,
"loss": -0.019157709181308748,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.25833334103226663,
"reward_std": 0.3352662205696106,
"rewards/MultiModalAccuracyORM": 0.25833334103226663,
"step": 1415,
"train_speed(iter/s)": 0.026066
},
{
"clip_ratio": 0.0,
"completion_length": 356.4,
"epoch": 0.5737373737373738,
"grad_norm": 1.7990652267186868,
"kl": 0.021240234375,
"learning_rate": 2e-07,
"loss": 0.043132427334785464,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.17500000298023224,
"reward_std": 0.2159808874130249,
"rewards/MultiModalAccuracyORM": 0.17500000298023224,
"step": 1420,
"train_speed(iter/s)": 0.026059
},
{
"clip_ratio": 0.0,
"completion_length": 288.85,
"epoch": 0.5757575757575758,
"grad_norm": 1.3873829792776142,
"kl": 0.017431640625,
"learning_rate": 2e-07,
"loss": 0.010021258890628815,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.45833334475755694,
"reward_std": 0.2770525634288788,
"rewards/MultiModalAccuracyORM": 0.45833334475755694,
"step": 1425,
"train_speed(iter/s)": 0.026059
},
{
"clip_ratio": 0.0,
"completion_length": 296.05,
"epoch": 0.5777777777777777,
"grad_norm": 1.6565432442769377,
"kl": 0.0139190673828125,
"learning_rate": 2e-07,
"loss": 0.016829773783683777,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.29166667312383654,
"reward_std": 0.40086200535297395,
"rewards/MultiModalAccuracyORM": 0.29166667312383654,
"step": 1430,
"train_speed(iter/s)": 0.026063
},
{
"clip_ratio": 0.0,
"completion_length": 370.7,
"epoch": 0.5797979797979798,
"grad_norm": 1.2410328295318487,
"kl": 0.015863037109375,
"learning_rate": 2e-07,
"loss": -0.04091094434261322,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.3166666731238365,
"reward_std": 0.3603756338357925,
"rewards/MultiModalAccuracyORM": 0.3166666731238365,
"step": 1435,
"train_speed(iter/s)": 0.026053
},
{
"clip_ratio": 0.0,
"completion_length": 305.7,
"epoch": 0.5818181818181818,
"grad_norm": 2.659138324217993,
"kl": 0.01724853515625,
"learning_rate": 2e-07,
"loss": 0.08770001530647278,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.3333333425223827,
"reward_std": 0.4456100821495056,
"rewards/MultiModalAccuracyORM": 0.3333333425223827,
"step": 1440,
"train_speed(iter/s)": 0.026045
},
{
"clip_ratio": 0.0,
"completion_length": 321.15,
"epoch": 0.5838383838383838,
"grad_norm": 2.6855533659279462,
"kl": 0.015350341796875,
"learning_rate": 2e-07,
"loss": -0.03101794719696045,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.1666666716337204,
"reward_std": 0.2644129186868668,
"rewards/MultiModalAccuracyORM": 0.1666666716337204,
"step": 1445,
"train_speed(iter/s)": 0.026039
},
{
"clip_ratio": 0.0,
"completion_length": 353.85,
"epoch": 0.5858585858585859,
"grad_norm": 0.8787033948980154,
"kl": 0.018048095703125,
"learning_rate": 2e-07,
"loss": 0.021743962168693544,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.1500000014901161,
"reward_std": 0.2496483266353607,
"rewards/MultiModalAccuracyORM": 0.1500000014901161,
"step": 1450,
"train_speed(iter/s)": 0.026027
},
{
"clip_ratio": 0.0,
"completion_length": 329.65,
"epoch": 0.5878787878787879,
"grad_norm": 2.6089377973235917,
"kl": 0.0154541015625,
"learning_rate": 2e-07,
"loss": -0.0126606285572052,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.25833333730697633,
"reward_std": 0.287842845916748,
"rewards/MultiModalAccuracyORM": 0.25833333730697633,
"step": 1455,
"train_speed(iter/s)": 0.026012
},
{
"clip_ratio": 0.0,
"completion_length": 353.55,
"epoch": 0.5898989898989899,
"grad_norm": 3.1599228273908895,
"kl": 0.017535400390625,
"learning_rate": 2e-07,
"loss": 0.03227808475494385,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.3500000089406967,
"reward_std": 0.2754935622215271,
"rewards/MultiModalAccuracyORM": 0.3500000089406967,
"step": 1460,
"train_speed(iter/s)": 0.025992
},
{
"clip_ratio": 0.0,
"completion_length": 247.35,
"epoch": 0.591919191919192,
"grad_norm": 3.772779516485284,
"kl": 0.016162109375,
"learning_rate": 2e-07,
"loss": -0.006427288055419922,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.21666667088866234,
"reward_std": 0.3214506834745407,
"rewards/MultiModalAccuracyORM": 0.21666667088866234,
"step": 1465,
"train_speed(iter/s)": 0.02598
},
{
"clip_ratio": 0.0,
"completion_length": 300.7,
"epoch": 0.593939393939394,
"grad_norm": 1.9048234622524929,
"kl": 0.019964599609375,
"learning_rate": 2e-07,
"loss": 0.02089463174343109,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.33333334550261495,
"reward_std": 0.39707074165344236,
"rewards/MultiModalAccuracyORM": 0.33333334550261495,
"step": 1470,
"train_speed(iter/s)": 0.025963
},
{
"clip_ratio": 0.0,
"completion_length": 372.2,
"epoch": 0.5959595959595959,
"grad_norm": 1.7167051608215667,
"kl": 0.0126953125,
"learning_rate": 2e-07,
"loss": 0.0002398371696472168,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.24166667014360427,
"reward_std": 0.3485645651817322,
"rewards/MultiModalAccuracyORM": 0.24166667014360427,
"step": 1475,
"train_speed(iter/s)": 0.025929
},
{
"clip_ratio": 0.0,
"completion_length": 286.9,
"epoch": 0.597979797979798,
"grad_norm": 2.018355689891589,
"kl": 0.014324951171875,
"learning_rate": 2e-07,
"loss": 0.025476664304733276,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.5250000104308128,
"reward_std": 0.3463323086500168,
"rewards/MultiModalAccuracyORM": 0.5250000104308128,
"step": 1480,
"train_speed(iter/s)": 0.025899
},
{
"clip_ratio": 0.0,
"completion_length": 286.35,
"epoch": 0.6,
"grad_norm": 1.9564498539046626,
"kl": 0.013104248046875,
"learning_rate": 2e-07,
"loss": -0.0017219483852386475,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.2666666753590107,
"reward_std": 0.3392761141061783,
"rewards/MultiModalAccuracyORM": 0.2666666753590107,
"step": 1485,
"train_speed(iter/s)": 0.025884
},
{
"clip_ratio": 0.0,
"completion_length": 371.55,
"epoch": 0.602020202020202,
"grad_norm": 3.3586873596373836,
"kl": 0.0190948486328125,
"learning_rate": 2e-07,
"loss": -0.015026980638504028,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.33333334550261495,
"reward_std": 0.43529842495918275,
"rewards/MultiModalAccuracyORM": 0.33333334550261495,
"step": 1490,
"train_speed(iter/s)": 0.02585
},
{
"clip_ratio": 0.0,
"completion_length": 352.1,
"epoch": 0.604040404040404,
"grad_norm": 1.5566031738878978,
"kl": 0.0152313232421875,
"learning_rate": 2e-07,
"loss": 0.05221402645111084,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.4666666716337204,
"reward_std": 0.3853524446487427,
"rewards/MultiModalAccuracyORM": 0.4666666716337204,
"step": 1495,
"train_speed(iter/s)": 0.025826
},
{
"epoch": 0.6060606060606061,
"grad_norm": 1.092725055214899,
"learning_rate": 2e-07,
"loss": 0.044440290331840514,
"memory(GiB)": 87.45,
"step": 1500,
"train_speed(iter/s)": 0.025794
},
{
"epoch": 0.6060606060606061,
"eval_clip_ratio": 0.0,
"eval_completion_length": 332.07667766571046,
"eval_kl": 0.03210205078125,
"eval_loss": 0.03433879837393761,
"eval_response_clip_ratio": 0.0,
"eval_reward": 0.32333334147930143,
"eval_reward_std": 0.34949765503406527,
"eval_rewards/MultiModalAccuracyORM": 0.32333334147930143,
"eval_runtime": 946.9078,
"eval_samples_per_second": 0.053,
"eval_steps_per_second": 0.005,
"step": 1500
},
{
"clip_ratio": 0.0,
"completion_length": 307.3,
"epoch": 0.6080808080808081,
"grad_norm": 1.594406200527781,
"kl": 0.01402130126953125,
"learning_rate": 2e-07,
"loss": 0.011821150779724121,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.30000000447034836,
"reward_std": 0.29021300822496415,
"rewards/MultiModalAccuracyORM": 0.30000000447034836,
"step": 1505,
"train_speed(iter/s)": 0.02519
},
{
"clip_ratio": 0.0,
"completion_length": 310.35,
"epoch": 0.6101010101010101,
"grad_norm": 1.872354266566466,
"kl": 0.01295166015625,
"learning_rate": 2e-07,
"loss": 0.040472963452339174,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.39166667237877845,
"reward_std": 0.24481281042098998,
"rewards/MultiModalAccuracyORM": 0.39166667237877845,
"step": 1510,
"train_speed(iter/s)": 0.025138
},
{
"clip_ratio": 0.0,
"completion_length": 324.25,
"epoch": 0.6121212121212121,
"grad_norm": 2.2298458448624032,
"kl": 0.017498779296875,
"learning_rate": 2e-07,
"loss": -0.003679761290550232,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.2666666753590107,
"reward_std": 0.33752005696296694,
"rewards/MultiModalAccuracyORM": 0.2666666753590107,
"step": 1515,
"train_speed(iter/s)": 0.02512
},
{
"clip_ratio": 0.0,
"completion_length": 495.3,
"epoch": 0.6141414141414141,
"grad_norm": 2.1057358539094637,
"kl": 0.013360595703125,
"learning_rate": 2e-07,
"loss": -0.040804427862167356,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.11666667088866234,
"reward_std": 0.22625695466995238,
"rewards/MultiModalAccuracyORM": 0.11666667088866234,
"step": 1520,
"train_speed(iter/s)": 0.025038
},
{
"clip_ratio": 0.0,
"completion_length": 355.45,
"epoch": 0.6161616161616161,
"grad_norm": 1.7271901034384924,
"kl": 0.01778564453125,
"learning_rate": 2e-07,
"loss": 0.04612007737159729,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.29166667759418485,
"reward_std": 0.385197651386261,
"rewards/MultiModalAccuracyORM": 0.29166667759418485,
"step": 1525,
"train_speed(iter/s)": 0.025
},
{
"clip_ratio": 0.0,
"completion_length": 331.4,
"epoch": 0.6181818181818182,
"grad_norm": 2.251271699623951,
"kl": 0.015338134765625,
"learning_rate": 2e-07,
"loss": 0.07724932432174683,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.40833334177732467,
"reward_std": 0.39786076843738555,
"rewards/MultiModalAccuracyORM": 0.40833334177732467,
"step": 1530,
"train_speed(iter/s)": 0.024971
},
{
"clip_ratio": 0.0,
"completion_length": 469.35,
"epoch": 0.6202020202020202,
"grad_norm": 3.517799255266591,
"kl": 0.021319580078125,
"learning_rate": 2e-07,
"loss": -0.042039293050765994,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.2000000074505806,
"reward_std": 0.27122943103313446,
"rewards/MultiModalAccuracyORM": 0.2000000074505806,
"step": 1535,
"train_speed(iter/s)": 0.02492
},
{
"clip_ratio": 0.0,
"completion_length": 414.3,
"epoch": 0.6222222222222222,
"grad_norm": 2.5032184616862736,
"kl": 0.023309326171875,
"learning_rate": 2e-07,
"loss": 0.004111546277999878,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.36666667386889457,
"reward_std": 0.36037562787532806,
"rewards/MultiModalAccuracyORM": 0.36666667386889457,
"step": 1540,
"train_speed(iter/s)": 0.024886
},
{
"clip_ratio": 0.0,
"completion_length": 379.65,
"epoch": 0.6242424242424243,
"grad_norm": 1.3788944987112297,
"kl": 0.018865966796875,
"learning_rate": 2e-07,
"loss": 0.03875549137592316,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.14166666939854622,
"reward_std": 0.275529146194458,
"rewards/MultiModalAccuracyORM": 0.14166666939854622,
"step": 1545,
"train_speed(iter/s)": 0.02484
},
{
"clip_ratio": 0.0,
"completion_length": 418.9,
"epoch": 0.6262626262626263,
"grad_norm": 1.8495513561932837,
"kl": 0.02667236328125,
"learning_rate": 2e-07,
"loss": 0.006523740291595459,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.15833333656191825,
"reward_std": 0.3144540905952454,
"rewards/MultiModalAccuracyORM": 0.15833333656191825,
"step": 1550,
"train_speed(iter/s)": 0.024776
},
{
"clip_ratio": 0.0,
"completion_length": 346.2,
"epoch": 0.6282828282828283,
"grad_norm": 1.753463603338966,
"kl": 0.030621337890625,
"learning_rate": 2e-07,
"loss": -0.08293852806091309,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.25000000149011614,
"reward_std": 0.23083824515342713,
"rewards/MultiModalAccuracyORM": 0.25000000149011614,
"step": 1555,
"train_speed(iter/s)": 0.02475
},
{
"clip_ratio": 0.0,
"completion_length": 382.7,
"epoch": 0.6303030303030303,
"grad_norm": 2.663595112199716,
"kl": 0.0219970703125,
"learning_rate": 2e-07,
"loss": -0.002608485519886017,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.35000000447034835,
"reward_std": 0.27756678462028506,
"rewards/MultiModalAccuracyORM": 0.35000000447034835,
"step": 1560,
"train_speed(iter/s)": 0.024719
},
{
"clip_ratio": 0.0,
"completion_length": 285.55,
"epoch": 0.6323232323232323,
"grad_norm": 1.803682568463378,
"kl": 0.02052001953125,
"learning_rate": 2e-07,
"loss": -0.031521540880203244,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.2833333440124989,
"reward_std": 0.24935851097106934,
"rewards/MultiModalAccuracyORM": 0.2833333440124989,
"step": 1565,
"train_speed(iter/s)": 0.024694
},
{
"clip_ratio": 0.0,
"completion_length": 464.5,
"epoch": 0.6343434343434343,
"grad_norm": 1.9551331787297712,
"kl": 0.012725830078125,
"learning_rate": 2e-07,
"loss": 0.016904991865158082,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.2250000059604645,
"reward_std": 0.3863160490989685,
"rewards/MultiModalAccuracyORM": 0.2250000059604645,
"step": 1570,
"train_speed(iter/s)": 0.024625
},
{
"clip_ratio": 0.0,
"completion_length": 280.85,
"epoch": 0.6363636363636364,
"grad_norm": 2.19696821448914,
"kl": 0.016156005859375,
"learning_rate": 2e-07,
"loss": 0.00793578326702118,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.38333333656191826,
"reward_std": 0.2260383188724518,
"rewards/MultiModalAccuracyORM": 0.38333333656191826,
"step": 1575,
"train_speed(iter/s)": 0.024598
},
{
"clip_ratio": 0.0,
"completion_length": 346.35,
"epoch": 0.6383838383838384,
"grad_norm": 0.10124868688137513,
"kl": 0.016912841796875,
"learning_rate": 2e-07,
"loss": -0.007649339735507965,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.26666666865348815,
"reward_std": 0.23634997606277466,
"rewards/MultiModalAccuracyORM": 0.26666666865348815,
"step": 1580,
"train_speed(iter/s)": 0.024579
},
{
"clip_ratio": 0.0,
"completion_length": 409.0,
"epoch": 0.6404040404040404,
"grad_norm": 1.6301877045933517,
"kl": 0.012744140625,
"learning_rate": 2e-07,
"loss": 0.013163220882415772,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.3250000052154064,
"reward_std": 0.3906099498271942,
"rewards/MultiModalAccuracyORM": 0.3250000052154064,
"step": 1585,
"train_speed(iter/s)": 0.024565
},
{
"clip_ratio": 0.0,
"completion_length": 359.05,
"epoch": 0.6424242424242425,
"grad_norm": 2.155746940066879,
"kl": 0.016387939453125,
"learning_rate": 2e-07,
"loss": -0.006454774737358093,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.2333333373069763,
"reward_std": 0.2855865776538849,
"rewards/MultiModalAccuracyORM": 0.2333333373069763,
"step": 1590,
"train_speed(iter/s)": 0.024526
},
{
"clip_ratio": 0.0,
"completion_length": 369.0,
"epoch": 0.6444444444444445,
"grad_norm": 2.831254989761031,
"kl": 0.0135467529296875,
"learning_rate": 2e-07,
"loss": 0.04445863664150238,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.20000000149011612,
"reward_std": 0.28787843585014344,
"rewards/MultiModalAccuracyORM": 0.20000000149011612,
"step": 1595,
"train_speed(iter/s)": 0.024495
},
{
"clip_ratio": 0.0,
"completion_length": 279.45,
"epoch": 0.6464646464646465,
"grad_norm": 1.4752518445027274,
"kl": 0.017083740234375,
"learning_rate": 2e-07,
"loss": 0.03578461408615112,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.36666668131947516,
"reward_std": 0.33704385757446287,
"rewards/MultiModalAccuracyORM": 0.36666668131947516,
"step": 1600,
"train_speed(iter/s)": 0.024476
},
{
"clip_ratio": 0.0,
"completion_length": 355.65,
"epoch": 0.6484848484848484,
"grad_norm": 0.9187218241799472,
"kl": 0.016937255859375,
"learning_rate": 2e-07,
"loss": 0.02192138433456421,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.3500000111758709,
"reward_std": 0.3222196638584137,
"rewards/MultiModalAccuracyORM": 0.3500000111758709,
"step": 1605,
"train_speed(iter/s)": 0.024426
},
{
"clip_ratio": 0.0,
"completion_length": 388.35,
"epoch": 0.6505050505050505,
"grad_norm": 1.7973159194566164,
"kl": 0.0144775390625,
"learning_rate": 2e-07,
"loss": 0.01784837543964386,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.12500000298023223,
"reward_std": 0.2689731627702713,
"rewards/MultiModalAccuracyORM": 0.12500000298023223,
"step": 1610,
"train_speed(iter/s)": 0.024388
},
{
"clip_ratio": 0.0,
"completion_length": 397.55,
"epoch": 0.6525252525252525,
"grad_norm": 2.0318711993448617,
"kl": 0.018182373046875,
"learning_rate": 2e-07,
"loss": -0.02051687240600586,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.4333333380520344,
"reward_std": 0.261207589507103,
"rewards/MultiModalAccuracyORM": 0.4333333380520344,
"step": 1615,
"train_speed(iter/s)": 0.024346
},
{
"clip_ratio": 0.0,
"completion_length": 339.3,
"epoch": 0.6545454545454545,
"grad_norm": 1.9030819605130962,
"kl": 0.0175079345703125,
"learning_rate": 2e-07,
"loss": 0.06623161435127259,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.37500000596046446,
"reward_std": 0.24885829985141755,
"rewards/MultiModalAccuracyORM": 0.37500000596046446,
"step": 1620,
"train_speed(iter/s)": 0.024315
},
{
"clip_ratio": 0.0,
"completion_length": 280.4,
"epoch": 0.6565656565656566,
"grad_norm": 2.08045815446475,
"kl": 0.0169708251953125,
"learning_rate": 2e-07,
"loss": -0.013642898201942444,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.2500000029802322,
"reward_std": 0.3378098726272583,
"rewards/MultiModalAccuracyORM": 0.2500000029802322,
"step": 1625,
"train_speed(iter/s)": 0.024289
},
{
"clip_ratio": 0.0,
"completion_length": 400.75,
"epoch": 0.6585858585858586,
"grad_norm": 1.436661872799103,
"kl": 0.0193359375,
"learning_rate": 2e-07,
"loss": 0.02239292562007904,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.15833333805203437,
"reward_std": 0.2629852324724197,
"rewards/MultiModalAccuracyORM": 0.15833333805203437,
"step": 1630,
"train_speed(iter/s)": 0.024248
},
{
"clip_ratio": 0.0,
"completion_length": 270.4,
"epoch": 0.6606060606060606,
"grad_norm": 2.5008411774286494,
"kl": 0.020758056640625,
"learning_rate": 2e-07,
"loss": 0.02127687931060791,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.4083333432674408,
"reward_std": 0.3023863762617111,
"rewards/MultiModalAccuracyORM": 0.4083333432674408,
"step": 1635,
"train_speed(iter/s)": 0.024227
},
{
"clip_ratio": 0.0,
"completion_length": 324.8,
"epoch": 0.6626262626262627,
"grad_norm": 2.6410537415459125,
"kl": 0.02030029296875,
"learning_rate": 2e-07,
"loss": 0.05219934582710266,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.41666667684912684,
"reward_std": 0.35006397068500517,
"rewards/MultiModalAccuracyORM": 0.41666667684912684,
"step": 1640,
"train_speed(iter/s)": 0.024199
},
{
"clip_ratio": 0.0,
"completion_length": 356.0,
"epoch": 0.6646464646464646,
"grad_norm": 2.4569826375450914,
"kl": 0.01795654296875,
"learning_rate": 2e-07,
"loss": 0.013086378574371338,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.45000001341104506,
"reward_std": 0.3337643891572952,
"rewards/MultiModalAccuracyORM": 0.45000001341104506,
"step": 1645,
"train_speed(iter/s)": 0.024168
},
{
"clip_ratio": 0.0,
"completion_length": 304.8,
"epoch": 0.6666666666666666,
"grad_norm": 1.9280627341583514,
"kl": 0.015191650390625,
"learning_rate": 2e-07,
"loss": 0.01907120943069458,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.15000000447034836,
"reward_std": 0.30035116970539094,
"rewards/MultiModalAccuracyORM": 0.15000000447034836,
"step": 1650,
"train_speed(iter/s)": 0.024141
},
{
"clip_ratio": 0.0,
"completion_length": 419.0,
"epoch": 0.6686868686868687,
"grad_norm": 2.6312715310589687,
"kl": 0.015863037109375,
"learning_rate": 2e-07,
"loss": -0.04063203632831573,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.15833333656191825,
"reward_std": 0.25741389989852903,
"rewards/MultiModalAccuracyORM": 0.15833333656191825,
"step": 1655,
"train_speed(iter/s)": 0.024076
},
{
"clip_ratio": 0.0,
"completion_length": 394.05,
"epoch": 0.6707070707070707,
"grad_norm": 0.9566291807644657,
"kl": 0.015057373046875,
"learning_rate": 2e-07,
"loss": 0.018163633346557618,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.26666667610406875,
"reward_std": 0.28446817994117735,
"rewards/MultiModalAccuracyORM": 0.26666667610406875,
"step": 1660,
"train_speed(iter/s)": 0.024043
},
{
"clip_ratio": 0.0,
"completion_length": 326.85,
"epoch": 0.6727272727272727,
"grad_norm": 1.9521868347750622,
"kl": 0.019769287109375,
"learning_rate": 2e-07,
"loss": -5.202591419219971e-05,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.21666667535901069,
"reward_std": 0.23481498062610626,
"rewards/MultiModalAccuracyORM": 0.21666667535901069,
"step": 1665,
"train_speed(iter/s)": 0.024026
},
{
"clip_ratio": 0.0,
"completion_length": 316.4,
"epoch": 0.6747474747474748,
"grad_norm": 2.1472683375029757,
"kl": 0.01842041015625,
"learning_rate": 2e-07,
"loss": 0.08016844987869262,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.24166667461395264,
"reward_std": 0.29655990600585935,
"rewards/MultiModalAccuracyORM": 0.24166667461395264,
"step": 1670,
"train_speed(iter/s)": 0.024002
},
{
"clip_ratio": 0.0,
"completion_length": 294.15,
"epoch": 0.6767676767676768,
"grad_norm": 2.136669782149022,
"kl": 0.012603759765625,
"learning_rate": 2e-07,
"loss": 0.03559441566467285,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.18333334103226662,
"reward_std": 0.31266487538814547,
"rewards/MultiModalAccuracyORM": 0.18333334103226662,
"step": 1675,
"train_speed(iter/s)": 0.023983
},
{
"clip_ratio": 0.0,
"completion_length": 349.7,
"epoch": 0.6787878787878788,
"grad_norm": 2.5120224393696056,
"kl": 0.033984375,
"learning_rate": 2e-07,
"loss": -0.02109343409538269,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.2083333410322666,
"reward_std": 0.2629852324724197,
"rewards/MultiModalAccuracyORM": 0.2083333410322666,
"step": 1680,
"train_speed(iter/s)": 0.02395
},
{
"clip_ratio": 0.0,
"completion_length": 224.85,
"epoch": 0.6808080808080809,
"grad_norm": 2.7291188101039268,
"kl": 0.0185638427734375,
"learning_rate": 2e-07,
"loss": 0.06400806307792664,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.441666679084301,
"reward_std": 0.3586460083723068,
"rewards/MultiModalAccuracyORM": 0.441666679084301,
"step": 1685,
"train_speed(iter/s)": 0.023931
},
{
"clip_ratio": 0.0,
"completion_length": 204.25,
"epoch": 0.6828282828282828,
"grad_norm": 2.473418035792826,
"kl": 0.03394775390625,
"learning_rate": 2e-07,
"loss": 0.042749062180519104,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.3250000044703484,
"reward_std": 0.30718872845172884,
"rewards/MultiModalAccuracyORM": 0.3250000044703484,
"step": 1690,
"train_speed(iter/s)": 0.023921
},
{
"clip_ratio": 0.0,
"completion_length": 321.45,
"epoch": 0.6848484848484848,
"grad_norm": 1.4363881715878042,
"kl": 0.023870849609375,
"learning_rate": 2e-07,
"loss": 0.007241478562355042,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.3500000022351742,
"reward_std": 0.3244373768568039,
"rewards/MultiModalAccuracyORM": 0.3500000022351742,
"step": 1695,
"train_speed(iter/s)": 0.023909
},
{
"clip_ratio": 0.0,
"completion_length": 315.0,
"epoch": 0.6868686868686869,
"grad_norm": 2.953319073134284,
"kl": 0.023052978515625,
"learning_rate": 2e-07,
"loss": -0.010269761085510254,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.32500000596046447,
"reward_std": 0.3144540905952454,
"rewards/MultiModalAccuracyORM": 0.32500000596046447,
"step": 1700,
"train_speed(iter/s)": 0.023894
},
{
"clip_ratio": 0.0,
"completion_length": 281.35,
"epoch": 0.6888888888888889,
"grad_norm": 2.565868939994401,
"kl": 0.02255859375,
"learning_rate": 2e-07,
"loss": 0.018953490257263183,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.3583333417773247,
"reward_std": 0.30840655863285066,
"rewards/MultiModalAccuracyORM": 0.3583333417773247,
"step": 1705,
"train_speed(iter/s)": 0.023885
},
{
"clip_ratio": 0.0,
"completion_length": 375.9,
"epoch": 0.6909090909090909,
"grad_norm": 0.6694533298035624,
"kl": 0.01932373046875,
"learning_rate": 2e-07,
"loss": 0.008337923884391784,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.21666667535901069,
"reward_std": 0.2652415007352829,
"rewards/MultiModalAccuracyORM": 0.21666667535901069,
"step": 1710,
"train_speed(iter/s)": 0.02387
},
{
"clip_ratio": 0.0,
"completion_length": 275.95,
"epoch": 0.692929292929293,
"grad_norm": 1.567189433294113,
"kl": 0.030279541015625,
"learning_rate": 2e-07,
"loss": 0.03896563053131104,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.41666667759418485,
"reward_std": 0.35563530325889586,
"rewards/MultiModalAccuracyORM": 0.41666667759418485,
"step": 1715,
"train_speed(iter/s)": 0.023857
},
{
"clip_ratio": 0.0,
"completion_length": 263.45,
"epoch": 0.694949494949495,
"grad_norm": 1.8167696383064045,
"kl": 0.0214141845703125,
"learning_rate": 2e-07,
"loss": 0.020650827884674074,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.4333333484828472,
"reward_std": 0.39936017990112305,
"rewards/MultiModalAccuracyORM": 0.4333333484828472,
"step": 1720,
"train_speed(iter/s)": 0.023843
},
{
"clip_ratio": 0.0,
"completion_length": 363.55,
"epoch": 0.696969696969697,
"grad_norm": 2.213186558232037,
"kl": 0.0275634765625,
"learning_rate": 2e-07,
"loss": -0.008746334910392761,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.341666679084301,
"reward_std": 0.35490245223045347,
"rewards/MultiModalAccuracyORM": 0.341666679084301,
"step": 1725,
"train_speed(iter/s)": 0.023835
},
{
"clip_ratio": 0.0,
"completion_length": 343.9,
"epoch": 0.6989898989898989,
"grad_norm": 2.601045176615316,
"kl": 0.021826171875,
"learning_rate": 2e-07,
"loss": -0.03737230598926544,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.31666667461395265,
"reward_std": 0.38179769814014436,
"rewards/MultiModalAccuracyORM": 0.31666667461395265,
"step": 1730,
"train_speed(iter/s)": 0.023822
},
{
"clip_ratio": 0.0,
"completion_length": 356.3,
"epoch": 0.701010101010101,
"grad_norm": 0.9407841462948962,
"kl": 0.0240234375,
"learning_rate": 2e-07,
"loss": -0.0031855762004852294,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.2750000089406967,
"reward_std": 0.2511385798454285,
"rewards/MultiModalAccuracyORM": 0.2750000089406967,
"step": 1735,
"train_speed(iter/s)": 0.023807
},
{
"clip_ratio": 0.0,
"completion_length": 262.95,
"epoch": 0.703030303030303,
"grad_norm": 2.6759259468484413,
"kl": 0.0215087890625,
"learning_rate": 2e-07,
"loss": 0.025629484653472902,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.34166667312383653,
"reward_std": 0.3390218883752823,
"rewards/MultiModalAccuracyORM": 0.34166667312383653,
"step": 1740,
"train_speed(iter/s)": 0.023819
},
{
"clip_ratio": 0.0,
"completion_length": 280.85,
"epoch": 0.705050505050505,
"grad_norm": 1.6215662631256935,
"kl": 0.043084716796875,
"learning_rate": 2e-07,
"loss": 0.01873619556427002,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.3083333395421505,
"reward_std": 0.23631438612937927,
"rewards/MultiModalAccuracyORM": 0.3083333395421505,
"step": 1745,
"train_speed(iter/s)": 0.023814
},
{
"epoch": 0.7070707070707071,
"grad_norm": 3.313730122510265,
"learning_rate": 2e-07,
"loss": -0.041856271028518674,
"memory(GiB)": 87.45,
"step": 1750,
"train_speed(iter/s)": 0.023773
},
{
"epoch": 0.7070707070707071,
"eval_clip_ratio": 0.0,
"eval_completion_length": 318.58167419433596,
"eval_kl": 0.0221929931640625,
"eval_loss": 0.0349855050444603,
"eval_response_clip_ratio": 0.001666666716337204,
"eval_reward": 0.2950000064074993,
"eval_reward_std": 0.3137217426300049,
"eval_rewards/MultiModalAccuracyORM": 0.2950000064074993,
"eval_runtime": 782.5117,
"eval_samples_per_second": 0.064,
"eval_steps_per_second": 0.006,
"step": 1750
},
{
"clip_ratio": 0.0,
"completion_length": 379.775,
"epoch": 0.7090909090909091,
"grad_norm": 1.446054468361364,
"kl": 0.0215576171875,
"learning_rate": 2e-07,
"loss": 0.013345304131507873,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.23333333805203438,
"reward_std": 0.3380433991551399,
"rewards/MultiModalAccuracyORM": 0.23333333805203438,
"step": 1755,
"train_speed(iter/s)": 0.023419
},
{
"clip_ratio": 0.0,
"completion_length": 307.45,
"epoch": 0.7111111111111111,
"grad_norm": 1.3947630704883345,
"kl": 0.018634033203125,
"learning_rate": 2e-07,
"loss": 0.010007500648498535,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.17500000521540643,
"reward_std": 0.27148365676403047,
"rewards/MultiModalAccuracyORM": 0.17500000521540643,
"step": 1760,
"train_speed(iter/s)": 0.023454
},
{
"clip_ratio": 0.0,
"completion_length": 283.5,
"epoch": 0.7131313131313132,
"grad_norm": 2.218781010019711,
"kl": 0.021832275390625,
"learning_rate": 2e-07,
"loss": -0.013157431781291962,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.15833333656191825,
"reward_std": 0.2652770906686783,
"rewards/MultiModalAccuracyORM": 0.15833333656191825,
"step": 1765,
"train_speed(iter/s)": 0.023491
},
{
"clip_ratio": 0.0,
"completion_length": 256.75,
"epoch": 0.7151515151515152,
"grad_norm": 1.7430710535513718,
"kl": 0.01793212890625,
"learning_rate": 2e-07,
"loss": 0.021530145406723024,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.26666667610406875,
"reward_std": 0.3066769391298294,
"rewards/MultiModalAccuracyORM": 0.26666667610406875,
"step": 1770,
"train_speed(iter/s)": 0.023528
},
{
"clip_ratio": 0.0,
"completion_length": 265.35,
"epoch": 0.7171717171717171,
"grad_norm": 1.7339756470338048,
"kl": 0.014569091796875,
"learning_rate": 2e-07,
"loss": -0.058446085453033446,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.31666667237877844,
"reward_std": 0.2820172876119614,
"rewards/MultiModalAccuracyORM": 0.31666667237877844,
"step": 1775,
"train_speed(iter/s)": 0.023563
},
{
"clip_ratio": 0.0,
"completion_length": 525.0,
"epoch": 0.7191919191919192,
"grad_norm": 1.6384172396752068,
"kl": 0.0145233154296875,
"learning_rate": 2e-07,
"loss": -0.00234740674495697,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.05,
"reward": 0.33333334177732465,
"reward_std": 0.3890485167503357,
"rewards/MultiModalAccuracyORM": 0.33333334177732465,
"step": 1780,
"train_speed(iter/s)": 0.02359
},
{
"clip_ratio": 0.0,
"completion_length": 391.6,
"epoch": 0.7212121212121212,
"grad_norm": 2.6878660022854333,
"kl": 0.016748046875,
"learning_rate": 2e-07,
"loss": 0.03554516434669495,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.2833333395421505,
"reward_std": 0.35974039435386657,
"rewards/MultiModalAccuracyORM": 0.2833333395421505,
"step": 1785,
"train_speed(iter/s)": 0.023622
},
{
"clip_ratio": 0.0,
"completion_length": 325.25,
"epoch": 0.7232323232323232,
"grad_norm": 2.4324428426946834,
"kl": 0.0128204345703125,
"learning_rate": 2e-07,
"loss": -0.047456872463226316,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.25833333730697633,
"reward_std": 0.2970361053943634,
"rewards/MultiModalAccuracyORM": 0.25833333730697633,
"step": 1790,
"train_speed(iter/s)": 0.023655
},
{
"clip_ratio": 0.0,
"completion_length": 343.65,
"epoch": 0.7252525252525253,
"grad_norm": 1.8618904482502028,
"kl": 0.0149169921875,
"learning_rate": 2e-07,
"loss": 0.009033694863319397,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.2583333387970924,
"reward_std": 0.21750431060791015,
"rewards/MultiModalAccuracyORM": 0.2583333387970924,
"step": 1795,
"train_speed(iter/s)": 0.023686
},
{
"clip_ratio": 0.0,
"completion_length": 369.05,
"epoch": 0.7272727272727273,
"grad_norm": 3.36471551001556,
"kl": 0.02044677734375,
"learning_rate": 2e-07,
"loss": 0.010516098141670227,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.3166666731238365,
"reward_std": 0.21218962371349334,
"rewards/MultiModalAccuracyORM": 0.3166666731238365,
"step": 1800,
"train_speed(iter/s)": 0.023721
},
{
"clip_ratio": 0.0,
"completion_length": 351.05,
"epoch": 0.7292929292929293,
"grad_norm": 3.723751882855137,
"kl": 0.023046875,
"learning_rate": 2e-07,
"loss": -0.02001919746398926,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.4083333447575569,
"reward_std": 0.28128686249256135,
"rewards/MultiModalAccuracyORM": 0.4083333447575569,
"step": 1805,
"train_speed(iter/s)": 0.023755
},
{
"clip_ratio": 0.0,
"completion_length": 335.35,
"epoch": 0.7313131313131314,
"grad_norm": 54.701999328620005,
"kl": 0.02723388671875,
"learning_rate": 2e-07,
"loss": 0.03721327781677246,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.2416666753590107,
"reward_std": 0.2910481750965118,
"rewards/MultiModalAccuracyORM": 0.2416666753590107,
"step": 1810,
"train_speed(iter/s)": 0.02379
},
{
"clip_ratio": 0.0,
"completion_length": 225.45,
"epoch": 0.7333333333333333,
"grad_norm": 3.0855092667576733,
"kl": 0.015704345703125,
"learning_rate": 2e-07,
"loss": -0.037659955024719236,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.36666667759418486,
"reward_std": 0.36648276150226594,
"rewards/MultiModalAccuracyORM": 0.36666667759418486,
"step": 1815,
"train_speed(iter/s)": 0.023829
},
{
"clip_ratio": 0.0,
"completion_length": 306.3,
"epoch": 0.7353535353535353,
"grad_norm": 2.1896027058768217,
"kl": 0.01336669921875,
"learning_rate": 2e-07,
"loss": 0.02186403125524521,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.4416666753590107,
"reward_std": 0.2956440091133118,
"rewards/MultiModalAccuracyORM": 0.4416666753590107,
"step": 1820,
"train_speed(iter/s)": 0.023865
},
{
"clip_ratio": 0.0,
"completion_length": 297.95,
"epoch": 0.7373737373737373,
"grad_norm": 1.540468825830471,
"kl": 0.010992431640625,
"learning_rate": 2e-07,
"loss": 0.03888830542564392,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.1583333395421505,
"reward_std": 0.21368902921676636,
"rewards/MultiModalAccuracyORM": 0.1583333395421505,
"step": 1825,
"train_speed(iter/s)": 0.023899
},
{
"clip_ratio": 0.0,
"completion_length": 366.15,
"epoch": 0.7393939393939394,
"grad_norm": 49.26742721312377,
"kl": 0.0157135009765625,
"learning_rate": 2e-07,
"loss": -0.0031795650720596313,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.1333333395421505,
"reward_std": 0.2736803233623505,
"rewards/MultiModalAccuracyORM": 0.1333333395421505,
"step": 1830,
"train_speed(iter/s)": 0.023929
},
{
"clip_ratio": 0.0,
"completion_length": 352.0,
"epoch": 0.7414141414141414,
"grad_norm": 1.2425141205561836,
"kl": 0.0211181640625,
"learning_rate": 2e-07,
"loss": -0.01690070778131485,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.31666667237877844,
"reward_std": 0.33905747830867766,
"rewards/MultiModalAccuracyORM": 0.31666667237877844,
"step": 1835,
"train_speed(iter/s)": 0.023961
},
{
"clip_ratio": 0.0,
"completion_length": 377.15,
"epoch": 0.7434343434343434,
"grad_norm": 2.8910783603707144,
"kl": 0.0198638916015625,
"learning_rate": 2e-07,
"loss": 0.06207960844039917,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.40000001043081285,
"reward_std": 0.38306058645248414,
"rewards/MultiModalAccuracyORM": 0.40000001043081285,
"step": 1840,
"train_speed(iter/s)": 0.023987
},
{
"clip_ratio": 0.0,
"completion_length": 398.35,
"epoch": 0.7454545454545455,
"grad_norm": 14.235626745032626,
"kl": 0.019775390625,
"learning_rate": 2e-07,
"loss": 0.037658247351646426,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.16666666865348817,
"reward_std": 0.12708649039268494,
"rewards/MultiModalAccuracyORM": 0.16666666865348817,
"step": 1845,
"train_speed(iter/s)": 0.024018
},
{
"clip_ratio": 0.0,
"completion_length": 328.4,
"epoch": 0.7474747474747475,
"grad_norm": 1.833635434555557,
"kl": 0.018505859375,
"learning_rate": 2e-07,
"loss": -0.026553609967231752,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.3750000111758709,
"reward_std": 0.34710129499435427,
"rewards/MultiModalAccuracyORM": 0.3750000111758709,
"step": 1850,
"train_speed(iter/s)": 0.024051
},
{
"clip_ratio": 0.0,
"completion_length": 394.6,
"epoch": 0.7494949494949495,
"grad_norm": 1.825594490175896,
"kl": 0.02091064453125,
"learning_rate": 2e-07,
"loss": 0.02868058383464813,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.33333334550261495,
"reward_std": 0.3127244710922241,
"rewards/MultiModalAccuracyORM": 0.33333334550261495,
"step": 1855,
"train_speed(iter/s)": 0.024084
},
{
"clip_ratio": 0.0,
"completion_length": 387.45,
"epoch": 0.7515151515151515,
"grad_norm": 1.3722283938123239,
"kl": 0.023919677734375,
"learning_rate": 2e-07,
"loss": 0.017566892504692077,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.14166666865348815,
"reward_std": 0.32900004684925077,
"rewards/MultiModalAccuracyORM": 0.14166666865348815,
"step": 1860,
"train_speed(iter/s)": 0.024119
},
{
"clip_ratio": 0.0,
"completion_length": 370.25,
"epoch": 0.7535353535353535,
"grad_norm": 3.3603602877653964,
"kl": 0.023779296875,
"learning_rate": 2e-07,
"loss": 0.051629495620727536,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.33333334177732465,
"reward_std": 0.4036242991685867,
"rewards/MultiModalAccuracyORM": 0.33333334177732465,
"step": 1865,
"train_speed(iter/s)": 0.02415
},
{
"clip_ratio": 0.0,
"completion_length": 306.4,
"epoch": 0.7555555555555555,
"grad_norm": 4.690429815238561,
"kl": 0.0260162353515625,
"learning_rate": 2e-07,
"loss": -0.004315692186355591,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.2500000029802322,
"reward_std": 0.2940108567476273,
"rewards/MultiModalAccuracyORM": 0.2500000029802322,
"step": 1870,
"train_speed(iter/s)": 0.024182
},
{
"clip_ratio": 0.0,
"completion_length": 274.45,
"epoch": 0.7575757575757576,
"grad_norm": 2.7051519330762646,
"kl": 0.0303466796875,
"learning_rate": 2e-07,
"loss": -0.008211909234523774,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.25833334103226663,
"reward_std": 0.3237069517374039,
"rewards/MultiModalAccuracyORM": 0.25833334103226663,
"step": 1875,
"train_speed(iter/s)": 0.024217
},
{
"clip_ratio": 0.0,
"completion_length": 409.5,
"epoch": 0.7595959595959596,
"grad_norm": 2.8417211154013895,
"kl": 0.02593994140625,
"learning_rate": 2e-07,
"loss": 0.061132901906967164,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.27500000670552255,
"reward_std": 0.40261563658714294,
"rewards/MultiModalAccuracyORM": 0.27500000670552255,
"step": 1880,
"train_speed(iter/s)": 0.024247
},
{
"clip_ratio": 0.0,
"completion_length": 396.45,
"epoch": 0.7616161616161616,
"grad_norm": 2.730755662335053,
"kl": 0.026220703125,
"learning_rate": 2e-07,
"loss": 0.036236304044723514,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.1666666731238365,
"reward_std": 0.3101543754339218,
"rewards/MultiModalAccuracyORM": 0.1666666731238365,
"step": 1885,
"train_speed(iter/s)": 0.024279
},
{
"clip_ratio": 0.0,
"completion_length": 275.0,
"epoch": 0.7636363636363637,
"grad_norm": 1.777471986992103,
"kl": 0.025811767578125,
"learning_rate": 2e-07,
"loss": 0.010323920845985412,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.3916666701436043,
"reward_std": 0.2506715327501297,
"rewards/MultiModalAccuracyORM": 0.3916666701436043,
"step": 1890,
"train_speed(iter/s)": 0.024315
},
{
"clip_ratio": 0.0,
"completion_length": 428.5,
"epoch": 0.7656565656565657,
"grad_norm": 0.13037300867268706,
"kl": 0.030450439453125,
"learning_rate": 2e-07,
"loss": 0.0042250391095876695,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.35000001043081286,
"reward_std": 0.3182337760925293,
"rewards/MultiModalAccuracyORM": 0.35000001043081286,
"step": 1895,
"train_speed(iter/s)": 0.024345
},
{
"clip_ratio": 0.0,
"completion_length": 329.7,
"epoch": 0.7676767676767676,
"grad_norm": 1.7511437916198835,
"kl": 0.016363525390625,
"learning_rate": 2e-07,
"loss": 0.006176537275314331,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.44166667461395265,
"reward_std": 0.2988493382930756,
"rewards/MultiModalAccuracyORM": 0.44166667461395265,
"step": 1900,
"train_speed(iter/s)": 0.024374
},
{
"clip_ratio": 0.0,
"completion_length": 262.25,
"epoch": 0.7696969696969697,
"grad_norm": 2.6784748457723992,
"kl": 0.026043701171875,
"learning_rate": 2e-07,
"loss": -0.0650195300579071,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.4416666753590107,
"reward_std": 0.4098664551973343,
"rewards/MultiModalAccuracyORM": 0.4416666753590107,
"step": 1905,
"train_speed(iter/s)": 0.024412
},
{
"clip_ratio": 0.0,
"completion_length": 374.75,
"epoch": 0.7717171717171717,
"grad_norm": 2.0646305839430648,
"kl": 0.027471923828125,
"learning_rate": 2e-07,
"loss": 0.023633481562137605,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.30833334401249884,
"reward_std": 0.375223833322525,
"rewards/MultiModalAccuracyORM": 0.30833334401249884,
"step": 1910,
"train_speed(iter/s)": 0.024446
},
{
"clip_ratio": 0.0,
"completion_length": 312.0,
"epoch": 0.7737373737373737,
"grad_norm": 1.9430903927913294,
"kl": 0.018585205078125,
"learning_rate": 2e-07,
"loss": -0.023164969682693482,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.31666667237877844,
"reward_std": 0.3330695480108261,
"rewards/MultiModalAccuracyORM": 0.31666667237877844,
"step": 1915,
"train_speed(iter/s)": 0.024483
},
{
"clip_ratio": 0.0,
"completion_length": 414.55,
"epoch": 0.7757575757575758,
"grad_norm": 1.2487710271189274,
"kl": 0.0145263671875,
"learning_rate": 2e-07,
"loss": 0.014984607696533203,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.38333334028720856,
"reward_std": 0.2784802496433258,
"rewards/MultiModalAccuracyORM": 0.38333334028720856,
"step": 1920,
"train_speed(iter/s)": 0.024514
},
{
"clip_ratio": 0.0,
"completion_length": 301.65,
"epoch": 0.7777777777777778,
"grad_norm": 3.397172729657377,
"kl": 0.025823974609375,
"learning_rate": 2e-07,
"loss": 0.010728538036346436,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.45000001341104506,
"reward_std": 0.36237767040729524,
"rewards/MultiModalAccuracyORM": 0.45000001341104506,
"step": 1925,
"train_speed(iter/s)": 0.024547
},
{
"clip_ratio": 0.0,
"completion_length": 347.9,
"epoch": 0.7797979797979798,
"grad_norm": 2.445242624274772,
"kl": 0.02085418701171875,
"learning_rate": 2e-07,
"loss": 0.0506191611289978,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.366666679084301,
"reward_std": 0.3425410449504852,
"rewards/MultiModalAccuracyORM": 0.366666679084301,
"step": 1930,
"train_speed(iter/s)": 0.024581
},
{
"clip_ratio": 0.0,
"completion_length": 414.95,
"epoch": 0.7818181818181819,
"grad_norm": 2.2267041732312953,
"kl": 0.0191650390625,
"learning_rate": 2e-07,
"loss": 0.07460187673568726,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.1750000037252903,
"reward_std": 0.27998208105564115,
"rewards/MultiModalAccuracyORM": 0.1750000037252903,
"step": 1935,
"train_speed(iter/s)": 0.024609
},
{
"clip_ratio": 0.0,
"completion_length": 358.15,
"epoch": 0.7838383838383839,
"grad_norm": 0.08307319969608204,
"kl": 0.01834716796875,
"learning_rate": 2e-07,
"loss": 0.01801389306783676,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.23333334028720856,
"reward_std": 0.2292436480522156,
"rewards/MultiModalAccuracyORM": 0.23333334028720856,
"step": 1940,
"train_speed(iter/s)": 0.024633
},
{
"clip_ratio": 0.0,
"completion_length": 351.25,
"epoch": 0.7858585858585858,
"grad_norm": 2.4956737169852876,
"kl": 0.0243896484375,
"learning_rate": 2e-07,
"loss": 0.02604297399520874,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.4416666768491268,
"reward_std": 0.23860624432563782,
"rewards/MultiModalAccuracyORM": 0.4416666768491268,
"step": 1945,
"train_speed(iter/s)": 0.024666
},
{
"clip_ratio": 0.0,
"completion_length": 365.6,
"epoch": 0.7878787878787878,
"grad_norm": 1.412421381873315,
"kl": 0.03074951171875,
"learning_rate": 2e-07,
"loss": -0.008066686987876891,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.3166666753590107,
"reward_std": 0.3619014710187912,
"rewards/MultiModalAccuracyORM": 0.3166666753590107,
"step": 1950,
"train_speed(iter/s)": 0.0247
},
{
"clip_ratio": 0.0,
"completion_length": 316.4,
"epoch": 0.7898989898989899,
"grad_norm": 2.581974028906461,
"kl": 0.0264404296875,
"learning_rate": 2e-07,
"loss": 0.0021781913936138155,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.6333333432674408,
"reward_std": 0.34636789858341216,
"rewards/MultiModalAccuracyORM": 0.6333333432674408,
"step": 1955,
"train_speed(iter/s)": 0.024735
},
{
"clip_ratio": 0.0,
"completion_length": 330.8,
"epoch": 0.7919191919191919,
"grad_norm": 2.7977079078012546,
"kl": 0.030804443359375,
"learning_rate": 2e-07,
"loss": 0.028843042254447938,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.23333333879709245,
"reward_std": 0.18488111793994905,
"rewards/MultiModalAccuracyORM": 0.23333333879709245,
"step": 1960,
"train_speed(iter/s)": 0.02477
},
{
"clip_ratio": 0.0,
"completion_length": 325.4,
"epoch": 0.793939393939394,
"grad_norm": 2.3766146998216606,
"kl": 0.029986572265625,
"learning_rate": 2e-07,
"loss": 0.01644158363342285,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.5250000111758709,
"reward_std": 0.3782962501049042,
"rewards/MultiModalAccuracyORM": 0.5250000111758709,
"step": 1965,
"train_speed(iter/s)": 0.024803
},
{
"clip_ratio": 0.0,
"completion_length": 413.2,
"epoch": 0.795959595959596,
"grad_norm": 1.6454459000922825,
"kl": 0.03331298828125,
"learning_rate": 2e-07,
"loss": 0.04098441600799561,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.3916666805744171,
"reward_std": 0.31651573479175565,
"rewards/MultiModalAccuracyORM": 0.3916666805744171,
"step": 1970,
"train_speed(iter/s)": 0.024833
},
{
"clip_ratio": 0.0,
"completion_length": 279.5,
"epoch": 0.797979797979798,
"grad_norm": 2.676941712540541,
"kl": 0.033160400390625,
"learning_rate": 2e-07,
"loss": -0.06822603344917297,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.4083333395421505,
"reward_std": 0.26591232419013977,
"rewards/MultiModalAccuracyORM": 0.4083333395421505,
"step": 1975,
"train_speed(iter/s)": 0.024862
},
{
"clip_ratio": 0.0,
"completion_length": 338.05,
"epoch": 0.8,
"grad_norm": 2.6654647292288565,
"kl": 0.03338623046875,
"learning_rate": 2e-07,
"loss": 0.018979550898075105,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.3083333410322666,
"reward_std": 0.2988493382930756,
"rewards/MultiModalAccuracyORM": 0.3083333410322666,
"step": 1980,
"train_speed(iter/s)": 0.024892
},
{
"clip_ratio": 0.0,
"completion_length": 408.9,
"epoch": 0.802020202020202,
"grad_norm": 1.2773941729876779,
"kl": 0.02757568359375,
"learning_rate": 2e-07,
"loss": 0.0032975614070892335,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.3666666716337204,
"reward_std": 0.21999078691005708,
"rewards/MultiModalAccuracyORM": 0.3666666716337204,
"step": 1985,
"train_speed(iter/s)": 0.024918
},
{
"clip_ratio": 0.0,
"completion_length": 318.45,
"epoch": 0.804040404040404,
"grad_norm": 3.249804741680811,
"kl": 0.0233734130859375,
"learning_rate": 2e-07,
"loss": -0.0009274959564208984,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.25000000447034837,
"reward_std": 0.3111390322446823,
"rewards/MultiModalAccuracyORM": 0.25000000447034837,
"step": 1990,
"train_speed(iter/s)": 0.024952
},
{
"clip_ratio": 0.0,
"completion_length": 356.3,
"epoch": 0.806060606060606,
"grad_norm": 1.6358353140611315,
"kl": 0.02435302734375,
"learning_rate": 2e-07,
"loss": 0.01845797598361969,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.3083333380520344,
"reward_std": 0.31345489621162415,
"rewards/MultiModalAccuracyORM": 0.3083333380520344,
"step": 1995,
"train_speed(iter/s)": 0.024983
},
{
"epoch": 0.8080808080808081,
"grad_norm": 2.5769756858186366,
"learning_rate": 2e-07,
"loss": -0.03718583881855011,
"memory(GiB)": 87.45,
"step": 2000,
"train_speed(iter/s)": 0.025016
},
{
"epoch": 0.8080808080808081,
"eval_clip_ratio": 0.0,
"eval_completion_length": 323.9533418273926,
"eval_kl": 0.0281341552734375,
"eval_loss": 0.006039996165782213,
"eval_response_clip_ratio": 0.0,
"eval_reward": 0.318333340883255,
"eval_reward_std": 0.32694393634796143,
"eval_rewards/MultiModalAccuracyORM": 0.318333340883255,
"eval_runtime": 462.0456,
"eval_samples_per_second": 0.108,
"eval_steps_per_second": 0.011,
"step": 2000
},
{
"clip_ratio": 0.0,
"completion_length": 325.125,
"epoch": 0.8101010101010101,
"grad_norm": 1.7033276169087128,
"kl": 0.02674102783203125,
"learning_rate": 2e-07,
"loss": 0.03609513640403748,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.23750001043081284,
"reward_std": 0.24687736183404924,
"rewards/MultiModalAccuracyORM": 0.23750001043081284,
"step": 2005,
"train_speed(iter/s)": 0.024793
},
{
"clip_ratio": 0.0,
"completion_length": 325.55,
"epoch": 0.8121212121212121,
"grad_norm": 1.77522203951707,
"kl": 0.0292724609375,
"learning_rate": 2e-07,
"loss": 0.01515505015850067,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.3500000059604645,
"reward_std": 0.38405978083610537,
"rewards/MultiModalAccuracyORM": 0.3500000059604645,
"step": 2010,
"train_speed(iter/s)": 0.024823
},
{
"clip_ratio": 0.0,
"completion_length": 288.5,
"epoch": 0.8141414141414142,
"grad_norm": 2.047124696336966,
"kl": 0.02886962890625,
"learning_rate": 2e-07,
"loss": -0.056891226768493654,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.350000012665987,
"reward_std": 0.3127244710922241,
"rewards/MultiModalAccuracyORM": 0.350000012665987,
"step": 2015,
"train_speed(iter/s)": 0.024857
},
{
"clip_ratio": 0.0,
"completion_length": 273.0,
"epoch": 0.8161616161616162,
"grad_norm": 2.933718360724764,
"kl": 0.0226837158203125,
"learning_rate": 2e-07,
"loss": 0.04815356135368347,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.3916666753590107,
"reward_std": 0.3597048044204712,
"rewards/MultiModalAccuracyORM": 0.3916666753590107,
"step": 2020,
"train_speed(iter/s)": 0.024891
},
{
"clip_ratio": 0.0,
"completion_length": 334.85,
"epoch": 0.8181818181818182,
"grad_norm": 2.3099689560601595,
"kl": 0.015521240234375,
"learning_rate": 2e-07,
"loss": 0.00659940093755722,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.4250000067055225,
"reward_std": 0.2574163258075714,
"rewards/MultiModalAccuracyORM": 0.4250000067055225,
"step": 2025,
"train_speed(iter/s)": 0.024922
},
{
"clip_ratio": 0.0,
"completion_length": 406.5,
"epoch": 0.8202020202020202,
"grad_norm": 2.5439305675732165,
"kl": 0.019085693359375,
"learning_rate": 2e-07,
"loss": 0.0326183021068573,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.3750000074505806,
"reward_std": 0.31040860116481783,
"rewards/MultiModalAccuracyORM": 0.3750000074505806,
"step": 2030,
"train_speed(iter/s)": 0.024949
},
{
"clip_ratio": 0.0,
"completion_length": 431.5,
"epoch": 0.8222222222222222,
"grad_norm": 3.2829060035742557,
"kl": 0.023626708984375,
"learning_rate": 2e-07,
"loss": 0.015071746706962586,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.2666666693985462,
"reward_std": 0.205923455953598,
"rewards/MultiModalAccuracyORM": 0.2666666693985462,
"step": 2035,
"train_speed(iter/s)": 0.024973
},
{
"clip_ratio": 0.0,
"completion_length": 525.55,
"epoch": 0.8242424242424242,
"grad_norm": 2.658698100364113,
"kl": 0.0230712890625,
"learning_rate": 2e-07,
"loss": 0.013616405427455902,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.28333333879709244,
"reward_std": 0.34936913549900056,
"rewards/MultiModalAccuracyORM": 0.28333333879709244,
"step": 2040,
"train_speed(iter/s)": 0.024998
},
{
"clip_ratio": 0.0,
"completion_length": 238.15,
"epoch": 0.8262626262626263,
"grad_norm": 2.342715529046246,
"kl": 0.029901123046875,
"learning_rate": 2e-07,
"loss": 0.037117105722427365,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.3166666753590107,
"reward_std": 0.36670139729976653,
"rewards/MultiModalAccuracyORM": 0.3166666753590107,
"step": 2045,
"train_speed(iter/s)": 0.025033
},
{
"clip_ratio": 0.0,
"completion_length": 334.45,
"epoch": 0.8282828282828283,
"grad_norm": 0.9452733042514408,
"kl": 0.025860595703125,
"learning_rate": 2e-07,
"loss": 0.03209388256072998,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.22500000074505805,
"reward_std": 0.26750934720039365,
"rewards/MultiModalAccuracyORM": 0.22500000074505805,
"step": 2050,
"train_speed(iter/s)": 0.025068
},
{
"clip_ratio": 0.0,
"completion_length": 361.15,
"epoch": 0.8303030303030303,
"grad_norm": 2.136815117405037,
"kl": 0.0298553466796875,
"learning_rate": 2e-07,
"loss": 0.04463410079479217,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.3416666753590107,
"reward_std": 0.4307381808757782,
"rewards/MultiModalAccuracyORM": 0.3416666753590107,
"step": 2055,
"train_speed(iter/s)": 0.025094
},
{
"clip_ratio": 0.0,
"completion_length": 437.85,
"epoch": 0.8323232323232324,
"grad_norm": 1.7941689466428354,
"kl": 0.018414306640625,
"learning_rate": 2e-07,
"loss": -0.013085222244262696,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.33333333432674406,
"reward_std": 0.27756677865982055,
"rewards/MultiModalAccuracyORM": 0.33333333432674406,
"step": 2060,
"train_speed(iter/s)": 0.025121
},
{
"clip_ratio": 0.0,
"completion_length": 373.6,
"epoch": 0.8343434343434344,
"grad_norm": 2.741809894885581,
"kl": 0.0217803955078125,
"learning_rate": 2e-07,
"loss": 0.032400667667388916,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.2083333395421505,
"reward_std": 0.3207202583551407,
"rewards/MultiModalAccuracyORM": 0.2083333395421505,
"step": 2065,
"train_speed(iter/s)": 0.025146
},
{
"clip_ratio": 0.0,
"completion_length": 380.7,
"epoch": 0.8363636363636363,
"grad_norm": 1.5317649365927353,
"kl": 0.02591552734375,
"learning_rate": 2e-07,
"loss": 0.026116135716438293,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.2916666746139526,
"reward_std": 0.3315081149339676,
"rewards/MultiModalAccuracyORM": 0.2916666746139526,
"step": 2070,
"train_speed(iter/s)": 0.02517
},
{
"clip_ratio": 0.0,
"completion_length": 298.3,
"epoch": 0.8383838383838383,
"grad_norm": 2.2493040161672164,
"kl": 0.023297119140625,
"learning_rate": 2e-07,
"loss": 0.011263298988342284,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.3666666723787785,
"reward_std": 0.27122943103313446,
"rewards/MultiModalAccuracyORM": 0.3666666723787785,
"step": 2075,
"train_speed(iter/s)": 0.025195
},
{
"clip_ratio": 0.0,
"completion_length": 327.35,
"epoch": 0.8404040404040404,
"grad_norm": 1.6803752878651963,
"kl": 0.05001220703125,
"learning_rate": 2e-07,
"loss": 0.021441753208637237,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.3833333387970924,
"reward_std": 0.3531844109296799,
"rewards/MultiModalAccuracyORM": 0.3833333387970924,
"step": 2080,
"train_speed(iter/s)": 0.025225
},
{
"clip_ratio": 0.0,
"completion_length": 347.15,
"epoch": 0.8424242424242424,
"grad_norm": 1.980173450589181,
"kl": 0.0163818359375,
"learning_rate": 2e-07,
"loss": 0.013161852955818176,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.21666667088866234,
"reward_std": 0.22625695466995238,
"rewards/MultiModalAccuracyORM": 0.21666667088866234,
"step": 2085,
"train_speed(iter/s)": 0.025254
},
{
"clip_ratio": 0.0,
"completion_length": 367.1,
"epoch": 0.8444444444444444,
"grad_norm": 1.0010632093343366,
"kl": 0.017938232421875,
"learning_rate": 2e-07,
"loss": -0.0012541890144348144,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.15000000223517418,
"reward_std": 0.2916341096162796,
"rewards/MultiModalAccuracyORM": 0.15000000223517418,
"step": 2090,
"train_speed(iter/s)": 0.025273
},
{
"clip_ratio": 0.0,
"completion_length": 326.0,
"epoch": 0.8464646464646465,
"grad_norm": 1.8276205217385537,
"kl": 0.0211029052734375,
"learning_rate": 2e-07,
"loss": 0.018240103125572206,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.2916666708886623,
"reward_std": 0.35748412609100344,
"rewards/MultiModalAccuracyORM": 0.2916666708886623,
"step": 2095,
"train_speed(iter/s)": 0.0253
},
{
"clip_ratio": 0.0,
"completion_length": 351.3,
"epoch": 0.8484848484848485,
"grad_norm": 2.25183174936328,
"kl": 0.0171142578125,
"learning_rate": 2e-07,
"loss": -0.0015764832496643066,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.3916666775941849,
"reward_std": 0.2782260239124298,
"rewards/MultiModalAccuracyORM": 0.3916666775941849,
"step": 2100,
"train_speed(iter/s)": 0.02533
},
{
"clip_ratio": 0.0,
"completion_length": 411.65,
"epoch": 0.8505050505050505,
"grad_norm": 2.301476369720727,
"kl": 0.02381591796875,
"learning_rate": 2e-07,
"loss": 0.02723083198070526,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.05,
"reward": 0.2500000037252903,
"reward_std": 0.3780420243740082,
"rewards/MultiModalAccuracyORM": 0.2500000037252903,
"step": 2105,
"train_speed(iter/s)": 0.025351
},
{
"clip_ratio": 0.0,
"completion_length": 342.2,
"epoch": 0.8525252525252526,
"grad_norm": 2.2465362796243915,
"kl": 0.031561279296875,
"learning_rate": 2e-07,
"loss": -0.006004461646080017,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.4500000111758709,
"reward_std": 0.386061829328537,
"rewards/MultiModalAccuracyORM": 0.4500000111758709,
"step": 2110,
"train_speed(iter/s)": 0.025381
},
{
"clip_ratio": 0.0,
"completion_length": 430.45,
"epoch": 0.8545454545454545,
"grad_norm": 0.034882262330713364,
"kl": 0.01632537841796875,
"learning_rate": 2e-07,
"loss": 0.07573002576828003,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.2083333358168602,
"reward_std": 0.3058815211057663,
"rewards/MultiModalAccuracyORM": 0.2083333358168602,
"step": 2115,
"train_speed(iter/s)": 0.025404
},
{
"clip_ratio": 0.0,
"completion_length": 278.3,
"epoch": 0.8565656565656565,
"grad_norm": 1.8179385747560524,
"kl": 0.01519775390625,
"learning_rate": 2e-07,
"loss": 0.046589908003807065,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.20000000596046447,
"reward_std": 0.28446818590164186,
"rewards/MultiModalAccuracyORM": 0.20000000596046447,
"step": 2120,
"train_speed(iter/s)": 0.025437
},
{
"clip_ratio": 0.0,
"completion_length": 303.8,
"epoch": 0.8585858585858586,
"grad_norm": 1.842386637827148,
"kl": 0.023931884765625,
"learning_rate": 2e-07,
"loss": 0.0047568708658218386,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.2500000141561031,
"reward_std": 0.32924269437789916,
"rewards/MultiModalAccuracyORM": 0.2500000141561031,
"step": 2125,
"train_speed(iter/s)": 0.025467
},
{
"clip_ratio": 0.0,
"completion_length": 412.4,
"epoch": 0.8606060606060606,
"grad_norm": 3.12980971819249,
"kl": 0.0230224609375,
"learning_rate": 2e-07,
"loss": 0.012965646386146546,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.05,
"reward": 0.3833333387970924,
"reward_std": 0.3985911935567856,
"rewards/MultiModalAccuracyORM": 0.3833333387970924,
"step": 2130,
"train_speed(iter/s)": 0.025486
},
{
"clip_ratio": 0.0,
"completion_length": 322.65,
"epoch": 0.8626262626262626,
"grad_norm": 0.9262722343921138,
"kl": 0.018023681640625,
"learning_rate": 2e-07,
"loss": 0.0012422390282154083,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.17500000149011613,
"reward_std": 0.1808116167783737,
"rewards/MultiModalAccuracyORM": 0.17500000149011613,
"step": 2135,
"train_speed(iter/s)": 0.025513
},
{
"clip_ratio": 0.0,
"completion_length": 335.7,
"epoch": 0.8646464646464647,
"grad_norm": 1.0357905764180717,
"kl": 0.01571044921875,
"learning_rate": 2e-07,
"loss": 0.0018387317657470703,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.29166667014360426,
"reward_std": 0.25490583181381227,
"rewards/MultiModalAccuracyORM": 0.29166667014360426,
"step": 2140,
"train_speed(iter/s)": 0.025545
},
{
"clip_ratio": 0.0,
"completion_length": 285.1,
"epoch": 0.8666666666666667,
"grad_norm": 2.379354282182724,
"kl": 0.019244384765625,
"learning_rate": 2e-07,
"loss": 0.028354501724243163,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.3000000111758709,
"reward_std": 0.2963056802749634,
"rewards/MultiModalAccuracyORM": 0.3000000111758709,
"step": 2145,
"train_speed(iter/s)": 0.025579
},
{
"clip_ratio": 0.0,
"completion_length": 366.1,
"epoch": 0.8686868686868687,
"grad_norm": 1.257926920186221,
"kl": 0.0236419677734375,
"learning_rate": 2e-07,
"loss": 0.05731485486030578,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.4500000074505806,
"reward_std": 0.24860407412052155,
"rewards/MultiModalAccuracyORM": 0.4500000074505806,
"step": 2150,
"train_speed(iter/s)": 0.025607
},
{
"clip_ratio": 0.0,
"completion_length": 370.65,
"epoch": 0.8707070707070707,
"grad_norm": 0.4145211028011141,
"kl": 0.035430908203125,
"learning_rate": 2e-07,
"loss": -0.008838014304637909,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.0416666679084301,
"reward_std": 0.12552748322486879,
"rewards/MultiModalAccuracyORM": 0.0416666679084301,
"step": 2155,
"train_speed(iter/s)": 0.025637
},
{
"clip_ratio": 0.0,
"completion_length": 481.6,
"epoch": 0.8727272727272727,
"grad_norm": 3.5679392309928852,
"kl": 0.020635986328125,
"learning_rate": 2e-07,
"loss": -0.04596620798110962,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.05,
"reward": 0.21666667237877846,
"reward_std": 0.3494287371635437,
"rewards/MultiModalAccuracyORM": 0.21666667237877846,
"step": 2160,
"train_speed(iter/s)": 0.02566
},
{
"clip_ratio": 0.0,
"completion_length": 312.8,
"epoch": 0.8747474747474747,
"grad_norm": 2.915431806582569,
"kl": 0.03173828125,
"learning_rate": 2e-07,
"loss": 0.03424719870090485,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.45000000670552254,
"reward_std": 0.3579271614551544,
"rewards/MultiModalAccuracyORM": 0.45000000670552254,
"step": 2165,
"train_speed(iter/s)": 0.025692
},
{
"clip_ratio": 0.0,
"completion_length": 347.9,
"epoch": 0.8767676767676768,
"grad_norm": 1.2438809288674397,
"kl": 0.02581787109375,
"learning_rate": 2e-07,
"loss": 0.022351789474487304,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.2250000022351742,
"reward_std": 0.2556006669998169,
"rewards/MultiModalAccuracyORM": 0.2250000022351742,
"step": 2170,
"train_speed(iter/s)": 0.025718
},
{
"clip_ratio": 0.0,
"completion_length": 338.35,
"epoch": 0.8787878787878788,
"grad_norm": 0.08213166464110444,
"kl": 0.0291015625,
"learning_rate": 2e-07,
"loss": -0.04905802011489868,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.23333334028720856,
"reward_std": 0.3343147337436676,
"rewards/MultiModalAccuracyORM": 0.23333334028720856,
"step": 2175,
"train_speed(iter/s)": 0.025744
},
{
"clip_ratio": 0.0,
"completion_length": 386.8,
"epoch": 0.8808080808080808,
"grad_norm": 1.2558474815848573,
"kl": 0.0392333984375,
"learning_rate": 2e-07,
"loss": 0.03639570772647858,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.3500000052154064,
"reward_std": 0.40410049855709074,
"rewards/MultiModalAccuracyORM": 0.3500000052154064,
"step": 2180,
"train_speed(iter/s)": 0.025763
},
{
"clip_ratio": 0.0,
"completion_length": 295.25,
"epoch": 0.8828282828282829,
"grad_norm": 2.2083604873690255,
"kl": 0.02174072265625,
"learning_rate": 2e-07,
"loss": 0.04861523509025574,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.36666667386889457,
"reward_std": 0.4242177873849869,
"rewards/MultiModalAccuracyORM": 0.36666667386889457,
"step": 2185,
"train_speed(iter/s)": 0.025791
},
{
"clip_ratio": 0.0,
"completion_length": 421.4,
"epoch": 0.8848484848484849,
"grad_norm": 1.9173115593509535,
"kl": 0.02357177734375,
"learning_rate": 2e-07,
"loss": 0.013380092382431031,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.3000000074505806,
"reward_std": 0.311967608332634,
"rewards/MultiModalAccuracyORM": 0.3000000074505806,
"step": 2190,
"train_speed(iter/s)": 0.025813
},
{
"clip_ratio": 0.0,
"completion_length": 363.5,
"epoch": 0.8868686868686869,
"grad_norm": 1.3588226440942046,
"kl": 0.025439453125,
"learning_rate": 2e-07,
"loss": 0.011188817024230958,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.24166667014360427,
"reward_std": 0.18326250910758973,
"rewards/MultiModalAccuracyORM": 0.24166667014360427,
"step": 2195,
"train_speed(iter/s)": 0.025832
},
{
"clip_ratio": 0.0,
"completion_length": 324.1,
"epoch": 0.8888888888888888,
"grad_norm": 1.8037621160022852,
"kl": 0.034747314453125,
"learning_rate": 2e-07,
"loss": 0.04917380511760712,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.23333333879709245,
"reward_std": 0.3719944924116135,
"rewards/MultiModalAccuracyORM": 0.23333333879709245,
"step": 2200,
"train_speed(iter/s)": 0.025862
},
{
"clip_ratio": 0.0,
"completion_length": 482.15,
"epoch": 0.8909090909090909,
"grad_norm": 2.141711868124079,
"kl": 0.0226776123046875,
"learning_rate": 2e-07,
"loss": -0.018071025609970093,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.11666666939854622,
"reward_std": 0.23030244410037995,
"rewards/MultiModalAccuracyORM": 0.11666666939854622,
"step": 2205,
"train_speed(iter/s)": 0.025882
},
{
"clip_ratio": 0.0,
"completion_length": 501.4,
"epoch": 0.8929292929292929,
"grad_norm": 1.4394465065225663,
"kl": 0.03128662109375,
"learning_rate": 2e-07,
"loss": 0.019231194257736207,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.1833333380520344,
"reward_std": 0.31740519404411316,
"rewards/MultiModalAccuracyORM": 0.1833333380520344,
"step": 2210,
"train_speed(iter/s)": 0.025901
},
{
"clip_ratio": 0.0,
"completion_length": 268.9,
"epoch": 0.8949494949494949,
"grad_norm": 1.8778711843519251,
"kl": 0.03623046875,
"learning_rate": 2e-07,
"loss": 0.042392924427986145,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.3000000104308128,
"reward_std": 0.24866368174552916,
"rewards/MultiModalAccuracyORM": 0.3000000104308128,
"step": 2215,
"train_speed(iter/s)": 0.025928
},
{
"clip_ratio": 0.0,
"completion_length": 330.6,
"epoch": 0.896969696969697,
"grad_norm": 2.783501622971831,
"kl": 0.02158203125,
"learning_rate": 2e-07,
"loss": -0.009627214074134827,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.3250000074505806,
"reward_std": 0.30665292739868166,
"rewards/MultiModalAccuracyORM": 0.3250000074505806,
"step": 2220,
"train_speed(iter/s)": 0.025959
},
{
"clip_ratio": 0.0,
"completion_length": 304.35,
"epoch": 0.898989898989899,
"grad_norm": 64.84162647185127,
"kl": 0.042742919921875,
"learning_rate": 2e-07,
"loss": 0.027672123908996583,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.1250000014901161,
"reward_std": 0.24265173375606536,
"rewards/MultiModalAccuracyORM": 0.1250000014901161,
"step": 2225,
"train_speed(iter/s)": 0.025989
},
{
"clip_ratio": 0.0,
"completion_length": 285.25,
"epoch": 0.901010101010101,
"grad_norm": 2.756817795935333,
"kl": 0.027203369140625,
"learning_rate": 2e-07,
"loss": -0.0488799124956131,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.20000000149011612,
"reward_std": 0.2922547996044159,
"rewards/MultiModalAccuracyORM": 0.20000000149011612,
"step": 2230,
"train_speed(iter/s)": 0.026019
},
{
"clip_ratio": 0.0,
"completion_length": 331.1,
"epoch": 0.9030303030303031,
"grad_norm": 3.484265646880912,
"kl": 0.0185455322265625,
"learning_rate": 2e-07,
"loss": -0.006375116109848022,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.4083333447575569,
"reward_std": 0.2692514002323151,
"rewards/MultiModalAccuracyORM": 0.4083333447575569,
"step": 2235,
"train_speed(iter/s)": 0.026045
},
{
"clip_ratio": 0.0,
"completion_length": 310.6,
"epoch": 0.9050505050505051,
"grad_norm": 0.08112989718996635,
"kl": 0.026385498046875,
"learning_rate": 2e-07,
"loss": 0.07493855953216552,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.22500000670552253,
"reward_std": 0.2915985196828842,
"rewards/MultiModalAccuracyORM": 0.22500000670552253,
"step": 2240,
"train_speed(iter/s)": 0.026071
},
{
"clip_ratio": 0.0,
"completion_length": 383.5,
"epoch": 0.907070707070707,
"grad_norm": 2.1571772688182276,
"kl": 0.02109375,
"learning_rate": 2e-07,
"loss": -0.008470755815505982,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.15833333656191825,
"reward_std": 0.1808116227388382,
"rewards/MultiModalAccuracyORM": 0.15833333656191825,
"step": 2245,
"train_speed(iter/s)": 0.026093
},
{
"epoch": 0.9090909090909091,
"grad_norm": 2.4521268907747036,
"learning_rate": 2e-07,
"loss": 0.02900133728981018,
"memory(GiB)": 87.45,
"step": 2250,
"train_speed(iter/s)": 0.026122
},
{
"epoch": 0.9090909090909091,
"eval_clip_ratio": 0.0,
"eval_completion_length": 326.39667755126953,
"eval_kl": 0.0267205810546875,
"eval_loss": 0.02248476631939411,
"eval_response_clip_ratio": 0.0,
"eval_reward": 0.3383333416283131,
"eval_reward_std": 0.30222029507160186,
"eval_rewards/MultiModalAccuracyORM": 0.3383333416283131,
"eval_runtime": 479.1069,
"eval_samples_per_second": 0.104,
"eval_steps_per_second": 0.01,
"step": 2250
},
{
"clip_ratio": 0.0,
"completion_length": 293.825,
"epoch": 0.9111111111111111,
"grad_norm": 2.997368813220566,
"kl": 0.02721710205078125,
"learning_rate": 2e-07,
"loss": 0.003950953483581543,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.4541666753590107,
"reward_std": 0.3525440260767937,
"rewards/MultiModalAccuracyORM": 0.4541666753590107,
"step": 2255,
"train_speed(iter/s)": 0.025886
},
{
"clip_ratio": 0.0,
"completion_length": 221.3,
"epoch": 0.9131313131313131,
"grad_norm": 3.095107484502175,
"kl": 0.0549560546875,
"learning_rate": 2e-07,
"loss": 0.006377041339874268,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.4750000052154064,
"reward_std": 0.30114119648933413,
"rewards/MultiModalAccuracyORM": 0.4750000052154064,
"step": 2260,
"train_speed(iter/s)": 0.025918
},
{
"clip_ratio": 0.0,
"completion_length": 326.8,
"epoch": 0.9151515151515152,
"grad_norm": 2.764452940040707,
"kl": 0.025128173828125,
"learning_rate": 2e-07,
"loss": -0.060949933528900144,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.2833333402872086,
"reward_std": 0.3563301384449005,
"rewards/MultiModalAccuracyORM": 0.2833333402872086,
"step": 2265,
"train_speed(iter/s)": 0.025947
},
{
"clip_ratio": 0.0,
"completion_length": 308.85,
"epoch": 0.9171717171717172,
"grad_norm": 1.6613189303519411,
"kl": 0.0338897705078125,
"learning_rate": 2e-07,
"loss": 0.030397918820381165,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.23333334177732468,
"reward_std": 0.21600489914417267,
"rewards/MultiModalAccuracyORM": 0.23333334177732468,
"step": 2270,
"train_speed(iter/s)": 0.025974
},
{
"clip_ratio": 0.0,
"completion_length": 268.75,
"epoch": 0.9191919191919192,
"grad_norm": 2.4104355223612903,
"kl": 0.043646240234375,
"learning_rate": 2e-07,
"loss": 0.02471620440483093,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.39166667088866236,
"reward_std": 0.22880061268806456,
"rewards/MultiModalAccuracyORM": 0.39166667088866236,
"step": 2275,
"train_speed(iter/s)": 0.026005
},
{
"clip_ratio": 0.0,
"completion_length": 223.65,
"epoch": 0.9212121212121213,
"grad_norm": 0.9890862252945101,
"kl": 0.0232696533203125,
"learning_rate": 2e-07,
"loss": -0.019132834672927857,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.43333334773778914,
"reward_std": 0.28934226334095003,
"rewards/MultiModalAccuracyORM": 0.43333334773778914,
"step": 2280,
"train_speed(iter/s)": 0.026037
},
{
"clip_ratio": 0.0,
"completion_length": 290.95,
"epoch": 0.9232323232323232,
"grad_norm": 2.8529813646862565,
"kl": 0.016925048828125,
"learning_rate": 2e-07,
"loss": 0.02090049088001251,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.33333334401249887,
"reward_std": 0.25286819934844973,
"rewards/MultiModalAccuracyORM": 0.33333334401249887,
"step": 2285,
"train_speed(iter/s)": 0.026068
},
{
"clip_ratio": 0.0,
"completion_length": 351.05,
"epoch": 0.9252525252525252,
"grad_norm": 1.89117356154723,
"kl": 0.0194671630859375,
"learning_rate": 2e-07,
"loss": 0.006132407486438752,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.24166667014360427,
"reward_std": 0.33449481427669525,
"rewards/MultiModalAccuracyORM": 0.24166667014360427,
"step": 2290,
"train_speed(iter/s)": 0.026093
},
{
"clip_ratio": 0.0,
"completion_length": 294.9,
"epoch": 0.9272727272727272,
"grad_norm": 1.5821722224404322,
"kl": 0.0285491943359375,
"learning_rate": 2e-07,
"loss": -0.055334615707397464,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.3250000089406967,
"reward_std": 0.3450992465019226,
"rewards/MultiModalAccuracyORM": 0.3250000089406967,
"step": 2295,
"train_speed(iter/s)": 0.026121
},
{
"clip_ratio": 0.0,
"completion_length": 408.45,
"epoch": 0.9292929292929293,
"grad_norm": 1.0631048809606616,
"kl": 0.0221282958984375,
"learning_rate": 2e-07,
"loss": 0.04601133763790131,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.17500000298023224,
"reward_std": 0.3211964577436447,
"rewards/MultiModalAccuracyORM": 0.17500000298023224,
"step": 2300,
"train_speed(iter/s)": 0.026144
},
{
"clip_ratio": 0.0,
"completion_length": 383.7,
"epoch": 0.9313131313131313,
"grad_norm": 2.2872062972102016,
"kl": 0.013800048828125,
"learning_rate": 2e-07,
"loss": -0.06729268431663513,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.308333345502615,
"reward_std": 0.42669269144535066,
"rewards/MultiModalAccuracyORM": 0.308333345502615,
"step": 2305,
"train_speed(iter/s)": 0.026168
},
{
"clip_ratio": 0.0,
"completion_length": 301.05,
"epoch": 0.9333333333333333,
"grad_norm": 1.5571796305098269,
"kl": 0.015960693359375,
"learning_rate": 2e-07,
"loss": 0.019453226029872893,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.2750000111758709,
"reward_std": 0.2812868684530258,
"rewards/MultiModalAccuracyORM": 0.2750000111758709,
"step": 2310,
"train_speed(iter/s)": 0.02619
},
{
"clip_ratio": 0.0,
"completion_length": 354.5,
"epoch": 0.9353535353535354,
"grad_norm": 1.2789781364913986,
"kl": 0.0262939453125,
"learning_rate": 2e-07,
"loss": -0.014371034502983094,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.39166667610406875,
"reward_std": 0.35789157152175904,
"rewards/MultiModalAccuracyORM": 0.39166667610406875,
"step": 2315,
"train_speed(iter/s)": 0.026212
},
{
"clip_ratio": 0.0,
"completion_length": 315.75,
"epoch": 0.9373737373737374,
"grad_norm": 2.0043648431803742,
"kl": 0.0160247802734375,
"learning_rate": 2e-07,
"loss": 0.004941976815462113,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.2666666753590107,
"reward_std": 0.3945602476596832,
"rewards/MultiModalAccuracyORM": 0.2666666753590107,
"step": 2320,
"train_speed(iter/s)": 0.026239
},
{
"clip_ratio": 0.0,
"completion_length": 397.45,
"epoch": 0.9393939393939394,
"grad_norm": 2.434275159571036,
"kl": 0.0218505859375,
"learning_rate": 2e-07,
"loss": 0.015783283114433288,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.30000001192092896,
"reward_std": 0.44407508671283724,
"rewards/MultiModalAccuracyORM": 0.30000001192092896,
"step": 2325,
"train_speed(iter/s)": 0.026266
},
{
"clip_ratio": 0.0,
"completion_length": 341.45,
"epoch": 0.9414141414141414,
"grad_norm": 3.3518880188766262,
"kl": 0.0180023193359375,
"learning_rate": 2e-07,
"loss": 0.004853534698486328,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.37500001266598704,
"reward_std": 0.3925822228193283,
"rewards/MultiModalAccuracyORM": 0.37500001266598704,
"step": 2330,
"train_speed(iter/s)": 0.026293
},
{
"clip_ratio": 0.0,
"completion_length": 434.3,
"epoch": 0.9434343434343434,
"grad_norm": 2.162505598086888,
"kl": 0.018927001953125,
"learning_rate": 2e-07,
"loss": 0.06589244604110718,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.05,
"reward": 0.3666666693985462,
"reward_std": 0.2581467509269714,
"rewards/MultiModalAccuracyORM": 0.3666666693985462,
"step": 2335,
"train_speed(iter/s)": 0.026311
},
{
"clip_ratio": 0.0,
"completion_length": 323.1,
"epoch": 0.9454545454545454,
"grad_norm": 2.6990455984773494,
"kl": 0.0258880615234375,
"learning_rate": 2e-07,
"loss": 0.007903063297271728,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.15833333656191825,
"reward_std": 0.3127004593610764,
"rewards/MultiModalAccuracyORM": 0.15833333656191825,
"step": 2340,
"train_speed(iter/s)": 0.026336
},
{
"clip_ratio": 0.0,
"completion_length": 189.5,
"epoch": 0.9474747474747475,
"grad_norm": 31.778104916563368,
"kl": 0.046075439453125,
"learning_rate": 2e-07,
"loss": -0.046237149834632875,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.28333334624767303,
"reward_std": 0.3485885769128799,
"rewards/MultiModalAccuracyORM": 0.28333334624767303,
"step": 2345,
"train_speed(iter/s)": 0.026363
},
{
"clip_ratio": 0.0,
"completion_length": 413.05,
"epoch": 0.9494949494949495,
"grad_norm": 1.8887972983852979,
"kl": 0.0284271240234375,
"learning_rate": 2e-07,
"loss": -0.044114714860916136,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.29166667610406877,
"reward_std": 0.3408351272344589,
"rewards/MultiModalAccuracyORM": 0.29166667610406877,
"step": 2350,
"train_speed(iter/s)": 0.026385
},
{
"clip_ratio": 0.0,
"completion_length": 403.95,
"epoch": 0.9515151515151515,
"grad_norm": 2.719100446764501,
"kl": 0.0343994140625,
"learning_rate": 2e-07,
"loss": 0.030634421110153198,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.38333334028720856,
"reward_std": 0.379781112074852,
"rewards/MultiModalAccuracyORM": 0.38333334028720856,
"step": 2355,
"train_speed(iter/s)": 0.026406
},
{
"clip_ratio": 0.0,
"completion_length": 338.7,
"epoch": 0.9535353535353536,
"grad_norm": 2.4658627482626816,
"kl": 0.033807373046875,
"learning_rate": 2e-07,
"loss": 0.026800933480262756,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.4916666731238365,
"reward_std": 0.2393606811761856,
"rewards/MultiModalAccuracyORM": 0.4916666731238365,
"step": 2360,
"train_speed(iter/s)": 0.026436
},
{
"clip_ratio": 0.0,
"completion_length": 360.25,
"epoch": 0.9555555555555556,
"grad_norm": 2.851734873550529,
"kl": 0.027685546875,
"learning_rate": 2e-07,
"loss": 0.013045597076416015,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.3500000052154064,
"reward_std": 0.43759028911590575,
"rewards/MultiModalAccuracyORM": 0.3500000052154064,
"step": 2365,
"train_speed(iter/s)": 0.026463
},
{
"clip_ratio": 0.0,
"completion_length": 315.05,
"epoch": 0.9575757575757575,
"grad_norm": 1.448742319519302,
"kl": 0.02066650390625,
"learning_rate": 2e-07,
"loss": -0.010880425572395325,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.41666667237877847,
"reward_std": 0.3780420243740082,
"rewards/MultiModalAccuracyORM": 0.41666667237877847,
"step": 2370,
"train_speed(iter/s)": 0.026491
},
{
"clip_ratio": 0.0,
"completion_length": 284.3,
"epoch": 0.9595959595959596,
"grad_norm": 1.7573565404253169,
"kl": 0.05279541015625,
"learning_rate": 2e-07,
"loss": -0.009101217985153199,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.3916666805744171,
"reward_std": 0.32049004435539247,
"rewards/MultiModalAccuracyORM": 0.3916666805744171,
"step": 2375,
"train_speed(iter/s)": 0.026519
},
{
"clip_ratio": 0.0,
"completion_length": 387.75,
"epoch": 0.9616161616161616,
"grad_norm": 1.3965100041612641,
"kl": 0.02640380859375,
"learning_rate": 2e-07,
"loss": 0.01602880358695984,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.41666667759418485,
"reward_std": 0.3471368789672852,
"rewards/MultiModalAccuracyORM": 0.41666667759418485,
"step": 2380,
"train_speed(iter/s)": 0.026544
},
{
"clip_ratio": 0.0,
"completion_length": 308.25,
"epoch": 0.9636363636363636,
"grad_norm": 2.2883768350459732,
"kl": 0.0283721923828125,
"learning_rate": 2e-07,
"loss": -0.02478056252002716,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.18333333730697632,
"reward_std": 0.3252063632011414,
"rewards/MultiModalAccuracyORM": 0.18333333730697632,
"step": 2385,
"train_speed(iter/s)": 0.02657
},
{
"clip_ratio": 0.0,
"completion_length": 269.4,
"epoch": 0.9656565656565657,
"grad_norm": 2.3698939133503027,
"kl": 0.027130126953125,
"learning_rate": 2e-07,
"loss": 0.0352479875087738,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.316666679084301,
"reward_std": 0.2815410941839218,
"rewards/MultiModalAccuracyORM": 0.316666679084301,
"step": 2390,
"train_speed(iter/s)": 0.0266
},
{
"clip_ratio": 0.0,
"completion_length": 409.3,
"epoch": 0.9676767676767677,
"grad_norm": 2.6455515972771577,
"kl": 0.0282379150390625,
"learning_rate": 2e-07,
"loss": 0.02145477384328842,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.3416666738688946,
"reward_std": 0.43726191222667693,
"rewards/MultiModalAccuracyORM": 0.3416666738688946,
"step": 2395,
"train_speed(iter/s)": 0.026624
},
{
"clip_ratio": 0.0,
"completion_length": 321.1,
"epoch": 0.9696969696969697,
"grad_norm": 1.3800009626988052,
"kl": 0.023291015625,
"learning_rate": 2e-07,
"loss": 0.009223046898841857,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.09166666939854622,
"reward_std": 0.1850757420063019,
"rewards/MultiModalAccuracyORM": 0.09166666939854622,
"step": 2400,
"train_speed(iter/s)": 0.02665
},
{
"clip_ratio": 0.0,
"completion_length": 264.95,
"epoch": 0.9717171717171718,
"grad_norm": 2.707313244667536,
"kl": 0.0386138916015625,
"learning_rate": 2e-07,
"loss": -0.016336160898208617,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.38333333730697633,
"reward_std": 0.24860407412052155,
"rewards/MultiModalAccuracyORM": 0.38333333730697633,
"step": 2405,
"train_speed(iter/s)": 0.026681
},
{
"clip_ratio": 0.0,
"completion_length": 403.65,
"epoch": 0.9737373737373738,
"grad_norm": 2.6298064760318223,
"kl": 0.031060791015625,
"learning_rate": 2e-07,
"loss": -0.026252752542495726,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.21666667014360427,
"reward_std": 0.3385047078132629,
"rewards/MultiModalAccuracyORM": 0.21666667014360427,
"step": 2410,
"train_speed(iter/s)": 0.0267
},
{
"clip_ratio": 0.0,
"completion_length": 270.65,
"epoch": 0.9757575757575757,
"grad_norm": 2.0364458058384423,
"kl": 0.018072509765625,
"learning_rate": 2e-07,
"loss": -0.022683143615722656,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.483333345502615,
"reward_std": 0.2900991141796112,
"rewards/MultiModalAccuracyORM": 0.483333345502615,
"step": 2415,
"train_speed(iter/s)": 0.026729
},
{
"clip_ratio": 0.0,
"completion_length": 279.65,
"epoch": 0.9777777777777777,
"grad_norm": 3.0539530097221843,
"kl": 0.0236175537109375,
"learning_rate": 2e-07,
"loss": -0.025200226902961732,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.5166666835546494,
"reward_std": 0.3579155892133713,
"rewards/MultiModalAccuracyORM": 0.5166666835546494,
"step": 2420,
"train_speed(iter/s)": 0.026757
},
{
"clip_ratio": 0.0,
"completion_length": 319.65,
"epoch": 0.9797979797979798,
"grad_norm": 2.837404902371068,
"kl": 0.0191802978515625,
"learning_rate": 2e-07,
"loss": -0.05283277034759522,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.1916666679084301,
"reward_std": 0.24939410090446473,
"rewards/MultiModalAccuracyORM": 0.1916666679084301,
"step": 2425,
"train_speed(iter/s)": 0.026783
},
{
"clip_ratio": 0.0,
"completion_length": 387.05,
"epoch": 0.9818181818181818,
"grad_norm": 1.2637214917941955,
"kl": 0.0302001953125,
"learning_rate": 2e-07,
"loss": 0.013781133294105529,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.37500001266598704,
"reward_std": 0.4204265236854553,
"rewards/MultiModalAccuracyORM": 0.37500001266598704,
"step": 2430,
"train_speed(iter/s)": 0.026802
},
{
"clip_ratio": 0.0,
"completion_length": 297.8,
"epoch": 0.9838383838383838,
"grad_norm": 0.058208298350106734,
"kl": 0.0239227294921875,
"learning_rate": 2e-07,
"loss": 0.03573224246501923,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.32500000670552254,
"reward_std": 0.26597192585468293,
"rewards/MultiModalAccuracyORM": 0.32500000670552254,
"step": 2435,
"train_speed(iter/s)": 0.02683
},
{
"clip_ratio": 0.0,
"completion_length": 346.85,
"epoch": 0.9858585858585859,
"grad_norm": 1.6302602474729853,
"kl": 0.0171844482421875,
"learning_rate": 2e-07,
"loss": -0.012005738914012909,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.1916666679084301,
"reward_std": 0.19717081785202026,
"rewards/MultiModalAccuracyORM": 0.1916666679084301,
"step": 2440,
"train_speed(iter/s)": 0.026853
},
{
"clip_ratio": 0.0,
"completion_length": 369.0,
"epoch": 0.9878787878787879,
"grad_norm": 2.5433362450025765,
"kl": 0.0248321533203125,
"learning_rate": 2e-07,
"loss": -0.030718517303466798,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.35833333879709245,
"reward_std": 0.30894235968589784,
"rewards/MultiModalAccuracyORM": 0.35833333879709245,
"step": 2445,
"train_speed(iter/s)": 0.026875
},
{
"clip_ratio": 0.0,
"completion_length": 406.35,
"epoch": 0.98989898989899,
"grad_norm": 1.0906797325242925,
"kl": 0.024761962890625,
"learning_rate": 2e-07,
"loss": -0.007297384738922119,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.23333334028720856,
"reward_std": 0.3393357157707214,
"rewards/MultiModalAccuracyORM": 0.23333334028720856,
"step": 2450,
"train_speed(iter/s)": 0.026893
},
{
"clip_ratio": 0.0,
"completion_length": 355.7,
"epoch": 0.9919191919191919,
"grad_norm": 1.8168984918524227,
"kl": 0.0161956787109375,
"learning_rate": 2e-07,
"loss": 0.03163195252418518,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.450000011920929,
"reward_std": 0.37525942325592043,
"rewards/MultiModalAccuracyORM": 0.450000011920929,
"step": 2455,
"train_speed(iter/s)": 0.026915
},
{
"clip_ratio": 0.0,
"completion_length": 340.05,
"epoch": 0.9939393939393939,
"grad_norm": 1.171315154121709,
"kl": 0.02081298828125,
"learning_rate": 2e-07,
"loss": 0.014726841449737548,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.4083333402872086,
"reward_std": 0.29634127020835876,
"rewards/MultiModalAccuracyORM": 0.4083333402872086,
"step": 2460,
"train_speed(iter/s)": 0.026936
},
{
"clip_ratio": 0.0,
"completion_length": 302.6,
"epoch": 0.9959595959595959,
"grad_norm": 0.9872275853532635,
"kl": 0.01080322265625,
"learning_rate": 2e-07,
"loss": 0.01651265621185303,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.5666666753590107,
"reward_std": 0.2488823115825653,
"rewards/MultiModalAccuracyORM": 0.5666666753590107,
"step": 2465,
"train_speed(iter/s)": 0.026959
},
{
"clip_ratio": 0.0,
"completion_length": 438.75,
"epoch": 0.997979797979798,
"grad_norm": 1.8423007639906985,
"kl": 0.0185638427734375,
"learning_rate": 2e-07,
"loss": -0.006967762112617492,
"memory(GiB)": 87.45,
"response_clip_ratio": 0.0,
"reward": 0.31666667237877844,
"reward_std": 0.21999078691005708,
"rewards/MultiModalAccuracyORM": 0.31666667237877844,
"step": 2470,
"train_speed(iter/s)": 0.02698
},
{
"epoch": 1.0,
"grad_norm": 2.4251028884123285,
"learning_rate": 2e-07,
"loss": -0.04546417593955994,
"memory(GiB)": 87.45,
"step": 2475,
"train_speed(iter/s)": 0.026999
},
{
"epoch": 1.0,
"eval_clip_ratio": 0.0,
"eval_completion_length": 364.18834014892576,
"eval_kl": 0.0238104248046875,
"eval_loss": 0.01933932490646839,
"eval_response_clip_ratio": 0.00833333358168602,
"eval_reward": 0.34333334282040595,
"eval_reward_std": 0.295663959980011,
"eval_rewards/MultiModalAccuracyORM": 0.34333334282040595,
"eval_runtime": 580.8644,
"eval_samples_per_second": 0.086,
"eval_steps_per_second": 0.009,
"step": 2475
}
],
"logging_steps": 5,
"max_steps": 2475,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 250,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 3,
"trial_name": null,
"trial_params": null
}