Safetensors
llava_onevision
Game-RL-LLaVA-OV-7B / trainer_state.json
lkdhy's picture
Upload 20 files
92603ee verified
raw
history blame
253 kB
{
"best_metric": 0.4650000059604645,
"best_model_checkpoint": "/mnt/data/user/zhao_jun/tangjixin/output/model/llava_ov-grpo_new_v20_5k/v8-20250330-101445/checkpoint-2475",
"epoch": 1.0,
"eval_steps": 250,
"global_step": 2475,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"clip_ratio": 0.0,
"completion_length": 12.833333730697632,
"epoch": 0.00040404040404040404,
"grad_norm": 4.95974063873291,
"kl": 0.0007257461547851562,
"learning_rate": 1.6129032258064515e-09,
"loss": 0.1313462257385254,
"memory(GiB)": 103.91,
"response_clip_ratio": 0.0,
"reward": 0.125,
"reward_std": 0.22613351047039032,
"rewards/MultiModalAccuracyORM": 0.125,
"step": 1,
"train_speed(iter/s)": 0.011139
},
{
"clip_ratio": 0.0,
"completion_length": 45.927083522081375,
"epoch": 0.00202020202020202,
"grad_norm": 0.026089413091540337,
"kl": 0.00024419277906417847,
"learning_rate": 8.064516129032257e-09,
"loss": -0.0017255048733204603,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.0416666679084301,
"reward_std": 0.09731236100196838,
"rewards/MultiModalAccuracyORM": 0.0416666679084301,
"step": 5,
"train_speed(iter/s)": 0.028079
},
{
"clip_ratio": 0.0,
"completion_length": 33.066667795181274,
"epoch": 0.00404040404040404,
"grad_norm": 4.474486827850342,
"kl": 4.897117614746094e-05,
"learning_rate": 1.6129032258064514e-08,
"loss": 0.005788012593984604,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.1250000014901161,
"reward_std": 0.2712650209665298,
"rewards/MultiModalAccuracyORM": 0.1250000014901161,
"step": 10,
"train_speed(iter/s)": 0.034795
},
{
"clip_ratio": 0.0,
"completion_length": 39.85000114440918,
"epoch": 0.006060606060606061,
"grad_norm": 2.904900074005127,
"kl": 0.00015695095062255858,
"learning_rate": 2.4193548387096773e-08,
"loss": 0.036757296323776244,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.2416666716337204,
"reward_std": 0.29389037787914274,
"rewards/MultiModalAccuracyORM": 0.2416666716337204,
"step": 15,
"train_speed(iter/s)": 0.0376
},
{
"clip_ratio": 0.0,
"completion_length": 69.10000429153442,
"epoch": 0.00808080808080808,
"grad_norm": 1.9090512990951538,
"kl": 0.00022979974746704102,
"learning_rate": 3.225806451612903e-08,
"loss": 0.00942036360502243,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.16666667088866233,
"reward_std": 0.31846399009227755,
"rewards/MultiModalAccuracyORM": 0.16666667088866233,
"step": 20,
"train_speed(iter/s)": 0.03857
},
{
"clip_ratio": 0.0,
"completion_length": 34.64166686534882,
"epoch": 0.010101010101010102,
"grad_norm": 23.398836135864258,
"kl": 0.00027928352355957033,
"learning_rate": 4.032258064516129e-08,
"loss": -0.005109664052724838,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.2083333410322666,
"reward_std": 0.31046820282936094,
"rewards/MultiModalAccuracyORM": 0.2083333410322666,
"step": 25,
"train_speed(iter/s)": 0.039527
},
{
"clip_ratio": 0.0,
"completion_length": 27.183334088325502,
"epoch": 0.012121212121212121,
"grad_norm": 0.027309712022542953,
"kl": 0.0002372264862060547,
"learning_rate": 4.8387096774193546e-08,
"loss": -0.016541659832000732,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.05833333432674408,
"reward_std": 0.14188667237758637,
"rewards/MultiModalAccuracyORM": 0.05833333432674408,
"step": 30,
"train_speed(iter/s)": 0.040173
},
{
"clip_ratio": 0.0,
"completion_length": 48.750002241134645,
"epoch": 0.014141414141414142,
"grad_norm": 2.6486644744873047,
"kl": 0.00022208690643310547,
"learning_rate": 5.645161290322581e-08,
"loss": 0.03488517701625824,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.1416666716337204,
"reward_std": 0.19962169826030732,
"rewards/MultiModalAccuracyORM": 0.1416666716337204,
"step": 35,
"train_speed(iter/s)": 0.040888
},
{
"clip_ratio": 0.0,
"completion_length": 7.7666668176651,
"epoch": 0.01616161616161616,
"grad_norm": 13.41940689086914,
"kl": 0.00021257400512695313,
"learning_rate": 6.451612903225806e-08,
"loss": -0.0012449542991816998,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.12500000223517418,
"reward_std": 0.2652174890041351,
"rewards/MultiModalAccuracyORM": 0.12500000223517418,
"step": 40,
"train_speed(iter/s)": 0.041651
},
{
"clip_ratio": 0.0,
"completion_length": 65.25000057220458,
"epoch": 0.01818181818181818,
"grad_norm": 11.40164852142334,
"kl": 5.4210424423217773e-05,
"learning_rate": 7.258064516129032e-08,
"loss": 0.03769001364707947,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.21666667386889457,
"reward_std": 0.325963220000267,
"rewards/MultiModalAccuracyORM": 0.21666667386889457,
"step": 45,
"train_speed(iter/s)": 0.041539
},
{
"clip_ratio": 0.0,
"completion_length": 32.84166791439056,
"epoch": 0.020202020202020204,
"grad_norm": 0.03606203943490982,
"kl": 0.00031108856201171874,
"learning_rate": 8.064516129032257e-08,
"loss": 1.2442469596862793e-05,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.07500000074505805,
"reward_std": 0.15824586153030396,
"rewards/MultiModalAccuracyORM": 0.07500000074505805,
"step": 50,
"train_speed(iter/s)": 0.041821
},
{
"clip_ratio": 0.0,
"completion_length": 20.025000762939452,
"epoch": 0.022222222222222223,
"grad_norm": 3.2404561042785645,
"kl": 0.0004961967468261718,
"learning_rate": 8.870967741935484e-08,
"loss": 0.016841122508049013,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.24166666865348815,
"reward_std": 0.3241831511259079,
"rewards/MultiModalAccuracyORM": 0.24166666865348815,
"step": 55,
"train_speed(iter/s)": 0.042244
},
{
"clip_ratio": 0.0,
"completion_length": 7.516666769981384,
"epoch": 0.024242424242424242,
"grad_norm": 3.8046255111694336,
"kl": 6.520748138427735e-06,
"learning_rate": 9.677419354838709e-08,
"loss": -0.001297527551651001,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.36666668206453323,
"reward_std": 0.330777695775032,
"rewards/MultiModalAccuracyORM": 0.36666668206453323,
"step": 60,
"train_speed(iter/s)": 0.042408
},
{
"clip_ratio": 0.0,
"completion_length": 9.5333336353302,
"epoch": 0.026262626262626262,
"grad_norm": 0.015074208378791809,
"kl": 0.00015583038330078126,
"learning_rate": 1.0483870967741934e-07,
"loss": -0.018772208690643312,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.2083333395421505,
"reward_std": 0.3019101768732071,
"rewards/MultiModalAccuracyORM": 0.2083333395421505,
"step": 65,
"train_speed(iter/s)": 0.04265
},
{
"clip_ratio": 0.0,
"completion_length": 14.125000405311585,
"epoch": 0.028282828282828285,
"grad_norm": 1.4802911281585693,
"kl": 0.0001938343048095703,
"learning_rate": 1.1290322580645162e-07,
"loss": 0.04349477887153626,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.15833333805203437,
"reward_std": 0.26123160123825073,
"rewards/MultiModalAccuracyORM": 0.15833333805203437,
"step": 70,
"train_speed(iter/s)": 0.042774
},
{
"clip_ratio": 0.0,
"completion_length": 9.00833351612091,
"epoch": 0.030303030303030304,
"grad_norm": 17.15009880065918,
"kl": 0.0005457401275634766,
"learning_rate": 1.2096774193548387e-07,
"loss": -0.03085809648036957,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.20000000298023224,
"reward_std": 0.2855865776538849,
"rewards/MultiModalAccuracyORM": 0.20000000298023224,
"step": 75,
"train_speed(iter/s)": 0.043032
},
{
"clip_ratio": 0.0,
"completion_length": 30.941667556762695,
"epoch": 0.03232323232323232,
"grad_norm": 0.15290312469005585,
"kl": 0.0005632162094116211,
"learning_rate": 1.2903225806451611e-07,
"loss": -0.019948795437812805,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.00833333358168602,
"reward": 0.15000000447034836,
"reward_std": 0.2066778928041458,
"rewards/MultiModalAccuracyORM": 0.15000000447034836,
"step": 80,
"train_speed(iter/s)": 0.042552
},
{
"clip_ratio": 0.0,
"completion_length": 13.350000309944154,
"epoch": 0.03434343434343434,
"grad_norm": 10.242753028869629,
"kl": 0.0002181917428970337,
"learning_rate": 1.3709677419354838e-07,
"loss": -0.0021827301010489465,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.20833333805203438,
"reward_std": 0.36318225264549253,
"rewards/MultiModalAccuracyORM": 0.20833333805203438,
"step": 85,
"train_speed(iter/s)": 0.042776
},
{
"clip_ratio": 0.0,
"completion_length": 31.70833353996277,
"epoch": 0.03636363636363636,
"grad_norm": 18.3216552734375,
"kl": 0.00013442039489746093,
"learning_rate": 1.4516129032258064e-07,
"loss": -0.014865413308143616,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.17500000521540643,
"reward_std": 0.19786564111709595,
"rewards/MultiModalAccuracyORM": 0.17500000521540643,
"step": 90,
"train_speed(iter/s)": 0.042668
},
{
"clip_ratio": 0.0,
"completion_length": 12.166666793823243,
"epoch": 0.03838383838383838,
"grad_norm": 2.986149311065674,
"kl": 0.00017652511596679687,
"learning_rate": 1.5322580645161288e-07,
"loss": -0.004295501857995987,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.0833333358168602,
"reward_std": 0.18482151627540588,
"rewards/MultiModalAccuracyORM": 0.0833333358168602,
"step": 95,
"train_speed(iter/s)": 0.042663
},
{
"clip_ratio": 0.0,
"completion_length": 37.32500224113464,
"epoch": 0.04040404040404041,
"grad_norm": 9.087557792663574,
"kl": 0.00025534629821777344,
"learning_rate": 1.6129032258064515e-07,
"loss": -0.042690178751945494,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.24166667088866234,
"reward_std": 0.3192540168762207,
"rewards/MultiModalAccuracyORM": 0.24166667088866234,
"step": 100,
"train_speed(iter/s)": 0.042723
},
{
"clip_ratio": 0.0,
"completion_length": 42.64166672229767,
"epoch": 0.04242424242424243,
"grad_norm": 1.299012303352356,
"kl": 0.000713956356048584,
"learning_rate": 1.6935483870967741e-07,
"loss": -0.01074601411819458,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.1500000014901161,
"reward_std": 0.2782616138458252,
"rewards/MultiModalAccuracyORM": 0.1500000014901161,
"step": 105,
"train_speed(iter/s)": 0.042694
},
{
"clip_ratio": 0.0,
"completion_length": 25.308334159851075,
"epoch": 0.044444444444444446,
"grad_norm": 20.200790405273438,
"kl": -2.079010009765625e-05,
"learning_rate": 1.7741935483870968e-07,
"loss": -0.0049890361726284025,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.17500000447034836,
"reward_std": 0.34557787179946897,
"rewards/MultiModalAccuracyORM": 0.17500000447034836,
"step": 110,
"train_speed(iter/s)": 0.042795
},
{
"clip_ratio": 0.0,
"completion_length": 17.325000619888307,
"epoch": 0.046464646464646465,
"grad_norm": 2.473445177078247,
"kl": 0.0003504753112792969,
"learning_rate": 1.8548387096774192e-07,
"loss": 0.009455542266368865,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.15833333805203437,
"reward_std": 0.2629852324724197,
"rewards/MultiModalAccuracyORM": 0.15833333805203437,
"step": 115,
"train_speed(iter/s)": 0.042806
},
{
"clip_ratio": 0.0,
"completion_length": 17.07500042915344,
"epoch": 0.048484848484848485,
"grad_norm": 18.782503128051758,
"kl": 0.00040736198425292967,
"learning_rate": 1.9354838709677418e-07,
"loss": -0.00938464030623436,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.10000000149011612,
"reward_std": 0.17861495018005372,
"rewards/MultiModalAccuracyORM": 0.10000000149011612,
"step": 120,
"train_speed(iter/s)": 0.042915
},
{
"clip_ratio": 0.0,
"completion_length": 40.666668796539305,
"epoch": 0.050505050505050504,
"grad_norm": 10.809483528137207,
"kl": 0.00013909339904785156,
"learning_rate": 2e-07,
"loss": 0.015682700276374816,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.3083333417773247,
"reward_std": 0.2325587123632431,
"rewards/MultiModalAccuracyORM": 0.3083333417773247,
"step": 125,
"train_speed(iter/s)": 0.042982
},
{
"clip_ratio": 0.0,
"completion_length": 21.79166784286499,
"epoch": 0.052525252525252523,
"grad_norm": 0.059968430548906326,
"kl": 0.0003565549850463867,
"learning_rate": 2e-07,
"loss": -0.012978824973106384,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.20833334028720857,
"reward_std": 0.2775311887264252,
"rewards/MultiModalAccuracyORM": 0.20833334028720857,
"step": 130,
"train_speed(iter/s)": 0.043138
},
{
"clip_ratio": 0.0,
"completion_length": 9.358333635330201,
"epoch": 0.05454545454545454,
"grad_norm": 16.368749618530273,
"kl": 0.0005423665046691894,
"learning_rate": 2e-07,
"loss": -0.018562111258506774,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.2083333432674408,
"reward_std": 0.3227223068475723,
"rewards/MultiModalAccuracyORM": 0.2083333432674408,
"step": 135,
"train_speed(iter/s)": 0.043281
},
{
"clip_ratio": 0.0,
"completion_length": 46.30833601951599,
"epoch": 0.05656565656565657,
"grad_norm": 8.052789688110352,
"kl": 0.0008988380432128906,
"learning_rate": 2e-07,
"loss": 0.05945103764533997,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.20833333879709243,
"reward_std": 0.32900004684925077,
"rewards/MultiModalAccuracyORM": 0.20833333879709243,
"step": 140,
"train_speed(iter/s)": 0.043225
},
{
"clip_ratio": 0.0,
"completion_length": 4.983333492279053,
"epoch": 0.05858585858585859,
"grad_norm": 5.5169525146484375,
"kl": 0.0008536338806152344,
"learning_rate": 2e-07,
"loss": -0.03663218915462494,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.2000000074505806,
"reward_std": 0.29079394936561587,
"rewards/MultiModalAccuracyORM": 0.2000000074505806,
"step": 145,
"train_speed(iter/s)": 0.043361
},
{
"clip_ratio": 0.0,
"completion_length": 7.125000166893005,
"epoch": 0.06060606060606061,
"grad_norm": 0.07958526909351349,
"kl": 0.001511383056640625,
"learning_rate": 2e-07,
"loss": 0.05411055088043213,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.2333333373069763,
"reward_std": 0.27122943103313446,
"rewards/MultiModalAccuracyORM": 0.2333333373069763,
"step": 150,
"train_speed(iter/s)": 0.043443
},
{
"clip_ratio": 0.0,
"completion_length": 7.666666889190674,
"epoch": 0.06262626262626263,
"grad_norm": 0.0961478129029274,
"kl": 0.0021147727966308594,
"learning_rate": 2e-07,
"loss": 0.0017779668793082236,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.20000000447034835,
"reward_std": 0.22052658796310426,
"rewards/MultiModalAccuracyORM": 0.20000000447034835,
"step": 155,
"train_speed(iter/s)": 0.043434
},
{
"clip_ratio": 0.0,
"completion_length": 56.125001430511475,
"epoch": 0.06464646464646465,
"grad_norm": 3.5018489360809326,
"kl": 0.0011393070220947266,
"learning_rate": 2e-07,
"loss": 0.003215038776397705,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.1833333395421505,
"reward_std": 0.2687189429998398,
"rewards/MultiModalAccuracyORM": 0.1833333395421505,
"step": 160,
"train_speed(iter/s)": 0.043475
},
{
"clip_ratio": 0.0,
"completion_length": 33.183334255218504,
"epoch": 0.06666666666666667,
"grad_norm": 1.7839807271957397,
"kl": 0.001880502700805664,
"learning_rate": 2e-07,
"loss": 0.037510618567466736,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.1250000037252903,
"reward_std": 0.2629256367683411,
"rewards/MultiModalAccuracyORM": 0.1250000037252903,
"step": 165,
"train_speed(iter/s)": 0.04338
},
{
"clip_ratio": 0.0,
"completion_length": 12.583333587646484,
"epoch": 0.06868686868686869,
"grad_norm": 2.9806480407714844,
"kl": 0.001198887825012207,
"learning_rate": 2e-07,
"loss": 0.007929786294698715,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.19166667461395265,
"reward_std": 0.21750431060791015,
"rewards/MultiModalAccuracyORM": 0.19166667461395265,
"step": 170,
"train_speed(iter/s)": 0.043348
},
{
"clip_ratio": 0.0,
"completion_length": 10.308333587646484,
"epoch": 0.0707070707070707,
"grad_norm": 0.006374528165906668,
"kl": 0.008016198873519897,
"learning_rate": 2e-07,
"loss": 0.0161195233464241,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.19166667014360428,
"reward_std": 0.2822715133428574,
"rewards/MultiModalAccuracyORM": 0.19166667014360428,
"step": 175,
"train_speed(iter/s)": 0.043522
},
{
"clip_ratio": 0.0,
"completion_length": 17.283334064483643,
"epoch": 0.07272727272727272,
"grad_norm": 13.373006820678711,
"kl": 0.005344104766845703,
"learning_rate": 2e-07,
"loss": 0.005642924830317498,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.1250000037252903,
"reward_std": 0.2629256367683411,
"rewards/MultiModalAccuracyORM": 0.1250000037252903,
"step": 180,
"train_speed(iter/s)": 0.043562
},
{
"clip_ratio": 0.0,
"completion_length": 7.858333396911621,
"epoch": 0.07474747474747474,
"grad_norm": 20.940757751464844,
"kl": 0.004119682312011719,
"learning_rate": 2e-07,
"loss": -0.014204351603984833,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.21666667312383653,
"reward_std": 0.24560283720493317,
"rewards/MultiModalAccuracyORM": 0.21666667312383653,
"step": 185,
"train_speed(iter/s)": 0.043604
},
{
"clip_ratio": 0.0,
"completion_length": 10.808333730697631,
"epoch": 0.07676767676767676,
"grad_norm": 1.9175783395767212,
"kl": 0.0015784263610839843,
"learning_rate": 2e-07,
"loss": 0.036653178930282596,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.2583333395421505,
"reward_std": 0.27774982452392577,
"rewards/MultiModalAccuracyORM": 0.2583333395421505,
"step": 190,
"train_speed(iter/s)": 0.043708
},
{
"clip_ratio": 0.0,
"completion_length": 8.058333468437194,
"epoch": 0.07878787878787878,
"grad_norm": 20.731929779052734,
"kl": 0.002748870849609375,
"learning_rate": 2e-07,
"loss": -0.007462918758392334,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.24166667461395264,
"reward_std": 0.26047474443912505,
"rewards/MultiModalAccuracyORM": 0.24166667461395264,
"step": 195,
"train_speed(iter/s)": 0.043797
},
{
"clip_ratio": 0.0,
"completion_length": 39.766668224334715,
"epoch": 0.08080808080808081,
"grad_norm": 32.81786346435547,
"kl": 0.012819027900695801,
"learning_rate": 2e-07,
"loss": -0.012741921842098236,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.1916666731238365,
"reward_std": 0.3634008765220642,
"rewards/MultiModalAccuracyORM": 0.1916666731238365,
"step": 200,
"train_speed(iter/s)": 0.043791
},
{
"clip_ratio": 0.0,
"completion_length": 7.191666889190674,
"epoch": 0.08282828282828283,
"grad_norm": 10.631654739379883,
"kl": 0.007097434997558594,
"learning_rate": 2e-07,
"loss": -0.059709519147872925,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.15000000223517418,
"reward_std": 0.26302082240581515,
"rewards/MultiModalAccuracyORM": 0.15000000223517418,
"step": 205,
"train_speed(iter/s)": 0.043862
},
{
"clip_ratio": 0.0,
"completion_length": 10.100000143051147,
"epoch": 0.08484848484848485,
"grad_norm": 15.135857582092285,
"kl": 0.016997623443603515,
"learning_rate": 2e-07,
"loss": 0.036284705996513365,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.10833333730697632,
"reward_std": 0.24481281042098998,
"rewards/MultiModalAccuracyORM": 0.10833333730697632,
"step": 210,
"train_speed(iter/s)": 0.043845
},
{
"clip_ratio": 0.0,
"completion_length": 10.075000190734864,
"epoch": 0.08686868686868687,
"grad_norm": 15.046256065368652,
"kl": 0.013745307922363281,
"learning_rate": 2e-07,
"loss": -0.01842743158340454,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.3166666753590107,
"reward_std": 0.3001325339078903,
"rewards/MultiModalAccuracyORM": 0.3166666753590107,
"step": 215,
"train_speed(iter/s)": 0.043911
},
{
"clip_ratio": 0.0,
"completion_length": 31.94166750907898,
"epoch": 0.08888888888888889,
"grad_norm": 14.397719383239746,
"kl": 0.01525421142578125,
"learning_rate": 2e-07,
"loss": -0.016506943106651305,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.22500000894069672,
"reward_std": 0.24662604331970214,
"rewards/MultiModalAccuracyORM": 0.22500000894069672,
"step": 220,
"train_speed(iter/s)": 0.043992
},
{
"clip_ratio": 0.0,
"completion_length": 5.44166669845581,
"epoch": 0.09090909090909091,
"grad_norm": 12.164202690124512,
"kl": 0.025649261474609376,
"learning_rate": 2e-07,
"loss": 0.017044636607170104,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.3000000074505806,
"reward_std": 0.28160068988800047,
"rewards/MultiModalAccuracyORM": 0.3000000074505806,
"step": 225,
"train_speed(iter/s)": 0.044113
},
{
"clip_ratio": 0.0,
"completion_length": 39.383334040641785,
"epoch": 0.09292929292929293,
"grad_norm": 21.127038955688477,
"kl": 0.024017763137817384,
"learning_rate": 2e-07,
"loss": 0.02930714190006256,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.1666666693985462,
"reward_std": 0.26196202635765076,
"rewards/MultiModalAccuracyORM": 0.1666666693985462,
"step": 230,
"train_speed(iter/s)": 0.044142
},
{
"clip_ratio": 0.0,
"completion_length": 14.891666889190674,
"epoch": 0.09494949494949495,
"grad_norm": 6.2940568923950195,
"kl": 0.027823114395141603,
"learning_rate": 2e-07,
"loss": -0.009951599687337876,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.17500000521540643,
"reward_std": 0.3036638140678406,
"rewards/MultiModalAccuracyORM": 0.17500000521540643,
"step": 235,
"train_speed(iter/s)": 0.044213
},
{
"clip_ratio": 0.0,
"completion_length": 6.750000047683716,
"epoch": 0.09696969696969697,
"grad_norm": 3.980544090270996,
"kl": 0.018259000778198243,
"learning_rate": 2e-07,
"loss": -0.020673815906047822,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.16666667014360428,
"reward_std": 0.21823472976684571,
"rewards/MultiModalAccuracyORM": 0.16666667014360428,
"step": 240,
"train_speed(iter/s)": 0.044233
},
{
"clip_ratio": 0.0,
"completion_length": 10.80000023841858,
"epoch": 0.09898989898989899,
"grad_norm": 1.3881502151489258,
"kl": 0.000605630874633789,
"learning_rate": 2e-07,
"loss": -0.01487920731306076,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.28333334177732467,
"reward_std": 0.30661733746528624,
"rewards/MultiModalAccuracyORM": 0.28333334177732467,
"step": 245,
"train_speed(iter/s)": 0.044235
},
{
"epoch": 0.10101010101010101,
"grad_norm": 11.512455940246582,
"learning_rate": 2e-07,
"loss": 0.033054867386817934,
"memory(GiB)": 104.49,
"step": 250,
"train_speed(iter/s)": 0.044081
},
{
"epoch": 0.10101010101010101,
"eval_clip_ratio": 0.0,
"eval_completion_length": 24.26333417892456,
"eval_kl": 0.022986836433410644,
"eval_loss": 0.027694934979081154,
"eval_response_clip_ratio": 0.0,
"eval_reward": 0.2150000040233135,
"eval_reward_std": 0.2852368396520615,
"eval_rewards/MultiModalAccuracyORM": 0.2150000040233135,
"eval_runtime": 262.2909,
"eval_samples_per_second": 0.191,
"eval_steps_per_second": 0.019,
"step": 250
},
{
"clip_ratio": 0.0,
"completion_length": 46.133334922790525,
"epoch": 0.10303030303030303,
"grad_norm": 4.130315780639648,
"kl": 0.018082523345947267,
"learning_rate": 2e-07,
"loss": 0.024475347995758057,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.24166666977107526,
"reward_std": 0.2766233593225479,
"rewards/MultiModalAccuracyORM": 0.24166666977107526,
"step": 255,
"train_speed(iter/s)": 0.041648
},
{
"clip_ratio": 0.0,
"completion_length": 6.6,
"epoch": 0.10505050505050505,
"grad_norm": 10.52556324005127,
"kl": 0.020127105712890624,
"learning_rate": 2e-07,
"loss": -0.008974193781614303,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.2750000037252903,
"reward_std": 0.2567190647125244,
"rewards/MultiModalAccuracyORM": 0.2750000037252903,
"step": 260,
"train_speed(iter/s)": 0.041738
},
{
"clip_ratio": 0.0,
"completion_length": 6.45,
"epoch": 0.10707070707070707,
"grad_norm": 11.179485321044922,
"kl": 0.03880462646484375,
"learning_rate": 2e-07,
"loss": 0.0015405803918838502,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.1666666716337204,
"reward_std": 0.2918527454137802,
"rewards/MultiModalAccuracyORM": 0.1666666716337204,
"step": 265,
"train_speed(iter/s)": 0.041756
},
{
"clip_ratio": 0.0,
"completion_length": 18.75,
"epoch": 0.10909090909090909,
"grad_norm": 4.639992713928223,
"kl": 0.018306541442871093,
"learning_rate": 2e-07,
"loss": -0.012826296687126159,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.10833333656191826,
"reward_std": 0.174764084815979,
"rewards/MultiModalAccuracyORM": 0.10833333656191826,
"step": 270,
"train_speed(iter/s)": 0.041759
},
{
"clip_ratio": 0.0,
"completion_length": 117.6,
"epoch": 0.1111111111111111,
"grad_norm": 14.52376651763916,
"kl": 0.02277069091796875,
"learning_rate": 2e-07,
"loss": -0.03760814070701599,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.05,
"reward": 0.30833334028720855,
"reward_std": 0.3679845929145813,
"rewards/MultiModalAccuracyORM": 0.30833334028720855,
"step": 275,
"train_speed(iter/s)": 0.041762
},
{
"clip_ratio": 0.0,
"completion_length": 41.5,
"epoch": 0.11313131313131314,
"grad_norm": 7.044532775878906,
"kl": 0.04247570037841797,
"learning_rate": 2e-07,
"loss": 0.05246252417564392,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.22500000670552253,
"reward_std": 0.30385262966156007,
"rewards/MultiModalAccuracyORM": 0.22500000670552253,
"step": 280,
"train_speed(iter/s)": 0.041745
},
{
"clip_ratio": 0.0,
"completion_length": 23.9,
"epoch": 0.11515151515151516,
"grad_norm": 3.5612969398498535,
"kl": 0.04666891098022461,
"learning_rate": 2e-07,
"loss": -0.03580006957054138,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.14166667312383652,
"reward_std": 0.20594746768474578,
"rewards/MultiModalAccuracyORM": 0.14166667312383652,
"step": 285,
"train_speed(iter/s)": 0.041805
},
{
"clip_ratio": 0.0,
"completion_length": 57.5,
"epoch": 0.11717171717171718,
"grad_norm": 22.66056251525879,
"kl": 0.0072917938232421875,
"learning_rate": 2e-07,
"loss": 0.030799278616905214,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.24166667386889457,
"reward_std": 0.3523798406124115,
"rewards/MultiModalAccuracyORM": 0.24166667386889457,
"step": 290,
"train_speed(iter/s)": 0.041794
},
{
"clip_ratio": 0.0,
"completion_length": 64.1,
"epoch": 0.1191919191919192,
"grad_norm": 16.353897094726562,
"kl": 0.02278270721435547,
"learning_rate": 2e-07,
"loss": 0.0040659308433532715,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.07500000149011612,
"reward_std": 0.16200153529644012,
"rewards/MultiModalAccuracyORM": 0.07500000149011612,
"step": 295,
"train_speed(iter/s)": 0.041713
},
{
"clip_ratio": 0.0,
"completion_length": 26.0,
"epoch": 0.12121212121212122,
"grad_norm": 3.0584208965301514,
"kl": 0.021613693237304686,
"learning_rate": 2e-07,
"loss": 0.015577539801597595,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.12500000447034837,
"reward_std": 0.2175043046474457,
"rewards/MultiModalAccuracyORM": 0.12500000447034837,
"step": 300,
"train_speed(iter/s)": 0.041708
},
{
"clip_ratio": 0.0,
"completion_length": 8.4,
"epoch": 0.12323232323232323,
"grad_norm": 2.683347225189209,
"kl": 0.05754499435424805,
"learning_rate": 2e-07,
"loss": 0.0014399250969290734,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.13333333805203437,
"reward_std": 0.24637182354927062,
"rewards/MultiModalAccuracyORM": 0.13333333805203437,
"step": 305,
"train_speed(iter/s)": 0.041731
},
{
"clip_ratio": 0.0,
"completion_length": 11.0,
"epoch": 0.12525252525252525,
"grad_norm": 4.011137008666992,
"kl": 0.003471851348876953,
"learning_rate": 2e-07,
"loss": -0.012657842040061951,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.12500000223517418,
"reward_std": 0.17781037986278533,
"rewards/MultiModalAccuracyORM": 0.12500000223517418,
"step": 310,
"train_speed(iter/s)": 0.041745
},
{
"clip_ratio": 0.0,
"completion_length": 14.4,
"epoch": 0.12727272727272726,
"grad_norm": 2.4296364784240723,
"kl": 0.01938905715942383,
"learning_rate": 2e-07,
"loss": 0.023499640822410583,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.17500000149011613,
"reward_std": 0.1808116167783737,
"rewards/MultiModalAccuracyORM": 0.17500000149011613,
"step": 315,
"train_speed(iter/s)": 0.041811
},
{
"clip_ratio": 0.0,
"completion_length": 9.35,
"epoch": 0.1292929292929293,
"grad_norm": 1.5319490432739258,
"kl": 0.023272895812988283,
"learning_rate": 2e-07,
"loss": -0.0005661348812282085,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.22500000298023223,
"reward_std": 0.23860624432563782,
"rewards/MultiModalAccuracyORM": 0.22500000298023223,
"step": 320,
"train_speed(iter/s)": 0.041846
},
{
"clip_ratio": 0.0,
"completion_length": 14.8,
"epoch": 0.13131313131313133,
"grad_norm": 28.09259605407715,
"kl": 0.055776214599609374,
"learning_rate": 2e-07,
"loss": -0.00978400707244873,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.15833333507180214,
"reward_std": 0.2785158395767212,
"rewards/MultiModalAccuracyORM": 0.15833333507180214,
"step": 325,
"train_speed(iter/s)": 0.041894
},
{
"clip_ratio": 0.0,
"completion_length": 9.8,
"epoch": 0.13333333333333333,
"grad_norm": 5.655847072601318,
"kl": 0.01194305419921875,
"learning_rate": 2e-07,
"loss": -0.023021923005580903,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.2250000037252903,
"reward_std": 0.242361918091774,
"rewards/MultiModalAccuracyORM": 0.2250000037252903,
"step": 330,
"train_speed(iter/s)": 0.041922
},
{
"clip_ratio": 0.0,
"completion_length": 10.0,
"epoch": 0.13535353535353536,
"grad_norm": 16.269479751586914,
"kl": 0.012023067474365235,
"learning_rate": 2e-07,
"loss": 0.009542696177959442,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.2583333380520344,
"reward_std": 0.4074155628681183,
"rewards/MultiModalAccuracyORM": 0.2583333380520344,
"step": 335,
"train_speed(iter/s)": 0.041926
},
{
"clip_ratio": 0.0,
"completion_length": 9.1,
"epoch": 0.13737373737373737,
"grad_norm": 19.7489013671875,
"kl": 0.041985511779785156,
"learning_rate": 2e-07,
"loss": -0.009631294012069701,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.3250000111758709,
"reward_std": 0.38227055966854095,
"rewards/MultiModalAccuracyORM": 0.3250000111758709,
"step": 340,
"train_speed(iter/s)": 0.042003
},
{
"clip_ratio": 0.0,
"completion_length": 17.25,
"epoch": 0.1393939393939394,
"grad_norm": 25.704818725585938,
"kl": 0.02933082580566406,
"learning_rate": 2e-07,
"loss": 0.005663518235087395,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.17500000596046447,
"reward_std": 0.287842845916748,
"rewards/MultiModalAccuracyORM": 0.17500000596046447,
"step": 345,
"train_speed(iter/s)": 0.042012
},
{
"clip_ratio": 0.0,
"completion_length": 25.0,
"epoch": 0.1414141414141414,
"grad_norm": 30.1114559173584,
"kl": 0.010479164123535157,
"learning_rate": 2e-07,
"loss": 0.018732863664627075,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.12500000447034837,
"reward_std": 0.2077010989189148,
"rewards/MultiModalAccuracyORM": 0.12500000447034837,
"step": 350,
"train_speed(iter/s)": 0.041986
},
{
"clip_ratio": 0.0,
"completion_length": 18.65,
"epoch": 0.14343434343434344,
"grad_norm": 4.131731033325195,
"kl": 0.03218498229980469,
"learning_rate": 2e-07,
"loss": 0.05048830509185791,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.1833333380520344,
"reward_std": 0.22854881286621093,
"rewards/MultiModalAccuracyORM": 0.1833333380520344,
"step": 355,
"train_speed(iter/s)": 0.041992
},
{
"clip_ratio": 0.0,
"completion_length": 5.9,
"epoch": 0.14545454545454545,
"grad_norm": 2.5443966388702393,
"kl": 0.028252887725830077,
"learning_rate": 2e-07,
"loss": 0.011212460696697235,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.3166666813194752,
"reward_std": 0.3104326128959656,
"rewards/MultiModalAccuracyORM": 0.3166666813194752,
"step": 360,
"train_speed(iter/s)": 0.042049
},
{
"clip_ratio": 0.0,
"completion_length": 52.05,
"epoch": 0.14747474747474748,
"grad_norm": 4.374809265136719,
"kl": 0.024268913269042968,
"learning_rate": 2e-07,
"loss": -0.0001811852096579969,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.15000000447034836,
"reward_std": 0.20544483065605162,
"rewards/MultiModalAccuracyORM": 0.15000000447034836,
"step": 365,
"train_speed(iter/s)": 0.042035
},
{
"clip_ratio": 0.0,
"completion_length": 9.75,
"epoch": 0.1494949494949495,
"grad_norm": 16.779956817626953,
"kl": 0.015867042541503906,
"learning_rate": 2e-07,
"loss": 0.022855284810066222,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.17500000447034836,
"reward_std": 0.17529989182949066,
"rewards/MultiModalAccuracyORM": 0.17500000447034836,
"step": 370,
"train_speed(iter/s)": 0.042043
},
{
"clip_ratio": 0.0,
"completion_length": 32.05,
"epoch": 0.15151515151515152,
"grad_norm": 1.799055576324463,
"kl": 0.02576103210449219,
"learning_rate": 2e-07,
"loss": 0.03886341452598572,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.15000000670552255,
"reward_std": 0.23481498062610626,
"rewards/MultiModalAccuracyORM": 0.15000000670552255,
"step": 375,
"train_speed(iter/s)": 0.041993
},
{
"clip_ratio": 0.0,
"completion_length": 11.3,
"epoch": 0.15353535353535352,
"grad_norm": 14.809004783630371,
"kl": 0.06607561111450196,
"learning_rate": 2e-07,
"loss": 0.02258915901184082,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.27500000447034834,
"reward_std": 0.27759079039096834,
"rewards/MultiModalAccuracyORM": 0.27500000447034834,
"step": 380,
"train_speed(iter/s)": 0.042034
},
{
"clip_ratio": 0.0,
"completion_length": 11.6,
"epoch": 0.15555555555555556,
"grad_norm": 4.855790138244629,
"kl": 0.044758033752441403,
"learning_rate": 2e-07,
"loss": 0.006666116416454315,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.1916666731238365,
"reward_std": 0.2877832442522049,
"rewards/MultiModalAccuracyORM": 0.1916666731238365,
"step": 385,
"train_speed(iter/s)": 0.042053
},
{
"clip_ratio": 0.0,
"completion_length": 5.45,
"epoch": 0.15757575757575756,
"grad_norm": 3.650961399078369,
"kl": 0.09126663208007812,
"learning_rate": 2e-07,
"loss": -0.006338779628276825,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.2750000014901161,
"reward_std": 0.22384165227413177,
"rewards/MultiModalAccuracyORM": 0.2750000014901161,
"step": 390,
"train_speed(iter/s)": 0.04209
},
{
"clip_ratio": 0.0,
"completion_length": 21.7,
"epoch": 0.1595959595959596,
"grad_norm": 22.398860931396484,
"kl": 0.05564393997192383,
"learning_rate": 2e-07,
"loss": 0.011527793109416961,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.2666666716337204,
"reward_std": 0.3385071337223053,
"rewards/MultiModalAccuracyORM": 0.2666666716337204,
"step": 395,
"train_speed(iter/s)": 0.04213
},
{
"clip_ratio": 0.0,
"completion_length": 32.45,
"epoch": 0.16161616161616163,
"grad_norm": 3.777151346206665,
"kl": 0.08077354431152343,
"learning_rate": 2e-07,
"loss": 0.02410067617893219,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.11666666939854622,
"reward_std": 0.2687189429998398,
"rewards/MultiModalAccuracyORM": 0.11666666939854622,
"step": 400,
"train_speed(iter/s)": 0.04213
},
{
"clip_ratio": 0.0,
"completion_length": 5.7,
"epoch": 0.16363636363636364,
"grad_norm": 6.114872455596924,
"kl": 0.09431419372558594,
"learning_rate": 2e-07,
"loss": 0.02062232345342636,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.2833333410322666,
"reward_std": 0.384308198094368,
"rewards/MultiModalAccuracyORM": 0.2833333410322666,
"step": 405,
"train_speed(iter/s)": 0.042217
},
{
"clip_ratio": 0.0,
"completion_length": 11.1,
"epoch": 0.16565656565656567,
"grad_norm": 2.8733115196228027,
"kl": 0.07746734619140624,
"learning_rate": 2e-07,
"loss": 0.014683787524700165,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.3500000074505806,
"reward_std": 0.28160068988800047,
"rewards/MultiModalAccuracyORM": 0.3500000074505806,
"step": 410,
"train_speed(iter/s)": 0.042237
},
{
"clip_ratio": 0.0,
"completion_length": 29.55,
"epoch": 0.16767676767676767,
"grad_norm": 1.103491187095642,
"kl": 0.013630294799804687,
"learning_rate": 2e-07,
"loss": 0.031570857763290404,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.23333333805203438,
"reward_std": 0.3222051203250885,
"rewards/MultiModalAccuracyORM": 0.23333333805203438,
"step": 415,
"train_speed(iter/s)": 0.042253
},
{
"clip_ratio": 0.0,
"completion_length": 11.65,
"epoch": 0.1696969696969697,
"grad_norm": 19.609107971191406,
"kl": 0.006585693359375,
"learning_rate": 2e-07,
"loss": 0.029933744668960573,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.29166666865348817,
"reward_std": 0.2815766781568527,
"rewards/MultiModalAccuracyORM": 0.29166666865348817,
"step": 420,
"train_speed(iter/s)": 0.042267
},
{
"clip_ratio": 0.0,
"completion_length": 33.85,
"epoch": 0.1717171717171717,
"grad_norm": 3.5567312240600586,
"kl": 0.027184486389160156,
"learning_rate": 2e-07,
"loss": -0.008297288417816162,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.3000000067055225,
"reward_std": 0.3423224091529846,
"rewards/MultiModalAccuracyORM": 0.3000000067055225,
"step": 425,
"train_speed(iter/s)": 0.042268
},
{
"clip_ratio": 0.0,
"completion_length": 40.6,
"epoch": 0.17373737373737375,
"grad_norm": 4.005617141723633,
"kl": 0.037563323974609375,
"learning_rate": 2e-07,
"loss": -0.008759691566228866,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.22500000298023223,
"reward_std": 0.2403598755598068,
"rewards/MultiModalAccuracyORM": 0.22500000298023223,
"step": 430,
"train_speed(iter/s)": 0.042273
},
{
"clip_ratio": 0.0,
"completion_length": 29.65,
"epoch": 0.17575757575757575,
"grad_norm": 1.1876083612442017,
"kl": 0.04276580810546875,
"learning_rate": 2e-07,
"loss": 0.009293363988399505,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.23333333805203438,
"reward_std": 0.25639069378376006,
"rewards/MultiModalAccuracyORM": 0.23333333805203438,
"step": 435,
"train_speed(iter/s)": 0.042306
},
{
"clip_ratio": 0.0,
"completion_length": 24.1,
"epoch": 0.17777777777777778,
"grad_norm": 1.259384274482727,
"kl": 0.09014434814453125,
"learning_rate": 2e-07,
"loss": 0.07308403849601745,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.2083333373069763,
"reward_std": 0.2925831705331802,
"rewards/MultiModalAccuracyORM": 0.2083333373069763,
"step": 440,
"train_speed(iter/s)": 0.042352
},
{
"clip_ratio": 0.0,
"completion_length": 7.1,
"epoch": 0.1797979797979798,
"grad_norm": 1.2361171245574951,
"kl": 0.0314971923828125,
"learning_rate": 2e-07,
"loss": -0.04375269114971161,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.30833333656191825,
"reward_std": 0.29863070249557494,
"rewards/MultiModalAccuracyORM": 0.30833333656191825,
"step": 445,
"train_speed(iter/s)": 0.042392
},
{
"clip_ratio": 0.0,
"completion_length": 6.65,
"epoch": 0.18181818181818182,
"grad_norm": 2.4363491535186768,
"kl": 0.07178993225097656,
"learning_rate": 2e-07,
"loss": 0.0028454601764678956,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.32500000596046447,
"reward_std": 0.41791602969169617,
"rewards/MultiModalAccuracyORM": 0.32500000596046447,
"step": 450,
"train_speed(iter/s)": 0.042441
},
{
"clip_ratio": 0.0,
"completion_length": 6.6,
"epoch": 0.18383838383838383,
"grad_norm": 12.971217155456543,
"kl": 0.05601959228515625,
"learning_rate": 2e-07,
"loss": 0.012572245299816131,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.25833333656191826,
"reward_std": 0.22704698145389557,
"rewards/MultiModalAccuracyORM": 0.25833333656191826,
"step": 455,
"train_speed(iter/s)": 0.042477
},
{
"clip_ratio": 0.0,
"completion_length": 32.85,
"epoch": 0.18585858585858586,
"grad_norm": 11.262785911560059,
"kl": 0.014653778076171875,
"learning_rate": 2e-07,
"loss": 0.005643188953399658,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.2750000022351742,
"reward_std": 0.26040059328079224,
"rewards/MultiModalAccuracyORM": 0.2750000022351742,
"step": 460,
"train_speed(iter/s)": 0.042456
},
{
"clip_ratio": 0.0,
"completion_length": 17.2,
"epoch": 0.18787878787878787,
"grad_norm": 9.14407730102539,
"kl": 0.03995361328125,
"learning_rate": 2e-07,
"loss": 0.0012056897394359112,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.22500000521540642,
"reward_std": 0.3923635810613632,
"rewards/MultiModalAccuracyORM": 0.22500000521540642,
"step": 465,
"train_speed(iter/s)": 0.042452
},
{
"clip_ratio": 0.0,
"completion_length": 5.2,
"epoch": 0.1898989898989899,
"grad_norm": 2.3540585041046143,
"kl": 0.041180419921875,
"learning_rate": 2e-07,
"loss": 0.018683533370494842,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.10000000149011612,
"reward_std": 0.20722824335098267,
"rewards/MultiModalAccuracyORM": 0.10000000149011612,
"step": 470,
"train_speed(iter/s)": 0.042503
},
{
"clip_ratio": 0.0,
"completion_length": 27.25,
"epoch": 0.1919191919191919,
"grad_norm": 6.397303581237793,
"kl": 0.02938995361328125,
"learning_rate": 2e-07,
"loss": 0.005294787883758545,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.308333333581686,
"reward_std": 0.31422091126441953,
"rewards/MultiModalAccuracyORM": 0.308333333581686,
"step": 475,
"train_speed(iter/s)": 0.042517
},
{
"clip_ratio": 0.0,
"completion_length": 11.45,
"epoch": 0.19393939393939394,
"grad_norm": 15.569790840148926,
"kl": 0.07780342102050782,
"learning_rate": 2e-07,
"loss": 0.012630045413970947,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.24166667088866234,
"reward_std": 0.36667739152908324,
"rewards/MultiModalAccuracyORM": 0.24166667088866234,
"step": 480,
"train_speed(iter/s)": 0.042512
},
{
"clip_ratio": 0.0,
"completion_length": 9.95,
"epoch": 0.19595959595959597,
"grad_norm": 12.205713272094727,
"kl": 0.02214508056640625,
"learning_rate": 2e-07,
"loss": 0.012730973958969116,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.35833333656191824,
"reward_std": 0.25566026866436004,
"rewards/MultiModalAccuracyORM": 0.35833333656191824,
"step": 485,
"train_speed(iter/s)": 0.042552
},
{
"clip_ratio": 0.0,
"completion_length": 16.55,
"epoch": 0.19797979797979798,
"grad_norm": 0.97981858253479,
"kl": 0.05444526672363281,
"learning_rate": 2e-07,
"loss": 0.006719142198562622,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.10000000074505806,
"reward_std": 0.203472563624382,
"rewards/MultiModalAccuracyORM": 0.10000000074505806,
"step": 490,
"train_speed(iter/s)": 0.04257
},
{
"clip_ratio": 0.0,
"completion_length": 31.85,
"epoch": 0.2,
"grad_norm": 2.1149213314056396,
"kl": 0.06137847900390625,
"learning_rate": 2e-07,
"loss": 0.04113571047782898,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.38333334401249886,
"reward_std": 0.3259632259607315,
"rewards/MultiModalAccuracyORM": 0.38333334401249886,
"step": 495,
"train_speed(iter/s)": 0.042559
},
{
"epoch": 0.20202020202020202,
"grad_norm": 18.28374671936035,
"learning_rate": 2e-07,
"loss": 0.0038329623639583588,
"memory(GiB)": 104.49,
"step": 500,
"train_speed(iter/s)": 0.042571
},
{
"epoch": 0.20202020202020202,
"eval_clip_ratio": 0.0,
"eval_completion_length": 26.648334164619445,
"eval_kl": 0.08782589912414551,
"eval_loss": 7.593631835334236e-06,
"eval_response_clip_ratio": 0.001666666716337204,
"eval_reward": 0.2816666740179062,
"eval_reward_std": 0.3331107318401337,
"eval_rewards/MultiModalAccuracyORM": 0.2816666740179062,
"eval_runtime": 274.2098,
"eval_samples_per_second": 0.182,
"eval_steps_per_second": 0.018,
"step": 500
},
{
"clip_ratio": 0.0,
"completion_length": 11.25,
"epoch": 0.20404040404040405,
"grad_norm": 6.910037517547607,
"kl": 0.07545309066772461,
"learning_rate": 2e-07,
"loss": 0.02395549863576889,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.30416667498648164,
"reward_std": 0.2502841353416443,
"rewards/MultiModalAccuracyORM": 0.30416667498648164,
"step": 505,
"train_speed(iter/s)": 0.041389
},
{
"clip_ratio": 0.0,
"completion_length": 5.3,
"epoch": 0.20606060606060606,
"grad_norm": 7.303215503692627,
"kl": 0.03816680908203125,
"learning_rate": 2e-07,
"loss": 0.012394474446773529,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.30000000521540643,
"reward_std": 0.20363159775733947,
"rewards/MultiModalAccuracyORM": 0.30000000521540643,
"step": 510,
"train_speed(iter/s)": 0.041415
},
{
"clip_ratio": 0.0,
"completion_length": 37.0,
"epoch": 0.2080808080808081,
"grad_norm": 2.0224409103393555,
"kl": 0.038478851318359375,
"learning_rate": 2e-07,
"loss": -0.017507487535476686,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.2583333387970924,
"reward_std": 0.37155145704746245,
"rewards/MultiModalAccuracyORM": 0.2583333387970924,
"step": 515,
"train_speed(iter/s)": 0.041435
},
{
"clip_ratio": 0.0,
"completion_length": 22.9,
"epoch": 0.2101010101010101,
"grad_norm": 9.651928901672363,
"kl": 0.00984039306640625,
"learning_rate": 2e-07,
"loss": -0.002422221563756466,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.3500000014901161,
"reward_std": 0.30187162160873415,
"rewards/MultiModalAccuracyORM": 0.3500000014901161,
"step": 520,
"train_speed(iter/s)": 0.041478
},
{
"clip_ratio": 0.0,
"completion_length": 9.75,
"epoch": 0.21212121212121213,
"grad_norm": 5.6520562171936035,
"kl": 0.031005859375,
"learning_rate": 2e-07,
"loss": 0.00025533935986459254,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.2916666716337204,
"reward_std": 0.2338038921356201,
"rewards/MultiModalAccuracyORM": 0.2916666716337204,
"step": 525,
"train_speed(iter/s)": 0.041505
},
{
"clip_ratio": 0.0,
"completion_length": 6.5,
"epoch": 0.21414141414141413,
"grad_norm": 20.748729705810547,
"kl": 0.0915985107421875,
"learning_rate": 2e-07,
"loss": -0.01767445057630539,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.3000000096857548,
"reward_std": 0.38835368156433103,
"rewards/MultiModalAccuracyORM": 0.3000000096857548,
"step": 530,
"train_speed(iter/s)": 0.041524
},
{
"clip_ratio": 0.0,
"completion_length": 11.45,
"epoch": 0.21616161616161617,
"grad_norm": 0.023180894553661346,
"kl": 0.07088775634765625,
"learning_rate": 2e-07,
"loss": 0.029787826538085937,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.37500000521540644,
"reward_std": 0.2526735752820969,
"rewards/MultiModalAccuracyORM": 0.37500000521540644,
"step": 535,
"train_speed(iter/s)": 0.041552
},
{
"clip_ratio": 0.0,
"completion_length": 9.2,
"epoch": 0.21818181818181817,
"grad_norm": 16.621583938598633,
"kl": 0.05093994140625,
"learning_rate": 2e-07,
"loss": -0.009274721145629883,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.29166667312383654,
"reward_std": 0.28561058938503264,
"rewards/MultiModalAccuracyORM": 0.29166667312383654,
"step": 540,
"train_speed(iter/s)": 0.041581
},
{
"clip_ratio": 0.0,
"completion_length": 8.45,
"epoch": 0.2202020202020202,
"grad_norm": 17.103206634521484,
"kl": 0.0737823486328125,
"learning_rate": 2e-07,
"loss": 0.021037888526916505,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.3583333432674408,
"reward_std": 0.28561058938503264,
"rewards/MultiModalAccuracyORM": 0.3583333432674408,
"step": 545,
"train_speed(iter/s)": 0.041645
},
{
"clip_ratio": 0.0,
"completion_length": 28.85,
"epoch": 0.2222222222222222,
"grad_norm": 1.5227787494659424,
"kl": 0.07874641418457032,
"learning_rate": 2e-07,
"loss": 0.00487855076789856,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.17500000223517417,
"reward_std": 0.21779412031173706,
"rewards/MultiModalAccuracyORM": 0.17500000223517417,
"step": 550,
"train_speed(iter/s)": 0.041506
},
{
"clip_ratio": 0.0,
"completion_length": 17.6,
"epoch": 0.22424242424242424,
"grad_norm": 13.277663230895996,
"kl": 0.039247894287109376,
"learning_rate": 2e-07,
"loss": 0.008411864936351775,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.13333333656191826,
"reward_std": 0.29784068167209626,
"rewards/MultiModalAccuracyORM": 0.13333333656191826,
"step": 555,
"train_speed(iter/s)": 0.041504
},
{
"clip_ratio": 0.0,
"completion_length": 34.2,
"epoch": 0.22626262626262628,
"grad_norm": 0.10883937031030655,
"kl": 0.06273307800292968,
"learning_rate": 2e-07,
"loss": 0.012170317023992539,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.2666666679084301,
"reward_std": 0.12333081662654877,
"rewards/MultiModalAccuracyORM": 0.2666666679084301,
"step": 560,
"train_speed(iter/s)": 0.041519
},
{
"clip_ratio": 0.0,
"completion_length": 12.5,
"epoch": 0.22828282828282828,
"grad_norm": 12.209307670593262,
"kl": 0.04704780578613281,
"learning_rate": 2e-07,
"loss": 0.032337296009063723,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.17500000074505806,
"reward_std": 0.2626924514770508,
"rewards/MultiModalAccuracyORM": 0.17500000074505806,
"step": 565,
"train_speed(iter/s)": 0.04155
},
{
"clip_ratio": 0.0,
"completion_length": 7.85,
"epoch": 0.23030303030303031,
"grad_norm": 4.45810079574585,
"kl": 0.05213623046875,
"learning_rate": 2e-07,
"loss": 0.001686885952949524,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.30000000447034836,
"reward_std": 0.25891573131084444,
"rewards/MultiModalAccuracyORM": 0.30000000447034836,
"step": 570,
"train_speed(iter/s)": 0.041578
},
{
"clip_ratio": 0.0,
"completion_length": 65.55,
"epoch": 0.23232323232323232,
"grad_norm": 0.6504287719726562,
"kl": 0.08351707458496094,
"learning_rate": 2e-07,
"loss": 0.016631042957305907,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.1500000014901161,
"reward_std": 0.26906835436820986,
"rewards/MultiModalAccuracyORM": 0.1500000014901161,
"step": 575,
"train_speed(iter/s)": 0.041538
},
{
"clip_ratio": 0.0,
"completion_length": 9.85,
"epoch": 0.23434343434343435,
"grad_norm": 27.585575103759766,
"kl": 0.1207763671875,
"learning_rate": 2e-07,
"loss": -0.036790531873703,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.3000000067055225,
"reward_std": 0.3860618233680725,
"rewards/MultiModalAccuracyORM": 0.3000000067055225,
"step": 580,
"train_speed(iter/s)": 0.041563
},
{
"clip_ratio": 0.0,
"completion_length": 10.3,
"epoch": 0.23636363636363636,
"grad_norm": 10.094830513000488,
"kl": 0.04735574722290039,
"learning_rate": 2e-07,
"loss": 0.008206900209188461,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.19166667461395265,
"reward_std": 0.23631438612937927,
"rewards/MultiModalAccuracyORM": 0.19166667461395265,
"step": 585,
"train_speed(iter/s)": 0.041593
},
{
"clip_ratio": 0.0,
"completion_length": 127.6,
"epoch": 0.2383838383838384,
"grad_norm": 3.5195720195770264,
"kl": 0.03963155746459961,
"learning_rate": 2e-07,
"loss": 0.027892309427261352,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.05,
"reward": 0.22500000521540642,
"reward_std": 0.22224706113338472,
"rewards/MultiModalAccuracyORM": 0.22500000521540642,
"step": 590,
"train_speed(iter/s)": 0.041543
},
{
"clip_ratio": 0.0,
"completion_length": 8.75,
"epoch": 0.2404040404040404,
"grad_norm": 12.612972259521484,
"kl": 0.0610992431640625,
"learning_rate": 2e-07,
"loss": -0.022297632694244385,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.4000000089406967,
"reward_std": 0.33376438319683077,
"rewards/MultiModalAccuracyORM": 0.4000000089406967,
"step": 595,
"train_speed(iter/s)": 0.041563
},
{
"clip_ratio": 0.0,
"completion_length": 21.75,
"epoch": 0.24242424242424243,
"grad_norm": 1.1488845348358154,
"kl": 0.06821136474609375,
"learning_rate": 2e-07,
"loss": 0.03176195621490478,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.22500000074505805,
"reward_std": 0.3149157464504242,
"rewards/MultiModalAccuracyORM": 0.22500000074505805,
"step": 600,
"train_speed(iter/s)": 0.041549
},
{
"clip_ratio": 0.0,
"completion_length": 8.5,
"epoch": 0.24444444444444444,
"grad_norm": 4.132078170776367,
"kl": 0.07441596984863282,
"learning_rate": 2e-07,
"loss": 0.004773074015974999,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.13333333879709244,
"reward_std": 0.2323044866323471,
"rewards/MultiModalAccuracyORM": 0.13333333879709244,
"step": 605,
"train_speed(iter/s)": 0.041583
},
{
"clip_ratio": 0.0,
"completion_length": 16.85,
"epoch": 0.24646464646464647,
"grad_norm": 3.0928878784179688,
"kl": 0.050506591796875,
"learning_rate": 2e-07,
"loss": 0.0011304418556392192,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.27500000670552255,
"reward_std": 0.31520852744579314,
"rewards/MultiModalAccuracyORM": 0.27500000670552255,
"step": 610,
"train_speed(iter/s)": 0.041586
},
{
"clip_ratio": 0.0,
"completion_length": 8.45,
"epoch": 0.24848484848484848,
"grad_norm": 13.133064270019531,
"kl": 0.05210723876953125,
"learning_rate": 2e-07,
"loss": -0.009364684671163559,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.20833334028720857,
"reward_std": 0.28003925681114195,
"rewards/MultiModalAccuracyORM": 0.20833334028720857,
"step": 615,
"train_speed(iter/s)": 0.041615
},
{
"clip_ratio": 0.0,
"completion_length": 14.05,
"epoch": 0.2505050505050505,
"grad_norm": 21.168598175048828,
"kl": 0.06778411865234375,
"learning_rate": 2e-07,
"loss": -0.006833799928426742,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.3000000096857548,
"reward_std": 0.330559054017067,
"rewards/MultiModalAccuracyORM": 0.3000000096857548,
"step": 620,
"train_speed(iter/s)": 0.041639
},
{
"clip_ratio": 0.0,
"completion_length": 7.35,
"epoch": 0.25252525252525254,
"grad_norm": 16.575620651245117,
"kl": 0.05116090774536133,
"learning_rate": 2e-07,
"loss": -0.016651205718517303,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.2250000074505806,
"reward_std": 0.28859728276729585,
"rewards/MultiModalAccuracyORM": 0.2250000074505806,
"step": 625,
"train_speed(iter/s)": 0.041672
},
{
"clip_ratio": 0.0,
"completion_length": 9.7,
"epoch": 0.2545454545454545,
"grad_norm": 3.503321886062622,
"kl": 0.0628082275390625,
"learning_rate": 2e-07,
"loss": -0.008116110414266586,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.2666666716337204,
"reward_std": 0.2330589234828949,
"rewards/MultiModalAccuracyORM": 0.2666666716337204,
"step": 630,
"train_speed(iter/s)": 0.041685
},
{
"clip_ratio": 0.0,
"completion_length": 6.0,
"epoch": 0.25656565656565655,
"grad_norm": 15.203675270080566,
"kl": 0.06846466064453124,
"learning_rate": 2e-07,
"loss": 0.011408740282058715,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.38333334028720856,
"reward_std": 0.3008869707584381,
"rewards/MultiModalAccuracyORM": 0.38333334028720856,
"step": 635,
"train_speed(iter/s)": 0.04173
},
{
"clip_ratio": 0.0,
"completion_length": 15.05,
"epoch": 0.2585858585858586,
"grad_norm": 32.77607727050781,
"kl": 0.12814788818359374,
"learning_rate": 2e-07,
"loss": -0.0371063232421875,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.3250000081956387,
"reward_std": 0.36673698723316195,
"rewards/MultiModalAccuracyORM": 0.3250000081956387,
"step": 640,
"train_speed(iter/s)": 0.041758
},
{
"clip_ratio": 0.0,
"completion_length": 19.0,
"epoch": 0.2606060606060606,
"grad_norm": 15.344500541687012,
"kl": 0.105792236328125,
"learning_rate": 2e-07,
"loss": -0.006553761661052704,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.20833333507180213,
"reward_std": 0.2597057640552521,
"rewards/MultiModalAccuracyORM": 0.20833333507180213,
"step": 645,
"train_speed(iter/s)": 0.041814
},
{
"clip_ratio": 0.0,
"completion_length": 46.8,
"epoch": 0.26262626262626265,
"grad_norm": 16.03054428100586,
"kl": 0.04459686279296875,
"learning_rate": 2e-07,
"loss": 0.036105594038963316,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.22500000298023223,
"reward_std": 0.2403598755598068,
"rewards/MultiModalAccuracyORM": 0.22500000298023223,
"step": 650,
"train_speed(iter/s)": 0.041793
},
{
"clip_ratio": 0.0,
"completion_length": 16.05,
"epoch": 0.26464646464646463,
"grad_norm": 17.309656143188477,
"kl": 0.11004905700683594,
"learning_rate": 2e-07,
"loss": 0.017519061267375947,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.2083333395421505,
"reward_std": 0.3682032287120819,
"rewards/MultiModalAccuracyORM": 0.2083333395421505,
"step": 655,
"train_speed(iter/s)": 0.04181
},
{
"clip_ratio": 0.0,
"completion_length": 13.3,
"epoch": 0.26666666666666666,
"grad_norm": 4.0642170906066895,
"kl": 0.054970169067382814,
"learning_rate": 2e-07,
"loss": -0.008081305027008056,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.28333333879709244,
"reward_std": 0.23230449259281158,
"rewards/MultiModalAccuracyORM": 0.28333333879709244,
"step": 660,
"train_speed(iter/s)": 0.041838
},
{
"clip_ratio": 0.0,
"completion_length": 45.25,
"epoch": 0.2686868686868687,
"grad_norm": 7.022747993469238,
"kl": 0.10093574523925782,
"learning_rate": 2e-07,
"loss": 0.027714025974273682,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.14166666865348815,
"reward_std": 0.2531497746706009,
"rewards/MultiModalAccuracyORM": 0.14166666865348815,
"step": 665,
"train_speed(iter/s)": 0.041812
},
{
"clip_ratio": 0.0,
"completion_length": 33.75,
"epoch": 0.27070707070707073,
"grad_norm": 9.984959602355957,
"kl": 0.023084259033203124,
"learning_rate": 2e-07,
"loss": 0.026220232248306274,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.10000000149011612,
"reward_std": 0.20661829113960267,
"rewards/MultiModalAccuracyORM": 0.10000000149011612,
"step": 670,
"train_speed(iter/s)": 0.041806
},
{
"clip_ratio": 0.0,
"completion_length": 38.05,
"epoch": 0.2727272727272727,
"grad_norm": 7.702730178833008,
"kl": 0.16024627685546874,
"learning_rate": 2e-07,
"loss": -0.049201831221580505,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.32500000670552254,
"reward_std": 0.20817729830741882,
"rewards/MultiModalAccuracyORM": 0.32500000670552254,
"step": 675,
"train_speed(iter/s)": 0.041821
},
{
"clip_ratio": 0.0,
"completion_length": 33.9,
"epoch": 0.27474747474747474,
"grad_norm": 0.16480083763599396,
"kl": 0.03549041748046875,
"learning_rate": 2e-07,
"loss": 0.006150122731924057,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.20833333432674409,
"reward_std": 0.2323400765657425,
"rewards/MultiModalAccuracyORM": 0.20833333432674409,
"step": 680,
"train_speed(iter/s)": 0.041846
},
{
"clip_ratio": 0.0,
"completion_length": 10.8,
"epoch": 0.2767676767676768,
"grad_norm": 0.027387158945202827,
"kl": 0.10235595703125,
"learning_rate": 2e-07,
"loss": 0.02902156114578247,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.15833333507180214,
"reward_std": 0.13583914041519166,
"rewards/MultiModalAccuracyORM": 0.15833333507180214,
"step": 685,
"train_speed(iter/s)": 0.041856
},
{
"clip_ratio": 0.0,
"completion_length": 13.5,
"epoch": 0.2787878787878788,
"grad_norm": 6.602695465087891,
"kl": 0.0608123779296875,
"learning_rate": 2e-07,
"loss": 0.012946502864360809,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.21666667312383653,
"reward_std": 0.33300994634628295,
"rewards/MultiModalAccuracyORM": 0.21666667312383653,
"step": 690,
"train_speed(iter/s)": 0.04186
},
{
"clip_ratio": 0.0,
"completion_length": 5.4,
"epoch": 0.2808080808080808,
"grad_norm": 3.4819886684417725,
"kl": 0.12022647857666016,
"learning_rate": 2e-07,
"loss": 0.02661624550819397,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.20000000298023224,
"reward_std": 0.1981794685125351,
"rewards/MultiModalAccuracyORM": 0.20000000298023224,
"step": 695,
"train_speed(iter/s)": 0.041875
},
{
"clip_ratio": 0.0,
"completion_length": 8.45,
"epoch": 0.2828282828282828,
"grad_norm": 9.789923667907715,
"kl": 0.06219940185546875,
"learning_rate": 2e-07,
"loss": -0.0169070765376091,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.24166667088866234,
"reward_std": 0.31451369225978854,
"rewards/MultiModalAccuracyORM": 0.24166667088866234,
"step": 700,
"train_speed(iter/s)": 0.041904
},
{
"clip_ratio": 0.0,
"completion_length": 26.7,
"epoch": 0.28484848484848485,
"grad_norm": 4.8883514404296875,
"kl": 0.0865386962890625,
"learning_rate": 2e-07,
"loss": -0.01697884649038315,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.41666667312383654,
"reward_std": 0.37380772531032563,
"rewards/MultiModalAccuracyORM": 0.41666667312383654,
"step": 705,
"train_speed(iter/s)": 0.041918
},
{
"clip_ratio": 0.0,
"completion_length": 6.65,
"epoch": 0.2868686868686869,
"grad_norm": 0.24715355038642883,
"kl": 0.1329193115234375,
"learning_rate": 2e-07,
"loss": 0.030154657363891602,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.2916666746139526,
"reward_std": 0.1888910174369812,
"rewards/MultiModalAccuracyORM": 0.2916666746139526,
"step": 710,
"train_speed(iter/s)": 0.041945
},
{
"clip_ratio": 0.0,
"completion_length": 15.05,
"epoch": 0.28888888888888886,
"grad_norm": 20.6412296295166,
"kl": 0.0775299072265625,
"learning_rate": 2e-07,
"loss": 0.010814273357391357,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.3666666731238365,
"reward_std": 0.23236408829689026,
"rewards/MultiModalAccuracyORM": 0.3666666731238365,
"step": 715,
"train_speed(iter/s)": 0.041844
},
{
"clip_ratio": 0.0,
"completion_length": 67.7,
"epoch": 0.2909090909090909,
"grad_norm": 19.74690055847168,
"kl": 0.0287322998046875,
"learning_rate": 2e-07,
"loss": 0.011786083877086639,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.3916666753590107,
"reward_std": 0.3597048044204712,
"rewards/MultiModalAccuracyORM": 0.3916666753590107,
"step": 720,
"train_speed(iter/s)": 0.041856
},
{
"clip_ratio": 0.0,
"completion_length": 7.95,
"epoch": 0.29292929292929293,
"grad_norm": 12.01062297821045,
"kl": 0.0283416748046875,
"learning_rate": 2e-07,
"loss": 0.030677640438079835,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.3083333447575569,
"reward_std": 0.37494559586048126,
"rewards/MultiModalAccuracyORM": 0.3083333447575569,
"step": 725,
"train_speed(iter/s)": 0.041837
},
{
"clip_ratio": 0.0,
"completion_length": 7.1,
"epoch": 0.29494949494949496,
"grad_norm": 18.26583480834961,
"kl": 0.048813819885253906,
"learning_rate": 2e-07,
"loss": 0.00018847386818379163,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.15000000298023225,
"reward_std": 0.2855865776538849,
"rewards/MultiModalAccuracyORM": 0.15000000298023225,
"step": 730,
"train_speed(iter/s)": 0.041864
},
{
"clip_ratio": 0.0,
"completion_length": 8.2,
"epoch": 0.296969696969697,
"grad_norm": 23.585920333862305,
"kl": 0.10856704711914063,
"learning_rate": 2e-07,
"loss": -0.010623668134212495,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.2833333402872086,
"reward_std": 0.2486636757850647,
"rewards/MultiModalAccuracyORM": 0.2833333402872086,
"step": 735,
"train_speed(iter/s)": 0.041867
},
{
"clip_ratio": 0.0,
"completion_length": 36.85,
"epoch": 0.298989898989899,
"grad_norm": 13.779229164123535,
"kl": 0.16164474487304686,
"learning_rate": 2e-07,
"loss": 0.09003554582595825,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.3500000096857548,
"reward_std": 0.3144781023263931,
"rewards/MultiModalAccuracyORM": 0.3500000096857548,
"step": 740,
"train_speed(iter/s)": 0.041866
},
{
"clip_ratio": 0.0,
"completion_length": 8.1,
"epoch": 0.301010101010101,
"grad_norm": 5.112743377685547,
"kl": 0.06104888916015625,
"learning_rate": 2e-07,
"loss": 0.006612183898687363,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.13333333730697633,
"reward_std": 0.24261614382267,
"rewards/MultiModalAccuracyORM": 0.13333333730697633,
"step": 745,
"train_speed(iter/s)": 0.041874
},
{
"epoch": 0.30303030303030304,
"grad_norm": 3.3870651721954346,
"learning_rate": 2e-07,
"loss": 0.007025846093893051,
"memory(GiB)": 104.49,
"step": 750,
"train_speed(iter/s)": 0.041879
},
{
"epoch": 0.30303030303030304,
"eval_clip_ratio": 0.0,
"eval_completion_length": 26.371667375564574,
"eval_kl": 0.08423469543457031,
"eval_loss": 0.020288411527872086,
"eval_response_clip_ratio": 0.0,
"eval_reward": 0.3050000049173832,
"eval_reward_std": 0.28924588978290555,
"eval_rewards/MultiModalAccuracyORM": 0.3050000049173832,
"eval_runtime": 257.2173,
"eval_samples_per_second": 0.194,
"eval_steps_per_second": 0.019,
"step": 750
},
{
"clip_ratio": 0.0,
"completion_length": 25.575,
"epoch": 0.30505050505050507,
"grad_norm": 3.0410096645355225,
"kl": 0.09359779357910156,
"learning_rate": 2e-07,
"loss": 0.01778276413679123,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.20416667126119137,
"reward_std": 0.21572377979755403,
"rewards/MultiModalAccuracyORM": 0.20416667126119137,
"step": 755,
"train_speed(iter/s)": 0.041122
},
{
"clip_ratio": 0.0,
"completion_length": 10.25,
"epoch": 0.30707070707070705,
"grad_norm": 13.25398063659668,
"kl": 0.0601959228515625,
"learning_rate": 2e-07,
"loss": -0.023943953216075897,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.11666666939854622,
"reward_std": 0.24010565578937532,
"rewards/MultiModalAccuracyORM": 0.11666666939854622,
"step": 760,
"train_speed(iter/s)": 0.041142
},
{
"clip_ratio": 0.0,
"completion_length": 5.05,
"epoch": 0.3090909090909091,
"grad_norm": 0.06504862755537033,
"kl": 0.0304901123046875,
"learning_rate": 2e-07,
"loss": -0.007498346269130707,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.3833333387970924,
"reward_std": 0.3021644026041031,
"rewards/MultiModalAccuracyORM": 0.3833333387970924,
"step": 765,
"train_speed(iter/s)": 0.041185
},
{
"clip_ratio": 0.0,
"completion_length": 50.95,
"epoch": 0.3111111111111111,
"grad_norm": 18.189159393310547,
"kl": 0.0461090087890625,
"learning_rate": 2e-07,
"loss": -0.0027750393375754355,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.2916666716337204,
"reward_std": 0.29709570705890653,
"rewards/MultiModalAccuracyORM": 0.2916666716337204,
"step": 770,
"train_speed(iter/s)": 0.041169
},
{
"clip_ratio": 0.0,
"completion_length": 19.85,
"epoch": 0.31313131313131315,
"grad_norm": 0.3038291931152344,
"kl": 0.03930206298828125,
"learning_rate": 2e-07,
"loss": -0.0053185861557722095,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.1666666731238365,
"reward_std": 0.2386302560567856,
"rewards/MultiModalAccuracyORM": 0.1666666731238365,
"step": 775,
"train_speed(iter/s)": 0.041176
},
{
"clip_ratio": 0.0,
"completion_length": 21.35,
"epoch": 0.3151515151515151,
"grad_norm": 10.563432693481445,
"kl": 0.02420806884765625,
"learning_rate": 2e-07,
"loss": -0.005909685418009758,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.10833333432674408,
"reward_std": 0.2135300010442734,
"rewards/MultiModalAccuracyORM": 0.10833333432674408,
"step": 780,
"train_speed(iter/s)": 0.041208
},
{
"clip_ratio": 0.0,
"completion_length": 17.1,
"epoch": 0.31717171717171716,
"grad_norm": 5.078320503234863,
"kl": 0.026453018188476562,
"learning_rate": 2e-07,
"loss": 0.0009352466091513634,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.2666666738688946,
"reward_std": 0.256683474779129,
"rewards/MultiModalAccuracyORM": 0.2666666738688946,
"step": 785,
"train_speed(iter/s)": 0.041229
},
{
"clip_ratio": 0.0,
"completion_length": 12.35,
"epoch": 0.3191919191919192,
"grad_norm": 10.143798828125,
"kl": 0.03321533203125,
"learning_rate": 2e-07,
"loss": 0.012424397468566894,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.2750000037252903,
"reward_std": 0.26597192585468293,
"rewards/MultiModalAccuracyORM": 0.2750000037252903,
"step": 790,
"train_speed(iter/s)": 0.041249
},
{
"clip_ratio": 0.0,
"completion_length": 163.15,
"epoch": 0.3212121212121212,
"grad_norm": 0.5449197888374329,
"kl": 0.019189453125,
"learning_rate": 2e-07,
"loss": 0.030487871170043944,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.2583333447575569,
"reward_std": 0.39207376539707184,
"rewards/MultiModalAccuracyORM": 0.2583333447575569,
"step": 795,
"train_speed(iter/s)": 0.041183
},
{
"clip_ratio": 0.0,
"completion_length": 22.25,
"epoch": 0.32323232323232326,
"grad_norm": 1.004371166229248,
"kl": 0.037581253051757815,
"learning_rate": 2e-07,
"loss": 0.0017656445503234862,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.2666666693985462,
"reward_std": 0.25270916521549225,
"rewards/MultiModalAccuracyORM": 0.2666666693985462,
"step": 800,
"train_speed(iter/s)": 0.041199
},
{
"clip_ratio": 0.0,
"completion_length": 13.95,
"epoch": 0.32525252525252524,
"grad_norm": 19.628896713256836,
"kl": 0.053558349609375,
"learning_rate": 2e-07,
"loss": -0.021615955233573913,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.1666666693985462,
"reward_std": 0.32094223201274874,
"rewards/MultiModalAccuracyORM": 0.1666666693985462,
"step": 805,
"train_speed(iter/s)": 0.041213
},
{
"clip_ratio": 0.0,
"completion_length": 12.7,
"epoch": 0.32727272727272727,
"grad_norm": 6.42383337020874,
"kl": 0.18563766479492189,
"learning_rate": 2e-07,
"loss": 0.033368897438049314,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.25833333656191826,
"reward_std": 0.2744703501462936,
"rewards/MultiModalAccuracyORM": 0.25833333656191826,
"step": 810,
"train_speed(iter/s)": 0.041234
},
{
"clip_ratio": 0.0,
"completion_length": 12.8,
"epoch": 0.3292929292929293,
"grad_norm": 3.2321925163269043,
"kl": 0.08846683502197265,
"learning_rate": 2e-07,
"loss": 0.003480428457260132,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.15000000298023225,
"reward_std": 0.2953897833824158,
"rewards/MultiModalAccuracyORM": 0.15000000298023225,
"step": 815,
"train_speed(iter/s)": 0.041242
},
{
"clip_ratio": 0.0,
"completion_length": 18.85,
"epoch": 0.33131313131313134,
"grad_norm": 5.854945659637451,
"kl": 0.011492156982421875,
"learning_rate": 2e-07,
"loss": -0.008568185567855834,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.2666666716337204,
"reward_std": 0.3172461599111557,
"rewards/MultiModalAccuracyORM": 0.2666666716337204,
"step": 820,
"train_speed(iter/s)": 0.041263
},
{
"clip_ratio": 0.0,
"completion_length": 7.25,
"epoch": 0.3333333333333333,
"grad_norm": 17.020723342895508,
"kl": 0.029691314697265624,
"learning_rate": 2e-07,
"loss": -0.010567378997802735,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.2583333395421505,
"reward_std": 0.34933354556560514,
"rewards/MultiModalAccuracyORM": 0.2583333395421505,
"step": 825,
"train_speed(iter/s)": 0.041286
},
{
"clip_ratio": 0.0,
"completion_length": 9.2,
"epoch": 0.33535353535353535,
"grad_norm": 12.575139999389648,
"kl": 0.0603668212890625,
"learning_rate": 2e-07,
"loss": -0.0004529397003352642,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.3833333432674408,
"reward_std": 0.30291883945465087,
"rewards/MultiModalAccuracyORM": 0.3833333432674408,
"step": 830,
"train_speed(iter/s)": 0.041299
},
{
"clip_ratio": 0.0,
"completion_length": 7.7,
"epoch": 0.3373737373737374,
"grad_norm": 2.0305564403533936,
"kl": 0.08530197143554688,
"learning_rate": 2e-07,
"loss": -0.0174559086561203,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.3083333380520344,
"reward_std": 0.36567819118499756,
"rewards/MultiModalAccuracyORM": 0.3083333380520344,
"step": 835,
"train_speed(iter/s)": 0.041331
},
{
"clip_ratio": 0.0,
"completion_length": 59.2,
"epoch": 0.3393939393939394,
"grad_norm": 6.523157119750977,
"kl": 0.098590087890625,
"learning_rate": 2e-07,
"loss": -0.014323845505714417,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.2916666716337204,
"reward_std": 0.31820976436138154,
"rewards/MultiModalAccuracyORM": 0.2916666716337204,
"step": 840,
"train_speed(iter/s)": 0.04132
},
{
"clip_ratio": 0.0,
"completion_length": 123.2,
"epoch": 0.3414141414141414,
"grad_norm": 4.560072422027588,
"kl": 0.010162353515625,
"learning_rate": 2e-07,
"loss": 0.02465280294418335,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.22500000447034835,
"reward_std": 0.25512446761131286,
"rewards/MultiModalAccuracyORM": 0.22500000447034835,
"step": 845,
"train_speed(iter/s)": 0.041291
},
{
"clip_ratio": 0.0,
"completion_length": 52.65,
"epoch": 0.3434343434343434,
"grad_norm": 0.2115914523601532,
"kl": 0.1222564697265625,
"learning_rate": 2e-07,
"loss": 0.01849503219127655,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.1416666679084301,
"reward_std": 0.18255070447921753,
"rewards/MultiModalAccuracyORM": 0.1416666679084301,
"step": 850,
"train_speed(iter/s)": 0.041263
},
{
"clip_ratio": 0.0,
"completion_length": 17.6,
"epoch": 0.34545454545454546,
"grad_norm": 8.007162094116211,
"kl": 0.06471099853515624,
"learning_rate": 2e-07,
"loss": -0.027201026678085327,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.41666667386889455,
"reward_std": 0.27853985130786896,
"rewards/MultiModalAccuracyORM": 0.41666667386889455,
"step": 855,
"train_speed(iter/s)": 0.041287
},
{
"clip_ratio": 0.0,
"completion_length": 14.55,
"epoch": 0.3474747474747475,
"grad_norm": 14.470208168029785,
"kl": 0.07525177001953125,
"learning_rate": 2e-07,
"loss": -0.01188465803861618,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.33333334028720857,
"reward_std": 0.3921093553304672,
"rewards/MultiModalAccuracyORM": 0.33333334028720857,
"step": 860,
"train_speed(iter/s)": 0.041297
},
{
"clip_ratio": 0.0,
"completion_length": 16.45,
"epoch": 0.34949494949494947,
"grad_norm": 11.233606338500977,
"kl": 0.10040740966796875,
"learning_rate": 2e-07,
"loss": 0.02309779226779938,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.3000000037252903,
"reward_std": 0.23634997606277466,
"rewards/MultiModalAccuracyORM": 0.3000000037252903,
"step": 865,
"train_speed(iter/s)": 0.041313
},
{
"clip_ratio": 0.0,
"completion_length": 91.9,
"epoch": 0.3515151515151515,
"grad_norm": 22.588499069213867,
"kl": 0.11612701416015625,
"learning_rate": 2e-07,
"loss": 0.020076577365398408,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.15000000298023225,
"reward_std": 0.26677650213241577,
"rewards/MultiModalAccuracyORM": 0.15000000298023225,
"step": 870,
"train_speed(iter/s)": 0.041279
},
{
"clip_ratio": 0.0,
"completion_length": 15.75,
"epoch": 0.35353535353535354,
"grad_norm": 8.226666450500488,
"kl": 0.05343475341796875,
"learning_rate": 2e-07,
"loss": -0.014575448632240296,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.3333333395421505,
"reward_std": 0.37199449837207793,
"rewards/MultiModalAccuracyORM": 0.3333333395421505,
"step": 875,
"train_speed(iter/s)": 0.041283
},
{
"clip_ratio": 0.0,
"completion_length": 81.1,
"epoch": 0.35555555555555557,
"grad_norm": 0.5393237471580505,
"kl": 0.0892120361328125,
"learning_rate": 2e-07,
"loss": 0.006313225626945496,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.2833333365619183,
"reward_std": 0.20661829113960267,
"rewards/MultiModalAccuracyORM": 0.2833333365619183,
"step": 880,
"train_speed(iter/s)": 0.041271
},
{
"clip_ratio": 0.0,
"completion_length": 26.1,
"epoch": 0.3575757575757576,
"grad_norm": 0.05410289764404297,
"kl": 0.015875244140625,
"learning_rate": 2e-07,
"loss": 0.0006564079783856868,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.28333333432674407,
"reward_std": 0.14888326525688172,
"rewards/MultiModalAccuracyORM": 0.28333333432674407,
"step": 885,
"train_speed(iter/s)": 0.04124
},
{
"clip_ratio": 0.0,
"completion_length": 51.8,
"epoch": 0.3595959595959596,
"grad_norm": 16.54722785949707,
"kl": 0.0899993896484375,
"learning_rate": 2e-07,
"loss": 0.010663460195064544,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.3083333410322666,
"reward_std": 0.2636228919029236,
"rewards/MultiModalAccuracyORM": 0.3083333410322666,
"step": 890,
"train_speed(iter/s)": 0.041246
},
{
"clip_ratio": 0.0,
"completion_length": 19.9,
"epoch": 0.3616161616161616,
"grad_norm": 10.844444274902344,
"kl": 0.194525146484375,
"learning_rate": 2e-07,
"loss": -0.04198589324951172,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.4833333432674408,
"reward_std": 0.36594696044921876,
"rewards/MultiModalAccuracyORM": 0.4833333432674408,
"step": 895,
"train_speed(iter/s)": 0.04126
},
{
"clip_ratio": 0.0,
"completion_length": 30.25,
"epoch": 0.36363636363636365,
"grad_norm": 5.428062915802002,
"kl": 0.0639495849609375,
"learning_rate": 2e-07,
"loss": -0.029673090577125548,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.21666667237877846,
"reward_std": 0.2916341096162796,
"rewards/MultiModalAccuracyORM": 0.21666667237877846,
"step": 900,
"train_speed(iter/s)": 0.041259
},
{
"clip_ratio": 0.0,
"completion_length": 6.9,
"epoch": 0.3656565656565657,
"grad_norm": 0.12221446633338928,
"kl": 0.07857627868652343,
"learning_rate": 2e-07,
"loss": -0.016133570671081544,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.37500000447034837,
"reward_std": 0.2135299950838089,
"rewards/MultiModalAccuracyORM": 0.37500000447034837,
"step": 905,
"train_speed(iter/s)": 0.041282
},
{
"clip_ratio": 0.0,
"completion_length": 4.65,
"epoch": 0.36767676767676766,
"grad_norm": 28.893342971801758,
"kl": 0.09071540832519531,
"learning_rate": 2e-07,
"loss": 0.006183768063783646,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.3583333358168602,
"reward_std": 0.23309451341629028,
"rewards/MultiModalAccuracyORM": 0.3583333358168602,
"step": 910,
"train_speed(iter/s)": 0.041302
},
{
"clip_ratio": 0.0,
"completion_length": 8.4,
"epoch": 0.3696969696969697,
"grad_norm": 0.04850845783948898,
"kl": 0.03702239990234375,
"learning_rate": 2e-07,
"loss": 0.031198829412460327,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.3083333380520344,
"reward_std": 0.2325587123632431,
"rewards/MultiModalAccuracyORM": 0.3083333380520344,
"step": 915,
"train_speed(iter/s)": 0.041316
},
{
"clip_ratio": 0.0,
"completion_length": 88.5,
"epoch": 0.3717171717171717,
"grad_norm": 6.006438732147217,
"kl": 0.09772415161132812,
"learning_rate": 2e-07,
"loss": 0.02726798951625824,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.3083333417773247,
"reward_std": 0.3043610692024231,
"rewards/MultiModalAccuracyORM": 0.3083333417773247,
"step": 920,
"train_speed(iter/s)": 0.041277
},
{
"clip_ratio": 0.0,
"completion_length": 35.3,
"epoch": 0.37373737373737376,
"grad_norm": 1.342499852180481,
"kl": 0.011273193359375,
"learning_rate": 2e-07,
"loss": 0.00134199857711792,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.15000000074505807,
"reward_std": 0.13182924091815948,
"rewards/MultiModalAccuracyORM": 0.15000000074505807,
"step": 925,
"train_speed(iter/s)": 0.041269
},
{
"clip_ratio": 0.0,
"completion_length": 12.45,
"epoch": 0.37575757575757573,
"grad_norm": 3.2022011280059814,
"kl": 0.165765380859375,
"learning_rate": 2e-07,
"loss": -0.004855489730834961,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.35833333879709245,
"reward_std": 0.33156771659851075,
"rewards/MultiModalAccuracyORM": 0.35833333879709245,
"step": 930,
"train_speed(iter/s)": 0.041287
},
{
"clip_ratio": 0.0,
"completion_length": 50.05,
"epoch": 0.37777777777777777,
"grad_norm": 0.07847103476524353,
"kl": 0.05077667236328125,
"learning_rate": 2e-07,
"loss": -0.023166632652282713,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.4166666716337204,
"reward_std": 0.32526595890522003,
"rewards/MultiModalAccuracyORM": 0.4166666716337204,
"step": 935,
"train_speed(iter/s)": 0.041275
},
{
"clip_ratio": 0.0,
"completion_length": 24.15,
"epoch": 0.3797979797979798,
"grad_norm": 18.610437393188477,
"kl": 0.03169517517089844,
"learning_rate": 2e-07,
"loss": 0.024595724046230318,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.18333334028720855,
"reward_std": 0.3267322063446045,
"rewards/MultiModalAccuracyORM": 0.18333334028720855,
"step": 940,
"train_speed(iter/s)": 0.041277
},
{
"clip_ratio": 0.0,
"completion_length": 13.45,
"epoch": 0.38181818181818183,
"grad_norm": 5.941343784332275,
"kl": 0.0661346435546875,
"learning_rate": 2e-07,
"loss": 0.024455997347831725,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.2583333387970924,
"reward_std": 0.21973656117916107,
"rewards/MultiModalAccuracyORM": 0.2583333387970924,
"step": 945,
"train_speed(iter/s)": 0.04129
},
{
"clip_ratio": 0.0,
"completion_length": 37.05,
"epoch": 0.3838383838383838,
"grad_norm": 24.896520614624023,
"kl": 0.150213623046875,
"learning_rate": 2e-07,
"loss": 0.017214223742485046,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.41666667312383654,
"reward_std": 0.33557761609554293,
"rewards/MultiModalAccuracyORM": 0.41666667312383654,
"step": 950,
"train_speed(iter/s)": 0.041288
},
{
"clip_ratio": 0.0,
"completion_length": 15.5,
"epoch": 0.38585858585858585,
"grad_norm": 8.904081344604492,
"kl": 0.12316970825195313,
"learning_rate": 2e-07,
"loss": -0.0002661585807800293,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.3000000022351742,
"reward_std": 0.21999078691005708,
"rewards/MultiModalAccuracyORM": 0.3000000022351742,
"step": 955,
"train_speed(iter/s)": 0.041304
},
{
"clip_ratio": 0.0,
"completion_length": 31.05,
"epoch": 0.3878787878787879,
"grad_norm": 0.30000391602516174,
"kl": 0.193408203125,
"learning_rate": 2e-07,
"loss": -0.016391244530677796,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.5916666716337204,
"reward_std": 0.15219832956790924,
"rewards/MultiModalAccuracyORM": 0.5916666716337204,
"step": 960,
"train_speed(iter/s)": 0.041312
},
{
"clip_ratio": 0.0,
"completion_length": 9.6,
"epoch": 0.3898989898989899,
"grad_norm": 1.9883811473846436,
"kl": 0.046465301513671876,
"learning_rate": 2e-07,
"loss": -0.0011612892150878907,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.24166667014360427,
"reward_std": 0.23930107951164245,
"rewards/MultiModalAccuracyORM": 0.24166667014360427,
"step": 965,
"train_speed(iter/s)": 0.041313
},
{
"clip_ratio": 0.0,
"completion_length": 25.4,
"epoch": 0.39191919191919194,
"grad_norm": 17.314956665039062,
"kl": 0.10909576416015625,
"learning_rate": 2e-07,
"loss": 0.003603992611169815,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.26666666865348815,
"reward_std": 0.26976318955421447,
"rewards/MultiModalAccuracyORM": 0.26666666865348815,
"step": 970,
"train_speed(iter/s)": 0.04131
},
{
"clip_ratio": 0.0,
"completion_length": 26.45,
"epoch": 0.3939393939393939,
"grad_norm": 2.700242042541504,
"kl": 0.0446197509765625,
"learning_rate": 2e-07,
"loss": -0.024584516882896423,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.09166666865348816,
"reward_std": 0.23854664266109465,
"rewards/MultiModalAccuracyORM": 0.09166666865348816,
"step": 975,
"train_speed(iter/s)": 0.041305
},
{
"clip_ratio": 0.0,
"completion_length": 19.85,
"epoch": 0.39595959595959596,
"grad_norm": 1.759245753288269,
"kl": 0.0932861328125,
"learning_rate": 2e-07,
"loss": 0.03299914002418518,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.23333334028720856,
"reward_std": 0.22785155177116395,
"rewards/MultiModalAccuracyORM": 0.23333334028720856,
"step": 980,
"train_speed(iter/s)": 0.041315
},
{
"clip_ratio": 0.0,
"completion_length": 32.7,
"epoch": 0.397979797979798,
"grad_norm": 12.485607147216797,
"kl": 0.061135292053222656,
"learning_rate": 2e-07,
"loss": 0.022333118319511413,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.26666667088866236,
"reward_std": 0.27903059422969817,
"rewards/MultiModalAccuracyORM": 0.26666667088866236,
"step": 985,
"train_speed(iter/s)": 0.041318
},
{
"clip_ratio": 0.0,
"completion_length": 48.65,
"epoch": 0.4,
"grad_norm": 4.170945644378662,
"kl": 0.0902923583984375,
"learning_rate": 2e-07,
"loss": -0.00014310678234323858,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.21666667014360427,
"reward_std": 0.2511145681142807,
"rewards/MultiModalAccuracyORM": 0.21666667014360427,
"step": 990,
"train_speed(iter/s)": 0.041309
},
{
"clip_ratio": 0.0,
"completion_length": 8.25,
"epoch": 0.402020202020202,
"grad_norm": 2.5125696659088135,
"kl": 0.12824859619140624,
"learning_rate": 2e-07,
"loss": 0.0361581027507782,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.5250000052154065,
"reward_std": 0.2526735752820969,
"rewards/MultiModalAccuracyORM": 0.5250000052154065,
"step": 995,
"train_speed(iter/s)": 0.041331
},
{
"epoch": 0.40404040404040403,
"grad_norm": 24.84500503540039,
"learning_rate": 2e-07,
"loss": -0.03532302379608154,
"memory(GiB)": 104.49,
"step": 1000,
"train_speed(iter/s)": 0.041234
},
{
"epoch": 0.40404040404040403,
"eval_clip_ratio": 0.0,
"eval_completion_length": 40.71333456993103,
"eval_kl": 0.09849456787109374,
"eval_loss": 0.019675862044095993,
"eval_response_clip_ratio": 0.0,
"eval_reward": 0.36166667461395263,
"eval_reward_std": 0.2775319296121597,
"eval_rewards/MultiModalAccuracyORM": 0.36166667461395263,
"eval_runtime": 294.4392,
"eval_samples_per_second": 0.17,
"eval_steps_per_second": 0.017,
"step": 1000
},
{
"clip_ratio": 0.0,
"completion_length": 63.525,
"epoch": 0.40606060606060607,
"grad_norm": 2.5043818950653076,
"kl": 0.041501617431640624,
"learning_rate": 2e-07,
"loss": -0.008308599889278411,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.21666667014360427,
"reward_std": 0.21963488459587097,
"rewards/MultiModalAccuracyORM": 0.21666667014360427,
"step": 1005,
"train_speed(iter/s)": 0.040624
},
{
"clip_ratio": 0.0,
"completion_length": 6.05,
"epoch": 0.4080808080808081,
"grad_norm": 19.067171096801758,
"kl": 0.07535552978515625,
"learning_rate": 2e-07,
"loss": 0.017892301082611084,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.33333334028720857,
"reward_std": 0.22005038857460021,
"rewards/MultiModalAccuracyORM": 0.33333334028720857,
"step": 1010,
"train_speed(iter/s)": 0.040645
},
{
"clip_ratio": 0.0,
"completion_length": 70.6,
"epoch": 0.4101010101010101,
"grad_norm": 2.5989065170288086,
"kl": 0.0229217529296875,
"learning_rate": 2e-07,
"loss": 0.040188026428222653,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.21666667014360427,
"reward_std": 0.25591449439525604,
"rewards/MultiModalAccuracyORM": 0.21666667014360427,
"step": 1015,
"train_speed(iter/s)": 0.040633
},
{
"clip_ratio": 0.0,
"completion_length": 58.65,
"epoch": 0.4121212121212121,
"grad_norm": 11.748002052307129,
"kl": 0.06688776016235351,
"learning_rate": 2e-07,
"loss": -0.0008021335117518902,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.11666666939854622,
"reward_std": 0.17150862216949464,
"rewards/MultiModalAccuracyORM": 0.11666666939854622,
"step": 1020,
"train_speed(iter/s)": 0.040631
},
{
"clip_ratio": 0.0,
"completion_length": 25.15,
"epoch": 0.41414141414141414,
"grad_norm": 0.12045960873365402,
"kl": 0.03296966552734375,
"learning_rate": 2e-07,
"loss": -0.010370378196239472,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.11666667088866234,
"reward_std": 0.17081378698348998,
"rewards/MultiModalAccuracyORM": 0.11666667088866234,
"step": 1025,
"train_speed(iter/s)": 0.040649
},
{
"clip_ratio": 0.0,
"completion_length": 47.85,
"epoch": 0.4161616161616162,
"grad_norm": 1.6403871774673462,
"kl": 0.06666259765625,
"learning_rate": 2e-07,
"loss": 0.00585133358836174,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.17500000074505806,
"reward_std": 0.17705594301223754,
"rewards/MultiModalAccuracyORM": 0.17500000074505806,
"step": 1030,
"train_speed(iter/s)": 0.040624
},
{
"clip_ratio": 0.0,
"completion_length": 34.3,
"epoch": 0.41818181818181815,
"grad_norm": 0.014441369101405144,
"kl": 0.07417640686035157,
"learning_rate": 2e-07,
"loss": -0.010604190826416015,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.22500000298023223,
"reward_std": 0.2003761351108551,
"rewards/MultiModalAccuracyORM": 0.22500000298023223,
"step": 1035,
"train_speed(iter/s)": 0.040636
},
{
"clip_ratio": 0.0,
"completion_length": 47.6,
"epoch": 0.4202020202020202,
"grad_norm": 6.607668399810791,
"kl": 0.1425227165222168,
"learning_rate": 2e-07,
"loss": 0.02794753313064575,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.3583333395421505,
"reward_std": 0.2567190647125244,
"rewards/MultiModalAccuracyORM": 0.3583333395421505,
"step": 1040,
"train_speed(iter/s)": 0.040638
},
{
"clip_ratio": 0.0,
"completion_length": 6.6,
"epoch": 0.4222222222222222,
"grad_norm": 0.8122760057449341,
"kl": 0.1528533935546875,
"learning_rate": 2e-07,
"loss": 0.019382116198539735,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.17500000670552254,
"reward_std": 0.2034369796514511,
"rewards/MultiModalAccuracyORM": 0.17500000670552254,
"step": 1045,
"train_speed(iter/s)": 0.040661
},
{
"clip_ratio": 0.0,
"completion_length": 29.85,
"epoch": 0.42424242424242425,
"grad_norm": 0.18659576773643494,
"kl": 0.010857391357421874,
"learning_rate": 2e-07,
"loss": 0.015965181589126586,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.21666667014360427,
"reward_std": 0.1660114347934723,
"rewards/MultiModalAccuracyORM": 0.21666667014360427,
"step": 1050,
"train_speed(iter/s)": 0.040661
},
{
"clip_ratio": 0.0,
"completion_length": 21.25,
"epoch": 0.4262626262626263,
"grad_norm": 0.4390380382537842,
"kl": 0.07591552734375,
"learning_rate": 2e-07,
"loss": 0.011004485189914703,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.4250000111758709,
"reward_std": 0.24862808585166932,
"rewards/MultiModalAccuracyORM": 0.4250000111758709,
"step": 1055,
"train_speed(iter/s)": 0.040671
},
{
"clip_ratio": 0.0,
"completion_length": 52.95,
"epoch": 0.42828282828282827,
"grad_norm": 0.3618135452270508,
"kl": 0.109490966796875,
"learning_rate": 2e-07,
"loss": 0.011407237499952316,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.3333333387970924,
"reward_std": 0.14589657187461852,
"rewards/MultiModalAccuracyORM": 0.3333333387970924,
"step": 1060,
"train_speed(iter/s)": 0.04069
},
{
"clip_ratio": 0.0,
"completion_length": 47.1,
"epoch": 0.4303030303030303,
"grad_norm": 13.074536323547363,
"kl": 0.18959503173828124,
"learning_rate": 2e-07,
"loss": 0.04986717700958252,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.3000000029802322,
"reward_std": 0.14589657187461852,
"rewards/MultiModalAccuracyORM": 0.3000000029802322,
"step": 1065,
"train_speed(iter/s)": 0.040694
},
{
"clip_ratio": 0.0,
"completion_length": 28.9,
"epoch": 0.43232323232323233,
"grad_norm": 6.16197395324707,
"kl": 0.15793075561523437,
"learning_rate": 2e-07,
"loss": 0.06019207835197449,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.28333334177732467,
"reward_std": 0.2669951319694519,
"rewards/MultiModalAccuracyORM": 0.28333334177732467,
"step": 1070,
"train_speed(iter/s)": 0.040701
},
{
"clip_ratio": 0.0,
"completion_length": 27.05,
"epoch": 0.43434343434343436,
"grad_norm": 25.265649795532227,
"kl": 0.08460769653320313,
"learning_rate": 2e-07,
"loss": -0.04109536409378052,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.3750000037252903,
"reward_std": 0.2325587123632431,
"rewards/MultiModalAccuracyORM": 0.3750000037252903,
"step": 1075,
"train_speed(iter/s)": 0.040703
},
{
"clip_ratio": 0.0,
"completion_length": 33.0,
"epoch": 0.43636363636363634,
"grad_norm": 2.5213825702667236,
"kl": 0.089501953125,
"learning_rate": 2e-07,
"loss": 0.011518492549657821,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.3333333387970924,
"reward_std": 0.35792474150657655,
"rewards/MultiModalAccuracyORM": 0.3333333387970924,
"step": 1080,
"train_speed(iter/s)": 0.040705
},
{
"clip_ratio": 0.0,
"completion_length": 65.85,
"epoch": 0.4383838383838384,
"grad_norm": 2.2053442001342773,
"kl": 0.014685440063476562,
"learning_rate": 2e-07,
"loss": -0.03693766593933105,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.2666666701436043,
"reward_std": 0.2370448112487793,
"rewards/MultiModalAccuracyORM": 0.2666666701436043,
"step": 1085,
"train_speed(iter/s)": 0.040693
},
{
"clip_ratio": 0.0,
"completion_length": 10.2,
"epoch": 0.4404040404040404,
"grad_norm": 12.156472206115723,
"kl": 0.17877197265625,
"learning_rate": 2e-07,
"loss": 0.032665693759918214,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.3416666746139526,
"reward_std": 0.27148365676403047,
"rewards/MultiModalAccuracyORM": 0.3416666746139526,
"step": 1090,
"train_speed(iter/s)": 0.040713
},
{
"clip_ratio": 0.0,
"completion_length": 8.85,
"epoch": 0.44242424242424244,
"grad_norm": 1.4023343324661255,
"kl": 0.098193359375,
"learning_rate": 2e-07,
"loss": -0.007838453352451324,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.4000000022351742,
"reward_std": 0.172567418217659,
"rewards/MultiModalAccuracyORM": 0.4000000022351742,
"step": 1095,
"train_speed(iter/s)": 0.040737
},
{
"clip_ratio": 0.0,
"completion_length": 67.6,
"epoch": 0.4444444444444444,
"grad_norm": 10.351971626281738,
"kl": 0.02147979736328125,
"learning_rate": 2e-07,
"loss": 0.03331095576286316,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.3166666775941849,
"reward_std": 0.3504018098115921,
"rewards/MultiModalAccuracyORM": 0.3166666775941849,
"step": 1100,
"train_speed(iter/s)": 0.04073
},
{
"clip_ratio": 0.0,
"completion_length": 12.95,
"epoch": 0.44646464646464645,
"grad_norm": 13.833907127380371,
"kl": 0.019232177734375,
"learning_rate": 2e-07,
"loss": -0.005460131168365479,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.3250000089406967,
"reward_std": 0.2667409062385559,
"rewards/MultiModalAccuracyORM": 0.3250000089406967,
"step": 1105,
"train_speed(iter/s)": 0.040741
},
{
"clip_ratio": 0.0,
"completion_length": 26.65,
"epoch": 0.4484848484848485,
"grad_norm": 2.0316038131713867,
"kl": 0.018201828002929688,
"learning_rate": 2e-07,
"loss": -0.0024514278396964074,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.35833333879709245,
"reward_std": 0.2793444275856018,
"rewards/MultiModalAccuracyORM": 0.35833333879709245,
"step": 1110,
"train_speed(iter/s)": 0.04076
},
{
"clip_ratio": 0.0,
"completion_length": 7.0,
"epoch": 0.4505050505050505,
"grad_norm": 15.886459350585938,
"kl": 0.21325912475585937,
"learning_rate": 2e-07,
"loss": 0.0038191914558410645,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.24166666865348815,
"reward_std": 0.2245364874601364,
"rewards/MultiModalAccuracyORM": 0.24166666865348815,
"step": 1115,
"train_speed(iter/s)": 0.040791
},
{
"clip_ratio": 0.0,
"completion_length": 12.4,
"epoch": 0.45252525252525255,
"grad_norm": 0.03295298293232918,
"kl": 0.1110443115234375,
"learning_rate": 2e-07,
"loss": 0.013870391249656677,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.21666667088866234,
"reward_std": 0.27402731478214265,
"rewards/MultiModalAccuracyORM": 0.21666667088866234,
"step": 1120,
"train_speed(iter/s)": 0.040796
},
{
"clip_ratio": 0.0,
"completion_length": 27.9,
"epoch": 0.45454545454545453,
"grad_norm": 2.8173696994781494,
"kl": 0.0269622802734375,
"learning_rate": 2e-07,
"loss": 0.03692147135734558,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.3833333387970924,
"reward_std": 0.2159452974796295,
"rewards/MultiModalAccuracyORM": 0.3833333387970924,
"step": 1125,
"train_speed(iter/s)": 0.04082
},
{
"clip_ratio": 0.0,
"completion_length": 16.45,
"epoch": 0.45656565656565656,
"grad_norm": 0.10465247184038162,
"kl": 0.04431991577148438,
"learning_rate": 2e-07,
"loss": 0.003530232235789299,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.3166666701436043,
"reward_std": 0.2323044866323471,
"rewards/MultiModalAccuracyORM": 0.3166666701436043,
"step": 1130,
"train_speed(iter/s)": 0.040817
},
{
"clip_ratio": 0.0,
"completion_length": 7.6,
"epoch": 0.4585858585858586,
"grad_norm": 0.32010194659233093,
"kl": 0.094537353515625,
"learning_rate": 2e-07,
"loss": 0.012909208238124848,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.0916666716337204,
"reward_std": 0.1293427586555481,
"rewards/MultiModalAccuracyORM": 0.0916666716337204,
"step": 1135,
"train_speed(iter/s)": 0.040832
},
{
"clip_ratio": 0.0,
"completion_length": 67.8,
"epoch": 0.46060606060606063,
"grad_norm": 15.148902893066406,
"kl": 0.07255020141601562,
"learning_rate": 2e-07,
"loss": 0.016760605573654174,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.20000000149011612,
"reward_std": 0.2260383188724518,
"rewards/MultiModalAccuracyORM": 0.20000000149011612,
"step": 1140,
"train_speed(iter/s)": 0.040831
},
{
"clip_ratio": 0.0,
"completion_length": 54.75,
"epoch": 0.4626262626262626,
"grad_norm": 4.259115219116211,
"kl": 0.012025833129882812,
"learning_rate": 2e-07,
"loss": -0.004991362616419792,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.12500000298023223,
"reward_std": 0.2003761351108551,
"rewards/MultiModalAccuracyORM": 0.12500000298023223,
"step": 1145,
"train_speed(iter/s)": 0.040832
},
{
"clip_ratio": 0.0,
"completion_length": 8.7,
"epoch": 0.46464646464646464,
"grad_norm": 4.517999649047852,
"kl": 0.0364471435546875,
"learning_rate": 2e-07,
"loss": 0.0014625540003180503,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.37500000149011614,
"reward_std": 0.18561154305934907,
"rewards/MultiModalAccuracyORM": 0.37500000149011614,
"step": 1150,
"train_speed(iter/s)": 0.040853
},
{
"clip_ratio": 0.0,
"completion_length": 10.1,
"epoch": 0.4666666666666667,
"grad_norm": 9.037857055664062,
"kl": 0.066754150390625,
"learning_rate": 2e-07,
"loss": 0.023162148892879486,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.3000000029802322,
"reward_std": 0.14589657187461852,
"rewards/MultiModalAccuracyORM": 0.3000000029802322,
"step": 1155,
"train_speed(iter/s)": 0.040895
},
{
"clip_ratio": 0.0,
"completion_length": 5.7,
"epoch": 0.4686868686868687,
"grad_norm": 0.35684671998023987,
"kl": 0.1403411865234375,
"learning_rate": 2e-07,
"loss": 0.011607617139816284,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.47500001043081286,
"reward_std": 0.19340355396270753,
"rewards/MultiModalAccuracyORM": 0.47500001043081286,
"step": 1160,
"train_speed(iter/s)": 0.040919
},
{
"clip_ratio": 0.0,
"completion_length": 12.4,
"epoch": 0.4707070707070707,
"grad_norm": 0.18109376728534698,
"kl": 0.0370758056640625,
"learning_rate": 2e-07,
"loss": -0.0030417680740356446,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.21666667014360427,
"reward_std": 0.1848811239004135,
"rewards/MultiModalAccuracyORM": 0.21666667014360427,
"step": 1165,
"train_speed(iter/s)": 0.040938
},
{
"clip_ratio": 0.0,
"completion_length": 8.8,
"epoch": 0.4727272727272727,
"grad_norm": 17.05179786682129,
"kl": 0.027799224853515624,
"learning_rate": 2e-07,
"loss": -0.01608174741268158,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.35833333656191824,
"reward_std": 0.25566026866436004,
"rewards/MultiModalAccuracyORM": 0.35833333656191824,
"step": 1170,
"train_speed(iter/s)": 0.040961
},
{
"clip_ratio": 0.0,
"completion_length": 35.55,
"epoch": 0.47474747474747475,
"grad_norm": 2.053295850753784,
"kl": 0.0653228759765625,
"learning_rate": 2e-07,
"loss": 0.0025410931557416916,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.30000000149011613,
"reward_std": 0.18780820965766906,
"rewards/MultiModalAccuracyORM": 0.30000000149011613,
"step": 1175,
"train_speed(iter/s)": 0.040966
},
{
"clip_ratio": 0.0,
"completion_length": 3.65,
"epoch": 0.4767676767676768,
"grad_norm": 12.327520370483398,
"kl": 0.1503997802734375,
"learning_rate": 2e-07,
"loss": 0.00606456995010376,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.47500000298023226,
"reward_std": 0.16696292161941528,
"rewards/MultiModalAccuracyORM": 0.47500000298023226,
"step": 1180,
"train_speed(iter/s)": 0.040998
},
{
"clip_ratio": 0.0,
"completion_length": 9.1,
"epoch": 0.47878787878787876,
"grad_norm": 0.1990954726934433,
"kl": 0.26718597412109374,
"learning_rate": 2e-07,
"loss": 0.011653450131416321,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.38333333656191826,
"reward_std": 0.27402731478214265,
"rewards/MultiModalAccuracyORM": 0.38333333656191826,
"step": 1185,
"train_speed(iter/s)": 0.041009
},
{
"clip_ratio": 0.0,
"completion_length": 7.5,
"epoch": 0.4808080808080808,
"grad_norm": 5.806619644165039,
"kl": 0.059732818603515626,
"learning_rate": 2e-07,
"loss": -0.013705405592918395,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.09166667088866234,
"reward_std": 0.12558708488941192,
"rewards/MultiModalAccuracyORM": 0.09166667088866234,
"step": 1190,
"train_speed(iter/s)": 0.041032
},
{
"clip_ratio": 0.0,
"completion_length": 8.2,
"epoch": 0.48282828282828283,
"grad_norm": 12.781750679016113,
"kl": 0.04134521484375,
"learning_rate": 2e-07,
"loss": -0.008668276667594909,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.4416666693985462,
"reward_std": 0.2597057580947876,
"rewards/MultiModalAccuracyORM": 0.4416666693985462,
"step": 1195,
"train_speed(iter/s)": 0.041043
},
{
"clip_ratio": 0.0,
"completion_length": 6.45,
"epoch": 0.48484848484848486,
"grad_norm": 3.4121592044830322,
"kl": 0.073028564453125,
"learning_rate": 2e-07,
"loss": -0.0033960781991481783,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.2833333447575569,
"reward_std": 0.38452682793140414,
"rewards/MultiModalAccuracyORM": 0.2833333447575569,
"step": 1200,
"train_speed(iter/s)": 0.041068
},
{
"clip_ratio": 0.0,
"completion_length": 19.65,
"epoch": 0.4868686868686869,
"grad_norm": 2.179175615310669,
"kl": 0.1186309814453125,
"learning_rate": 2e-07,
"loss": 0.0020799320191144943,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.5500000081956387,
"reward_std": 0.383985635638237,
"rewards/MultiModalAccuracyORM": 0.5500000081956387,
"step": 1205,
"train_speed(iter/s)": 0.041076
},
{
"clip_ratio": 0.0,
"completion_length": 18.0,
"epoch": 0.4888888888888889,
"grad_norm": 16.699316024780273,
"kl": 0.19964828491210937,
"learning_rate": 2e-07,
"loss": 0.07210339307785034,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.41666667014360426,
"reward_std": 0.27151924669742583,
"rewards/MultiModalAccuracyORM": 0.41666667014360426,
"step": 1210,
"train_speed(iter/s)": 0.041084
},
{
"clip_ratio": 0.0,
"completion_length": 12.2,
"epoch": 0.4909090909090909,
"grad_norm": 11.2245512008667,
"kl": 0.02044839859008789,
"learning_rate": 2e-07,
"loss": 0.0006846427917480469,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.3166666738688946,
"reward_std": 0.22074522376060485,
"rewards/MultiModalAccuracyORM": 0.3166666738688946,
"step": 1215,
"train_speed(iter/s)": 0.041094
},
{
"clip_ratio": 0.0,
"completion_length": 10.95,
"epoch": 0.49292929292929294,
"grad_norm": 23.733837127685547,
"kl": 0.0533355712890625,
"learning_rate": 2e-07,
"loss": -0.03312296569347382,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.21666666939854623,
"reward_std": 0.3827823489904404,
"rewards/MultiModalAccuracyORM": 0.21666666939854623,
"step": 1220,
"train_speed(iter/s)": 0.041103
},
{
"clip_ratio": 0.0,
"completion_length": 7.4,
"epoch": 0.494949494949495,
"grad_norm": 5.569579124450684,
"kl": 0.12704048156738282,
"learning_rate": 2e-07,
"loss": -0.030297344923019408,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.41666667386889455,
"reward_std": 0.3534030467271805,
"rewards/MultiModalAccuracyORM": 0.41666667386889455,
"step": 1225,
"train_speed(iter/s)": 0.041105
},
{
"clip_ratio": 0.0,
"completion_length": 6.15,
"epoch": 0.49696969696969695,
"grad_norm": 13.687773704528809,
"kl": 0.054621124267578126,
"learning_rate": 2e-07,
"loss": 0.020814248919487,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.29166667312383654,
"reward_std": 0.2981545031070709,
"rewards/MultiModalAccuracyORM": 0.29166667312383654,
"step": 1230,
"train_speed(iter/s)": 0.041117
},
{
"clip_ratio": 0.0,
"completion_length": 15.45,
"epoch": 0.498989898989899,
"grad_norm": 4.014401912689209,
"kl": 0.11805038452148438,
"learning_rate": 2e-07,
"loss": -0.014261078834533692,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.31666666865348814,
"reward_std": 0.24615318179130555,
"rewards/MultiModalAccuracyORM": 0.31666666865348814,
"step": 1235,
"train_speed(iter/s)": 0.041139
},
{
"clip_ratio": 0.0,
"completion_length": 57.15,
"epoch": 0.501010101010101,
"grad_norm": 7.063708782196045,
"kl": 0.04602813720703125,
"learning_rate": 2e-07,
"loss": -0.0014480194076895714,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.3500000052154064,
"reward_std": 0.2486636757850647,
"rewards/MultiModalAccuracyORM": 0.3500000052154064,
"step": 1240,
"train_speed(iter/s)": 0.041135
},
{
"clip_ratio": 0.0,
"completion_length": 15.65,
"epoch": 0.503030303030303,
"grad_norm": 0.07285178452730179,
"kl": 0.06838836669921874,
"learning_rate": 2e-07,
"loss": 0.007464568316936493,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.25833333656191826,
"reward_std": 0.26703072190284727,
"rewards/MultiModalAccuracyORM": 0.25833333656191826,
"step": 1245,
"train_speed(iter/s)": 0.04113
},
{
"epoch": 0.5050505050505051,
"grad_norm": 16.691085815429688,
"learning_rate": 2e-07,
"loss": 0.027106884121894836,
"memory(GiB)": 104.49,
"step": 1250,
"train_speed(iter/s)": 0.041132
},
{
"epoch": 0.5050505050505051,
"eval_clip_ratio": 0.0,
"eval_completion_length": 24.193333625793457,
"eval_kl": 0.0990032958984375,
"eval_loss": 0.013061273843050003,
"eval_response_clip_ratio": 0.0,
"eval_reward": 0.3783333380520344,
"eval_reward_std": 0.21932941377162934,
"eval_rewards/MultiModalAccuracyORM": 0.3783333380520344,
"eval_runtime": 254.2733,
"eval_samples_per_second": 0.197,
"eval_steps_per_second": 0.02,
"step": 1250
},
{
"clip_ratio": 0.0,
"completion_length": 13.45,
"epoch": 0.5070707070707071,
"grad_norm": 1.7288111448287964,
"kl": 0.14322261810302733,
"learning_rate": 2e-07,
"loss": -0.0040175896137952805,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.32916667200624944,
"reward_std": 0.21599168032407762,
"rewards/MultiModalAccuracyORM": 0.32916667200624944,
"step": 1255,
"train_speed(iter/s)": 0.040698
},
{
"clip_ratio": 0.0,
"completion_length": 7.8,
"epoch": 0.509090909090909,
"grad_norm": 30.862096786499023,
"kl": 0.065618896484375,
"learning_rate": 2e-07,
"loss": 0.03462098240852356,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.3166666753590107,
"reward_std": 0.3471368789672852,
"rewards/MultiModalAccuracyORM": 0.3166666753590107,
"step": 1260,
"train_speed(iter/s)": 0.040722
},
{
"clip_ratio": 0.0,
"completion_length": 56.65,
"epoch": 0.5111111111111111,
"grad_norm": 18.206647872924805,
"kl": 0.050946044921875,
"learning_rate": 2e-07,
"loss": -0.018359455466270446,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.37500001192092897,
"reward_std": 0.285042542219162,
"rewards/MultiModalAccuracyORM": 0.37500001192092897,
"step": 1265,
"train_speed(iter/s)": 0.040714
},
{
"clip_ratio": 0.0,
"completion_length": 32.8,
"epoch": 0.5131313131313131,
"grad_norm": 21.11511993408203,
"kl": 0.08178558349609374,
"learning_rate": 2e-07,
"loss": 0.019801269471645355,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.5000000059604645,
"reward_std": 0.24666163325309753,
"rewards/MultiModalAccuracyORM": 0.5000000059604645,
"step": 1270,
"train_speed(iter/s)": 0.040716
},
{
"clip_ratio": 0.0,
"completion_length": 33.7,
"epoch": 0.5151515151515151,
"grad_norm": 2.3435275554656982,
"kl": 0.037060546875,
"learning_rate": 2e-07,
"loss": -0.044399937987327574,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.30000000149011613,
"reward_std": 0.18780821561813354,
"rewards/MultiModalAccuracyORM": 0.30000000149011613,
"step": 1275,
"train_speed(iter/s)": 0.040729
},
{
"clip_ratio": 0.0,
"completion_length": 32.3,
"epoch": 0.5171717171717172,
"grad_norm": 6.154475688934326,
"kl": 0.06382598876953124,
"learning_rate": 2e-07,
"loss": 0.024791686236858367,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.38333334103226663,
"reward_std": 0.3026406019926071,
"rewards/MultiModalAccuracyORM": 0.38333334103226663,
"step": 1280,
"train_speed(iter/s)": 0.040736
},
{
"clip_ratio": 0.0,
"completion_length": 72.85,
"epoch": 0.5191919191919192,
"grad_norm": 0.17857688665390015,
"kl": 0.05196533203125,
"learning_rate": 2e-07,
"loss": -0.01656932532787323,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.1916666716337204,
"reward_std": 0.32905964851379393,
"rewards/MultiModalAccuracyORM": 0.1916666716337204,
"step": 1285,
"train_speed(iter/s)": 0.040739
},
{
"clip_ratio": 0.0,
"completion_length": 18.1,
"epoch": 0.5212121212121212,
"grad_norm": 5.7444353103637695,
"kl": 0.032296371459960935,
"learning_rate": 2e-07,
"loss": -0.04405757784843445,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.19166667237877846,
"reward_std": 0.3292782843112946,
"rewards/MultiModalAccuracyORM": 0.19166667237877846,
"step": 1290,
"train_speed(iter/s)": 0.04075
},
{
"clip_ratio": 0.0,
"completion_length": 9.45,
"epoch": 0.5232323232323233,
"grad_norm": 1.938860297203064,
"kl": 0.04727783203125,
"learning_rate": 2e-07,
"loss": 0.001994212530553341,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.4666666701436043,
"reward_std": 0.2953156381845474,
"rewards/MultiModalAccuracyORM": 0.4666666701436043,
"step": 1295,
"train_speed(iter/s)": 0.040768
},
{
"clip_ratio": 0.0,
"completion_length": 8.9,
"epoch": 0.5252525252525253,
"grad_norm": 23.327890396118164,
"kl": 0.118865966796875,
"learning_rate": 2e-07,
"loss": 0.020175328850746153,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.3833333492279053,
"reward_std": 0.3596546709537506,
"rewards/MultiModalAccuracyORM": 0.3833333492279053,
"step": 1300,
"train_speed(iter/s)": 0.04078
},
{
"clip_ratio": 0.0,
"completion_length": 19.4,
"epoch": 0.5272727272727272,
"grad_norm": 1.2604830265045166,
"kl": 0.082135009765625,
"learning_rate": 2e-07,
"loss": -0.006745982170104981,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.3250000089406967,
"reward_std": 0.23631438612937927,
"rewards/MultiModalAccuracyORM": 0.3250000089406967,
"step": 1305,
"train_speed(iter/s)": 0.040788
},
{
"clip_ratio": 0.0,
"completion_length": 15.1,
"epoch": 0.5292929292929293,
"grad_norm": 19.63453483581543,
"kl": 0.093505859375,
"learning_rate": 2e-07,
"loss": -0.01361556351184845,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.33333334177732465,
"reward_std": 0.24337058067321776,
"rewards/MultiModalAccuracyORM": 0.33333334177732465,
"step": 1310,
"train_speed(iter/s)": 0.0408
},
{
"clip_ratio": 0.0,
"completion_length": 64.75,
"epoch": 0.5313131313131313,
"grad_norm": 5.953737735748291,
"kl": 0.115643310546875,
"learning_rate": 2e-07,
"loss": 0.004205666109919548,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.30833334028720855,
"reward_std": 0.21123813688755036,
"rewards/MultiModalAccuracyORM": 0.30833334028720855,
"step": 1315,
"train_speed(iter/s)": 0.040801
},
{
"clip_ratio": 0.0,
"completion_length": 6.55,
"epoch": 0.5333333333333333,
"grad_norm": 24.937227249145508,
"kl": 0.1268402099609375,
"learning_rate": 2e-07,
"loss": 0.0015925129875540734,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.26666667237877845,
"reward_std": 0.31119862794876096,
"rewards/MultiModalAccuracyORM": 0.26666667237877845,
"step": 1320,
"train_speed(iter/s)": 0.040816
},
{
"clip_ratio": 0.0,
"completion_length": 11.35,
"epoch": 0.5353535353535354,
"grad_norm": 0.8153337240219116,
"kl": 0.150848388671875,
"learning_rate": 2e-07,
"loss": -0.021095672249794008,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.21666667088866234,
"reward_std": 0.27402731478214265,
"rewards/MultiModalAccuracyORM": 0.21666667088866234,
"step": 1325,
"train_speed(iter/s)": 0.040834
},
{
"clip_ratio": 0.0,
"completion_length": 10.35,
"epoch": 0.5373737373737374,
"grad_norm": 18.53838539123535,
"kl": 0.046075439453125,
"learning_rate": 2e-07,
"loss": 0.017172405123710634,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.4500000089406967,
"reward_std": 0.2159452974796295,
"rewards/MultiModalAccuracyORM": 0.4500000089406967,
"step": 1330,
"train_speed(iter/s)": 0.040851
},
{
"clip_ratio": 0.0,
"completion_length": 33.05,
"epoch": 0.5393939393939394,
"grad_norm": 7.678282737731934,
"kl": 0.0884857177734375,
"learning_rate": 2e-07,
"loss": 0.0011547883972525597,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.5750000141561031,
"reward_std": 0.3044206708669662,
"rewards/MultiModalAccuracyORM": 0.5750000141561031,
"step": 1335,
"train_speed(iter/s)": 0.04087
},
{
"clip_ratio": 0.0,
"completion_length": 8.7,
"epoch": 0.5414141414141415,
"grad_norm": 10.90495777130127,
"kl": 0.0806304931640625,
"learning_rate": 2e-07,
"loss": -0.017473408579826356,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.2750000022351742,
"reward_std": 0.15518502295017242,
"rewards/MultiModalAccuracyORM": 0.2750000022351742,
"step": 1340,
"train_speed(iter/s)": 0.040877
},
{
"clip_ratio": 0.0,
"completion_length": 10.55,
"epoch": 0.5434343434343434,
"grad_norm": 0.10261930525302887,
"kl": 0.060321044921875,
"learning_rate": 2e-07,
"loss": 0.0017479043453931808,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.33333334028720857,
"reward_std": 0.152222341299057,
"rewards/MultiModalAccuracyORM": 0.33333334028720857,
"step": 1345,
"train_speed(iter/s)": 0.040892
},
{
"clip_ratio": 0.0,
"completion_length": 10.75,
"epoch": 0.5454545454545454,
"grad_norm": 2.2841360569000244,
"kl": 0.024788665771484374,
"learning_rate": 2e-07,
"loss": -0.02739916443824768,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.20833333656191827,
"reward_std": 0.2652174890041351,
"rewards/MultiModalAccuracyORM": 0.20833333656191827,
"step": 1350,
"train_speed(iter/s)": 0.040901
},
{
"clip_ratio": 0.0,
"completion_length": 11.85,
"epoch": 0.5474747474747474,
"grad_norm": 13.731690406799316,
"kl": 0.0828125,
"learning_rate": 2e-07,
"loss": -0.0664910078048706,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.4333333417773247,
"reward_std": 0.3362748771905899,
"rewards/MultiModalAccuracyORM": 0.4333333417773247,
"step": 1355,
"train_speed(iter/s)": 0.04092
},
{
"clip_ratio": 0.0,
"completion_length": 11.95,
"epoch": 0.5494949494949495,
"grad_norm": 25.35189437866211,
"kl": 0.100750732421875,
"learning_rate": 2e-07,
"loss": -0.00892333835363388,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.3083333432674408,
"reward_std": 0.2915389180183411,
"rewards/MultiModalAccuracyORM": 0.3083333432674408,
"step": 1360,
"train_speed(iter/s)": 0.04093
},
{
"clip_ratio": 0.0,
"completion_length": 18.1,
"epoch": 0.5515151515151515,
"grad_norm": 9.685708999633789,
"kl": 0.061480712890625,
"learning_rate": 2e-07,
"loss": 0.012898986041545869,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.30000000521540643,
"reward_std": 0.21447905600070954,
"rewards/MultiModalAccuracyORM": 0.30000000521540643,
"step": 1365,
"train_speed(iter/s)": 0.040933
},
{
"clip_ratio": 0.0,
"completion_length": 8.45,
"epoch": 0.5535353535353535,
"grad_norm": 0.28964653611183167,
"kl": 0.1938751220703125,
"learning_rate": 2e-07,
"loss": 0.01745934933423996,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.3416666701436043,
"reward_std": 0.2489179015159607,
"rewards/MultiModalAccuracyORM": 0.3416666701436043,
"step": 1370,
"train_speed(iter/s)": 0.040944
},
{
"clip_ratio": 0.0,
"completion_length": 17.25,
"epoch": 0.5555555555555556,
"grad_norm": 8.731843948364258,
"kl": 0.06651153564453124,
"learning_rate": 2e-07,
"loss": 0.03409457206726074,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.35833333656191824,
"reward_std": 0.25566026866436004,
"rewards/MultiModalAccuracyORM": 0.35833333656191824,
"step": 1375,
"train_speed(iter/s)": 0.040953
},
{
"clip_ratio": 0.0,
"completion_length": 10.2,
"epoch": 0.5575757575757576,
"grad_norm": 35.31602096557617,
"kl": 0.100604248046875,
"learning_rate": 2e-07,
"loss": -0.010587018728256226,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.2500000111758709,
"reward_std": 0.25487024188041685,
"rewards/MultiModalAccuracyORM": 0.2500000111758709,
"step": 1380,
"train_speed(iter/s)": 0.040972
},
{
"clip_ratio": 0.0,
"completion_length": 12.55,
"epoch": 0.5595959595959596,
"grad_norm": 1.9312275648117065,
"kl": 0.09021759033203125,
"learning_rate": 2e-07,
"loss": -0.012255148589611053,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.24166667610406875,
"reward_std": 0.255184069275856,
"rewards/MultiModalAccuracyORM": 0.24166667610406875,
"step": 1385,
"train_speed(iter/s)": 0.040973
},
{
"clip_ratio": 0.0,
"completion_length": 15.25,
"epoch": 0.5616161616161616,
"grad_norm": 30.091777801513672,
"kl": 0.08451480865478515,
"learning_rate": 2e-07,
"loss": -0.004190707206726074,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.2500000074505806,
"reward_std": 0.2875886201858521,
"rewards/MultiModalAccuracyORM": 0.2500000074505806,
"step": 1390,
"train_speed(iter/s)": 0.040981
},
{
"clip_ratio": 0.0,
"completion_length": 5.25,
"epoch": 0.5636363636363636,
"grad_norm": 5.909719467163086,
"kl": 0.16330108642578126,
"learning_rate": 2e-07,
"loss": -0.01449722945690155,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.35000000670552256,
"reward_std": 0.15821027159690856,
"rewards/MultiModalAccuracyORM": 0.35000000670552256,
"step": 1395,
"train_speed(iter/s)": 0.040992
},
{
"clip_ratio": 0.0,
"completion_length": 11.9,
"epoch": 0.5656565656565656,
"grad_norm": 4.40855598449707,
"kl": 0.0266082763671875,
"learning_rate": 2e-07,
"loss": 0.026001608371734618,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.24166667088866234,
"reward_std": 0.3048968702554703,
"rewards/MultiModalAccuracyORM": 0.24166667088866234,
"step": 1400,
"train_speed(iter/s)": 0.041006
},
{
"clip_ratio": 0.0,
"completion_length": 8.85,
"epoch": 0.5676767676767677,
"grad_norm": 0.061144277453422546,
"kl": 0.07353515625,
"learning_rate": 2e-07,
"loss": -0.010889561474323272,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.24166667312383652,
"reward_std": 0.14815284609794616,
"rewards/MultiModalAccuracyORM": 0.24166667312383652,
"step": 1405,
"train_speed(iter/s)": 0.041018
},
{
"clip_ratio": 0.0,
"completion_length": 12.6,
"epoch": 0.5696969696969697,
"grad_norm": 0.037721507251262665,
"kl": 0.087078857421875,
"learning_rate": 2e-07,
"loss": 0.004135938733816147,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.45833333358168604,
"reward_std": 0.08109080791473389,
"rewards/MultiModalAccuracyORM": 0.45833333358168604,
"step": 1410,
"train_speed(iter/s)": 0.041023
},
{
"clip_ratio": 0.0,
"completion_length": 14.2,
"epoch": 0.5717171717171717,
"grad_norm": 4.825331211090088,
"kl": 0.18311767578125,
"learning_rate": 2e-07,
"loss": 0.02725890576839447,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.24166667684912682,
"reward_std": 0.2338038921356201,
"rewards/MultiModalAccuracyORM": 0.24166667684912682,
"step": 1415,
"train_speed(iter/s)": 0.04103
},
{
"clip_ratio": 0.0,
"completion_length": 32.65,
"epoch": 0.5737373737373738,
"grad_norm": 1.8680031299591064,
"kl": 0.0274566650390625,
"learning_rate": 2e-07,
"loss": 0.0017455607652664185,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.17500000074505806,
"reward_std": 0.12001575231552124,
"rewards/MultiModalAccuracyORM": 0.17500000074505806,
"step": 1420,
"train_speed(iter/s)": 0.041024
},
{
"clip_ratio": 0.0,
"completion_length": 10.9,
"epoch": 0.5757575757575758,
"grad_norm": 3.193700075149536,
"kl": 0.305999755859375,
"learning_rate": 2e-07,
"loss": 0.046308600902557374,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.4000000074505806,
"reward_std": 0.29177860021591184,
"rewards/MultiModalAccuracyORM": 0.4000000074505806,
"step": 1425,
"train_speed(iter/s)": 0.041037
},
{
"clip_ratio": 0.0,
"completion_length": 12.95,
"epoch": 0.5777777777777777,
"grad_norm": 2.843719244003296,
"kl": 0.0330230712890625,
"learning_rate": 2e-07,
"loss": -0.04594253897666931,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.30000000521540643,
"reward_std": 0.3008869707584381,
"rewards/MultiModalAccuracyORM": 0.30000000521540643,
"step": 1430,
"train_speed(iter/s)": 0.041054
},
{
"clip_ratio": 0.0,
"completion_length": 20.0,
"epoch": 0.5797979797979798,
"grad_norm": 23.12917137145996,
"kl": 0.0993408203125,
"learning_rate": 2e-07,
"loss": 0.021137547492980958,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.2166666716337204,
"reward_std": 0.2790306001901627,
"rewards/MultiModalAccuracyORM": 0.2166666716337204,
"step": 1435,
"train_speed(iter/s)": 0.041061
},
{
"clip_ratio": 0.0,
"completion_length": 7.25,
"epoch": 0.5818181818181818,
"grad_norm": 17.79547882080078,
"kl": 0.1023834228515625,
"learning_rate": 2e-07,
"loss": 0.00415017232298851,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.30000000521540643,
"reward_std": 0.20369119942188263,
"rewards/MultiModalAccuracyORM": 0.30000000521540643,
"step": 1440,
"train_speed(iter/s)": 0.041072
},
{
"clip_ratio": 0.0,
"completion_length": 7.2,
"epoch": 0.5838383838383838,
"grad_norm": 15.119973182678223,
"kl": 0.11974754333496093,
"learning_rate": 2e-07,
"loss": -0.008057641983032226,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.3250000089406967,
"reward_std": 0.2770525634288788,
"rewards/MultiModalAccuracyORM": 0.3250000089406967,
"step": 1445,
"train_speed(iter/s)": 0.041081
},
{
"clip_ratio": 0.0,
"completion_length": 6.6,
"epoch": 0.5858585858585859,
"grad_norm": 0.13666389882564545,
"kl": 0.0672607421875,
"learning_rate": 2e-07,
"loss": -0.0010352015495300293,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.2083333373069763,
"reward_std": 0.14996607303619386,
"rewards/MultiModalAccuracyORM": 0.2083333373069763,
"step": 1450,
"train_speed(iter/s)": 0.041098
},
{
"clip_ratio": 0.0,
"completion_length": 10.3,
"epoch": 0.5878787878787879,
"grad_norm": 11.365659713745117,
"kl": 0.0847259521484375,
"learning_rate": 2e-07,
"loss": 0.014445498585700989,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.1500000037252903,
"reward_std": 0.23955530524253846,
"rewards/MultiModalAccuracyORM": 0.1500000037252903,
"step": 1455,
"train_speed(iter/s)": 0.041108
},
{
"clip_ratio": 0.0,
"completion_length": 5.5,
"epoch": 0.5898989898989899,
"grad_norm": 25.425418853759766,
"kl": 0.070880126953125,
"learning_rate": 2e-07,
"loss": 0.00023016731720417737,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.3916666753590107,
"reward_std": 0.3265491545200348,
"rewards/MultiModalAccuracyORM": 0.3916666753590107,
"step": 1460,
"train_speed(iter/s)": 0.041127
},
{
"clip_ratio": 0.0,
"completion_length": 12.35,
"epoch": 0.591919191919192,
"grad_norm": 11.779102325439453,
"kl": 0.07333221435546874,
"learning_rate": 2e-07,
"loss": 0.0254564106464386,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.2583333395421505,
"reward_std": 0.16626566052436828,
"rewards/MultiModalAccuracyORM": 0.2583333395421505,
"step": 1465,
"train_speed(iter/s)": 0.041143
},
{
"clip_ratio": 0.0,
"completion_length": 6.55,
"epoch": 0.593939393939394,
"grad_norm": 1.78038489818573,
"kl": 0.1328155517578125,
"learning_rate": 2e-07,
"loss": 0.008091837167739868,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.27500000298023225,
"reward_std": 0.1293427586555481,
"rewards/MultiModalAccuracyORM": 0.27500000298023225,
"step": 1470,
"train_speed(iter/s)": 0.041154
},
{
"clip_ratio": 0.0,
"completion_length": 35.25,
"epoch": 0.5959595959595959,
"grad_norm": 2.518378734588623,
"kl": 0.1015869140625,
"learning_rate": 2e-07,
"loss": -0.03122214078903198,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.4333333358168602,
"reward_std": 0.3322981417179108,
"rewards/MultiModalAccuracyORM": 0.4333333358168602,
"step": 1475,
"train_speed(iter/s)": 0.041146
},
{
"clip_ratio": 0.0,
"completion_length": 5.5,
"epoch": 0.597979797979798,
"grad_norm": 20.898664474487305,
"kl": 0.1433135986328125,
"learning_rate": 2e-07,
"loss": -0.02608821392059326,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.2250000059604645,
"reward_std": 0.287842845916748,
"rewards/MultiModalAccuracyORM": 0.2250000059604645,
"step": 1480,
"train_speed(iter/s)": 0.041162
},
{
"clip_ratio": 0.0,
"completion_length": 10.8,
"epoch": 0.6,
"grad_norm": 0.11180847883224487,
"kl": 0.13046875,
"learning_rate": 2e-07,
"loss": 0.003093409538269043,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.21666666716337205,
"reward_std": 0.12937834858894348,
"rewards/MultiModalAccuracyORM": 0.21666666716337205,
"step": 1485,
"train_speed(iter/s)": 0.041171
},
{
"clip_ratio": 0.0,
"completion_length": 7.85,
"epoch": 0.602020202020202,
"grad_norm": 12.01523494720459,
"kl": 0.187371826171875,
"learning_rate": 2e-07,
"loss": -0.008616887032985687,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.29166667237877847,
"reward_std": 0.21550226211547852,
"rewards/MultiModalAccuracyORM": 0.29166667237877847,
"step": 1490,
"train_speed(iter/s)": 0.041188
},
{
"clip_ratio": 0.0,
"completion_length": 6.75,
"epoch": 0.604040404040404,
"grad_norm": 14.021830558776855,
"kl": 0.16456298828125,
"learning_rate": 2e-07,
"loss": 0.010373742878437042,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.5166666701436042,
"reward_std": 0.21447905600070954,
"rewards/MultiModalAccuracyORM": 0.5166666701436042,
"step": 1495,
"train_speed(iter/s)": 0.041207
},
{
"epoch": 0.6060606060606061,
"grad_norm": 1.3669841289520264,
"learning_rate": 2e-07,
"loss": -0.011987817287445069,
"memory(GiB)": 104.49,
"step": 1500,
"train_speed(iter/s)": 0.041216
},
{
"epoch": 0.6060606060606061,
"eval_clip_ratio": 0.0,
"eval_completion_length": 23.09000030040741,
"eval_kl": 0.12807769775390626,
"eval_loss": 0.0023684909101575613,
"eval_response_clip_ratio": 0.0,
"eval_reward": 0.42833334028720854,
"eval_reward_std": 0.21841024577617646,
"eval_rewards/MultiModalAccuracyORM": 0.42833334028720854,
"eval_runtime": 243.0786,
"eval_samples_per_second": 0.206,
"eval_steps_per_second": 0.021,
"step": 1500
},
{
"clip_ratio": 0.0,
"completion_length": 13.275,
"epoch": 0.6080808080808081,
"grad_norm": 10.859317779541016,
"kl": 0.1035552978515625,
"learning_rate": 2e-07,
"loss": -0.011110000312328339,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.35000000447034835,
"reward_std": 0.2066851645708084,
"rewards/MultiModalAccuracyORM": 0.35000000447034835,
"step": 1505,
"train_speed(iter/s)": 0.040874
},
{
"clip_ratio": 0.0,
"completion_length": 61.45,
"epoch": 0.6101010101010101,
"grad_norm": 0.055811017751693726,
"kl": 0.03581314086914063,
"learning_rate": 2e-07,
"loss": 0.04541417956352234,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.3916666731238365,
"reward_std": 0.33700530230998993,
"rewards/MultiModalAccuracyORM": 0.3916666731238365,
"step": 1510,
"train_speed(iter/s)": 0.040868
},
{
"clip_ratio": 0.0,
"completion_length": 6.1,
"epoch": 0.6121212121212121,
"grad_norm": 2.9291131496429443,
"kl": 0.076611328125,
"learning_rate": 2e-07,
"loss": 0.0033688426017761232,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.40000000298023225,
"reward_std": 0.22297748029232026,
"rewards/MultiModalAccuracyORM": 0.40000000298023225,
"step": 1515,
"train_speed(iter/s)": 0.040893
},
{
"clip_ratio": 0.0,
"completion_length": 39.8,
"epoch": 0.6141414141414141,
"grad_norm": 10.698760032653809,
"kl": 0.024103546142578126,
"learning_rate": 2e-07,
"loss": 0.033906325697898865,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.1416666716337204,
"reward_std": 0.24487241208553315,
"rewards/MultiModalAccuracyORM": 0.1416666716337204,
"step": 1520,
"train_speed(iter/s)": 0.040902
},
{
"clip_ratio": 0.0,
"completion_length": 9.95,
"epoch": 0.6161616161616161,
"grad_norm": 5.847660541534424,
"kl": 0.141815185546875,
"learning_rate": 2e-07,
"loss": -0.014752772450447083,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.40000000447034834,
"reward_std": 0.27756677865982055,
"rewards/MultiModalAccuracyORM": 0.40000000447034834,
"step": 1525,
"train_speed(iter/s)": 0.04091
},
{
"clip_ratio": 0.0,
"completion_length": 6.85,
"epoch": 0.6181818181818182,
"grad_norm": 2.933770179748535,
"kl": 0.1540740966796875,
"learning_rate": 2e-07,
"loss": 0.021346482634544372,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.32500001043081284,
"reward_std": 0.3767612546682358,
"rewards/MultiModalAccuracyORM": 0.32500001043081284,
"step": 1530,
"train_speed(iter/s)": 0.040919
},
{
"clip_ratio": 0.0,
"completion_length": 20.35,
"epoch": 0.6202020202020202,
"grad_norm": 6.487882614135742,
"kl": 0.08126373291015625,
"learning_rate": 2e-07,
"loss": -0.02819029986858368,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.4000000037252903,
"reward_std": 0.2875886201858521,
"rewards/MultiModalAccuracyORM": 0.4000000037252903,
"step": 1535,
"train_speed(iter/s)": 0.040926
},
{
"clip_ratio": 0.0,
"completion_length": 13.7,
"epoch": 0.6222222222222222,
"grad_norm": 0.1822008639574051,
"kl": 0.244976806640625,
"learning_rate": 2e-07,
"loss": 0.02670127749443054,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.43333333879709246,
"reward_std": 0.14589657187461852,
"rewards/MultiModalAccuracyORM": 0.43333333879709246,
"step": 1540,
"train_speed(iter/s)": 0.040936
},
{
"clip_ratio": 0.0,
"completion_length": 54.35,
"epoch": 0.6242424242424243,
"grad_norm": 5.22224235534668,
"kl": 0.087286376953125,
"learning_rate": 2e-07,
"loss": 0.011146068572998047,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.39166667237877845,
"reward_std": 0.39465543925762175,
"rewards/MultiModalAccuracyORM": 0.39166667237877845,
"step": 1545,
"train_speed(iter/s)": 0.040941
},
{
"clip_ratio": 0.0,
"completion_length": 46.65,
"epoch": 0.6262626262626263,
"grad_norm": 12.465606689453125,
"kl": 0.11739501953125,
"learning_rate": 2e-07,
"loss": 0.01348254531621933,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.37500000447034837,
"reward_std": 0.18332211077213287,
"rewards/MultiModalAccuracyORM": 0.37500000447034837,
"step": 1550,
"train_speed(iter/s)": 0.040941
},
{
"clip_ratio": 0.0,
"completion_length": 12.05,
"epoch": 0.6282828282828283,
"grad_norm": 0.03528100252151489,
"kl": 0.059906005859375,
"learning_rate": 2e-07,
"loss": 0.002536106109619141,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.32500000223517417,
"reward_std": 0.12558708488941192,
"rewards/MultiModalAccuracyORM": 0.32500000223517417,
"step": 1555,
"train_speed(iter/s)": 0.040957
},
{
"clip_ratio": 0.0,
"completion_length": 7.5,
"epoch": 0.6303030303030303,
"grad_norm": 15.021883010864258,
"kl": 0.11079330444335937,
"learning_rate": 2e-07,
"loss": 0.0029231052845716476,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.4333333380520344,
"reward_std": 0.20973873138427734,
"rewards/MultiModalAccuracyORM": 0.4333333380520344,
"step": 1560,
"train_speed(iter/s)": 0.040974
},
{
"clip_ratio": 0.0,
"completion_length": 15.1,
"epoch": 0.6323232323232323,
"grad_norm": 2.5578255653381348,
"kl": 0.04172821044921875,
"learning_rate": 2e-07,
"loss": 0.004573901742696762,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.12500000447034837,
"reward_std": 0.18087121844291687,
"rewards/MultiModalAccuracyORM": 0.12500000447034837,
"step": 1565,
"train_speed(iter/s)": 0.040988
},
{
"clip_ratio": 0.0,
"completion_length": 6.8,
"epoch": 0.6343434343434343,
"grad_norm": 22.243240356445312,
"kl": 0.158673095703125,
"learning_rate": 2e-07,
"loss": -0.008480211347341537,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.41666667759418485,
"reward_std": 0.28480601906776426,
"rewards/MultiModalAccuracyORM": 0.41666667759418485,
"step": 1570,
"train_speed(iter/s)": 0.040998
},
{
"clip_ratio": 0.0,
"completion_length": 22.0,
"epoch": 0.6363636363636364,
"grad_norm": 25.038570404052734,
"kl": 0.1517974853515625,
"learning_rate": 2e-07,
"loss": 0.04977948367595673,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.36666667386889457,
"reward_std": 0.22625695466995238,
"rewards/MultiModalAccuracyORM": 0.36666667386889457,
"step": 1575,
"train_speed(iter/s)": 0.041005
},
{
"clip_ratio": 0.0,
"completion_length": 9.15,
"epoch": 0.6383838383838384,
"grad_norm": 0.11025875806808472,
"kl": 0.0567169189453125,
"learning_rate": 2e-07,
"loss": 0.004630526155233383,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.3916666679084301,
"reward_std": 0.13032740950584412,
"rewards/MultiModalAccuracyORM": 0.3916666679084301,
"step": 1580,
"train_speed(iter/s)": 0.041022
},
{
"clip_ratio": 0.0,
"completion_length": 29.9,
"epoch": 0.6404040404040404,
"grad_norm": 8.77802562713623,
"kl": 0.06422119140625,
"learning_rate": 2e-07,
"loss": -0.002487625740468502,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.2916666708886623,
"reward_std": 0.2822715133428574,
"rewards/MultiModalAccuracyORM": 0.2916666708886623,
"step": 1585,
"train_speed(iter/s)": 0.041027
},
{
"clip_ratio": 0.0,
"completion_length": 10.1,
"epoch": 0.6424242424242425,
"grad_norm": 0.061026524752378464,
"kl": 0.181072998046875,
"learning_rate": 2e-07,
"loss": 0.012957209348678589,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.28333333805203437,
"reward_std": 0.11928532719612121,
"rewards/MultiModalAccuracyORM": 0.28333333805203437,
"step": 1590,
"train_speed(iter/s)": 0.041041
},
{
"clip_ratio": 0.0,
"completion_length": 7.6,
"epoch": 0.6444444444444445,
"grad_norm": 5.596570014953613,
"kl": 0.17645263671875,
"learning_rate": 2e-07,
"loss": -0.0008578440174460411,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.2666666753590107,
"reward_std": 0.3071531385183334,
"rewards/MultiModalAccuracyORM": 0.2666666753590107,
"step": 1595,
"train_speed(iter/s)": 0.041048
},
{
"clip_ratio": 0.0,
"completion_length": 71.7,
"epoch": 0.6464646464646465,
"grad_norm": 26.054533004760742,
"kl": 0.11879425048828125,
"learning_rate": 2e-07,
"loss": 0.007277928292751312,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.37500000894069674,
"reward_std": 0.2732968896627426,
"rewards/MultiModalAccuracyORM": 0.37500000894069674,
"step": 1600,
"train_speed(iter/s)": 0.041045
},
{
"clip_ratio": 0.0,
"completion_length": 44.45,
"epoch": 0.6484848484848484,
"grad_norm": 0.11397194862365723,
"kl": 0.0313624382019043,
"learning_rate": 2e-07,
"loss": 0.0012240668758749962,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.33333334028720857,
"reward_std": 0.152222341299057,
"rewards/MultiModalAccuracyORM": 0.33333334028720857,
"step": 1605,
"train_speed(iter/s)": 0.041049
},
{
"clip_ratio": 0.0,
"completion_length": 54.7,
"epoch": 0.6505050505050505,
"grad_norm": 0.8132848739624023,
"kl": 0.099078369140625,
"learning_rate": 2e-07,
"loss": 0.008613920211791993,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.13333333432674407,
"reward_std": 0.20110656023025514,
"rewards/MultiModalAccuracyORM": 0.13333333432674407,
"step": 1610,
"train_speed(iter/s)": 0.041055
},
{
"clip_ratio": 0.0,
"completion_length": 106.2,
"epoch": 0.6525252525252525,
"grad_norm": 2.1414718627929688,
"kl": 0.05146484375,
"learning_rate": 2e-07,
"loss": 0.0011494815349578857,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.33333333507180213,
"reward_std": 0.25270916521549225,
"rewards/MultiModalAccuracyORM": 0.33333333507180213,
"step": 1615,
"train_speed(iter/s)": 0.041049
},
{
"clip_ratio": 0.0,
"completion_length": 7.7,
"epoch": 0.6545454545454545,
"grad_norm": 2.636408567428589,
"kl": 0.05029296875,
"learning_rate": 2e-07,
"loss": -0.02351543605327606,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.5083333447575569,
"reward_std": 0.2792848199605942,
"rewards/MultiModalAccuracyORM": 0.5083333447575569,
"step": 1620,
"train_speed(iter/s)": 0.041065
},
{
"clip_ratio": 0.0,
"completion_length": 47.3,
"epoch": 0.6565656565656566,
"grad_norm": 3.0985336303710938,
"kl": 0.065228271484375,
"learning_rate": 2e-07,
"loss": -0.014748664200305938,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.4833333395421505,
"reward_std": 0.16225576102733613,
"rewards/MultiModalAccuracyORM": 0.4833333395421505,
"step": 1625,
"train_speed(iter/s)": 0.041069
},
{
"clip_ratio": 0.0,
"completion_length": 17.65,
"epoch": 0.6585858585858586,
"grad_norm": 9.992680549621582,
"kl": 0.16975555419921876,
"learning_rate": 2e-07,
"loss": 0.008018460124731064,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.2583333395421505,
"reward_std": 0.21368903517723084,
"rewards/MultiModalAccuracyORM": 0.2583333395421505,
"step": 1630,
"train_speed(iter/s)": 0.041077
},
{
"clip_ratio": 0.0,
"completion_length": 5.95,
"epoch": 0.6606060606060606,
"grad_norm": 47.361576080322266,
"kl": 0.125982666015625,
"learning_rate": 2e-07,
"loss": 0.015030686557292939,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.4666666753590107,
"reward_std": 0.2340581238269806,
"rewards/MultiModalAccuracyORM": 0.4666666753590107,
"step": 1635,
"train_speed(iter/s)": 0.041091
},
{
"clip_ratio": 0.0,
"completion_length": 15.05,
"epoch": 0.6626262626262627,
"grad_norm": 6.931950569152832,
"kl": 0.16407470703125,
"learning_rate": 2e-07,
"loss": -0.012672655284404755,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.4166666746139526,
"reward_std": 0.215248042345047,
"rewards/MultiModalAccuracyORM": 0.4166666746139526,
"step": 1640,
"train_speed(iter/s)": 0.041096
},
{
"clip_ratio": 0.0,
"completion_length": 13.3,
"epoch": 0.6646464646464646,
"grad_norm": 0.08681845664978027,
"kl": 0.1269195556640625,
"learning_rate": 2e-07,
"loss": -0.0032407425343990324,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.5083333358168602,
"reward_std": 0.13338824808597566,
"rewards/MultiModalAccuracyORM": 0.5083333358168602,
"step": 1645,
"train_speed(iter/s)": 0.041111
},
{
"clip_ratio": 0.0,
"completion_length": 45.7,
"epoch": 0.6666666666666666,
"grad_norm": 3.8581395149230957,
"kl": 0.121484375,
"learning_rate": 2e-07,
"loss": 0.008351793140172958,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.28333333432674407,
"reward_std": 0.2581467509269714,
"rewards/MultiModalAccuracyORM": 0.28333333432674407,
"step": 1650,
"train_speed(iter/s)": 0.041103
},
{
"clip_ratio": 0.0,
"completion_length": 29.15,
"epoch": 0.6686868686868687,
"grad_norm": 17.391639709472656,
"kl": 0.13189697265625,
"learning_rate": 2e-07,
"loss": 0.056326770782470705,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.4083333410322666,
"reward_std": 0.3480859398841858,
"rewards/MultiModalAccuracyORM": 0.4083333410322666,
"step": 1655,
"train_speed(iter/s)": 0.041102
},
{
"clip_ratio": 0.0,
"completion_length": 9.25,
"epoch": 0.6707070707070707,
"grad_norm": 7.648516654968262,
"kl": 0.2052001953125,
"learning_rate": 2e-07,
"loss": -0.00421803817152977,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.3500000037252903,
"reward_std": 0.25897533297538755,
"rewards/MultiModalAccuracyORM": 0.3500000037252903,
"step": 1660,
"train_speed(iter/s)": 0.041107
},
{
"clip_ratio": 0.0,
"completion_length": 35.35,
"epoch": 0.6727272727272727,
"grad_norm": 1.1766724586486816,
"kl": 0.0945709228515625,
"learning_rate": 2e-07,
"loss": 0.013910901546478272,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.29166667684912684,
"reward_std": 0.24487241804599763,
"rewards/MultiModalAccuracyORM": 0.29166667684912684,
"step": 1665,
"train_speed(iter/s)": 0.041117
},
{
"clip_ratio": 0.0,
"completion_length": 25.35,
"epoch": 0.6747474747474748,
"grad_norm": 4.918646335601807,
"kl": 0.023187255859375,
"learning_rate": 2e-07,
"loss": -0.009105654805898667,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.20833333507180213,
"reward_std": 0.2074824631214142,
"rewards/MultiModalAccuracyORM": 0.20833333507180213,
"step": 1670,
"train_speed(iter/s)": 0.041129
},
{
"clip_ratio": 0.0,
"completion_length": 29.25,
"epoch": 0.6767676767676768,
"grad_norm": 10.536828994750977,
"kl": 0.0798187255859375,
"learning_rate": 2e-07,
"loss": 0.02544976770877838,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.2583333380520344,
"reward_std": 0.16451202929019929,
"rewards/MultiModalAccuracyORM": 0.2583333380520344,
"step": 1675,
"train_speed(iter/s)": 0.041135
},
{
"clip_ratio": 0.0,
"completion_length": 52.3,
"epoch": 0.6787878787878788,
"grad_norm": 5.117887020111084,
"kl": 0.02090301513671875,
"learning_rate": 2e-07,
"loss": 0.04579094052314758,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.30000000447034836,
"reward_std": 0.31495430171489713,
"rewards/MultiModalAccuracyORM": 0.30000000447034836,
"step": 1680,
"train_speed(iter/s)": 0.041133
},
{
"clip_ratio": 0.0,
"completion_length": 11.7,
"epoch": 0.6808080808080809,
"grad_norm": 8.01219367980957,
"kl": 0.1265289306640625,
"learning_rate": 2e-07,
"loss": 0.019950807094573975,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.4333333380520344,
"reward_std": 0.16852192878723143,
"rewards/MultiModalAccuracyORM": 0.4333333380520344,
"step": 1685,
"train_speed(iter/s)": 0.041146
},
{
"clip_ratio": 0.0,
"completion_length": 16.9,
"epoch": 0.6828282828282828,
"grad_norm": 7.546853065490723,
"kl": 0.0402618408203125,
"learning_rate": 2e-07,
"loss": 0.030116382241249084,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.42500000447034836,
"reward_std": 0.22629254460334777,
"rewards/MultiModalAccuracyORM": 0.42500000447034836,
"step": 1690,
"train_speed(iter/s)": 0.041159
},
{
"clip_ratio": 0.0,
"completion_length": 47.2,
"epoch": 0.6848484848484848,
"grad_norm": 8.680946350097656,
"kl": 0.1186279296875,
"learning_rate": 2e-07,
"loss": -0.014576731622219086,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.45833333507180213,
"reward_std": 0.2074824631214142,
"rewards/MultiModalAccuracyORM": 0.45833333507180213,
"step": 1695,
"train_speed(iter/s)": 0.041143
},
{
"clip_ratio": 0.0,
"completion_length": 10.65,
"epoch": 0.6868686868686869,
"grad_norm": 33.545352935791016,
"kl": 0.11261825561523438,
"learning_rate": 2e-07,
"loss": 0.004046386480331421,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.3250000014901161,
"reward_std": 0.18561154305934907,
"rewards/MultiModalAccuracyORM": 0.3250000014901161,
"step": 1700,
"train_speed(iter/s)": 0.041159
},
{
"clip_ratio": 0.0,
"completion_length": 69.85,
"epoch": 0.6888888888888889,
"grad_norm": 13.335136413574219,
"kl": 0.11529541015625,
"learning_rate": 2e-07,
"loss": 0.0011761213652789592,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.3166666701436043,
"reward_std": 0.18482151627540588,
"rewards/MultiModalAccuracyORM": 0.3166666701436043,
"step": 1705,
"train_speed(iter/s)": 0.041157
},
{
"clip_ratio": 0.0,
"completion_length": 21.45,
"epoch": 0.6909090909090909,
"grad_norm": 14.620392799377441,
"kl": 0.07541313171386718,
"learning_rate": 2e-07,
"loss": 0.01065676361322403,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.1916666716337204,
"reward_std": 0.24961273670196532,
"rewards/MultiModalAccuracyORM": 0.1916666716337204,
"step": 1710,
"train_speed(iter/s)": 0.041164
},
{
"clip_ratio": 0.0,
"completion_length": 19.2,
"epoch": 0.692929292929293,
"grad_norm": 1.2891874313354492,
"kl": 0.13163909912109376,
"learning_rate": 2e-07,
"loss": 0.02046767473220825,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.4916666731238365,
"reward_std": 0.21374863088130952,
"rewards/MultiModalAccuracyORM": 0.4916666731238365,
"step": 1715,
"train_speed(iter/s)": 0.041157
},
{
"clip_ratio": 0.0,
"completion_length": 11.4,
"epoch": 0.694949494949495,
"grad_norm": 3.101806879043579,
"kl": 0.22337646484375,
"learning_rate": 2e-07,
"loss": 0.008609502017498017,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.37500001341104505,
"reward_std": 0.3003751873970032,
"rewards/MultiModalAccuracyORM": 0.37500001341104505,
"step": 1720,
"train_speed(iter/s)": 0.041168
},
{
"clip_ratio": 0.0,
"completion_length": 16.2,
"epoch": 0.696969696969697,
"grad_norm": 17.069448471069336,
"kl": 0.10420684814453125,
"learning_rate": 2e-07,
"loss": -0.020038720965385438,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.32500000298023224,
"reward_std": 0.14433756470680237,
"rewards/MultiModalAccuracyORM": 0.32500000298023224,
"step": 1725,
"train_speed(iter/s)": 0.041178
},
{
"clip_ratio": 0.0,
"completion_length": 24.0,
"epoch": 0.6989898989898989,
"grad_norm": 2.795525074005127,
"kl": 0.0689239501953125,
"learning_rate": 2e-07,
"loss": 0.022227957844734192,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.22500000074505805,
"reward_std": 0.15824586153030396,
"rewards/MultiModalAccuracyORM": 0.22500000074505805,
"step": 1730,
"train_speed(iter/s)": 0.041179
},
{
"clip_ratio": 0.0,
"completion_length": 63.75,
"epoch": 0.701010101010101,
"grad_norm": 2.3581957817077637,
"kl": 0.04788818359375,
"learning_rate": 2e-07,
"loss": 0.033317530155181886,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.35000000298023226,
"reward_std": 0.2837563753128052,
"rewards/MultiModalAccuracyORM": 0.35000000298023226,
"step": 1735,
"train_speed(iter/s)": 0.04118
},
{
"clip_ratio": 0.0,
"completion_length": 10.5,
"epoch": 0.703030303030303,
"grad_norm": 2.782379627227783,
"kl": 0.080255126953125,
"learning_rate": 2e-07,
"loss": -0.012095755338668824,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.18333333879709243,
"reward_std": 0.28154108822345736,
"rewards/MultiModalAccuracyORM": 0.18333333879709243,
"step": 1740,
"train_speed(iter/s)": 0.041192
},
{
"clip_ratio": 0.0,
"completion_length": 17.2,
"epoch": 0.705050505050505,
"grad_norm": 3.129946708679199,
"kl": 0.04556884765625,
"learning_rate": 2e-07,
"loss": 0.037814974784851074,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.45000000223517417,
"reward_std": 0.21378422081470488,
"rewards/MultiModalAccuracyORM": 0.45000000223517417,
"step": 1745,
"train_speed(iter/s)": 0.041197
},
{
"epoch": 0.7070707070707071,
"grad_norm": 2.4902050495147705,
"learning_rate": 2e-07,
"loss": 0.0172103151679039,
"memory(GiB)": 104.49,
"step": 1750,
"train_speed(iter/s)": 0.041202
},
{
"epoch": 0.7070707070707071,
"eval_clip_ratio": 0.0,
"eval_completion_length": 34.29833379745483,
"eval_kl": 0.10184234619140625,
"eval_loss": 0.012326983734965324,
"eval_response_clip_ratio": 0.0,
"eval_reward": 0.4183333376049995,
"eval_reward_std": 0.1789151507616043,
"eval_rewards/MultiModalAccuracyORM": 0.4183333376049995,
"eval_runtime": 267.6806,
"eval_samples_per_second": 0.187,
"eval_steps_per_second": 0.019,
"step": 1750
},
{
"clip_ratio": 0.0,
"completion_length": 31.0,
"epoch": 0.7090909090909091,
"grad_norm": 14.173089981079102,
"kl": 0.10649490356445312,
"learning_rate": 2e-07,
"loss": 0.007458774745464325,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.28750000670552256,
"reward_std": 0.23671061247587205,
"rewards/MultiModalAccuracyORM": 0.28750000670552256,
"step": 1755,
"train_speed(iter/s)": 0.040873
},
{
"clip_ratio": 0.0,
"completion_length": 19.6,
"epoch": 0.7111111111111111,
"grad_norm": 2.1408114433288574,
"kl": 0.066253662109375,
"learning_rate": 2e-07,
"loss": 0.027722400426864625,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.16666667237877847,
"reward_std": 0.2669951319694519,
"rewards/MultiModalAccuracyORM": 0.16666667237877847,
"step": 1760,
"train_speed(iter/s)": 0.040871
},
{
"clip_ratio": 0.0,
"completion_length": 22.25,
"epoch": 0.7131313131313132,
"grad_norm": 24.069496154785156,
"kl": 0.0887176513671875,
"learning_rate": 2e-07,
"loss": 0.00502915009856224,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.2583333395421505,
"reward_std": 0.18407654762268066,
"rewards/MultiModalAccuracyORM": 0.2583333395421505,
"step": 1765,
"train_speed(iter/s)": 0.040877
},
{
"clip_ratio": 0.0,
"completion_length": 26.2,
"epoch": 0.7151515151515152,
"grad_norm": 2.3050827980041504,
"kl": 0.2020782470703125,
"learning_rate": 2e-07,
"loss": 0.016819214820861815,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.2750000022351742,
"reward_std": 0.174764084815979,
"rewards/MultiModalAccuracyORM": 0.2750000022351742,
"step": 1770,
"train_speed(iter/s)": 0.040888
},
{
"clip_ratio": 0.0,
"completion_length": 5.25,
"epoch": 0.7171717171717171,
"grad_norm": 8.913907051086426,
"kl": 0.137213134765625,
"learning_rate": 2e-07,
"loss": -0.006190218776464462,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.5250000029802322,
"reward_std": 0.22078081369400024,
"rewards/MultiModalAccuracyORM": 0.5250000029802322,
"step": 1775,
"train_speed(iter/s)": 0.040903
},
{
"clip_ratio": 0.0,
"completion_length": 30.05,
"epoch": 0.7191919191919192,
"grad_norm": 2.8246963024139404,
"kl": 0.11649169921875,
"learning_rate": 2e-07,
"loss": -0.06523974537849427,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.35833333507180215,
"reward_std": 0.22629254460334777,
"rewards/MultiModalAccuracyORM": 0.35833333507180215,
"step": 1780,
"train_speed(iter/s)": 0.040913
},
{
"clip_ratio": 0.0,
"completion_length": 25.3,
"epoch": 0.7212121212121212,
"grad_norm": 7.319549083709717,
"kl": 0.100701904296875,
"learning_rate": 2e-07,
"loss": 0.03789505362510681,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.2750000149011612,
"reward_std": 0.2712534427642822,
"rewards/MultiModalAccuracyORM": 0.2750000149011612,
"step": 1785,
"train_speed(iter/s)": 0.040921
},
{
"clip_ratio": 0.0,
"completion_length": 47.75,
"epoch": 0.7232323232323232,
"grad_norm": 8.2145357131958,
"kl": 0.13018798828125,
"learning_rate": 2e-07,
"loss": -0.021410945057868957,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.3916666738688946,
"reward_std": 0.2325587123632431,
"rewards/MultiModalAccuracyORM": 0.3916666738688946,
"step": 1790,
"train_speed(iter/s)": 0.040929
},
{
"clip_ratio": 0.0,
"completion_length": 8.7,
"epoch": 0.7252525252525253,
"grad_norm": 8.516419410705566,
"kl": 0.1542633056640625,
"learning_rate": 2e-07,
"loss": 0.02146460711956024,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.3000000059604645,
"reward_std": 0.21823472976684571,
"rewards/MultiModalAccuracyORM": 0.3000000059604645,
"step": 1795,
"train_speed(iter/s)": 0.040941
},
{
"clip_ratio": 0.0,
"completion_length": 8.8,
"epoch": 0.7272727272727273,
"grad_norm": 10.487430572509766,
"kl": 0.2330535888671875,
"learning_rate": 2e-07,
"loss": 0.03371854722499847,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.37500000521540644,
"reward_std": 0.16925235390663146,
"rewards/MultiModalAccuracyORM": 0.37500000521540644,
"step": 1800,
"train_speed(iter/s)": 0.040952
},
{
"clip_ratio": 0.0,
"completion_length": 10.9,
"epoch": 0.7292929292929293,
"grad_norm": 2.5021793842315674,
"kl": 0.053016281127929686,
"learning_rate": 2e-07,
"loss": -0.005027930065989494,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.358333333581686,
"reward_std": 0.1193209171295166,
"rewards/MultiModalAccuracyORM": 0.358333333581686,
"step": 1805,
"train_speed(iter/s)": 0.040965
},
{
"clip_ratio": 0.0,
"completion_length": 16.3,
"epoch": 0.7313131313131314,
"grad_norm": 9.409316062927246,
"kl": 0.077154541015625,
"learning_rate": 2e-07,
"loss": 0.00013190507888793945,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.3083333343267441,
"reward_std": 0.14188667237758637,
"rewards/MultiModalAccuracyORM": 0.3083333343267441,
"step": 1810,
"train_speed(iter/s)": 0.040974
},
{
"clip_ratio": 0.0,
"completion_length": 13.2,
"epoch": 0.7333333333333333,
"grad_norm": 8.413249015808105,
"kl": 0.06329345703125,
"learning_rate": 2e-07,
"loss": 0.0067844375967979435,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.3833333417773247,
"reward_std": 0.2878072619438171,
"rewards/MultiModalAccuracyORM": 0.3833333417773247,
"step": 1815,
"train_speed(iter/s)": 0.040988
},
{
"clip_ratio": 0.0,
"completion_length": 42.4,
"epoch": 0.7353535353535353,
"grad_norm": 3.3386476039886475,
"kl": 0.0814666748046875,
"learning_rate": 2e-07,
"loss": 0.020126067101955414,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.37500000596046446,
"reward_std": 0.18087121844291687,
"rewards/MultiModalAccuracyORM": 0.37500000596046446,
"step": 1820,
"train_speed(iter/s)": 0.041
},
{
"clip_ratio": 0.0,
"completion_length": 18.6,
"epoch": 0.7373737373737373,
"grad_norm": 11.123106956481934,
"kl": 0.13977203369140626,
"learning_rate": 2e-07,
"loss": 0.0059658966958522795,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.2666666716337204,
"reward_std": 0.15821027159690856,
"rewards/MultiModalAccuracyORM": 0.2666666716337204,
"step": 1825,
"train_speed(iter/s)": 0.041002
},
{
"clip_ratio": 0.0,
"completion_length": 23.75,
"epoch": 0.7393939393939394,
"grad_norm": 4.5245361328125,
"kl": 0.098736572265625,
"learning_rate": 2e-07,
"loss": -0.024525515735149384,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.2666666731238365,
"reward_std": 0.20995736718177796,
"rewards/MultiModalAccuracyORM": 0.2666666731238365,
"step": 1830,
"train_speed(iter/s)": 0.040989
},
{
"clip_ratio": 0.0,
"completion_length": 25.1,
"epoch": 0.7414141414141414,
"grad_norm": 0.7691475749015808,
"kl": 0.0991119384765625,
"learning_rate": 2e-07,
"loss": 0.039085444808006284,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.4250000029802322,
"reward_std": 0.12552748322486879,
"rewards/MultiModalAccuracyORM": 0.4250000029802322,
"step": 1835,
"train_speed(iter/s)": 0.040998
},
{
"clip_ratio": 0.0,
"completion_length": 19.0,
"epoch": 0.7434343434343434,
"grad_norm": 0.2410029023885727,
"kl": 0.17838897705078124,
"learning_rate": 2e-07,
"loss": 0.04514871537685394,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.6166666708886623,
"reward_std": 0.13258367776870728,
"rewards/MultiModalAccuracyORM": 0.6166666708886623,
"step": 1840,
"train_speed(iter/s)": 0.040995
},
{
"clip_ratio": 0.0,
"completion_length": 11.9,
"epoch": 0.7454545454545455,
"grad_norm": 12.146939277648926,
"kl": 0.097296142578125,
"learning_rate": 2e-07,
"loss": 0.02126455307006836,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.20833333656191827,
"reward_std": 0.17657731771469115,
"rewards/MultiModalAccuracyORM": 0.20833333656191827,
"step": 1845,
"train_speed(iter/s)": 0.041
},
{
"clip_ratio": 0.0,
"completion_length": 10.0,
"epoch": 0.7474747474747475,
"grad_norm": 10.014187812805176,
"kl": 0.12047119140625,
"learning_rate": 2e-07,
"loss": 0.0045259218662977215,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.46666667237877846,
"reward_std": 0.25897533297538755,
"rewards/MultiModalAccuracyORM": 0.46666667237877846,
"step": 1850,
"train_speed(iter/s)": 0.041019
},
{
"clip_ratio": 0.0,
"completion_length": 9.3,
"epoch": 0.7494949494949495,
"grad_norm": 0.34578633308410645,
"kl": 0.13382987976074218,
"learning_rate": 2e-07,
"loss": 0.003971926495432853,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.1833333358168602,
"reward_std": 0.1356445223093033,
"rewards/MultiModalAccuracyORM": 0.1833333358168602,
"step": 1855,
"train_speed(iter/s)": 0.041027
},
{
"clip_ratio": 0.0,
"completion_length": 19.05,
"epoch": 0.7515151515151515,
"grad_norm": 17.808372497558594,
"kl": 0.025757217407226564,
"learning_rate": 2e-07,
"loss": 0.035965240001678465,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.28333334252238274,
"reward_std": 0.19713521599769593,
"rewards/MultiModalAccuracyORM": 0.28333334252238274,
"step": 1860,
"train_speed(iter/s)": 0.041022
},
{
"clip_ratio": 0.0,
"completion_length": 9.8,
"epoch": 0.7535353535353535,
"grad_norm": 24.15494155883789,
"kl": 0.0437255859375,
"learning_rate": 2e-07,
"loss": -0.06361854076385498,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.4000000096857548,
"reward_std": 0.36670139729976653,
"rewards/MultiModalAccuracyORM": 0.4000000096857548,
"step": 1865,
"train_speed(iter/s)": 0.041031
},
{
"clip_ratio": 0.0,
"completion_length": 8.85,
"epoch": 0.7555555555555555,
"grad_norm": 80.81800079345703,
"kl": 0.08274688720703124,
"learning_rate": 2e-07,
"loss": 0.003989287465810776,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.3083333395421505,
"reward_std": 0.15846449732780457,
"rewards/MultiModalAccuracyORM": 0.3083333395421505,
"step": 1870,
"train_speed(iter/s)": 0.041038
},
{
"clip_ratio": 0.0,
"completion_length": 12.5,
"epoch": 0.7575757575757576,
"grad_norm": 14.617817878723145,
"kl": 0.090728759765625,
"learning_rate": 2e-07,
"loss": -0.0045210480690002445,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.2666666731238365,
"reward_std": 0.19337954819202424,
"rewards/MultiModalAccuracyORM": 0.2666666731238365,
"step": 1875,
"train_speed(iter/s)": 0.041048
},
{
"clip_ratio": 0.0,
"completion_length": 6.65,
"epoch": 0.7595959595959596,
"grad_norm": 13.89445972442627,
"kl": 0.13492431640625,
"learning_rate": 2e-07,
"loss": 0.012078547477722168,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.20833333805203438,
"reward_std": 0.12552748322486879,
"rewards/MultiModalAccuracyORM": 0.20833333805203438,
"step": 1880,
"train_speed(iter/s)": 0.041062
},
{
"clip_ratio": 0.0,
"completion_length": 61.05,
"epoch": 0.7616161616161616,
"grad_norm": 11.715389251708984,
"kl": 0.1376861572265625,
"learning_rate": 2e-07,
"loss": -0.014951804280281067,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.40000000298023225,
"reward_std": 0.19337954223155976,
"rewards/MultiModalAccuracyORM": 0.40000000298023225,
"step": 1885,
"train_speed(iter/s)": 0.041063
},
{
"clip_ratio": 0.0,
"completion_length": 15.4,
"epoch": 0.7636363636363637,
"grad_norm": 0.07281157374382019,
"kl": 0.095611572265625,
"learning_rate": 2e-07,
"loss": 0.012891271710395813,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.483333333581686,
"reward_std": 0.12631751000881195,
"rewards/MultiModalAccuracyORM": 0.483333333581686,
"step": 1890,
"train_speed(iter/s)": 0.041073
},
{
"clip_ratio": 0.0,
"completion_length": 57.6,
"epoch": 0.7656565656565657,
"grad_norm": 1.9145233631134033,
"kl": 0.19044036865234376,
"learning_rate": 2e-07,
"loss": -0.03062499463558197,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.35000000447034835,
"reward_std": 0.24490800201892854,
"rewards/MultiModalAccuracyORM": 0.35000000447034835,
"step": 1895,
"train_speed(iter/s)": 0.041073
},
{
"clip_ratio": 0.0,
"completion_length": 15.8,
"epoch": 0.7676767676767676,
"grad_norm": 22.877309799194336,
"kl": 0.161077880859375,
"learning_rate": 2e-07,
"loss": 0.008297159522771835,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.27500000447034834,
"reward_std": 0.09041781425476074,
"rewards/MultiModalAccuracyORM": 0.27500000447034834,
"step": 1900,
"train_speed(iter/s)": 0.041078
},
{
"clip_ratio": 0.0,
"completion_length": 13.0,
"epoch": 0.7696969696969697,
"grad_norm": 21.666425704956055,
"kl": 0.1980316162109375,
"learning_rate": 2e-07,
"loss": 0.020768019556999206,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.46666666865348816,
"reward_std": 0.24114990234375,
"rewards/MultiModalAccuracyORM": 0.46666666865348816,
"step": 1905,
"train_speed(iter/s)": 0.041093
},
{
"clip_ratio": 0.0,
"completion_length": 8.25,
"epoch": 0.7717171717171717,
"grad_norm": 22.925674438476562,
"kl": 0.0932861328125,
"learning_rate": 2e-07,
"loss": 0.009479768574237823,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.30833334028720855,
"reward_std": 0.2466856449842453,
"rewards/MultiModalAccuracyORM": 0.30833334028720855,
"step": 1910,
"train_speed(iter/s)": 0.0411
},
{
"clip_ratio": 0.0,
"completion_length": 8.65,
"epoch": 0.7737373737373737,
"grad_norm": 0.14844609797000885,
"kl": 0.232122802734375,
"learning_rate": 2e-07,
"loss": 0.010550656914710998,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.6166666686534882,
"reward_std": 0.16454761922359468,
"rewards/MultiModalAccuracyORM": 0.6166666686534882,
"step": 1915,
"train_speed(iter/s)": 0.041111
},
{
"clip_ratio": 0.0,
"completion_length": 29.85,
"epoch": 0.7757575757575758,
"grad_norm": 13.482421875,
"kl": 0.120068359375,
"learning_rate": 2e-07,
"loss": 0.022914706170558928,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.4500000044703484,
"reward_std": 0.25292780101299284,
"rewards/MultiModalAccuracyORM": 0.4500000044703484,
"step": 1920,
"train_speed(iter/s)": 0.041122
},
{
"clip_ratio": 0.0,
"completion_length": 8.7,
"epoch": 0.7777777777777778,
"grad_norm": 0.19085177779197693,
"kl": 0.14432373046875,
"learning_rate": 2e-07,
"loss": 0.020079278945922853,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.5416666671633721,
"reward_std": 0.18859823644161225,
"rewards/MultiModalAccuracyORM": 0.5416666671633721,
"step": 1925,
"train_speed(iter/s)": 0.04113
},
{
"clip_ratio": 0.0,
"completion_length": 57.35,
"epoch": 0.7797979797979798,
"grad_norm": 0.04123455658555031,
"kl": 0.10629119873046874,
"learning_rate": 2e-07,
"loss": 0.02534767985343933,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.4833333358168602,
"reward_std": 0.1652424544095993,
"rewards/MultiModalAccuracyORM": 0.4833333358168602,
"step": 1930,
"train_speed(iter/s)": 0.041128
},
{
"clip_ratio": 0.0,
"completion_length": 13.9,
"epoch": 0.7818181818181819,
"grad_norm": 7.716069221496582,
"kl": 0.03204345703125,
"learning_rate": 2e-07,
"loss": 0.018103978037834166,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.32500000596046447,
"reward_std": 0.3275222271680832,
"rewards/MultiModalAccuracyORM": 0.32500000596046447,
"step": 1935,
"train_speed(iter/s)": 0.041139
},
{
"clip_ratio": 0.0,
"completion_length": 15.0,
"epoch": 0.7838383838383839,
"grad_norm": 1.998159408569336,
"kl": 0.2424346923828125,
"learning_rate": 2e-07,
"loss": -0.0022819479927420616,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.3416666738688946,
"reward_std": 0.2526735752820969,
"rewards/MultiModalAccuracyORM": 0.3416666738688946,
"step": 1940,
"train_speed(iter/s)": 0.041144
},
{
"clip_ratio": 0.0,
"completion_length": 8.25,
"epoch": 0.7858585858585858,
"grad_norm": 0.11755078285932541,
"kl": 0.1235809326171875,
"learning_rate": 2e-07,
"loss": 0.01756092607975006,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.4083333358168602,
"reward_std": 0.16145119071006775,
"rewards/MultiModalAccuracyORM": 0.4083333358168602,
"step": 1945,
"train_speed(iter/s)": 0.041156
},
{
"clip_ratio": 0.0,
"completion_length": 29.45,
"epoch": 0.7878787878787878,
"grad_norm": 11.287028312683105,
"kl": 0.05250396728515625,
"learning_rate": 2e-07,
"loss": -0.009032456576824189,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.36666667386889457,
"reward_std": 0.30639870166778566,
"rewards/MultiModalAccuracyORM": 0.36666667386889457,
"step": 1950,
"train_speed(iter/s)": 0.041159
},
{
"clip_ratio": 0.0,
"completion_length": 7.6,
"epoch": 0.7898989898989899,
"grad_norm": 0.1284160166978836,
"kl": 0.046563720703125,
"learning_rate": 2e-07,
"loss": 0.0006015380378812552,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.6250000067055226,
"reward_std": 0.1973894417285919,
"rewards/MultiModalAccuracyORM": 0.6250000067055226,
"step": 1955,
"train_speed(iter/s)": 0.041172
},
{
"clip_ratio": 0.0,
"completion_length": 20.05,
"epoch": 0.7919191919191919,
"grad_norm": 0.5048889517784119,
"kl": 0.0877197265625,
"learning_rate": 2e-07,
"loss": 0.0017469068989157677,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.125,
"reward_std": 0.045226702094078065,
"rewards/MultiModalAccuracyORM": 0.125,
"step": 1960,
"train_speed(iter/s)": 0.041177
},
{
"clip_ratio": 0.0,
"completion_length": 23.7,
"epoch": 0.793939393939394,
"grad_norm": 10.217628479003906,
"kl": 0.1369842529296875,
"learning_rate": 2e-07,
"loss": -0.007052314281463623,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.4083333395421505,
"reward_std": 0.19968129992485045,
"rewards/MultiModalAccuracyORM": 0.4083333395421505,
"step": 1965,
"train_speed(iter/s)": 0.041181
},
{
"clip_ratio": 0.0,
"completion_length": 31.1,
"epoch": 0.795959595959596,
"grad_norm": 15.147607803344727,
"kl": 0.139697265625,
"learning_rate": 2e-07,
"loss": -0.0005793333053588867,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.30000000521540643,
"reward_std": 0.26928699016571045,
"rewards/MultiModalAccuracyORM": 0.30000000521540643,
"step": 1970,
"train_speed(iter/s)": 0.041179
},
{
"clip_ratio": 0.0,
"completion_length": 17.05,
"epoch": 0.797979797979798,
"grad_norm": 14.508552551269531,
"kl": 0.1334228515625,
"learning_rate": 2e-07,
"loss": 0.014681649208068848,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.30000000447034836,
"reward_std": 0.31593895256519317,
"rewards/MultiModalAccuracyORM": 0.30000000447034836,
"step": 1975,
"train_speed(iter/s)": 0.041186
},
{
"clip_ratio": 0.0,
"completion_length": 11.4,
"epoch": 0.8,
"grad_norm": 14.245569229125977,
"kl": 0.07449951171875,
"learning_rate": 2e-07,
"loss": 0.019247731566429137,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.23333334252238275,
"reward_std": 0.256683474779129,
"rewards/MultiModalAccuracyORM": 0.23333334252238275,
"step": 1980,
"train_speed(iter/s)": 0.041204
},
{
"clip_ratio": 0.0,
"completion_length": 19.0,
"epoch": 0.802020202020202,
"grad_norm": 0.06112133339047432,
"kl": 0.09664306640625,
"learning_rate": 2e-07,
"loss": -0.010070499032735825,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.45000000223517417,
"reward_std": 0.17555411159992218,
"rewards/MultiModalAccuracyORM": 0.45000000223517417,
"step": 1985,
"train_speed(iter/s)": 0.04121
},
{
"clip_ratio": 0.0,
"completion_length": 51.2,
"epoch": 0.804040404040404,
"grad_norm": 0.20859137177467346,
"kl": 0.2631103515625,
"learning_rate": 2e-07,
"loss": -0.03446192741394043,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.2666666679084301,
"reward_std": 0.15194410383701323,
"rewards/MultiModalAccuracyORM": 0.2666666679084301,
"step": 1990,
"train_speed(iter/s)": 0.041215
},
{
"clip_ratio": 0.0,
"completion_length": 14.6,
"epoch": 0.806060606060606,
"grad_norm": 2.347874879837036,
"kl": 0.09171142578125,
"learning_rate": 2e-07,
"loss": 0.003209712356328964,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.4833333395421505,
"reward_std": 0.1770799547433853,
"rewards/MultiModalAccuracyORM": 0.4833333395421505,
"step": 1995,
"train_speed(iter/s)": 0.041215
},
{
"epoch": 0.8080808080808081,
"grad_norm": 12.103494644165039,
"learning_rate": 2e-07,
"loss": 0.051232755184173584,
"memory(GiB)": 104.49,
"step": 2000,
"train_speed(iter/s)": 0.041214
},
{
"epoch": 0.8080808080808081,
"eval_clip_ratio": 0.0,
"eval_completion_length": 32.68000123023987,
"eval_kl": 0.1109576416015625,
"eval_loss": 0.001846806495450437,
"eval_response_clip_ratio": 0.0,
"eval_reward": 0.4066666714847088,
"eval_reward_std": 0.1827806031703949,
"eval_rewards/MultiModalAccuracyORM": 0.4066666714847088,
"eval_runtime": 274.3294,
"eval_samples_per_second": 0.182,
"eval_steps_per_second": 0.018,
"step": 2000
},
{
"clip_ratio": 0.0,
"completion_length": 17.275,
"epoch": 0.8101010101010101,
"grad_norm": 20.72494888305664,
"kl": 0.09075469970703125,
"learning_rate": 2e-07,
"loss": 0.01332613080739975,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.21666667200624942,
"reward_std": 0.22227564305067063,
"rewards/MultiModalAccuracyORM": 0.21666667200624942,
"step": 2005,
"train_speed(iter/s)": 0.04093
},
{
"clip_ratio": 0.0,
"completion_length": 15.2,
"epoch": 0.8121212121212121,
"grad_norm": 10.545307159423828,
"kl": 0.157220458984375,
"learning_rate": 2e-07,
"loss": 0.02192305028438568,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.2666666716337204,
"reward_std": 0.21999078691005708,
"rewards/MultiModalAccuracyORM": 0.2666666716337204,
"step": 2010,
"train_speed(iter/s)": 0.040938
},
{
"clip_ratio": 0.0,
"completion_length": 7.65,
"epoch": 0.8141414141414142,
"grad_norm": 0.1491260975599289,
"kl": 0.1144989013671875,
"learning_rate": 2e-07,
"loss": 0.021004287898540495,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.40000000298023225,
"reward_std": 0.17456946671009063,
"rewards/MultiModalAccuracyORM": 0.40000000298023225,
"step": 2015,
"train_speed(iter/s)": 0.040944
},
{
"clip_ratio": 0.0,
"completion_length": 21.6,
"epoch": 0.8161616161616162,
"grad_norm": 19.212770462036133,
"kl": 0.0832275390625,
"learning_rate": 2e-07,
"loss": 0.004856839030981064,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.27500001043081285,
"reward_std": 0.2800416827201843,
"rewards/MultiModalAccuracyORM": 0.27500001043081285,
"step": 2020,
"train_speed(iter/s)": 0.040949
},
{
"clip_ratio": 0.0,
"completion_length": 20.2,
"epoch": 0.8181818181818182,
"grad_norm": 0.25410985946655273,
"kl": 0.129962158203125,
"learning_rate": 2e-07,
"loss": 0.016422802209854127,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.5250000059604645,
"reward_std": 0.09041781425476074,
"rewards/MultiModalAccuracyORM": 0.5250000059604645,
"step": 2025,
"train_speed(iter/s)": 0.040961
},
{
"clip_ratio": 0.0,
"completion_length": 8.25,
"epoch": 0.8202020202020202,
"grad_norm": 6.931528568267822,
"kl": 0.260528564453125,
"learning_rate": 2e-07,
"loss": -0.02277086079120636,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.5583333410322666,
"reward_std": 0.2526735752820969,
"rewards/MultiModalAccuracyORM": 0.5583333410322666,
"step": 2030,
"train_speed(iter/s)": 0.040965
},
{
"clip_ratio": 0.0,
"completion_length": 15.7,
"epoch": 0.8222222222222222,
"grad_norm": 27.311315536499023,
"kl": 0.07995872497558594,
"learning_rate": 2e-07,
"loss": 0.024982047080993653,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.3416666753590107,
"reward_std": 0.3019101768732071,
"rewards/MultiModalAccuracyORM": 0.3416666753590107,
"step": 2035,
"train_speed(iter/s)": 0.040969
},
{
"clip_ratio": 0.0,
"completion_length": 25.5,
"epoch": 0.8242424242424242,
"grad_norm": 0.08455629646778107,
"kl": 0.1283721923828125,
"learning_rate": 2e-07,
"loss": 0.007968991994857788,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.2083333395421505,
"reward_std": 0.2167353242635727,
"rewards/MultiModalAccuracyORM": 0.2083333395421505,
"step": 2040,
"train_speed(iter/s)": 0.040978
},
{
"clip_ratio": 0.0,
"completion_length": 17.7,
"epoch": 0.8262626262626263,
"grad_norm": 0.012692108750343323,
"kl": 0.06329593658447266,
"learning_rate": 2e-07,
"loss": 0.019880211353302,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.4750000022351742,
"reward_std": 0.1559540092945099,
"rewards/MultiModalAccuracyORM": 0.4750000022351742,
"step": 2045,
"train_speed(iter/s)": 0.040984
},
{
"clip_ratio": 0.0,
"completion_length": 18.25,
"epoch": 0.8282828282828283,
"grad_norm": 0.49161991477012634,
"kl": 0.041827392578125,
"learning_rate": 2e-07,
"loss": 0.023220118880271912,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.2583333358168602,
"reward_std": 0.13882583379745483,
"rewards/MultiModalAccuracyORM": 0.2583333358168602,
"step": 2050,
"train_speed(iter/s)": 0.040982
},
{
"clip_ratio": 0.0,
"completion_length": 5.25,
"epoch": 0.8303030303030303,
"grad_norm": 3.920830249786377,
"kl": 0.130963134765625,
"learning_rate": 2e-07,
"loss": 0.012984590232372284,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.3666666753590107,
"reward_std": 0.3127244710922241,
"rewards/MultiModalAccuracyORM": 0.3666666753590107,
"step": 2055,
"train_speed(iter/s)": 0.040988
},
{
"clip_ratio": 0.0,
"completion_length": 13.95,
"epoch": 0.8323232323232324,
"grad_norm": 2.618926763534546,
"kl": 0.0820068359375,
"learning_rate": 2e-07,
"loss": -0.0011547774076461792,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.3166666738688946,
"reward_std": 0.174509859085083,
"rewards/MultiModalAccuracyORM": 0.3166666738688946,
"step": 2060,
"train_speed(iter/s)": 0.040997
},
{
"clip_ratio": 0.0,
"completion_length": 4.05,
"epoch": 0.8343434343434344,
"grad_norm": 21.554759979248047,
"kl": 0.2670654296875,
"learning_rate": 2e-07,
"loss": 0.008714067935943603,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.2750000059604645,
"reward_std": 0.22781596183776856,
"rewards/MultiModalAccuracyORM": 0.2750000059604645,
"step": 2065,
"train_speed(iter/s)": 0.041007
},
{
"clip_ratio": 0.0,
"completion_length": 24.4,
"epoch": 0.8363636363636363,
"grad_norm": 0.038795698434114456,
"kl": 0.09162445068359375,
"learning_rate": 2e-07,
"loss": 0.01877760738134384,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.2416666679084301,
"reward_std": 0.12552748322486879,
"rewards/MultiModalAccuracyORM": 0.2416666679084301,
"step": 2070,
"train_speed(iter/s)": 0.041016
},
{
"clip_ratio": 0.0,
"completion_length": 24.65,
"epoch": 0.8383838383838383,
"grad_norm": 0.5922779440879822,
"kl": 0.17679443359375,
"learning_rate": 2e-07,
"loss": 0.007905527949333191,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.5,
"reward_std": 0.0,
"rewards/MultiModalAccuracyORM": 0.5,
"step": 2075,
"train_speed(iter/s)": 0.041029
},
{
"clip_ratio": 0.0,
"completion_length": 7.05,
"epoch": 0.8404040404040404,
"grad_norm": 0.48757824301719666,
"kl": 0.1322998046875,
"learning_rate": 2e-07,
"loss": 0.006768345832824707,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.5333333358168602,
"reward_std": 0.1356445163488388,
"rewards/MultiModalAccuracyORM": 0.5333333358168602,
"step": 2080,
"train_speed(iter/s)": 0.04104
},
{
"clip_ratio": 0.0,
"completion_length": 8.2,
"epoch": 0.8424242424242424,
"grad_norm": 7.100019931793213,
"kl": 0.09602890014648438,
"learning_rate": 2e-07,
"loss": -0.010533835738897324,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.25833334103226663,
"reward_std": 0.22001479864120482,
"rewards/MultiModalAccuracyORM": 0.25833334103226663,
"step": 2085,
"train_speed(iter/s)": 0.041047
},
{
"clip_ratio": 0.0,
"completion_length": 9.15,
"epoch": 0.8444444444444444,
"grad_norm": 10.953103065490723,
"kl": 0.205523681640625,
"learning_rate": 2e-07,
"loss": 0.07547287940979004,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.23333333805203438,
"reward_std": 0.16852193474769592,
"rewards/MultiModalAccuracyORM": 0.23333333805203438,
"step": 2090,
"train_speed(iter/s)": 0.041052
},
{
"clip_ratio": 0.0,
"completion_length": 6.75,
"epoch": 0.8464646464646465,
"grad_norm": 4.194830894470215,
"kl": 0.0806396484375,
"learning_rate": 2e-07,
"loss": -0.017879560589790344,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.3916666693985462,
"reward_std": 0.11702905893325806,
"rewards/MultiModalAccuracyORM": 0.3916666693985462,
"step": 2095,
"train_speed(iter/s)": 0.041059
},
{
"clip_ratio": 0.0,
"completion_length": 23.75,
"epoch": 0.8484848484848485,
"grad_norm": 0.12948361039161682,
"kl": 0.13734283447265624,
"learning_rate": 2e-07,
"loss": -0.01447494924068451,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.4750000111758709,
"reward_std": 0.2338038980960846,
"rewards/MultiModalAccuracyORM": 0.4750000111758709,
"step": 2100,
"train_speed(iter/s)": 0.041064
},
{
"clip_ratio": 0.0,
"completion_length": 22.05,
"epoch": 0.8505050505050505,
"grad_norm": 31.3735294342041,
"kl": 0.177423095703125,
"learning_rate": 2e-07,
"loss": -0.0017697295174002648,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.25000000521540644,
"reward_std": 0.243092343211174,
"rewards/MultiModalAccuracyORM": 0.25000000521540644,
"step": 2105,
"train_speed(iter/s)": 0.041067
},
{
"clip_ratio": 0.0,
"completion_length": 4.65,
"epoch": 0.8525252525252526,
"grad_norm": 2.228029251098633,
"kl": 0.14156494140625,
"learning_rate": 2e-07,
"loss": -0.010953420400619506,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.5166666679084301,
"reward_std": 0.22297748625278474,
"rewards/MultiModalAccuracyORM": 0.5166666679084301,
"step": 2110,
"train_speed(iter/s)": 0.041074
},
{
"clip_ratio": 0.0,
"completion_length": 47.35,
"epoch": 0.8545454545454545,
"grad_norm": 0.3235064446926117,
"kl": 0.19440174102783203,
"learning_rate": 2e-07,
"loss": -0.010122859477996826,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.2750000074505806,
"reward_std": 0.1888910174369812,
"rewards/MultiModalAccuracyORM": 0.2750000074505806,
"step": 2115,
"train_speed(iter/s)": 0.041074
},
{
"clip_ratio": 0.0,
"completion_length": 34.1,
"epoch": 0.8565656565656565,
"grad_norm": 9.72260856628418,
"kl": 0.18918914794921876,
"learning_rate": 2e-07,
"loss": 0.0024737130850553514,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.45833334028720857,
"reward_std": 0.16925235390663146,
"rewards/MultiModalAccuracyORM": 0.45833334028720857,
"step": 2120,
"train_speed(iter/s)": 0.041072
},
{
"clip_ratio": 0.0,
"completion_length": 42.2,
"epoch": 0.8585858585858586,
"grad_norm": 9.817282676696777,
"kl": 0.299951171875,
"learning_rate": 2e-07,
"loss": 0.00935778021812439,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.17500000074505806,
"reward_std": 0.15824586153030396,
"rewards/MultiModalAccuracyORM": 0.17500000074505806,
"step": 2125,
"train_speed(iter/s)": 0.041077
},
{
"clip_ratio": 0.0,
"completion_length": 7.6,
"epoch": 0.8606060606060606,
"grad_norm": 0.3442615568637848,
"kl": 0.17735595703125,
"learning_rate": 2e-07,
"loss": 0.006759631633758545,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.48333334028720853,
"reward_std": 0.2730426698923111,
"rewards/MultiModalAccuracyORM": 0.48333334028720853,
"step": 2130,
"train_speed(iter/s)": 0.041087
},
{
"clip_ratio": 0.0,
"completion_length": 79.15,
"epoch": 0.8626262626262626,
"grad_norm": 0.520937979221344,
"kl": 0.07093505859375,
"learning_rate": 2e-07,
"loss": -0.012908129394054413,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.20000001043081284,
"reward_std": 0.1996457099914551,
"rewards/MultiModalAccuracyORM": 0.20000001043081284,
"step": 2135,
"train_speed(iter/s)": 0.041077
},
{
"clip_ratio": 0.0,
"completion_length": 24.9,
"epoch": 0.8646464646464647,
"grad_norm": 1.4221155643463135,
"kl": 0.132373046875,
"learning_rate": 2e-07,
"loss": -0.07007729411125183,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.2916666716337204,
"reward_std": 0.21422483026981354,
"rewards/MultiModalAccuracyORM": 0.2916666716337204,
"step": 2140,
"train_speed(iter/s)": 0.041076
},
{
"clip_ratio": 0.0,
"completion_length": 34.4,
"epoch": 0.8666666666666667,
"grad_norm": 19.47251319885254,
"kl": 0.0463134765625,
"learning_rate": 2e-07,
"loss": 0.02097744941711426,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.3666666723787785,
"reward_std": 0.25897533297538755,
"rewards/MultiModalAccuracyORM": 0.3666666723787785,
"step": 2145,
"train_speed(iter/s)": 0.041079
},
{
"clip_ratio": 0.0,
"completion_length": 35.05,
"epoch": 0.8686868686868687,
"grad_norm": 0.0365481972694397,
"kl": 0.07025909423828125,
"learning_rate": 2e-07,
"loss": -0.00900230035185814,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.5666666716337204,
"reward_std": 0.1295969843864441,
"rewards/MultiModalAccuracyORM": 0.5666666716337204,
"step": 2150,
"train_speed(iter/s)": 0.041076
},
{
"clip_ratio": 0.0,
"completion_length": 15.55,
"epoch": 0.8707070707070707,
"grad_norm": 3.220684051513672,
"kl": 0.11529541015625,
"learning_rate": 2e-07,
"loss": 0.05271543264389038,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.23333334177732468,
"reward_std": 0.3222196638584137,
"rewards/MultiModalAccuracyORM": 0.23333334177732468,
"step": 2155,
"train_speed(iter/s)": 0.041085
},
{
"clip_ratio": 0.0,
"completion_length": 30.5,
"epoch": 0.8727272727272727,
"grad_norm": 21.94721031188965,
"kl": 0.1160491943359375,
"learning_rate": 2e-07,
"loss": 0.0024079522117972374,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.20000000447034835,
"reward_std": 0.24009110629558564,
"rewards/MultiModalAccuracyORM": 0.20000000447034835,
"step": 2160,
"train_speed(iter/s)": 0.04109
},
{
"clip_ratio": 0.0,
"completion_length": 16.7,
"epoch": 0.8747474747474747,
"grad_norm": 20.038494110107422,
"kl": 0.1658905029296875,
"learning_rate": 2e-07,
"loss": 0.04994232654571533,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.533333345502615,
"reward_std": 0.325963220000267,
"rewards/MultiModalAccuracyORM": 0.533333345502615,
"step": 2165,
"train_speed(iter/s)": 0.041101
},
{
"clip_ratio": 0.0,
"completion_length": 28.45,
"epoch": 0.8767676767676768,
"grad_norm": 2.1534128189086914,
"kl": 0.0698028564453125,
"learning_rate": 2e-07,
"loss": -0.025438961386680604,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.21666667014360427,
"reward_std": 0.21524804830551147,
"rewards/MultiModalAccuracyORM": 0.21666667014360427,
"step": 2170,
"train_speed(iter/s)": 0.041105
},
{
"clip_ratio": 0.0,
"completion_length": 7.25,
"epoch": 0.8787878787878788,
"grad_norm": 6.415175437927246,
"kl": 0.0926239013671875,
"learning_rate": 2e-07,
"loss": -0.007227879762649536,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.44166667610406873,
"reward_std": 0.32300969064235685,
"rewards/MultiModalAccuracyORM": 0.44166667610406873,
"step": 2175,
"train_speed(iter/s)": 0.041115
},
{
"clip_ratio": 0.0,
"completion_length": 7.3,
"epoch": 0.8808080808080808,
"grad_norm": 0.38973256945610046,
"kl": 0.13404541015625,
"learning_rate": 2e-07,
"loss": 0.023914989829063416,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.4000000059604645,
"reward_std": 0.1896214485168457,
"rewards/MultiModalAccuracyORM": 0.4000000059604645,
"step": 2180,
"train_speed(iter/s)": 0.041124
},
{
"clip_ratio": 0.0,
"completion_length": 10.9,
"epoch": 0.8828282828282829,
"grad_norm": 0.12656661868095398,
"kl": 0.15858612060546876,
"learning_rate": 2e-07,
"loss": 0.008176784217357635,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.3416666679084301,
"reward_std": 0.07810411453247071,
"rewards/MultiModalAccuracyORM": 0.3416666679084301,
"step": 2185,
"train_speed(iter/s)": 0.041129
},
{
"clip_ratio": 0.0,
"completion_length": 61.4,
"epoch": 0.8848484848484849,
"grad_norm": 2.246829032897949,
"kl": 0.05509033203125,
"learning_rate": 2e-07,
"loss": 0.0310079425573349,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.3416666708886623,
"reward_std": 0.22704698145389557,
"rewards/MultiModalAccuracyORM": 0.3416666708886623,
"step": 2190,
"train_speed(iter/s)": 0.041118
},
{
"clip_ratio": 0.0,
"completion_length": 11.15,
"epoch": 0.8868686868686869,
"grad_norm": 0.3648838996887207,
"kl": 0.1862060546875,
"learning_rate": 2e-07,
"loss": -0.014291207492351531,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.4416666813194752,
"reward_std": 0.29006352424621584,
"rewards/MultiModalAccuracyORM": 0.4416666813194752,
"step": 2195,
"train_speed(iter/s)": 0.041124
},
{
"clip_ratio": 0.0,
"completion_length": 11.0,
"epoch": 0.8888888888888888,
"grad_norm": 5.710547924041748,
"kl": 0.1601806640625,
"learning_rate": 2e-07,
"loss": 0.0010113120079040527,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.3416666708886623,
"reward_std": 0.15824586153030396,
"rewards/MultiModalAccuracyORM": 0.3416666708886623,
"step": 2200,
"train_speed(iter/s)": 0.041137
},
{
"clip_ratio": 0.0,
"completion_length": 47.85,
"epoch": 0.8909090909090909,
"grad_norm": 0.11420593410730362,
"kl": 0.1681976318359375,
"learning_rate": 2e-07,
"loss": 0.0913887619972229,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.1083333358168602,
"reward_std": 0.18262484967708587,
"rewards/MultiModalAccuracyORM": 0.1083333358168602,
"step": 2205,
"train_speed(iter/s)": 0.041132
},
{
"clip_ratio": 0.0,
"completion_length": 28.25,
"epoch": 0.8929292929292929,
"grad_norm": 21.853090286254883,
"kl": 0.10088920593261719,
"learning_rate": 2e-07,
"loss": 0.0005557646509259939,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.47500000819563865,
"reward_std": 0.16451202929019929,
"rewards/MultiModalAccuracyORM": 0.47500000819563865,
"step": 2210,
"train_speed(iter/s)": 0.041127
},
{
"clip_ratio": 0.0,
"completion_length": 34.95,
"epoch": 0.8949494949494949,
"grad_norm": 0.11827383190393448,
"kl": 0.1541900634765625,
"learning_rate": 2e-07,
"loss": 0.0488810658454895,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.3000000059604645,
"reward_std": 0.22625695466995238,
"rewards/MultiModalAccuracyORM": 0.3000000059604645,
"step": 2215,
"train_speed(iter/s)": 0.041123
},
{
"clip_ratio": 0.0,
"completion_length": 42.2,
"epoch": 0.896969696969697,
"grad_norm": 10.474591255187988,
"kl": 0.10148773193359376,
"learning_rate": 2e-07,
"loss": -0.004365795105695724,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.25000000521540644,
"reward_std": 0.25591449439525604,
"rewards/MultiModalAccuracyORM": 0.25000000521540644,
"step": 2220,
"train_speed(iter/s)": 0.041126
},
{
"clip_ratio": 0.0,
"completion_length": 40.2,
"epoch": 0.898989898989899,
"grad_norm": 0.02211969904601574,
"kl": 0.0304107666015625,
"learning_rate": 2e-07,
"loss": 0.0038854777812957764,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.233333333581686,
"reward_std": 0.07409421503543853,
"rewards/MultiModalAccuracyORM": 0.233333333581686,
"step": 2225,
"train_speed(iter/s)": 0.041135
},
{
"clip_ratio": 0.0,
"completion_length": 12.35,
"epoch": 0.901010101010101,
"grad_norm": 11.09273910522461,
"kl": 0.1110137939453125,
"learning_rate": 2e-07,
"loss": 0.0425330251455307,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.1000000037252903,
"reward_std": 0.16852192878723143,
"rewards/MultiModalAccuracyORM": 0.1000000037252903,
"step": 2230,
"train_speed(iter/s)": 0.041144
},
{
"clip_ratio": 0.0,
"completion_length": 14.8,
"epoch": 0.9030303030303031,
"grad_norm": 17.634380340576172,
"kl": 0.215167236328125,
"learning_rate": 2e-07,
"loss": 0.03751255869865418,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.3833333358168602,
"reward_std": 0.10697162747383118,
"rewards/MultiModalAccuracyORM": 0.3833333358168602,
"step": 2235,
"train_speed(iter/s)": 0.041143
},
{
"clip_ratio": 0.0,
"completion_length": 6.8,
"epoch": 0.9050505050505051,
"grad_norm": 0.31089159846305847,
"kl": 0.1969482421875,
"learning_rate": 2e-07,
"loss": 0.011410205066204071,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.3416666738688946,
"reward_std": 0.18108985424041749,
"rewards/MultiModalAccuracyORM": 0.3416666738688946,
"step": 2240,
"train_speed(iter/s)": 0.041157
},
{
"clip_ratio": 0.0,
"completion_length": 22.1,
"epoch": 0.907070707070707,
"grad_norm": 0.033987369388341904,
"kl": 0.1924041748046875,
"learning_rate": 2e-07,
"loss": 0.0015319785103201865,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.22500000298023223,
"reward_std": 0.12552748322486879,
"rewards/MultiModalAccuracyORM": 0.22500000298023223,
"step": 2245,
"train_speed(iter/s)": 0.041165
},
{
"epoch": 0.9090909090909091,
"grad_norm": 0.06531964987516403,
"learning_rate": 2e-07,
"loss": -0.01111970990896225,
"memory(GiB)": 104.49,
"step": 2250,
"train_speed(iter/s)": 0.041175
},
{
"epoch": 0.9090909090909091,
"eval_clip_ratio": 0.0,
"eval_completion_length": 33.406667890548704,
"eval_kl": 0.133411865234375,
"eval_loss": -0.00466223806142807,
"eval_response_clip_ratio": 0.0,
"eval_reward": 0.441666671782732,
"eval_reward_std": 0.1628412437438965,
"eval_rewards/MultiModalAccuracyORM": 0.441666671782732,
"eval_runtime": 272.4154,
"eval_samples_per_second": 0.184,
"eval_steps_per_second": 0.018,
"step": 2250
},
{
"clip_ratio": 0.0,
"completion_length": 23.525,
"epoch": 0.9111111111111111,
"grad_norm": 0.03232080861926079,
"kl": 0.22264862060546875,
"learning_rate": 2e-07,
"loss": 0.028143799304962157,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.4166666720062494,
"reward_std": 0.12746492475271226,
"rewards/MultiModalAccuracyORM": 0.4166666720062494,
"step": 2255,
"train_speed(iter/s)": 0.040921
},
{
"clip_ratio": 0.0,
"completion_length": 11.4,
"epoch": 0.9131313131313131,
"grad_norm": 0.06567571312189102,
"kl": 0.08049087524414063,
"learning_rate": 2e-07,
"loss": 0.031807747483253476,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.45833333507180213,
"reward_std": 0.14564234614372254,
"rewards/MultiModalAccuracyORM": 0.45833333507180213,
"step": 2260,
"train_speed(iter/s)": 0.040927
},
{
"clip_ratio": 0.0,
"completion_length": 21.6,
"epoch": 0.9151515151515152,
"grad_norm": 0.668204665184021,
"kl": 0.1173919677734375,
"learning_rate": 2e-07,
"loss": -0.03886902332305908,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.28333333432674407,
"reward_std": 0.16830329298973085,
"rewards/MultiModalAccuracyORM": 0.28333333432674407,
"step": 2265,
"train_speed(iter/s)": 0.040936
},
{
"clip_ratio": 0.0,
"completion_length": 44.0,
"epoch": 0.9171717171717172,
"grad_norm": 0.16663120687007904,
"kl": 0.1005615234375,
"learning_rate": 2e-07,
"loss": 0.011882781982421875,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.3083333380520344,
"reward_std": 0.2659719318151474,
"rewards/MultiModalAccuracyORM": 0.3083333380520344,
"step": 2270,
"train_speed(iter/s)": 0.040942
},
{
"clip_ratio": 0.0,
"completion_length": 17.5,
"epoch": 0.9191919191919192,
"grad_norm": 18.440631866455078,
"kl": 0.121307373046875,
"learning_rate": 2e-07,
"loss": 0.01434231996536255,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.33333333805203436,
"reward_std": 0.17075418531894684,
"rewards/MultiModalAccuracyORM": 0.33333333805203436,
"step": 2275,
"train_speed(iter/s)": 0.040956
},
{
"clip_ratio": 0.0,
"completion_length": 18.25,
"epoch": 0.9212121212121213,
"grad_norm": 30.9835147857666,
"kl": 0.118048095703125,
"learning_rate": 2e-07,
"loss": 0.012100108712911607,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.3583333373069763,
"reward_std": 0.17781037986278533,
"rewards/MultiModalAccuracyORM": 0.3583333373069763,
"step": 2280,
"train_speed(iter/s)": 0.040966
},
{
"clip_ratio": 0.0,
"completion_length": 13.3,
"epoch": 0.9232323232323232,
"grad_norm": 6.152209758758545,
"kl": 0.3557861328125,
"learning_rate": 2e-07,
"loss": -0.025510752201080324,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.30000000074505806,
"reward_std": 0.1652424544095993,
"rewards/MultiModalAccuracyORM": 0.30000000074505806,
"step": 2285,
"train_speed(iter/s)": 0.040978
},
{
"clip_ratio": 0.0,
"completion_length": 47.3,
"epoch": 0.9252525252525252,
"grad_norm": 22.69240951538086,
"kl": 0.155462646484375,
"learning_rate": 2e-07,
"loss": -0.0011336962692439557,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.2916666716337204,
"reward_std": 0.21422483026981354,
"rewards/MultiModalAccuracyORM": 0.2916666716337204,
"step": 2290,
"train_speed(iter/s)": 0.040975
},
{
"clip_ratio": 0.0,
"completion_length": 9.15,
"epoch": 0.9272727272727272,
"grad_norm": 0.06437839567661285,
"kl": 0.16920166015625,
"learning_rate": 2e-07,
"loss": 0.0063018262386322025,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.25833333730697633,
"reward_std": 0.07810411453247071,
"rewards/MultiModalAccuracyORM": 0.25833333730697633,
"step": 2295,
"train_speed(iter/s)": 0.040991
},
{
"clip_ratio": 0.0,
"completion_length": 56.8,
"epoch": 0.9292929292929293,
"grad_norm": 0.896676778793335,
"kl": 0.12425537109375,
"learning_rate": 2e-07,
"loss": 0.01196231171488762,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.2916666716337204,
"reward_std": 0.29564401507377625,
"rewards/MultiModalAccuracyORM": 0.2916666716337204,
"step": 2300,
"train_speed(iter/s)": 0.040994
},
{
"clip_ratio": 0.0,
"completion_length": 16.8,
"epoch": 0.9313131313131313,
"grad_norm": 1.9378466606140137,
"kl": 0.067706298828125,
"learning_rate": 2e-07,
"loss": -0.021140041947364806,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.4333333410322666,
"reward_std": 0.2504173070192337,
"rewards/MultiModalAccuracyORM": 0.4333333410322666,
"step": 2305,
"train_speed(iter/s)": 0.041006
},
{
"clip_ratio": 0.0,
"completion_length": 9.95,
"epoch": 0.9333333333333333,
"grad_norm": 0.4809723496437073,
"kl": 0.1289764404296875,
"learning_rate": 2e-07,
"loss": 0.003021649643778801,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.10833333730697632,
"reward_std": 0.2071926474571228,
"rewards/MultiModalAccuracyORM": 0.10833333730697632,
"step": 2310,
"train_speed(iter/s)": 0.041021
},
{
"clip_ratio": 0.0,
"completion_length": 42.75,
"epoch": 0.9353535353535354,
"grad_norm": 0.06879542768001556,
"kl": 0.09110107421875,
"learning_rate": 2e-07,
"loss": -0.004359513521194458,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.47500000298023226,
"reward_std": 0.2159808874130249,
"rewards/MultiModalAccuracyORM": 0.47500000298023226,
"step": 2315,
"train_speed(iter/s)": 0.041036
},
{
"clip_ratio": 0.0,
"completion_length": 13.2,
"epoch": 0.9373737373737374,
"grad_norm": 0.226049542427063,
"kl": 0.09764404296875,
"learning_rate": 2e-07,
"loss": 0.0010025198571383953,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.20833333805203438,
"reward_std": 0.12552748322486879,
"rewards/MultiModalAccuracyORM": 0.20833333805203438,
"step": 2320,
"train_speed(iter/s)": 0.04105
},
{
"clip_ratio": 0.0,
"completion_length": 6.95,
"epoch": 0.9393939393939394,
"grad_norm": 7.9168314933776855,
"kl": 0.0877655029296875,
"learning_rate": 2e-07,
"loss": 0.06811027526855469,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.5333333387970924,
"reward_std": 0.21149236261844634,
"rewards/MultiModalAccuracyORM": 0.5333333387970924,
"step": 2325,
"train_speed(iter/s)": 0.041062
},
{
"clip_ratio": 0.0,
"completion_length": 10.2,
"epoch": 0.9414141414141414,
"grad_norm": 0.2699204385280609,
"kl": 0.180908203125,
"learning_rate": 2e-07,
"loss": 0.0060350816696882244,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.27500000447034834,
"reward_std": 0.09041781425476074,
"rewards/MultiModalAccuracyORM": 0.27500000447034834,
"step": 2330,
"train_speed(iter/s)": 0.041071
},
{
"clip_ratio": 0.0,
"completion_length": 12.55,
"epoch": 0.9434343434343434,
"grad_norm": 27.749364852905273,
"kl": 0.22237548828125,
"learning_rate": 2e-07,
"loss": 0.08456591367721558,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.29166667312383654,
"reward_std": 0.1888910174369812,
"rewards/MultiModalAccuracyORM": 0.29166667312383654,
"step": 2335,
"train_speed(iter/s)": 0.041075
},
{
"clip_ratio": 0.0,
"completion_length": 69.25,
"epoch": 0.9454545454545454,
"grad_norm": 5.552628517150879,
"kl": 0.0543975830078125,
"learning_rate": 2e-07,
"loss": -0.05388938784599304,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.07500000149011612,
"reward_std": 0.19962169826030732,
"rewards/MultiModalAccuracyORM": 0.07500000149011612,
"step": 2340,
"train_speed(iter/s)": 0.041078
},
{
"clip_ratio": 0.0,
"completion_length": 7.7,
"epoch": 0.9474747474747475,
"grad_norm": 9.49284839630127,
"kl": 0.11671142578125,
"learning_rate": 2e-07,
"loss": -0.00172628965228796,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.19166667014360428,
"reward_std": 0.22629254460334777,
"rewards/MultiModalAccuracyORM": 0.19166667014360428,
"step": 2345,
"train_speed(iter/s)": 0.04109
},
{
"clip_ratio": 0.0,
"completion_length": 56.05,
"epoch": 0.9494949494949495,
"grad_norm": 3.0689406394958496,
"kl": 0.09317855834960938,
"learning_rate": 2e-07,
"loss": 0.013809925317764283,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.31666667088866235,
"reward_std": 0.3167103588581085,
"rewards/MultiModalAccuracyORM": 0.31666667088866235,
"step": 2350,
"train_speed(iter/s)": 0.041098
},
{
"clip_ratio": 0.0,
"completion_length": 12.8,
"epoch": 0.9515151515151515,
"grad_norm": 0.1557140052318573,
"kl": 0.280633544921875,
"learning_rate": 2e-07,
"loss": -0.00421304777264595,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.41666666865348817,
"reward_std": 0.07409421503543853,
"rewards/MultiModalAccuracyORM": 0.41666666865348817,
"step": 2355,
"train_speed(iter/s)": 0.041108
},
{
"clip_ratio": 0.0,
"completion_length": 30.65,
"epoch": 0.9535353535353536,
"grad_norm": 7.580443382263184,
"kl": 0.096343994140625,
"learning_rate": 2e-07,
"loss": -0.022874367237091065,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.6333333358168602,
"reward_std": 0.17861495018005372,
"rewards/MultiModalAccuracyORM": 0.6333333358168602,
"step": 2360,
"train_speed(iter/s)": 0.041118
},
{
"clip_ratio": 0.0,
"completion_length": 12.8,
"epoch": 0.9555555555555556,
"grad_norm": 0.11349290609359741,
"kl": 0.21148681640625,
"learning_rate": 2e-07,
"loss": -6.924470653757453e-05,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.5166666708886624,
"reward_std": 0.1840525358915329,
"rewards/MultiModalAccuracyORM": 0.5166666708886624,
"step": 2365,
"train_speed(iter/s)": 0.041134
},
{
"clip_ratio": 0.0,
"completion_length": 7.8,
"epoch": 0.9575757575757575,
"grad_norm": 16.9438419342041,
"kl": 0.20333251953125,
"learning_rate": 2e-07,
"loss": 0.008581924438476562,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.41666666939854624,
"reward_std": 0.16225576102733613,
"rewards/MultiModalAccuracyORM": 0.41666666939854624,
"step": 2370,
"train_speed(iter/s)": 0.041147
},
{
"clip_ratio": 0.0,
"completion_length": 5.7,
"epoch": 0.9595959595959596,
"grad_norm": 26.406293869018555,
"kl": 0.1427520751953125,
"learning_rate": 2e-07,
"loss": 0.011251689493656158,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.32500000596046447,
"reward_std": 0.23866584599018098,
"rewards/MultiModalAccuracyORM": 0.32500000596046447,
"step": 2375,
"train_speed(iter/s)": 0.041156
},
{
"clip_ratio": 0.0,
"completion_length": 44.35,
"epoch": 0.9616161616161616,
"grad_norm": 0.03468816727399826,
"kl": 0.09806137084960938,
"learning_rate": 2e-07,
"loss": 0.008510185778141022,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.5083333373069763,
"reward_std": 0.1037161648273468,
"rewards/MultiModalAccuracyORM": 0.5083333373069763,
"step": 2380,
"train_speed(iter/s)": 0.041159
},
{
"clip_ratio": 0.0,
"completion_length": 9.5,
"epoch": 0.9636363636363636,
"grad_norm": 12.14474105834961,
"kl": 0.127783203125,
"learning_rate": 2e-07,
"loss": 0.031885528564453126,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.25833333656191826,
"reward_std": 0.25566026866436004,
"rewards/MultiModalAccuracyORM": 0.25833333656191826,
"step": 2385,
"train_speed(iter/s)": 0.041174
},
{
"clip_ratio": 0.0,
"completion_length": 19.1,
"epoch": 0.9656565656565657,
"grad_norm": 0.8151546716690063,
"kl": 0.13538818359375,
"learning_rate": 2e-07,
"loss": 0.012065254151821136,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.233333333581686,
"reward_std": 0.07409421503543853,
"rewards/MultiModalAccuracyORM": 0.233333333581686,
"step": 2390,
"train_speed(iter/s)": 0.041185
},
{
"clip_ratio": 0.0,
"completion_length": 46.45,
"epoch": 0.9676767676767677,
"grad_norm": 22.97179412841797,
"kl": 0.0504150390625,
"learning_rate": 2e-07,
"loss": 0.00892886370420456,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.32500001043081284,
"reward_std": 0.386316055059433,
"rewards/MultiModalAccuracyORM": 0.32500001043081284,
"step": 2395,
"train_speed(iter/s)": 0.041191
},
{
"clip_ratio": 0.0,
"completion_length": 15.7,
"epoch": 0.9696969696969697,
"grad_norm": 0.13443566858768463,
"kl": 0.0746551513671875,
"learning_rate": 2e-07,
"loss": -0.008957084268331528,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.05833333507180214,
"reward_std": 0.11702905893325806,
"rewards/MultiModalAccuracyORM": 0.05833333507180214,
"step": 2400,
"train_speed(iter/s)": 0.041198
},
{
"clip_ratio": 0.0,
"completion_length": 5.3,
"epoch": 0.9717171717171718,
"grad_norm": 13.01309871673584,
"kl": 0.1608978271484375,
"learning_rate": 2e-07,
"loss": -0.005169375985860825,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.4333333395421505,
"reward_std": 0.2074468731880188,
"rewards/MultiModalAccuracyORM": 0.4333333395421505,
"step": 2405,
"train_speed(iter/s)": 0.041211
},
{
"clip_ratio": 0.0,
"completion_length": 65.4,
"epoch": 0.9737373737373738,
"grad_norm": 20.76219367980957,
"kl": 0.10498046875,
"learning_rate": 2e-07,
"loss": -0.026147454977035522,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.3166666693985462,
"reward_std": 0.23933667540550232,
"rewards/MultiModalAccuracyORM": 0.3166666693985462,
"step": 2410,
"train_speed(iter/s)": 0.041217
},
{
"clip_ratio": 0.0,
"completion_length": 17.3,
"epoch": 0.9757575757575757,
"grad_norm": 5.97620964050293,
"kl": 0.098968505859375,
"learning_rate": 2e-07,
"loss": 0.04436638355255127,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.5916666783392429,
"reward_std": 0.26292563080787656,
"rewards/MultiModalAccuracyORM": 0.5916666783392429,
"step": 2415,
"train_speed(iter/s)": 0.041224
},
{
"clip_ratio": 0.0,
"completion_length": 7.9,
"epoch": 0.9777777777777777,
"grad_norm": 0.16142967343330383,
"kl": 0.2656707763671875,
"learning_rate": 2e-07,
"loss": 0.010275793075561524,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.4750000022351742,
"reward_std": 0.12558708488941192,
"rewards/MultiModalAccuracyORM": 0.4750000022351742,
"step": 2420,
"train_speed(iter/s)": 0.041233
},
{
"clip_ratio": 0.0,
"completion_length": 29.5,
"epoch": 0.9797979797979798,
"grad_norm": 5.270585060119629,
"kl": 0.1023193359375,
"learning_rate": 2e-07,
"loss": 0.013689932227134705,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.2583333417773247,
"reward_std": 0.2817953139543533,
"rewards/MultiModalAccuracyORM": 0.2583333417773247,
"step": 2425,
"train_speed(iter/s)": 0.041241
},
{
"clip_ratio": 0.0,
"completion_length": 24.5,
"epoch": 0.9818181818181818,
"grad_norm": 2.2413382530212402,
"kl": 0.09530487060546874,
"learning_rate": 2e-07,
"loss": -0.009250025451183318,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.5166666693985462,
"reward_std": 0.17150862216949464,
"rewards/MultiModalAccuracyORM": 0.5166666693985462,
"step": 2430,
"train_speed(iter/s)": 0.041258
},
{
"clip_ratio": 0.0,
"completion_length": 7.1,
"epoch": 0.9838383838383838,
"grad_norm": 0.14606672525405884,
"kl": 0.098297119140625,
"learning_rate": 2e-07,
"loss": -0.021257255971431733,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.28333334252238274,
"reward_std": 0.19713521599769593,
"rewards/MultiModalAccuracyORM": 0.28333334252238274,
"step": 2435,
"train_speed(iter/s)": 0.041266
},
{
"clip_ratio": 0.0,
"completion_length": 11.3,
"epoch": 0.9858585858585859,
"grad_norm": 2.6238768100738525,
"kl": 0.10804595947265624,
"learning_rate": 2e-07,
"loss": 0.007257813215255737,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.30000000149011613,
"reward_std": 0.1974250316619873,
"rewards/MultiModalAccuracyORM": 0.30000000149011613,
"step": 2440,
"train_speed(iter/s)": 0.041271
},
{
"clip_ratio": 0.0,
"completion_length": 18.15,
"epoch": 0.9878787878787879,
"grad_norm": 0.03827716410160065,
"kl": 0.101373291015625,
"learning_rate": 2e-07,
"loss": 0.011828117072582245,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.4916666731238365,
"reward_std": 0.181566059589386,
"rewards/MultiModalAccuracyORM": 0.4916666731238365,
"step": 2445,
"train_speed(iter/s)": 0.041278
},
{
"clip_ratio": 0.0,
"completion_length": 43.4,
"epoch": 0.98989898989899,
"grad_norm": 6.416419982910156,
"kl": 0.2295166015625,
"learning_rate": 2e-07,
"loss": 0.009897831082344054,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.3750000029802322,
"reward_std": 0.10072947144508362,
"rewards/MultiModalAccuracyORM": 0.3750000029802322,
"step": 2450,
"train_speed(iter/s)": 0.041283
},
{
"clip_ratio": 0.0,
"completion_length": 9.6,
"epoch": 0.9919191919191919,
"grad_norm": 3.0410783290863037,
"kl": 0.14271240234375,
"learning_rate": 2e-07,
"loss": -0.015740707516670227,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.5083333380520344,
"reward_std": 0.27522478699684144,
"rewards/MultiModalAccuracyORM": 0.5083333380520344,
"step": 2455,
"train_speed(iter/s)": 0.041292
},
{
"clip_ratio": 0.0,
"completion_length": 6.15,
"epoch": 0.9939393939393939,
"grad_norm": 0.742748498916626,
"kl": 0.2917930603027344,
"learning_rate": 2e-07,
"loss": 0.06221296787261963,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.2833333402872086,
"reward_std": 0.17702035307884217,
"rewards/MultiModalAccuracyORM": 0.2833333402872086,
"step": 2460,
"train_speed(iter/s)": 0.041301
},
{
"clip_ratio": 0.0,
"completion_length": 39.7,
"epoch": 0.9959595959595959,
"grad_norm": 0.5455455780029297,
"kl": 0.1237335205078125,
"learning_rate": 2e-07,
"loss": 0.04647340774536133,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.5750000044703484,
"reward_std": 0.09041781425476074,
"rewards/MultiModalAccuracyORM": 0.5750000044703484,
"step": 2465,
"train_speed(iter/s)": 0.041305
},
{
"clip_ratio": 0.0,
"completion_length": 6.6,
"epoch": 0.997979797979798,
"grad_norm": 3.567203998565674,
"kl": 0.128204345703125,
"learning_rate": 2e-07,
"loss": -0.006601794809103012,
"memory(GiB)": 104.49,
"response_clip_ratio": 0.0,
"reward": 0.3416666753590107,
"reward_std": 0.3019101768732071,
"rewards/MultiModalAccuracyORM": 0.3416666753590107,
"step": 2470,
"train_speed(iter/s)": 0.04132
},
{
"epoch": 1.0,
"grad_norm": 24.7083740234375,
"learning_rate": 2e-07,
"loss": 0.018315188586711884,
"memory(GiB)": 104.49,
"step": 2475,
"train_speed(iter/s)": 0.041332
},
{
"epoch": 1.0,
"eval_clip_ratio": 0.0,
"eval_completion_length": 28.336667232513427,
"eval_kl": 0.152705078125,
"eval_loss": 0.011019712314009666,
"eval_response_clip_ratio": 0.0,
"eval_reward": 0.4650000059604645,
"eval_reward_std": 0.1907379400730133,
"eval_rewards/MultiModalAccuracyORM": 0.4650000059604645,
"eval_runtime": 238.5041,
"eval_samples_per_second": 0.21,
"eval_steps_per_second": 0.021,
"step": 2475
}
],
"logging_steps": 5,
"max_steps": 2475,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 250,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}