jonatatyska's picture
Model save
6df3ec0 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9991645781119465,
"eval_steps": 500,
"global_step": 299,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"clip_ratio": 0.0,
"completion_length": 714.7366333007812,
"epoch": 0.003341687552213868,
"grad_norm": 0.32587897777557373,
"kl": 0.00015604496002197266,
"learning_rate": 6.666666666666667e-07,
"loss": 0.037,
"reward": 0.3007812649011612,
"reward_std": 0.3548884987831116,
"rewards/embodied_math": 0.12946429196745157,
"rewards/format_reward": 0.0290178582072258,
"rewards/tag_count_reward": 0.14229911379516125,
"step": 1
},
{
"clip_ratio": 0.0,
"completion_length": 679.2701110839844,
"epoch": 0.006683375104427736,
"grad_norm": 0.37410375475883484,
"kl": 0.00015926361083984375,
"learning_rate": 1.3333333333333334e-06,
"loss": 0.0165,
"reward": 0.2979910857975483,
"reward_std": 0.4222180098295212,
"rewards/embodied_math": 0.08258928963914514,
"rewards/format_reward": 0.03794643096625805,
"rewards/tag_count_reward": 0.1774553619325161,
"step": 2
},
{
"clip_ratio": 0.0,
"completion_length": 752.3594055175781,
"epoch": 0.010025062656641603,
"grad_norm": 0.3303394913673401,
"kl": 0.00016188621520996094,
"learning_rate": 2.0000000000000003e-06,
"loss": 0.0331,
"reward": 0.2845982275903225,
"reward_std": 0.37162595987319946,
"rewards/embodied_math": 0.0959821455180645,
"rewards/format_reward": 0.029017858440056443,
"rewards/tag_count_reward": 0.1595982238650322,
"step": 3
},
{
"clip_ratio": 0.0,
"completion_length": 650.1518096923828,
"epoch": 0.013366750208855471,
"grad_norm": 0.42782607674598694,
"kl": 0.00015664100646972656,
"learning_rate": 2.666666666666667e-06,
"loss": -0.0001,
"reward": 0.27120537497103214,
"reward_std": 0.3699168190360069,
"rewards/embodied_math": 0.10044643399305642,
"rewards/format_reward": 0.03571428684517741,
"rewards/tag_count_reward": 0.13504465110599995,
"step": 4
},
{
"clip_ratio": 0.0,
"completion_length": 703.1094055175781,
"epoch": 0.01670843776106934,
"grad_norm": 0.392333447933197,
"kl": 0.000461578369140625,
"learning_rate": 3.3333333333333333e-06,
"loss": 0.0218,
"reward": 0.254464291036129,
"reward_std": 0.32935116440057755,
"rewards/embodied_math": 0.08035714668221772,
"rewards/format_reward": 0.026785714784637094,
"rewards/tag_count_reward": 0.1473214328289032,
"step": 5
},
{
"clip_ratio": 0.0,
"completion_length": 571.1674270629883,
"epoch": 0.020050125313283207,
"grad_norm": 0.4020984172821045,
"kl": 0.004108428955078125,
"learning_rate": 4.000000000000001e-06,
"loss": 0.0477,
"reward": 0.5329241380095482,
"reward_std": 0.495420403778553,
"rewards/embodied_math": 0.15401786682195961,
"rewards/format_reward": 0.08258929010480642,
"rewards/tag_count_reward": 0.2963169813156128,
"step": 6
},
{
"clip_ratio": 0.0,
"completion_length": 599.1049346923828,
"epoch": 0.023391812865497075,
"grad_norm": 118.67343139648438,
"kl": 2.298828125,
"learning_rate": 4.666666666666667e-06,
"loss": 0.1594,
"reward": 0.7059152126312256,
"reward_std": 0.659125566482544,
"rewards/embodied_math": 0.07589285937137902,
"rewards/format_reward": 0.2031250111758709,
"rewards/tag_count_reward": 0.426897332072258,
"step": 7
},
{
"clip_ratio": 0.0,
"completion_length": 610.7879943847656,
"epoch": 0.026733500417710943,
"grad_norm": 12.420914649963379,
"kl": 0.4365234375,
"learning_rate": 5.333333333333334e-06,
"loss": 0.064,
"reward": 0.6316964626312256,
"reward_std": 0.6864016056060791,
"rewards/embodied_math": 0.06250000419095159,
"rewards/format_reward": 0.1651785783469677,
"rewards/tag_count_reward": 0.4040178805589676,
"step": 8
},
{
"clip_ratio": 0.0,
"completion_length": 579.732177734375,
"epoch": 0.03007518796992481,
"grad_norm": 1.7345542907714844,
"kl": 0.1015625,
"learning_rate": 6e-06,
"loss": 0.0845,
"reward": 1.0446428954601288,
"reward_std": 0.7974795699119568,
"rewards/embodied_math": 0.2031250111758709,
"rewards/format_reward": 0.290178582072258,
"rewards/tag_count_reward": 0.5513393133878708,
"step": 9
},
{
"clip_ratio": 0.0,
"completion_length": 667.1272735595703,
"epoch": 0.03341687552213868,
"grad_norm": 0.9705411195755005,
"kl": 0.056121826171875,
"learning_rate": 6.666666666666667e-06,
"loss": 0.0447,
"reward": 0.8666295111179352,
"reward_std": 0.7798839062452316,
"rewards/embodied_math": 0.0803571455180645,
"rewards/format_reward": 0.2991071566939354,
"rewards/tag_count_reward": 0.4871651977300644,
"step": 10
},
{
"clip_ratio": 0.0,
"completion_length": 662.5736846923828,
"epoch": 0.036758563074352546,
"grad_norm": 1.020012378692627,
"kl": 0.024505615234375,
"learning_rate": 7.333333333333333e-06,
"loss": 0.0359,
"reward": 1.0256696790456772,
"reward_std": 0.8458178341388702,
"rewards/embodied_math": 0.13392857951112092,
"rewards/format_reward": 0.3482142984867096,
"rewards/tag_count_reward": 0.5435268059372902,
"step": 11
},
{
"clip_ratio": 0.0,
"completion_length": 493.3817138671875,
"epoch": 0.040100250626566414,
"grad_norm": 0.5114466547966003,
"kl": 0.02532958984375,
"learning_rate": 8.000000000000001e-06,
"loss": 0.0494,
"reward": 1.0876116454601288,
"reward_std": 0.8503101617097855,
"rewards/embodied_math": 0.12723214644938707,
"rewards/format_reward": 0.4017857313156128,
"rewards/tag_count_reward": 0.5585937649011612,
"step": 12
},
{
"clip_ratio": 0.0,
"completion_length": 619.1384201049805,
"epoch": 0.04344193817878028,
"grad_norm": 0.4396233856678009,
"kl": 0.023834228515625,
"learning_rate": 8.666666666666668e-06,
"loss": 0.0118,
"reward": 1.002232164144516,
"reward_std": 0.8543792814016342,
"rewards/embodied_math": 0.03125000069849193,
"rewards/format_reward": 0.408482164144516,
"rewards/tag_count_reward": 0.5625000149011612,
"step": 13
},
{
"clip_ratio": 0.0,
"completion_length": 633.654052734375,
"epoch": 0.04678362573099415,
"grad_norm": 0.39523178339004517,
"kl": 0.026153564453125,
"learning_rate": 9.333333333333334e-06,
"loss": 0.0537,
"reward": 1.4285715222358704,
"reward_std": 0.7692538350820541,
"rewards/embodied_math": 0.0781250037252903,
"rewards/format_reward": 0.587053582072258,
"rewards/tag_count_reward": 0.76339291036129,
"step": 14
},
{
"clip_ratio": 0.0,
"completion_length": 642.9152221679688,
"epoch": 0.05012531328320802,
"grad_norm": 0.48618102073669434,
"kl": 0.8505859375,
"learning_rate": 1e-05,
"loss": -0.0001,
"reward": 1.5468750596046448,
"reward_std": 0.7278469949960709,
"rewards/embodied_math": 0.10044643399305642,
"rewards/format_reward": 0.627232164144516,
"rewards/tag_count_reward": 0.8191964626312256,
"step": 15
},
{
"clip_ratio": 0.0,
"completion_length": 669.107177734375,
"epoch": 0.053467000835421885,
"grad_norm": 15783.640625,
"kl": 300.05426025390625,
"learning_rate": 1.0666666666666667e-05,
"loss": 9.6148,
"reward": 1.5273438096046448,
"reward_std": 0.7241310179233551,
"rewards/embodied_math": 0.08258929196745157,
"rewards/format_reward": 0.6138392984867096,
"rewards/tag_count_reward": 0.8309152126312256,
"step": 16
},
{
"clip_ratio": 0.0,
"completion_length": 628.6786041259766,
"epoch": 0.05680868838763575,
"grad_norm": 0.5250495076179504,
"kl": 0.0806884765625,
"learning_rate": 1.1333333333333334e-05,
"loss": 0.0244,
"reward": 1.5290178954601288,
"reward_std": 0.6853032112121582,
"rewards/embodied_math": 0.11383929033763707,
"rewards/format_reward": 0.5781250149011612,
"rewards/tag_count_reward": 0.8370536118745804,
"step": 17
},
{
"clip_ratio": 0.0,
"completion_length": 601.9754638671875,
"epoch": 0.06015037593984962,
"grad_norm": 0.6404133439064026,
"kl": 0.1341552734375,
"learning_rate": 1.2e-05,
"loss": -0.016,
"reward": 1.8063616454601288,
"reward_std": 0.5558229237794876,
"rewards/embodied_math": 0.1361607201397419,
"rewards/format_reward": 0.7566964626312256,
"rewards/tag_count_reward": 0.913504496216774,
"step": 18
},
{
"clip_ratio": 0.0,
"completion_length": 672.0781555175781,
"epoch": 0.06349206349206349,
"grad_norm": 3.5720558166503906,
"kl": 0.458251953125,
"learning_rate": 1.2666666666666667e-05,
"loss": 0.064,
"reward": 1.786272406578064,
"reward_std": 0.5581437945365906,
"rewards/embodied_math": 0.11383929220028222,
"rewards/format_reward": 0.7678571790456772,
"rewards/tag_count_reward": 0.9045759439468384,
"step": 19
},
{
"clip_ratio": 0.0,
"completion_length": 748.0268249511719,
"epoch": 0.06683375104427736,
"grad_norm": 0.5579437017440796,
"kl": 0.0953369140625,
"learning_rate": 1.3333333333333333e-05,
"loss": 0.0603,
"reward": 1.7711087763309479,
"reward_std": 0.6120292246341705,
"rewards/embodied_math": 0.14722478203475475,
"rewards/format_reward": 0.738839328289032,
"rewards/tag_count_reward": 0.8850446790456772,
"step": 20
},
{
"clip_ratio": 0.0,
"completion_length": 756.9888610839844,
"epoch": 0.07017543859649122,
"grad_norm": 0.6377071142196655,
"kl": 0.06231689453125,
"learning_rate": 1.4e-05,
"loss": 0.035,
"reward": 1.7229181826114655,
"reward_std": 0.6206964701414108,
"rewards/embodied_math": 0.06834219070151448,
"rewards/format_reward": 0.76339291036129,
"rewards/tag_count_reward": 0.891183078289032,
"step": 21
},
{
"clip_ratio": 0.0,
"completion_length": 753.357177734375,
"epoch": 0.07351712614870509,
"grad_norm": 1.8024694919586182,
"kl": 0.150146484375,
"learning_rate": 1.4666666666666666e-05,
"loss": 0.0766,
"reward": 1.697922170162201,
"reward_std": 0.6154973953962326,
"rewards/embodied_math": 0.1008238852955401,
"rewards/format_reward": 0.7633928805589676,
"rewards/tag_count_reward": 0.8337053954601288,
"step": 22
},
{
"clip_ratio": 0.0,
"completion_length": 871.7009124755859,
"epoch": 0.07685881370091896,
"grad_norm": 0.9224417209625244,
"kl": 0.120849609375,
"learning_rate": 1.5333333333333334e-05,
"loss": 0.1467,
"reward": 1.4809691905975342,
"reward_std": 0.7514003068208694,
"rewards/embodied_math": 0.11880402453243732,
"rewards/format_reward": 0.651785746216774,
"rewards/tag_count_reward": 0.710379496216774,
"step": 23
},
{
"clip_ratio": 0.0,
"completion_length": 816.9911193847656,
"epoch": 0.08020050125313283,
"grad_norm": 2.6176247596740723,
"kl": 0.22021484375,
"learning_rate": 1.6000000000000003e-05,
"loss": 0.1362,
"reward": 1.5853315591812134,
"reward_std": 0.7496855407953262,
"rewards/embodied_math": 0.13611273211427033,
"rewards/format_reward": 0.6808036118745804,
"rewards/tag_count_reward": 0.7684152126312256,
"step": 24
},
{
"clip_ratio": 0.0,
"completion_length": 881.4955596923828,
"epoch": 0.0835421888053467,
"grad_norm": 2947.326171875,
"kl": 18.1572265625,
"learning_rate": 1.6666666666666667e-05,
"loss": 1.0788,
"reward": 1.377666562795639,
"reward_std": 0.8003540188074112,
"rewards/embodied_math": 0.1455236654728651,
"rewards/format_reward": 0.5357142984867096,
"rewards/tag_count_reward": 0.6964286118745804,
"step": 25
},
{
"clip_ratio": 0.0,
"completion_length": 879.4531707763672,
"epoch": 0.08688387635756056,
"grad_norm": 8.099024772644043,
"kl": 1.2607421875,
"learning_rate": 1.7333333333333336e-05,
"loss": 0.1733,
"reward": 1.0582136511802673,
"reward_std": 0.7152388989925385,
"rewards/embodied_math": 0.05765558429993689,
"rewards/format_reward": 0.3370535895228386,
"rewards/tag_count_reward": 0.6635045111179352,
"step": 26
},
{
"clip_ratio": 0.0,
"completion_length": 907.2210235595703,
"epoch": 0.09022556390977443,
"grad_norm": 7.911270618438721,
"kl": 0.85986328125,
"learning_rate": 1.8e-05,
"loss": 0.1583,
"reward": 0.9548228234052658,
"reward_std": 0.6883680373430252,
"rewards/embodied_math": 0.15459956042468548,
"rewards/format_reward": 0.2165178656578064,
"rewards/tag_count_reward": 0.5837053805589676,
"step": 27
},
{
"clip_ratio": 0.0,
"completion_length": 947.0982666015625,
"epoch": 0.0935672514619883,
"grad_norm": 5.554101467132568,
"kl": 1.0263671875,
"learning_rate": 1.866666666666667e-05,
"loss": 0.1178,
"reward": 0.704320564866066,
"reward_std": 0.4823942184448242,
"rewards/embodied_math": 0.18478929996490479,
"rewards/format_reward": 0.07812500465661287,
"rewards/tag_count_reward": 0.4414062649011612,
"step": 28
},
{
"clip_ratio": 0.0,
"completion_length": 791.7611999511719,
"epoch": 0.09690893901420217,
"grad_norm": 13.062017440795898,
"kl": 3.9375,
"learning_rate": 1.9333333333333333e-05,
"loss": -0.0151,
"reward": 0.3755580484867096,
"reward_std": 0.33308642357587814,
"rewards/embodied_math": 0.051339289639145136,
"rewards/format_reward": 0.015625000931322575,
"rewards/tag_count_reward": 0.3085937649011612,
"step": 29
},
{
"clip_ratio": 0.0,
"completion_length": 589.7611846923828,
"epoch": 0.10025062656641603,
"grad_norm": 864.9710693359375,
"kl": 6.4609375,
"learning_rate": 2e-05,
"loss": -0.0239,
"reward": 0.3041294813156128,
"reward_std": 0.286144133657217,
"rewards/embodied_math": 0.04910714505240321,
"rewards/format_reward": 0.004464285913854837,
"rewards/tag_count_reward": 0.2505580447614193,
"step": 30
},
{
"clip_ratio": 0.0,
"completion_length": 440.2567138671875,
"epoch": 0.1035923141186299,
"grad_norm": 13.670504570007324,
"kl": 5.5625,
"learning_rate": 1.9999318037877998e-05,
"loss": -0.5693,
"reward": 0.203683041036129,
"reward_std": 0.2521365247666836,
"rewards/embodied_math": 0.06473214598372579,
"rewards/format_reward": 0.0022321429569274187,
"rewards/tag_count_reward": 0.13671875558793545,
"step": 31
},
{
"clip_ratio": 0.0,
"completion_length": 516.3727951049805,
"epoch": 0.10693400167084377,
"grad_norm": 7.831124782562256,
"kl": 2.88671875,
"learning_rate": 1.9997272244526454e-05,
"loss": -0.5291,
"reward": 0.2455357238650322,
"reward_std": 0.19646178930997849,
"rewards/embodied_math": 0.12276786006987095,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.12276786379516125,
"step": 32
},
{
"clip_ratio": 0.0,
"completion_length": 582.5781402587891,
"epoch": 0.11027568922305764,
"grad_norm": 589.973388671875,
"kl": 4.8671875,
"learning_rate": 1.9993862898976092e-05,
"loss": -0.5682,
"reward": 0.1043526828289032,
"reward_std": 0.15242009237408638,
"rewards/embodied_math": 0.0066964291036129,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0976562537252903,
"step": 33
},
{
"clip_ratio": 0.0,
"completion_length": 356.7455520629883,
"epoch": 0.1136173767752715,
"grad_norm": 17.74056053161621,
"kl": 3.6328125,
"learning_rate": 1.998909046623581e-05,
"loss": -0.5021,
"reward": 0.255022332072258,
"reward_std": 0.24578910320997238,
"rewards/embodied_math": 0.11160714784637094,
"rewards/format_reward": 0.017857143422588706,
"rewards/tag_count_reward": 0.125558041036129,
"step": 34
},
{
"clip_ratio": 0.0,
"completion_length": 410.4955520629883,
"epoch": 0.11695906432748537,
"grad_norm": 9.428912162780762,
"kl": 2.2265625,
"learning_rate": 1.9982955597229275e-05,
"loss": -0.6298,
"reward": 0.2382812574505806,
"reward_std": 0.22896768525242805,
"rewards/embodied_math": 0.10937500488944352,
"rewards/format_reward": 0.01785714365541935,
"rewards/tag_count_reward": 0.1110491119325161,
"step": 35
},
{
"clip_ratio": 0.0,
"completion_length": 548.084846496582,
"epoch": 0.12030075187969924,
"grad_norm": 7.926641941070557,
"kl": 1.931640625,
"learning_rate": 1.9975459128706155e-05,
"loss": -0.5518,
"reward": 0.24944196827709675,
"reward_std": 0.16391626000404358,
"rewards/embodied_math": 0.1428571492433548,
"rewards/format_reward": 0.004464285913854837,
"rewards/tag_count_reward": 0.102120541036129,
"step": 36
},
{
"clip_ratio": 0.0,
"completion_length": 605.2120819091797,
"epoch": 0.12364243943191311,
"grad_norm": 16.085020065307617,
"kl": 3.2109375,
"learning_rate": 1.996660208312796e-05,
"loss": -0.6493,
"reward": 0.08649553917348385,
"reward_std": 0.12165977619588375,
"rewards/embodied_math": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.08649553917348385,
"step": 37
},
{
"clip_ratio": 0.0,
"completion_length": 624.4464569091797,
"epoch": 0.12698412698412698,
"grad_norm": 5.551716327667236,
"kl": 2.5703125,
"learning_rate": 1.9956385668528614e-05,
"loss": -0.466,
"reward": 0.0920758955180645,
"reward_std": 0.12959402054548264,
"rewards/embodied_math": 0.0022321429569274187,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.08984375186264515,
"step": 38
},
{
"clip_ratio": 0.0,
"completion_length": 762.6674499511719,
"epoch": 0.13032581453634084,
"grad_norm": 6.6176934242248535,
"kl": 2.4609375,
"learning_rate": 1.9944811278349666e-05,
"loss": -0.2619,
"reward": 0.1785714402794838,
"reward_std": 0.1333499550819397,
"rewards/embodied_math": 0.0714285746216774,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.1071428619325161,
"step": 39
},
{
"clip_ratio": 0.0,
"completion_length": 898.5669860839844,
"epoch": 0.1336675020885547,
"grad_norm": 7.996533393859863,
"kl": 3.1640625,
"learning_rate": 1.9931880491250263e-05,
"loss": -0.0767,
"reward": 0.08593750186264515,
"reward_std": 0.11663081869482994,
"rewards/embodied_math": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.08593750186264515,
"step": 40
},
{
"clip_ratio": 0.0,
"completion_length": 1162.091552734375,
"epoch": 0.13700918964076858,
"grad_norm": 4.013873100280762,
"kl": 3.97265625,
"learning_rate": 1.9917595070891796e-05,
"loss": 0.0529,
"reward": 0.1668526865541935,
"reward_std": 0.12196414358913898,
"rewards/embodied_math": 0.0714285746216774,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0954241119325161,
"step": 41
},
{
"clip_ratio": 0.0,
"completion_length": 1291.6339721679688,
"epoch": 0.14035087719298245,
"grad_norm": 5.315943241119385,
"kl": 4.1796875,
"learning_rate": 1.9901956965697387e-05,
"loss": 0.1625,
"reward": 0.1400669701397419,
"reward_std": 0.12121919170022011,
"rewards/embodied_math": 0.0357142873108387,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.10435268469154835,
"step": 42
},
{
"clip_ratio": 0.0,
"completion_length": 1297.1205444335938,
"epoch": 0.14369256474519632,
"grad_norm": 7.909899711608887,
"kl": 8.6875,
"learning_rate": 1.988496830858612e-05,
"loss": 0.194,
"reward": 0.11662947200238705,
"reward_std": 0.11725078709423542,
"rewards/embodied_math": 0.0357142873108387,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0809151828289032,
"step": 43
},
{
"clip_ratio": 0.0,
"completion_length": 1300.0,
"epoch": 0.14703425229741018,
"grad_norm": 10.900046348571777,
"kl": 8.90625,
"learning_rate": 1.986663141668212e-05,
"loss": 0.3553,
"reward": 0.1389508992433548,
"reward_std": 0.1102825254201889,
"rewards/embodied_math": 0.0714285746216774,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0675223246216774,
"step": 44
},
{
"clip_ratio": 0.0,
"completion_length": 1297.1227722167969,
"epoch": 0.15037593984962405,
"grad_norm": 20.010757446289062,
"kl": 7.046875,
"learning_rate": 1.9846948790998532e-05,
"loss": 0.2807,
"reward": 0.10267857741564512,
"reward_std": 0.10911580175161362,
"rewards/embodied_math": 0.0357142873108387,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.06696429010480642,
"step": 45
},
{
"clip_ratio": 0.0,
"completion_length": 1300.0,
"epoch": 0.15371762740183792,
"grad_norm": 6.929605960845947,
"kl": 4.0703125,
"learning_rate": 1.982592311609639e-05,
"loss": 0.1625,
"reward": 0.16015625931322575,
"reward_std": 0.11942839249968529,
"rewards/embodied_math": 0.0714285746216774,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.08872768469154835,
"step": 46
},
{
"clip_ratio": 0.0,
"completion_length": 1300.0,
"epoch": 0.1570593149540518,
"grad_norm": 3.139974594116211,
"kl": 4.9453125,
"learning_rate": 1.9803557259718472e-05,
"loss": 0.1971,
"reward": 0.1434151865541935,
"reward_std": 0.12306286953389645,
"rewards/embodied_math": 0.0357142873108387,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.1077008955180645,
"step": 47
},
{
"clip_ratio": 0.0,
"completion_length": 1300.0,
"epoch": 0.16040100250626566,
"grad_norm": 12.176130294799805,
"kl": 7.8125,
"learning_rate": 1.977985427239815e-05,
"loss": 0.311,
"reward": 0.3018973357975483,
"reward_std": 0.120529068633914,
"rewards/embodied_math": 0.1428571492433548,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.1590401865541935,
"step": 48
},
{
"clip_ratio": 0.0,
"completion_length": 1300.0,
"epoch": 0.16374269005847952,
"grad_norm": 60.613792419433594,
"kl": 3.265625,
"learning_rate": 1.975481738704333e-05,
"loss": 0.1304,
"reward": 0.2890625149011612,
"reward_std": 0.10946565680205822,
"rewards/embodied_math": 0.1071428619325161,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.1819196529686451,
"step": 49
},
{
"clip_ratio": 0.0,
"completion_length": 1300.0,
"epoch": 0.1670843776106934,
"grad_norm": 7.159120559692383,
"kl": 2.24609375,
"learning_rate": 1.9728450018495506e-05,
"loss": 0.0896,
"reward": 0.1780133992433548,
"reward_std": 0.13202833384275436,
"rewards/embodied_math": 0.0357142873108387,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.1422991119325161,
"step": 50
},
{
"clip_ratio": 0.0,
"completion_length": 1300.0,
"epoch": 0.17042606516290726,
"grad_norm": 7.3434624671936035,
"kl": 1.193359375,
"learning_rate": 1.9700755763064e-05,
"loss": 0.0476,
"reward": 0.2114955484867096,
"reward_std": 0.10858343727886677,
"rewards/embodied_math": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.2114955484867096,
"step": 51
},
{
"clip_ratio": 0.0,
"completion_length": 1300.0,
"epoch": 0.17376775271512113,
"grad_norm": 3.7159674167633057,
"kl": 1.43359375,
"learning_rate": 1.967173839803545e-05,
"loss": 0.0572,
"reward": 0.3286830522119999,
"reward_std": 0.09371168166399002,
"rewards/embodied_math": 0.1071428619325161,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.2215401865541935,
"step": 52
},
{
"clip_ratio": 0.0,
"completion_length": 1300.0,
"epoch": 0.177109440267335,
"grad_norm": 3.8071751594543457,
"kl": 2.0390625,
"learning_rate": 1.9641401881158625e-05,
"loss": 0.0813,
"reward": 0.294642873108387,
"reward_std": 0.08163666725158691,
"rewards/embodied_math": 0.0714285746216774,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.2232142947614193,
"step": 53
},
{
"clip_ratio": 0.0,
"completion_length": 1300.0,
"epoch": 0.18045112781954886,
"grad_norm": 73.75374603271484,
"kl": 15.703125,
"learning_rate": 1.960975035010461e-05,
"loss": 0.6256,
"reward": 0.2801339402794838,
"reward_std": 0.13134469836950302,
"rewards/embodied_math": 0.1071428619325161,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.1729910783469677,
"step": 54
},
{
"clip_ratio": 0.0,
"completion_length": 1300.0,
"epoch": 0.18379281537176273,
"grad_norm": 23.148263931274414,
"kl": 6.6640625,
"learning_rate": 1.9576788121902457e-05,
"loss": 0.2659,
"reward": 0.1568080447614193,
"reward_std": 0.14204410836100578,
"rewards/embodied_math": 0.0357142873108387,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.1210937537252903,
"step": 55
},
{
"clip_ratio": 0.0,
"completion_length": 1297.4419860839844,
"epoch": 0.1871345029239766,
"grad_norm": 4.164386749267578,
"kl": 1.478515625,
"learning_rate": 1.954251969235039e-05,
"loss": 0.0545,
"reward": 0.2148437574505806,
"reward_std": 0.12886478379368782,
"rewards/embodied_math": 0.03794643026776612,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.1768973283469677,
"step": 56
},
{
"clip_ratio": 0.0,
"completion_length": 1300.0,
"epoch": 0.19047619047619047,
"grad_norm": 5.979341506958008,
"kl": 1.724609375,
"learning_rate": 1.950694973540259e-05,
"loss": 0.0687,
"reward": 0.2181919775903225,
"reward_std": 0.12196704186499119,
"rewards/embodied_math": 0.0357142873108387,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.1824776865541935,
"step": 57
},
{
"clip_ratio": 0.0,
"completion_length": 1297.1919860839844,
"epoch": 0.19381787802840433,
"grad_norm": 10.468803405761719,
"kl": 5.8671875,
"learning_rate": 1.9470083102531724e-05,
"loss": 0.2288,
"reward": 0.1841517947614193,
"reward_std": 0.12486465089023113,
"rewards/embodied_math": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.1841517947614193,
"step": 58
},
{
"clip_ratio": 0.0,
"completion_length": 1300.0,
"epoch": 0.1971595655806182,
"grad_norm": 4.8235697746276855,
"kl": 3.2734375,
"learning_rate": 1.943192482206723e-05,
"loss": 0.1307,
"reward": 0.258928582072258,
"reward_std": 0.10686362534761429,
"rewards/embodied_math": 0.0714285746216774,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.1875000111758709,
"step": 59
},
{
"clip_ratio": 0.0,
"completion_length": 1300.0,
"epoch": 0.20050125313283207,
"grad_norm": 4.113831520080566,
"kl": 1.681640625,
"learning_rate": 1.9392480098509488e-05,
"loss": 0.067,
"reward": 0.2282366193830967,
"reward_std": 0.106992082670331,
"rewards/embodied_math": 0.0357142873108387,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.192522332072258,
"step": 60
},
{
"clip_ratio": 0.0,
"completion_length": 1300.0,
"epoch": 0.20384294068504594,
"grad_norm": 30.52185821533203,
"kl": 1.46484375,
"learning_rate": 1.9351754311819978e-05,
"loss": 0.0584,
"reward": 0.3147321604192257,
"reward_std": 0.10511562786996365,
"rewards/embodied_math": 0.1071428619325161,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.2075892947614193,
"step": 61
},
{
"clip_ratio": 0.0,
"completion_length": 1300.0,
"epoch": 0.2071846282372598,
"grad_norm": 4.678550720214844,
"kl": 1.7421875,
"learning_rate": 1.9309753016687478e-05,
"loss": 0.0695,
"reward": 0.2410714440047741,
"reward_std": 0.09354828484356403,
"rewards/embodied_math": 0.0357142873108387,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.2053571529686451,
"step": 62
},
{
"clip_ratio": 0.0,
"completion_length": 1300.0,
"epoch": 0.21052631578947367,
"grad_norm": 2.103879928588867,
"kl": 1.75390625,
"learning_rate": 1.9266481941770463e-05,
"loss": 0.0699,
"reward": 0.2539062611758709,
"reward_std": 0.08228430338203907,
"rewards/embodied_math": 0.0357142873108387,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.2181919738650322,
"step": 63
},
{
"clip_ratio": 0.0,
"completion_length": 1300.0,
"epoch": 0.21386800334168754,
"grad_norm": 9.774397850036621,
"kl": 3.8046875,
"learning_rate": 1.9221946988915745e-05,
"loss": 0.1517,
"reward": 0.3058035857975483,
"reward_std": 0.09955826215445995,
"rewards/embodied_math": 0.1071428619325161,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.1986607238650322,
"step": 64
},
{
"clip_ratio": 0.0,
"completion_length": 1298.1830444335938,
"epoch": 0.2172096908939014,
"grad_norm": 13.15608024597168,
"kl": 6.14453125,
"learning_rate": 1.9176154232353513e-05,
"loss": 0.2418,
"reward": 0.1925223283469677,
"reward_std": 0.10169229097664356,
"rewards/embodied_math": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.1925223283469677,
"step": 65
},
{
"clip_ratio": 0.0,
"completion_length": 1300.0,
"epoch": 0.22055137844611528,
"grad_norm": 13.001031875610352,
"kl": 1.91796875,
"learning_rate": 1.9129109917868863e-05,
"loss": 0.0765,
"reward": 0.2812500186264515,
"reward_std": 0.10722276195883751,
"rewards/embodied_math": 0.0714285746216774,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.2098214402794838,
"step": 66
},
{
"clip_ratio": 0.0,
"completion_length": 1300.0,
"epoch": 0.22389306599832914,
"grad_norm": 20.15164566040039,
"kl": 5.0703125,
"learning_rate": 1.9080820461949886e-05,
"loss": 0.2023,
"reward": 0.2455357201397419,
"reward_std": 0.09997103177011013,
"rewards/embodied_math": 0.0357142873108387,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.2098214365541935,
"step": 67
},
{
"clip_ratio": 0.0,
"completion_length": 1297.8348388671875,
"epoch": 0.227234753550543,
"grad_norm": 69.2470703125,
"kl": 4.8671875,
"learning_rate": 1.9031292450912565e-05,
"loss": 0.19,
"reward": 0.1975446566939354,
"reward_std": 0.09716965816915035,
"rewards/embodied_math": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.1975446566939354,
"step": 68
},
{
"clip_ratio": 0.0,
"completion_length": 1112.46435546875,
"epoch": 0.23057644110275688,
"grad_norm": 1767.9007568359375,
"kl": 12.921875,
"learning_rate": 1.898053264000239e-05,
"loss": 0.5162,
"reward": 0.290736623108387,
"reward_std": 0.12393852323293686,
"rewards/embodied_math": 0.0357142873108387,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.2550223395228386,
"step": 69
},
{
"clip_ratio": 0.0,
"completion_length": 1265.5714416503906,
"epoch": 0.23391812865497075,
"grad_norm": 457.6597595214844,
"kl": 10.125,
"learning_rate": 1.8928547952473037e-05,
"loss": 0.4033,
"reward": 0.2818080522119999,
"reward_std": 0.17049719020724297,
"rewards/embodied_math": 0.03794643026776612,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.243861623108387,
"step": 70
},
{
"clip_ratio": 0.0,
"completion_length": 1300.0,
"epoch": 0.23725981620718462,
"grad_norm": 23.20967674255371,
"kl": 8.6171875,
"learning_rate": 1.8875345478642067e-05,
"loss": 0.3434,
"reward": 0.2377232238650322,
"reward_std": 0.13999010622501373,
"rewards/embodied_math": 0.0714285746216774,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.1662946492433548,
"step": 71
},
{
"clip_ratio": 0.0,
"completion_length": 1300.0,
"epoch": 0.24060150375939848,
"grad_norm": 4.324472904205322,
"kl": 3.1328125,
"learning_rate": 1.8820932474923874e-05,
"loss": 0.1251,
"reward": 0.2349330484867096,
"reward_std": 0.13926412537693977,
"rewards/embodied_math": 0.0714285746216774,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.1635044664144516,
"step": 72
},
{
"clip_ratio": 0.0,
"completion_length": 1300.0,
"epoch": 0.24394319131161235,
"grad_norm": 15.31286334991455,
"kl": 1.8125,
"learning_rate": 1.8765316362839955e-05,
"loss": 0.0722,
"reward": 0.2338169738650322,
"reward_std": 0.14043857902288437,
"rewards/embodied_math": 0.0357142873108387,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.1981026865541935,
"step": 73
},
{
"clip_ratio": 0.0,
"completion_length": 1300.0,
"epoch": 0.24728487886382622,
"grad_norm": 12.260504722595215,
"kl": 1.69921875,
"learning_rate": 1.8708504728006668e-05,
"loss": 0.0677,
"reward": 0.2544642984867096,
"reward_std": 0.11344457603991032,
"rewards/embodied_math": 0.0357142873108387,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.2187500149011612,
"step": 74
},
{
"clip_ratio": 0.0,
"completion_length": 1300.0,
"epoch": 0.2506265664160401,
"grad_norm": 8.76353645324707,
"kl": 2.29296875,
"learning_rate": 1.865050531910062e-05,
"loss": 0.0917,
"reward": 0.2287946566939354,
"reward_std": 0.134426174685359,
"rewards/embodied_math": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.2287946566939354,
"step": 75
},
{
"clip_ratio": 0.0,
"completion_length": 1300.0,
"epoch": 0.25396825396825395,
"grad_norm": 11.521807670593262,
"kl": 7.5703125,
"learning_rate": 1.8591326046801813e-05,
"loss": 0.302,
"reward": 0.262834832072258,
"reward_std": 0.13520535081624985,
"rewards/embodied_math": 0.0357142873108387,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.2271205484867096,
"step": 76
},
{
"clip_ratio": 0.0,
"completion_length": 1300.0,
"epoch": 0.2573099415204678,
"grad_norm": 38.819496154785156,
"kl": 13.5625,
"learning_rate": 1.8530974982714667e-05,
"loss": 0.5417,
"reward": 0.2622768022119999,
"reward_std": 0.1606529802083969,
"rewards/embodied_math": 0.0357142873108387,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.2265625111758709,
"step": 77
},
{
"clip_ratio": 0.0,
"completion_length": 1298.7522583007812,
"epoch": 0.2606516290726817,
"grad_norm": 10.04819393157959,
"kl": 8.65625,
"learning_rate": 1.8469460358267127e-05,
"loss": 0.344,
"reward": 0.3052455522119999,
"reward_std": 0.17179123312234879,
"rewards/embodied_math": 0.0357142873108387,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.2695312611758709,
"step": 78
},
{
"clip_ratio": 0.0,
"completion_length": 1300.0,
"epoch": 0.26399331662489556,
"grad_norm": 8.36678409576416,
"kl": 3.57421875,
"learning_rate": 1.8406790563587958e-05,
"loss": 0.1425,
"reward": 0.2154017984867096,
"reward_std": 0.12405722960829735,
"rewards/embodied_math": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.2154017984867096,
"step": 79
},
{
"clip_ratio": 0.0,
"completion_length": 1300.0,
"epoch": 0.2673350041771094,
"grad_norm": 11.640973091125488,
"kl": 2.359375,
"learning_rate": 1.8342974146362397e-05,
"loss": 0.094,
"reward": 0.2254464365541935,
"reward_std": 0.10200263559818268,
"rewards/embodied_math": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.2254464365541935,
"step": 80
},
{
"clip_ratio": 0.0,
"completion_length": 1300.0,
"epoch": 0.2706766917293233,
"grad_norm": 10.865341186523438,
"kl": 2.0625,
"learning_rate": 1.8278019810666295e-05,
"loss": 0.0823,
"reward": 0.2589285857975483,
"reward_std": 0.09501025639474392,
"rewards/embodied_math": 0.0357142873108387,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.2232142947614193,
"step": 81
},
{
"clip_ratio": 0.0,
"completion_length": 1300.0,
"epoch": 0.27401837928153716,
"grad_norm": 3.9547715187072754,
"kl": 3.9140625,
"learning_rate": 1.8211936415778986e-05,
"loss": 0.156,
"reward": 0.2678571529686451,
"reward_std": 0.08793714456260204,
"rewards/embodied_math": 0.0357142873108387,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.2321428656578064,
"step": 82
},
{
"clip_ratio": 0.0,
"completion_length": 1300.0,
"epoch": 0.27736006683375103,
"grad_norm": 11.76015567779541,
"kl": 5.71875,
"learning_rate": 1.8144732974974902e-05,
"loss": 0.2278,
"reward": 0.3203125186264515,
"reward_std": 0.09355076402425766,
"rewards/embodied_math": 0.1071428619325161,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.2131696529686451,
"step": 83
},
{
"clip_ratio": 0.0,
"completion_length": 1300.0,
"epoch": 0.2807017543859649,
"grad_norm": 15.147577285766602,
"kl": 6.8125,
"learning_rate": 1.8076418654294267e-05,
"loss": 0.2713,
"reward": 0.3175223395228386,
"reward_std": 0.09090832434594631,
"rewards/embodied_math": 0.1071428619325161,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.2103794701397419,
"step": 84
},
{
"clip_ratio": 0.0,
"completion_length": 1300.0,
"epoch": 0.28404344193817876,
"grad_norm": 2.8102500438690186,
"kl": 1.943359375,
"learning_rate": 1.80070027712929e-05,
"loss": 0.0775,
"reward": 0.2193080447614193,
"reward_std": 0.06847428530454636,
"rewards/embodied_math": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.2193080447614193,
"step": 85
},
{
"clip_ratio": 0.0,
"completion_length": 1300.0,
"epoch": 0.28738512949039263,
"grad_norm": 3.1324164867401123,
"kl": 2.29296875,
"learning_rate": 1.793649479377137e-05,
"loss": 0.0914,
"reward": 0.2165178656578064,
"reward_std": 0.07726325932890177,
"rewards/embodied_math": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.2165178656578064,
"step": 86
},
{
"clip_ratio": 0.0,
"completion_length": 1300.0,
"epoch": 0.2907268170426065,
"grad_norm": 5.72972297668457,
"kl": 2.6796875,
"learning_rate": 1.7864904338483676e-05,
"loss": 0.1071,
"reward": 0.2907366268336773,
"reward_std": 0.07602266781032085,
"rewards/embodied_math": 0.0714285746216774,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.2193080484867096,
"step": 87
},
{
"clip_ratio": 0.0,
"completion_length": 1299.3995666503906,
"epoch": 0.29406850459482037,
"grad_norm": 7.760501861572266,
"kl": 2.681640625,
"learning_rate": 1.779224116982558e-05,
"loss": 0.1047,
"reward": 0.2968750074505806,
"reward_std": 0.06183902267366648,
"rewards/embodied_math": 0.0714285746216774,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.2254464402794838,
"step": 88
},
{
"clip_ratio": 0.0,
"completion_length": 1300.0,
"epoch": 0.29741019214703424,
"grad_norm": 4.000362396240234,
"kl": 1.8857421875,
"learning_rate": 1.7718515198502816e-05,
"loss": 0.0753,
"reward": 0.298549123108387,
"reward_std": 0.05841092485934496,
"rewards/embodied_math": 0.0714285746216774,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.2271205484867096,
"step": 89
},
{
"clip_ratio": 0.0,
"completion_length": 1300.0,
"epoch": 0.3007518796992481,
"grad_norm": 5.868449687957764,
"kl": 1.1416015625,
"learning_rate": 1.7643736480179353e-05,
"loss": 0.0455,
"reward": 0.3085937649011612,
"reward_std": 0.04876699857413769,
"rewards/embodied_math": 0.0714285746216774,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.2371651902794838,
"step": 90
},
{
"clip_ratio": 0.0,
"completion_length": 1294.4285888671875,
"epoch": 0.30409356725146197,
"grad_norm": 1.812473177909851,
"kl": 1.982421875,
"learning_rate": 1.7567915214105883e-05,
"loss": 0.0715,
"reward": 0.2689732238650322,
"reward_std": 0.04237840510904789,
"rewards/embodied_math": 0.0357142873108387,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.2332589365541935,
"step": 91
},
{
"clip_ratio": 0.0,
"completion_length": 1300.0,
"epoch": 0.30743525480367584,
"grad_norm": 2.7611286640167236,
"kl": 2.279296875,
"learning_rate": 1.7491061741728703e-05,
"loss": 0.0909,
"reward": 0.2795759029686451,
"reward_std": 0.02313897479325533,
"rewards/embodied_math": 0.0357142873108387,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.2438616193830967,
"step": 92
},
{
"clip_ratio": 0.0,
"completion_length": 1296.2187805175781,
"epoch": 0.3107769423558897,
"grad_norm": 8.670313835144043,
"kl": 3.568359375,
"learning_rate": 1.741318654527923e-05,
"loss": 0.1315,
"reward": 0.2745535895228386,
"reward_std": 0.03872372629120946,
"rewards/embodied_math": 0.0357142873108387,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.2388392984867096,
"step": 93
},
{
"clip_ratio": 0.0,
"completion_length": 1285.0982360839844,
"epoch": 0.3141186299081036,
"grad_norm": 2.591120481491089,
"kl": 2.2060546875,
"learning_rate": 1.7334300246344318e-05,
"loss": 0.0649,
"reward": 0.2360491156578064,
"reward_std": 0.05540083209052682,
"rewards/embodied_math": 0.004464285913854837,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.2315848283469677,
"step": 94
},
{
"clip_ratio": 0.0,
"completion_length": 1284.8661193847656,
"epoch": 0.31746031746031744,
"grad_norm": 3.4341654777526855,
"kl": 1.61328125,
"learning_rate": 1.725441360441752e-05,
"loss": 0.0342,
"reward": 0.2751116156578064,
"reward_std": 0.03906811494380236,
"rewards/embodied_math": 0.0357142873108387,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.239397332072258,
"step": 95
},
{
"clip_ratio": 0.0,
"completion_length": 1272.2076416015625,
"epoch": 0.3208020050125313,
"grad_norm": 15.758692741394043,
"kl": 5.55078125,
"learning_rate": 1.7173537515431612e-05,
"loss": 0.1119,
"reward": 0.2343750111758709,
"reward_std": 0.040907643269747496,
"rewards/embodied_math": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.2343750111758709,
"step": 96
},
{
"clip_ratio": 0.0,
"completion_length": 1245.6607666015625,
"epoch": 0.3241436925647452,
"grad_norm": 1.1037966012954712,
"kl": 4.46484375,
"learning_rate": 1.7091683010272447e-05,
"loss": 0.0287,
"reward": 0.2606026902794838,
"reward_std": 0.07436614017933607,
"rewards/embodied_math": 0.03794643026776612,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.2226562611758709,
"step": 97
},
{
"clip_ratio": 0.0,
"completion_length": 1229.1741943359375,
"epoch": 0.32748538011695905,
"grad_norm": 10.468352317810059,
"kl": 6.5390625,
"learning_rate": 1.700886125327443e-05,
"loss": 0.0752,
"reward": 0.2907366156578064,
"reward_std": 0.08076621871441603,
"rewards/embodied_math": 0.0714285746216774,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.2193080484867096,
"step": 98
},
{
"clip_ratio": 0.0,
"completion_length": 1222.9174499511719,
"epoch": 0.3308270676691729,
"grad_norm": 2.7917568683624268,
"kl": 3.5390625,
"learning_rate": 1.692508354069779e-05,
"loss": -0.0414,
"reward": 0.3063616268336773,
"reward_std": 0.09452157653868198,
"rewards/embodied_math": 0.08482143143191934,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.2215401902794838,
"step": 99
},
{
"clip_ratio": 0.0,
"completion_length": 1230.76123046875,
"epoch": 0.3341687552213868,
"grad_norm": 8.454556465148926,
"kl": 6.4296875,
"learning_rate": 1.684036129918786e-05,
"loss": 0.03,
"reward": 0.2924107313156128,
"reward_std": 0.06796729285269976,
"rewards/embodied_math": 0.0714285746216774,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.2209821566939354,
"step": 100
},
{
"clip_ratio": 0.0,
"completion_length": 1260.3147888183594,
"epoch": 0.33751044277360065,
"grad_norm": 4.354994297027588,
"kl": 5.12890625,
"learning_rate": 1.6754706084216556e-05,
"loss": 0.0526,
"reward": 0.3370535857975483,
"reward_std": 0.054657368920743465,
"rewards/embodied_math": 0.1071428619325161,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.2299107238650322,
"step": 101
},
{
"clip_ratio": 0.0,
"completion_length": 1223.2879943847656,
"epoch": 0.3408521303258145,
"grad_norm": 86.24649047851562,
"kl": 896.05859375,
"learning_rate": 1.6668129578506315e-05,
"loss": 0.14,
"reward": 0.2600446529686451,
"reward_std": 0.06857628654688597,
"rewards/embodied_math": 0.0357142873108387,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.2243303656578064,
"step": 102
},
{
"clip_ratio": 0.0,
"completion_length": 1215.3170166015625,
"epoch": 0.3441938178780284,
"grad_norm": 8.539790153503418,
"kl": 2.80078125,
"learning_rate": 1.658064359043664e-05,
"loss": -0.1095,
"reward": 0.2901785857975483,
"reward_std": 0.0779828317463398,
"rewards/embodied_math": 0.0714285746216774,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.2187500111758709,
"step": 103
},
{
"clip_ratio": 0.0,
"completion_length": 1111.7098388671875,
"epoch": 0.34753550543024225,
"grad_norm": 3.1039435863494873,
"kl": 2.70703125,
"learning_rate": 1.6492260052433554e-05,
"loss": -0.2501,
"reward": 0.3147321604192257,
"reward_std": 0.0896985549479723,
"rewards/embodied_math": 0.10937500488944352,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.2053571529686451,
"step": 104
},
{
"clip_ratio": 0.0,
"completion_length": 1124.6875610351562,
"epoch": 0.3508771929824561,
"grad_norm": 2.739511013031006,
"kl": 4.7265625,
"learning_rate": 1.6402991019342073e-05,
"loss": -0.2298,
"reward": 0.2806919813156128,
"reward_std": 0.07699156645685434,
"rewards/embodied_math": 0.0714285746216774,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.2092634029686451,
"step": 105
},
{
"clip_ratio": 0.0,
"completion_length": 1193.6183471679688,
"epoch": 0.35421888053467,
"grad_norm": 4.411526203155518,
"kl": 7.28125,
"learning_rate": 1.631284866678205e-05,
"loss": -0.1137,
"reward": 0.3002232275903225,
"reward_std": 0.07899147737771273,
"rewards/embodied_math": 0.0758928619325161,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.2243303693830967,
"step": 106
},
{
"clip_ratio": 0.0,
"completion_length": 1137.857177734375,
"epoch": 0.35756056808688386,
"grad_norm": 7.792470455169678,
"kl": 7.45703125,
"learning_rate": 1.6221845289487493e-05,
"loss": -0.2043,
"reward": 0.2220982275903225,
"reward_std": 0.081636568531394,
"rewards/embodied_math": 0.004464285913854837,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.2176339402794838,
"step": 107
},
{
"clip_ratio": 0.0,
"completion_length": 1163.8683776855469,
"epoch": 0.3609022556390977,
"grad_norm": 2.2603821754455566,
"kl": 6.671875,
"learning_rate": 1.6129993299629652e-05,
"loss": -0.1924,
"reward": 0.2890625074505806,
"reward_std": 0.0779200978577137,
"rewards/embodied_math": 0.0714285746216774,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.2176339402794838,
"step": 108
},
{
"clip_ratio": 0.0,
"completion_length": 1173.3482971191406,
"epoch": 0.3642439431913116,
"grad_norm": 2.22993540763855,
"kl": 3.044921875,
"learning_rate": 1.6037305225124122e-05,
"loss": -0.2054,
"reward": 0.2952008992433548,
"reward_std": 0.07529877964407206,
"rewards/embodied_math": 0.0736607164144516,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.2215401865541935,
"step": 109
},
{
"clip_ratio": 0.0,
"completion_length": 1138.9710540771484,
"epoch": 0.36758563074352546,
"grad_norm": 1.692765474319458,
"kl": 2.310546875,
"learning_rate": 1.5943793707922086e-05,
"loss": -0.2594,
"reward": 0.2555803656578064,
"reward_std": 0.08683113381266594,
"rewards/embodied_math": 0.03794643026776612,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.2176339402794838,
"step": 110
},
{
"clip_ratio": 0.0,
"completion_length": 1246.4576110839844,
"epoch": 0.37092731829573933,
"grad_norm": 1.1689916849136353,
"kl": 1.72265625,
"learning_rate": 1.5849471502286088e-05,
"loss": -0.0851,
"reward": 0.2455357275903225,
"reward_std": 0.06550503056496382,
"rewards/embodied_math": 0.008928572060540318,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.2366071566939354,
"step": 111
},
{
"clip_ratio": 0.0,
"completion_length": 1265.3482666015625,
"epoch": 0.3742690058479532,
"grad_norm": 0.7709049582481384,
"kl": 1.814453125,
"learning_rate": 1.5754351473050434e-05,
"loss": -0.0438,
"reward": 0.3197544701397419,
"reward_std": 0.047460266621783376,
"rewards/embodied_math": 0.07812500232830644,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.2416294775903225,
"step": 112
},
{
"clip_ratio": 0.0,
"completion_length": 1250.3304138183594,
"epoch": 0.37761069340016706,
"grad_norm": 16.40837287902832,
"kl": 2.6015625,
"learning_rate": 1.5658446593866517e-05,
"loss": -0.053,
"reward": 0.2823660857975483,
"reward_std": 0.06565927620977163,
"rewards/embodied_math": 0.04464285937137902,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.2377232238650322,
"step": 113
},
{
"clip_ratio": 0.0,
"completion_length": 1227.9532165527344,
"epoch": 0.38095238095238093,
"grad_norm": 1.9537498950958252,
"kl": 3.77734375,
"learning_rate": 1.5561769945433326e-05,
"loss": -0.0841,
"reward": 0.3549107313156128,
"reward_std": 0.07631831709295511,
"rewards/embodied_math": 0.10937500488944352,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.2455357201397419,
"step": 114
},
{
"clip_ratio": 0.0,
"completion_length": 1232.9554138183594,
"epoch": 0.3842940685045948,
"grad_norm": 5.672196865081787,
"kl": 3.42578125,
"learning_rate": 1.5464334713713312e-05,
"loss": -0.0807,
"reward": 0.2695312574505806,
"reward_std": 0.11467710882425308,
"rewards/embodied_math": 0.004464285913854837,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.2650669738650322,
"step": 115
},
{
"clip_ratio": 0.0,
"completion_length": 1029.2232513427734,
"epoch": 0.38763575605680867,
"grad_norm": 47.228187561035156,
"kl": 6.15625,
"learning_rate": 1.5366154188133962e-05,
"loss": -0.0135,
"reward": 0.4296875149011612,
"reward_std": 0.18271929770708084,
"rewards/embodied_math": 0.11160715157166123,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.3180803656578064,
"step": 116
},
{
"clip_ratio": 0.0,
"completion_length": 1071.0067443847656,
"epoch": 0.39097744360902253,
"grad_norm": 2.859192132949829,
"kl": 5.3359375,
"learning_rate": 1.526724175977518e-05,
"loss": -0.0756,
"reward": 0.415736623108387,
"reward_std": 0.20538460090756416,
"rewards/embodied_math": 0.05133928847499192,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.3643973395228386,
"step": 117
},
{
"clip_ratio": 0.0,
"completion_length": 1082.4531555175781,
"epoch": 0.3943191311612364,
"grad_norm": 5.876430988311768,
"kl": 4.078125,
"learning_rate": 1.5167610919542885e-05,
"loss": -0.2275,
"reward": 0.3699776977300644,
"reward_std": 0.20139999315142632,
"rewards/embodied_math": 0.0022321429569274187,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.3677455559372902,
"step": 118
},
{
"clip_ratio": 0.0,
"completion_length": 1152.7322082519531,
"epoch": 0.39766081871345027,
"grad_norm": 2.9599621295928955,
"kl": 4.1796875,
"learning_rate": 1.5067275256328913e-05,
"loss": -0.1624,
"reward": 0.423549123108387,
"reward_std": 0.18587969616055489,
"rewards/embodied_math": 0.04017857206054032,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.383370541036129,
"step": 119
},
{
"clip_ratio": 0.0,
"completion_length": 1142.29248046875,
"epoch": 0.40100250626566414,
"grad_norm": 3.511597156524658,
"kl": 8.5234375,
"learning_rate": 1.4966248455157622e-05,
"loss": -0.1474,
"reward": 0.4648437649011612,
"reward_std": 0.16743408516049385,
"rewards/embodied_math": 0.0714285746216774,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.3934151902794838,
"step": 120
},
{
"clip_ratio": 0.0,
"completion_length": 1083.3371276855469,
"epoch": 0.404344193817878,
"grad_norm": 5.298196315765381,
"kl": 9.5859375,
"learning_rate": 1.4864544295319357e-05,
"loss": -0.1957,
"reward": 0.479352705180645,
"reward_std": 0.17764294892549515,
"rewards/embodied_math": 0.1071428619325161,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.3722098395228386,
"step": 121
},
{
"clip_ratio": 0.0,
"completion_length": 1091.8326416015625,
"epoch": 0.4076858813700919,
"grad_norm": 1.1045624017715454,
"kl": 4.83984375,
"learning_rate": 1.4762176648491052e-05,
"loss": -0.2606,
"reward": 0.462611623108387,
"reward_std": 0.17855771258473396,
"rewards/embodied_math": 0.07366071757860482,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.3889509066939354,
"step": 122
},
{
"clip_ratio": 0.0,
"completion_length": 1094.7880249023438,
"epoch": 0.41102756892230574,
"grad_norm": 1.479848027229309,
"kl": 3.59765625,
"learning_rate": 1.4659159476844231e-05,
"loss": -0.2782,
"reward": 0.4408482313156128,
"reward_std": 0.1699872985482216,
"rewards/embodied_math": 0.03794643026776612,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.4029018059372902,
"step": 123
},
{
"clip_ratio": 0.0,
"completion_length": 1031.8750457763672,
"epoch": 0.4143692564745196,
"grad_norm": 0.9205887913703918,
"kl": 5.6875,
"learning_rate": 1.4555506831140698e-05,
"loss": -0.3515,
"reward": 0.3850446566939354,
"reward_std": 0.17519571259617805,
"rewards/embodied_math": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.3850446566939354,
"step": 124
},
{
"clip_ratio": 0.0,
"completion_length": 1061.9531860351562,
"epoch": 0.4177109440267335,
"grad_norm": 1.5503870248794556,
"kl": 7.1640625,
"learning_rate": 1.445123284881609e-05,
"loss": -0.3,
"reward": 0.4709821715950966,
"reward_std": 0.19400348514318466,
"rewards/embodied_math": 0.08035714644938707,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.3906250149011612,
"step": 125
},
{
"clip_ratio": 0.0,
"completion_length": 1092.4754943847656,
"epoch": 0.42105263157894735,
"grad_norm": 3.406398296356201,
"kl": 6.828125,
"learning_rate": 1.4346351752051663e-05,
"loss": -0.2414,
"reward": 0.4414062649011612,
"reward_std": 0.1831374131143093,
"rewards/embodied_math": 0.0424107164144516,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.3989955559372902,
"step": 126
},
{
"clip_ratio": 0.0,
"completion_length": 1072.4755249023438,
"epoch": 0.4243943191311612,
"grad_norm": 4.0440993309021,
"kl": 6.4140625,
"learning_rate": 1.4240877845834473e-05,
"loss": -0.0842,
"reward": 0.2572544775903225,
"reward_std": 0.22713791206479073,
"rewards/embodied_math": 0.044642857974395156,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.2126116156578064,
"step": 127
},
{
"clip_ratio": 0.0,
"completion_length": 1057.2053985595703,
"epoch": 0.4277360066833751,
"grad_norm": 17.8367862701416,
"kl": 6.6875,
"learning_rate": 1.4134825516006307e-05,
"loss": 0.0196,
"reward": 0.1813616119325161,
"reward_std": 0.13534531742334366,
"rewards/embodied_math": 0.0357142873108387,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.1456473283469677,
"step": 128
},
{
"clip_ratio": 0.0,
"completion_length": 982.388427734375,
"epoch": 0.43107769423558895,
"grad_norm": 269.2532958984375,
"kl": 11.546875,
"learning_rate": 1.4028209227301534e-05,
"loss": 0.305,
"reward": 0.1852678693830967,
"reward_std": 0.11594511196017265,
"rewards/embodied_math": 0.0357142873108387,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.1495535783469677,
"step": 129
},
{
"clip_ratio": 0.0,
"completion_length": 1021.2768402099609,
"epoch": 0.4344193817878028,
"grad_norm": 13.602232933044434,
"kl": 5.3359375,
"learning_rate": 1.392104352137426e-05,
"loss": 0.019,
"reward": 0.152901791036129,
"reward_std": 0.12491585314273834,
"rewards/embodied_math": 0.0022321429569274187,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.1506696492433548,
"step": 130
},
{
"clip_ratio": 0.0,
"completion_length": 555.147346496582,
"epoch": 0.4377610693400167,
"grad_norm": 8.347877502441406,
"kl": 8.546875,
"learning_rate": 1.3813343014814926e-05,
"loss": 0.3175,
"reward": 0.11941964738070965,
"reward_std": 0.12463105469942093,
"rewards/embodied_math": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.11941964738070965,
"step": 131
},
{
"clip_ratio": 0.0,
"completion_length": 1125.8594055175781,
"epoch": 0.44110275689223055,
"grad_norm": 286.98577880859375,
"kl": 3.12109375,
"learning_rate": 1.3705122397156727e-05,
"loss": 0.0132,
"reward": 0.2070312611758709,
"reward_std": 0.18715298175811768,
"rewards/embodied_math": 0.011160714784637094,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.1958705447614193,
"step": 132
},
{
"clip_ratio": 0.0,
"completion_length": 1083.7411041259766,
"epoch": 0.4444444444444444,
"grad_norm": 19.709352493286133,
"kl": 1.904296875,
"learning_rate": 1.359639642887208e-05,
"loss": -0.1734,
"reward": 0.4101562649011612,
"reward_std": 0.23037764430046082,
"rewards/embodied_math": 0.113839291036129,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.2963169813156128,
"step": 133
},
{
"clip_ratio": 0.0,
"completion_length": 1081.1719055175781,
"epoch": 0.4477861319966583,
"grad_norm": 2.971442461013794,
"kl": 1.2763671875,
"learning_rate": 1.3487179939359394e-05,
"loss": -0.1503,
"reward": 0.4107143059372902,
"reward_std": 0.23758460581302643,
"rewards/embodied_math": 0.1071428619325161,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.3035714402794838,
"step": 134
},
{
"clip_ratio": 0.0,
"completion_length": 1124.0000305175781,
"epoch": 0.45112781954887216,
"grad_norm": 5.354092597961426,
"kl": 1.5625,
"learning_rate": 1.3377487824920459e-05,
"loss": -0.1025,
"reward": 0.3236607313156128,
"reward_std": 0.2569897249341011,
"rewards/embodied_math": 0.01785714365541935,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.305803582072258,
"step": 135
},
{
"clip_ratio": 0.0,
"completion_length": 1038.9911499023438,
"epoch": 0.454469507101086,
"grad_norm": 12.347515106201172,
"kl": 3.61328125,
"learning_rate": 1.32673350467287e-05,
"loss": -0.1563,
"reward": 0.459821455180645,
"reward_std": 0.28034432977437973,
"rewards/embodied_math": 0.15401787031441927,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.305803582072258,
"step": 136
},
{
"clip_ratio": 0.0,
"completion_length": 1030.7902374267578,
"epoch": 0.4578111946532999,
"grad_norm": 3.499429702758789,
"kl": 3.16796875,
"learning_rate": 1.3156736628788585e-05,
"loss": -0.0581,
"reward": 0.4006696566939354,
"reward_std": 0.23839671164751053,
"rewards/embodied_math": 0.1116071492433548,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.2890625149011612,
"step": 137
},
{
"clip_ratio": 0.0,
"completion_length": 958.8616638183594,
"epoch": 0.46115288220551376,
"grad_norm": 5.829720973968506,
"kl": 3.53125,
"learning_rate": 1.304570765588648e-05,
"loss": -0.0615,
"reward": 0.2762276902794838,
"reward_std": 0.22271455451846123,
"rewards/embodied_math": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.2762276902794838,
"step": 138
},
{
"clip_ratio": 0.0,
"completion_length": 1053.7389068603516,
"epoch": 0.4644945697577276,
"grad_norm": 4.121999263763428,
"kl": 2.267578125,
"learning_rate": 1.293426327153317e-05,
"loss": -0.1534,
"reward": 0.306919664144516,
"reward_std": 0.21316596493124962,
"rewards/embodied_math": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.306919664144516,
"step": 139
},
{
"clip_ratio": 0.0,
"completion_length": 1174.8036193847656,
"epoch": 0.4678362573099415,
"grad_norm": 3.6972813606262207,
"kl": 2.021484375,
"learning_rate": 1.2822418675898428e-05,
"loss": -0.0593,
"reward": 0.5362723469734192,
"reward_std": 0.215391855686903,
"rewards/embodied_math": 0.1473214328289032,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.388950914144516,
"step": 140
},
{
"clip_ratio": 0.0,
"completion_length": 1115.7031860351562,
"epoch": 0.47117794486215536,
"grad_norm": 1.3065992593765259,
"kl": 2.73046875,
"learning_rate": 1.2710189123737804e-05,
"loss": -0.1325,
"reward": 0.4441964477300644,
"reward_std": 0.23979860544204712,
"rewards/embodied_math": 0.08035714412108064,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.3638392984867096,
"step": 141
},
{
"clip_ratio": 0.0,
"completion_length": 1066.138427734375,
"epoch": 0.47451963241436923,
"grad_norm": 2.3809707164764404,
"kl": 3.703125,
"learning_rate": 1.2597589922312009e-05,
"loss": -0.1879,
"reward": 0.5602678880095482,
"reward_std": 0.21912335231900215,
"rewards/embodied_math": 0.1808035783469677,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.3794642984867096,
"step": 142
},
{
"clip_ratio": 0.0,
"completion_length": 1052.279067993164,
"epoch": 0.4778613199665831,
"grad_norm": 2.187842607498169,
"kl": 4.671875,
"learning_rate": 1.2484636429299113e-05,
"loss": -0.1914,
"reward": 0.491071455180645,
"reward_std": 0.2304515242576599,
"rewards/embodied_math": 0.113839291036129,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.377232164144516,
"step": 143
},
{
"clip_ratio": 0.0,
"completion_length": 1100.8661193847656,
"epoch": 0.48120300751879697,
"grad_norm": 4.84511661529541,
"kl": 3.40234375,
"learning_rate": 1.2371344050699872e-05,
"loss": -0.171,
"reward": 0.4760044887661934,
"reward_std": 0.21165388822555542,
"rewards/embodied_math": 0.0781250037252903,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.3978794813156128,
"step": 144
},
{
"clip_ratio": 0.0,
"completion_length": 1072.1763916015625,
"epoch": 0.48454469507101083,
"grad_norm": 0.7366332411766052,
"kl": 3.59765625,
"learning_rate": 1.2257728238736468e-05,
"loss": -0.263,
"reward": 0.506696455180645,
"reward_std": 0.22405631840229034,
"rewards/embodied_math": 0.1183035783469677,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.3883928805589676,
"step": 145
},
{
"clip_ratio": 0.0,
"completion_length": 1110.9710388183594,
"epoch": 0.4878863826232247,
"grad_norm": 1.2310246229171753,
"kl": 3.564453125,
"learning_rate": 1.2143804489744941e-05,
"loss": -0.2307,
"reward": 0.5100446715950966,
"reward_std": 0.17644834145903587,
"rewards/embodied_math": 0.1071428619325161,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.4029018059372902,
"step": 146
},
{
"clip_ratio": 0.0,
"completion_length": 1189.0067443847656,
"epoch": 0.49122807017543857,
"grad_norm": 1.516117811203003,
"kl": 2.3984375,
"learning_rate": 1.2029588342061623e-05,
"loss": -0.1141,
"reward": 0.4481026902794838,
"reward_std": 0.1671721525490284,
"rewards/embodied_math": 0.03794643026776612,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.4101562649011612,
"step": 147
},
{
"clip_ratio": 0.0,
"completion_length": 1152.7366638183594,
"epoch": 0.49456975772765244,
"grad_norm": 1.7402369976043701,
"kl": 3.30078125,
"learning_rate": 1.1915095373903789e-05,
"loss": -0.1621,
"reward": 0.5251116380095482,
"reward_std": 0.1959940418601036,
"rewards/embodied_math": 0.145089291036129,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.3800223469734192,
"step": 148
},
{
"clip_ratio": 0.0,
"completion_length": 1159.7254943847656,
"epoch": 0.4979114452798663,
"grad_norm": 3.0580763816833496,
"kl": 3.0390625,
"learning_rate": 1.1800341201244954e-05,
"loss": -0.1601,
"reward": 0.424107164144516,
"reward_std": 0.18527107685804367,
"rewards/embodied_math": 0.0357142873108387,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.388392873108387,
"step": 149
},
{
"clip_ratio": 0.0,
"completion_length": 1173.6719055175781,
"epoch": 0.5012531328320802,
"grad_norm": 4.800704479217529,
"kl": 2.416015625,
"learning_rate": 1.1685341475684935e-05,
"loss": -0.1069,
"reward": 0.412388414144516,
"reward_std": 0.18909326568245888,
"rewards/embodied_math": 0.0357142873108387,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.376674123108387,
"step": 150
},
{
"clip_ratio": 0.0,
"completion_length": 1095.2187957763672,
"epoch": 0.504594820384294,
"grad_norm": 14.275434494018555,
"kl": 4.41796875,
"learning_rate": 1.15701118823151e-05,
"loss": -0.1452,
"reward": 0.4190848395228386,
"reward_std": 0.20743219926953316,
"rewards/embodied_math": 0.0736607164144516,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.3454241156578064,
"step": 151
},
{
"clip_ratio": 0.0,
"completion_length": 1133.5447082519531,
"epoch": 0.5079365079365079,
"grad_norm": 8.570171356201172,
"kl": 4.53125,
"learning_rate": 1.1454668137579059e-05,
"loss": -0.1404,
"reward": 0.483816996216774,
"reward_std": 0.20593848451972008,
"rewards/embodied_math": 0.10937500488944352,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.3744419813156128,
"step": 152
},
{
"clip_ratio": 0.0,
"completion_length": 1146.6652526855469,
"epoch": 0.5112781954887218,
"grad_norm": 23.04602813720703,
"kl": 2.85546875,
"learning_rate": 1.1339025987129033e-05,
"loss": -0.1318,
"reward": 0.4670759066939354,
"reward_std": 0.2194213978946209,
"rewards/embodied_math": 0.08482143143191934,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.3822544813156128,
"step": 153
},
{
"clip_ratio": 0.0,
"completion_length": 1122.2768249511719,
"epoch": 0.5146198830409356,
"grad_norm": 2.594608783721924,
"kl": 2.466796875,
"learning_rate": 1.1223201203678289e-05,
"loss": -0.2029,
"reward": 0.4525669887661934,
"reward_std": 0.1940429024398327,
"rewards/embodied_math": 0.0714285746216774,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.3811384066939354,
"step": 154
},
{
"clip_ratio": 0.0,
"completion_length": 1132.1786499023438,
"epoch": 0.5179615705931495,
"grad_norm": 2.2763757705688477,
"kl": 2.4453125,
"learning_rate": 1.1107209584849845e-05,
"loss": -0.1703,
"reward": 0.4380580633878708,
"reward_std": 0.2103111855685711,
"rewards/embodied_math": 0.046875003492459655,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.3911830484867096,
"step": 155
},
{
"clip_ratio": 0.0,
"completion_length": 1131.6138610839844,
"epoch": 0.5213032581453634,
"grad_norm": 1.8114477396011353,
"kl": 2.7421875,
"learning_rate": 1.0991066951021802e-05,
"loss": -0.1733,
"reward": 0.5206473395228386,
"reward_std": 0.19661833345890045,
"rewards/embodied_math": 0.11607143143191934,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.4045759066939354,
"step": 156
},
{
"clip_ratio": 0.0,
"completion_length": 1162.8169860839844,
"epoch": 0.5246449456975772,
"grad_norm": 2.25466251373291,
"kl": 1.982421875,
"learning_rate": 1.0874789143169569e-05,
"loss": -0.1559,
"reward": 0.4871651902794838,
"reward_std": 0.20492572709918022,
"rewards/embodied_math": 0.08258928824216127,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.4045759066939354,
"step": 157
},
{
"clip_ratio": 0.0,
"completion_length": 1192.7478332519531,
"epoch": 0.5279866332497911,
"grad_norm": 1.1387535333633423,
"kl": 1.3837890625,
"learning_rate": 1.0758392020705258e-05,
"loss": -0.1423,
"reward": 0.5078125223517418,
"reward_std": 0.1970166452229023,
"rewards/embodied_math": 0.0937500037252903,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.4140625223517418,
"step": 158
},
{
"clip_ratio": 0.0,
"completion_length": 1229.7813110351562,
"epoch": 0.531328320802005,
"grad_norm": 0.46632248163223267,
"kl": 1.669921875,
"learning_rate": 1.0641891459314598e-05,
"loss": -0.0904,
"reward": 0.4726562649011612,
"reward_std": 0.19845933839678764,
"rewards/embodied_math": 0.0357142873108387,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.4369419887661934,
"step": 159
},
{
"clip_ratio": 0.0,
"completion_length": 1248.58935546875,
"epoch": 0.5346700083542189,
"grad_norm": 1.9050358533859253,
"kl": 1.359375,
"learning_rate": 1.0525303348791599e-05,
"loss": -0.063,
"reward": 0.5089285895228386,
"reward_std": 0.19919028133153915,
"rewards/embodied_math": 0.07142857555299997,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.4375000223517418,
"step": 160
},
{
"clip_ratio": 0.0,
"completion_length": 1238.6652221679688,
"epoch": 0.5380116959064327,
"grad_norm": 39.550594329833984,
"kl": 13.736328125,
"learning_rate": 1.0408643590871312e-05,
"loss": -0.0245,
"reward": 0.5340401977300644,
"reward_std": 0.21040942147374153,
"rewards/embodied_math": 0.0937500037252903,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.4402901902794838,
"step": 161
},
{
"clip_ratio": 0.0,
"completion_length": 1222.1406555175781,
"epoch": 0.5413533834586466,
"grad_norm": 2.6502327919006348,
"kl": 2.689697265625,
"learning_rate": 1.029192809706095e-05,
"loss": -0.0896,
"reward": 0.6010044887661934,
"reward_std": 0.18520404025912285,
"rewards/embodied_math": 0.1629464365541935,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.4380580559372902,
"step": 162
},
{
"clip_ratio": 0.0,
"completion_length": 1224.9911193847656,
"epoch": 0.5446950710108605,
"grad_norm": 5.2321457862854,
"kl": 2.138671875,
"learning_rate": 1.017517278646968e-05,
"loss": -0.0588,
"reward": 0.5859375298023224,
"reward_std": 0.2227453775703907,
"rewards/embodied_math": 0.12723214738070965,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.4587053805589676,
"step": 163
},
{
"clip_ratio": 0.0,
"completion_length": 1183.5982666015625,
"epoch": 0.5480367585630743,
"grad_norm": 1.016186237335205,
"kl": 2.61328125,
"learning_rate": 1.0058393583637376e-05,
"loss": -0.1255,
"reward": 0.490513414144516,
"reward_std": 0.23803862929344177,
"rewards/embodied_math": 0.049107146449387074,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.4414062723517418,
"step": 164
},
{
"clip_ratio": 0.0,
"completion_length": 1201.27685546875,
"epoch": 0.5513784461152882,
"grad_norm": 0.6124681830406189,
"kl": 2.61328125,
"learning_rate": 9.94160641636263e-06,
"loss": -0.1277,
"reward": 0.4882812723517418,
"reward_std": 0.18272774666547775,
"rewards/embodied_math": 0.03794643026776612,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.4503348395228386,
"step": 165
},
{
"clip_ratio": 0.0,
"completion_length": 1201.8638916015625,
"epoch": 0.5547201336675021,
"grad_norm": 0.4987063705921173,
"kl": 2.947265625,
"learning_rate": 9.824827213530323e-06,
"loss": -0.1437,
"reward": 0.5530134215950966,
"reward_std": 0.19792700558900833,
"rewards/embodied_math": 0.04017857206054032,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.5128348544239998,
"step": 166
},
{
"clip_ratio": 0.0,
"completion_length": 1198.6294860839844,
"epoch": 0.5580618212197159,
"grad_norm": 0.23080599308013916,
"kl": 2.048828125,
"learning_rate": 9.708071902939053e-06,
"loss": -0.1569,
"reward": 0.5262277126312256,
"reward_std": 0.22177628055214882,
"rewards/embodied_math": 0.008928572060540318,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.5172991305589676,
"step": 167
},
{
"clip_ratio": 0.0,
"completion_length": 1178.0647583007812,
"epoch": 0.5614035087719298,
"grad_norm": 0.2520909309387207,
"kl": 2.63671875,
"learning_rate": 9.591356409128691e-06,
"loss": -0.1652,
"reward": 0.581473246216774,
"reward_std": 0.27404002100229263,
"rewards/embodied_math": 0.0558035746216774,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.525669664144516,
"step": 168
},
{
"clip_ratio": 0.0,
"completion_length": 1197.9822082519531,
"epoch": 0.5647451963241437,
"grad_norm": 0.18577782809734344,
"kl": 2.50390625,
"learning_rate": 9.474696651208406e-06,
"loss": -0.1337,
"reward": 0.612723246216774,
"reward_std": 0.2510472200810909,
"rewards/embodied_math": 0.06919643003493547,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.5435268133878708,
"step": 169
},
{
"clip_ratio": 0.0,
"completion_length": 1176.0268249511719,
"epoch": 0.5680868838763575,
"grad_norm": 0.1762024164199829,
"kl": 2.5625,
"learning_rate": 9.358108540685406e-06,
"loss": -0.176,
"reward": 0.5552455633878708,
"reward_std": 0.2156192846596241,
"rewards/embodied_math": 0.03794643026776612,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.5172991305589676,
"step": 170
},
{
"clip_ratio": 0.0,
"completion_length": 1211.8772583007812,
"epoch": 0.5714285714285714,
"grad_norm": 0.7674791216850281,
"kl": 2.2890625,
"learning_rate": 9.241607979294745e-06,
"loss": -0.11,
"reward": 0.6635044813156128,
"reward_std": 0.27448395639657974,
"rewards/embodied_math": 0.11830357694998384,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.545200914144516,
"step": 171
},
{
"clip_ratio": 0.0,
"completion_length": 1206.4286193847656,
"epoch": 0.5747702589807853,
"grad_norm": 2.854034185409546,
"kl": 2.60546875,
"learning_rate": 9.125210856830433e-06,
"loss": -0.1176,
"reward": 0.6149553954601288,
"reward_std": 0.24333922192454338,
"rewards/embodied_math": 0.046875000931322575,
"rewards/format_reward": 0.0022321429569274187,
"rewards/tag_count_reward": 0.5658482313156128,
"step": 172
},
{
"clip_ratio": 0.0,
"completion_length": 1170.6652526855469,
"epoch": 0.5781119465329991,
"grad_norm": 1.9103987216949463,
"kl": 3.7734375,
"learning_rate": 9.0089330489782e-06,
"loss": -0.1914,
"reward": 0.7047991454601288,
"reward_std": 0.22894323244690895,
"rewards/embodied_math": 0.1517857201397419,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.553013414144516,
"step": 173
},
{
"clip_ratio": 0.0,
"completion_length": 1144.2031860351562,
"epoch": 0.581453634085213,
"grad_norm": 1.3213469982147217,
"kl": 3.146484375,
"learning_rate": 8.892790415150161e-06,
"loss": -0.1989,
"reward": 0.6216517984867096,
"reward_std": 0.25715912505984306,
"rewards/embodied_math": 0.08258928963914514,
"rewards/format_reward": 0.0022321429569274187,
"rewards/tag_count_reward": 0.5368303805589676,
"step": 174
},
{
"clip_ratio": 0.0,
"completion_length": 1199.1563110351562,
"epoch": 0.5847953216374269,
"grad_norm": 1.474984884262085,
"kl": 2.3408203125,
"learning_rate": 8.776798796321715e-06,
"loss": -0.0847,
"reward": 0.711495578289032,
"reward_std": 0.29080621898174286,
"rewards/embodied_math": 0.12723214738070965,
"rewards/format_reward": 0.008928572060540318,
"rewards/tag_count_reward": 0.5753348469734192,
"step": 175
},
{
"clip_ratio": 0.0,
"completion_length": 1154.560302734375,
"epoch": 0.5881370091896407,
"grad_norm": 0.5839706659317017,
"kl": 3.8046875,
"learning_rate": 8.66097401287097e-06,
"loss": -0.2043,
"reward": 0.7008928954601288,
"reward_std": 0.24622002243995667,
"rewards/embodied_math": 0.16741072200238705,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.533482164144516,
"step": 176
},
{
"clip_ratio": 0.0,
"completion_length": 1153.7812805175781,
"epoch": 0.5914786967418546,
"grad_norm": 0.2968985438346863,
"kl": 2.97265625,
"learning_rate": 8.545331862420945e-06,
"loss": -0.1888,
"reward": 0.5625000149011612,
"reward_std": 0.26313546672463417,
"rewards/embodied_math": 0.07812500488944352,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.4843750149011612,
"step": 177
},
{
"clip_ratio": 0.0,
"completion_length": 1164.2590026855469,
"epoch": 0.5948203842940685,
"grad_norm": 0.29861223697662354,
"kl": 2.93359375,
"learning_rate": 8.429888117684904e-06,
"loss": -0.1956,
"reward": 0.5970982313156128,
"reward_std": 0.18818846344947815,
"rewards/embodied_math": 0.1071428619325161,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.489955373108387,
"step": 178
},
{
"clip_ratio": 0.0,
"completion_length": 1210.9576721191406,
"epoch": 0.5981620718462823,
"grad_norm": 0.3574662506580353,
"kl": 1.658203125,
"learning_rate": 8.314658524315068e-06,
"loss": -0.1221,
"reward": 0.6149553805589676,
"reward_std": 0.22238216921687126,
"rewards/embodied_math": 0.12723214738070965,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.4877232387661934,
"step": 179
},
{
"clip_ratio": 0.0,
"completion_length": 1205.4754943847656,
"epoch": 0.6015037593984962,
"grad_norm": 0.3439498543739319,
"kl": 1.9462890625,
"learning_rate": 8.199658798755048e-06,
"loss": -0.1349,
"reward": 0.6132812798023224,
"reward_std": 0.23523807525634766,
"rewards/embodied_math": 0.09151786379516125,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.5217634066939354,
"step": 180
},
{
"clip_ratio": 0.0,
"completion_length": 1218.1607666015625,
"epoch": 0.6048454469507101,
"grad_norm": 0.32480064034461975,
"kl": 1.892578125,
"learning_rate": 8.084904626096211e-06,
"loss": -0.1013,
"reward": 0.5898437798023224,
"reward_std": 0.2543032169342041,
"rewards/embodied_math": 0.06250000186264515,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.5273437798023224,
"step": 181
},
{
"clip_ratio": 0.0,
"completion_length": 1199.7500305175781,
"epoch": 0.6081871345029239,
"grad_norm": 0.3220880925655365,
"kl": 2.478515625,
"learning_rate": 7.970411657938382e-06,
"loss": -0.1372,
"reward": 0.7087053805589676,
"reward_std": 0.27387310564517975,
"rewards/embodied_math": 0.12946429033763707,
"rewards/format_reward": 0.004464285913854837,
"rewards/tag_count_reward": 0.5747767984867096,
"step": 182
},
{
"clip_ratio": 0.0,
"completion_length": 1236.7522888183594,
"epoch": 0.6115288220551378,
"grad_norm": 1.8340140581130981,
"kl": 9.578125,
"learning_rate": 7.856195510255059e-06,
"loss": -0.0849,
"reward": 0.7823660969734192,
"reward_std": 0.24205785244703293,
"rewards/embodied_math": 0.17633928917348385,
"rewards/format_reward": 0.0022321429569274187,
"rewards/tag_count_reward": 0.6037946492433548,
"step": 183
},
{
"clip_ratio": 0.0,
"completion_length": 1228.3147888183594,
"epoch": 0.6148705096073517,
"grad_norm": 0.21496865153312683,
"kl": 1.609375,
"learning_rate": 7.742271761263537e-06,
"loss": -0.0914,
"reward": 0.7248884439468384,
"reward_std": 0.2794860415160656,
"rewards/embodied_math": 0.1272321455180645,
"rewards/format_reward": 0.011160715017467737,
"rewards/tag_count_reward": 0.5864955633878708,
"step": 184
},
{
"clip_ratio": 0.0,
"completion_length": 1247.9107666015625,
"epoch": 0.6182121971595655,
"grad_norm": 0.20037777721881866,
"kl": 1.1376953125,
"learning_rate": 7.628655949300133e-06,
"loss": -0.0576,
"reward": 0.7533482611179352,
"reward_std": 0.2807689905166626,
"rewards/embodied_math": 0.14285714668221772,
"rewards/format_reward": 0.01562500069849193,
"rewards/tag_count_reward": 0.594866082072258,
"step": 185
},
{
"clip_ratio": 0.0,
"completion_length": 1231.3705749511719,
"epoch": 0.6215538847117794,
"grad_norm": 2.5596811771392822,
"kl": 1.927734375,
"learning_rate": 7.51536357070089e-06,
"loss": -0.0724,
"reward": 0.6902902126312256,
"reward_std": 0.37761393934488297,
"rewards/embodied_math": 0.04017857415601611,
"rewards/format_reward": 0.05357143096625805,
"rewards/tag_count_reward": 0.5965402126312256,
"step": 186
},
{
"clip_ratio": 0.0,
"completion_length": 1210.7098693847656,
"epoch": 0.6248955722639933,
"grad_norm": 0.7288307547569275,
"kl": 2.509765625,
"learning_rate": 7.402410077687994e-06,
"loss": -0.0863,
"reward": 0.9045759290456772,
"reward_std": 0.4523550197482109,
"rewards/embodied_math": 0.2187500111758709,
"rewards/format_reward": 0.08705357555299997,
"rewards/tag_count_reward": 0.5987723469734192,
"step": 187
},
{
"clip_ratio": 0.0,
"completion_length": 1234.4465026855469,
"epoch": 0.6282372598162071,
"grad_norm": 0.8198930025100708,
"kl": 2.033203125,
"learning_rate": 7.2898108762622e-06,
"loss": -0.0534,
"reward": 0.8792080730199814,
"reward_std": 0.5382220521569252,
"rewards/embodied_math": 0.023181250551715493,
"rewards/format_reward": 0.2165178693830967,
"rewards/tag_count_reward": 0.6395089477300644,
"step": 188
},
{
"clip_ratio": 0.0,
"completion_length": 1267.7723999023438,
"epoch": 0.631578947368421,
"grad_norm": 0.2449118196964264,
"kl": 1.06103515625,
"learning_rate": 7.1775813241015755e-06,
"loss": -0.0225,
"reward": 1.1289062798023224,
"reward_std": 0.6006747037172318,
"rewards/embodied_math": 0.12053572200238705,
"rewards/format_reward": 0.3660714402794838,
"rewards/tag_count_reward": 0.6422991305589676,
"step": 189
},
{
"clip_ratio": 0.0,
"completion_length": 1262.41748046875,
"epoch": 0.6349206349206349,
"grad_norm": 0.24977703392505646,
"kl": 1.0390625,
"learning_rate": 7.065736728466832e-06,
"loss": -0.0361,
"reward": 1.2664072215557098,
"reward_std": 0.6163594722747803,
"rewards/embodied_math": 0.11852768177050166,
"rewards/format_reward": 0.5312500149011612,
"rewards/tag_count_reward": 0.616629496216774,
"step": 190
},
{
"clip_ratio": 0.0,
"completion_length": 1245.0402221679688,
"epoch": 0.6382623224728488,
"grad_norm": 0.4997389018535614,
"kl": 0.80029296875,
"learning_rate": 6.9542923441135226e-06,
"loss": -0.0598,
"reward": 1.3069196939468384,
"reward_std": 0.595898911356926,
"rewards/embodied_math": 0.1361607201397419,
"rewards/format_reward": 0.5825893133878708,
"rewards/tag_count_reward": 0.5881696492433548,
"step": 191
},
{
"clip_ratio": 0.0,
"completion_length": 1229.638427734375,
"epoch": 0.6416040100250626,
"grad_norm": 0.4335998594760895,
"kl": 1.01171875,
"learning_rate": 6.843263371211415e-06,
"loss": -0.0753,
"reward": 1.2984784245491028,
"reward_std": 0.6277011036872864,
"rewards/embodied_math": 0.07470602937974036,
"rewards/format_reward": 0.6294643133878708,
"rewards/tag_count_reward": 0.5943080633878708,
"step": 192
},
{
"clip_ratio": 0.0,
"completion_length": 1256.2232666015625,
"epoch": 0.6449456975772765,
"grad_norm": 0.9794358015060425,
"kl": 0.58349609375,
"learning_rate": 6.732664953271305e-06,
"loss": -0.0549,
"reward": 1.3554688096046448,
"reward_std": 0.5859769731760025,
"rewards/embodied_math": 0.0379464291036129,
"rewards/format_reward": 0.6919643133878708,
"rewards/tag_count_reward": 0.6255580633878708,
"step": 193
},
{
"clip_ratio": 0.0,
"completion_length": 1249.2857666015625,
"epoch": 0.6482873851294904,
"grad_norm": 0.6576189994812012,
"kl": 0.61962890625,
"learning_rate": 6.622512175079543e-06,
"loss": -0.0609,
"reward": 1.4308036267757416,
"reward_std": 0.6184276640415192,
"rewards/embodied_math": 0.04687500186264515,
"rewards/format_reward": 0.7254464477300644,
"rewards/tag_count_reward": 0.658482164144516,
"step": 194
},
{
"clip_ratio": 0.0,
"completion_length": 1265.3482666015625,
"epoch": 0.6516290726817042,
"grad_norm": 0.4225464165210724,
"kl": 0.84716796875,
"learning_rate": 6.512820060640608e-06,
"loss": -0.0428,
"reward": 1.4782367050647736,
"reward_std": 0.5665149390697479,
"rewards/embodied_math": 0.049107145285233855,
"rewards/format_reward": 0.752232164144516,
"rewards/tag_count_reward": 0.6768973469734192,
"step": 195
},
{
"clip_ratio": 0.0,
"completion_length": 1251.8259582519531,
"epoch": 0.6549707602339181,
"grad_norm": 0.6224950551986694,
"kl": 0.69775390625,
"learning_rate": 6.403603571127921e-06,
"loss": -0.0696,
"reward": 1.6000739634037018,
"reward_std": 0.5625407323241234,
"rewards/embodied_math": 0.1296497832518071,
"rewards/format_reward": 0.7924107611179352,
"rewards/tag_count_reward": 0.6780134290456772,
"step": 196
},
{
"clip_ratio": 0.0,
"completion_length": 1253.7812805175781,
"epoch": 0.658312447786132,
"grad_norm": 0.3629626929759979,
"kl": 0.90234375,
"learning_rate": 6.294877602843276e-06,
"loss": -0.0731,
"reward": 1.6656273305416107,
"reward_std": 0.4725663438439369,
"rewards/embodied_math": 0.08750223537208512,
"rewards/format_reward": 0.8660714775323868,
"rewards/tag_count_reward": 0.7120536118745804,
"step": 197
},
{
"clip_ratio": 0.0,
"completion_length": 1272.6205749511719,
"epoch": 0.6616541353383458,
"grad_norm": 0.336520254611969,
"kl": 1.02685546875,
"learning_rate": 6.186656985185078e-06,
"loss": -0.0326,
"reward": 1.6741072237491608,
"reward_std": 0.4543350860476494,
"rewards/embodied_math": 0.06919643003493547,
"rewards/format_reward": 0.8660714626312256,
"rewards/tag_count_reward": 0.7388393133878708,
"step": 198
},
{
"clip_ratio": 0.0,
"completion_length": 1280.7210388183594,
"epoch": 0.6649958228905597,
"grad_norm": 0.1395803987979889,
"kl": 0.560546875,
"learning_rate": 6.078956478625743e-06,
"loss": -0.0183,
"reward": 1.7165179550647736,
"reward_std": 0.5158629938960075,
"rewards/embodied_math": 0.13392857648432255,
"rewards/format_reward": 0.8571428805589676,
"rewards/tag_count_reward": 0.7254464626312256,
"step": 199
},
{
"clip_ratio": 0.0,
"completion_length": 1254.1563110351562,
"epoch": 0.6683375104427736,
"grad_norm": 0.6481565833091736,
"kl": 1.6591796875,
"learning_rate": 5.971790772698467e-06,
"loss": -0.0423,
"reward": 1.717633992433548,
"reward_std": 0.47620728611946106,
"rewards/embodied_math": 0.1428571492433548,
"rewards/format_reward": 0.8526786118745804,
"rewards/tag_count_reward": 0.722098246216774,
"step": 200
},
{
"clip_ratio": 0.0,
"completion_length": 1249.982177734375,
"epoch": 0.6716791979949874,
"grad_norm": 0.37232309579849243,
"kl": 1.18359375,
"learning_rate": 5.865174483993697e-06,
"loss": -0.0609,
"reward": 1.6735491752624512,
"reward_std": 0.6033786237239838,
"rewards/embodied_math": 0.13392857648432255,
"rewards/format_reward": 0.8058036118745804,
"rewards/tag_count_reward": 0.7338170111179352,
"step": 201
},
{
"clip_ratio": 0.0,
"completion_length": 1251.3504943847656,
"epoch": 0.6750208855472013,
"grad_norm": 0.5929485559463501,
"kl": 1.576171875,
"learning_rate": 5.759122154165528e-06,
"loss": -0.0636,
"reward": 1.6389509439468384,
"reward_std": 0.5772095322608948,
"rewards/embodied_math": 0.10937500605359674,
"rewards/format_reward": 0.8169643431901932,
"rewards/tag_count_reward": 0.7126116454601288,
"step": 202
},
{
"clip_ratio": 0.0,
"completion_length": 1251.3839721679688,
"epoch": 0.6783625730994152,
"grad_norm": 0.7571573853492737,
"kl": 3.2109375,
"learning_rate": 5.653648247948342e-06,
"loss": -0.0644,
"reward": 1.7477679550647736,
"reward_std": 0.589836597442627,
"rewards/embodied_math": 0.2031250074505806,
"rewards/format_reward": 0.8191964626312256,
"rewards/tag_count_reward": 0.7254464477300644,
"step": 203
},
{
"clip_ratio": 0.0,
"completion_length": 1251.7701416015625,
"epoch": 0.681704260651629,
"grad_norm": 0.28568902611732483,
"kl": 1.71484375,
"learning_rate": 5.548767151183912e-06,
"loss": -0.0616,
"reward": 1.698102742433548,
"reward_std": 0.5798378437757492,
"rewards/embodied_math": 0.160714291036129,
"rewards/format_reward": 0.7991071790456772,
"rewards/tag_count_reward": 0.7382812798023224,
"step": 204
},
{
"clip_ratio": 0.0,
"completion_length": 1244.33935546875,
"epoch": 0.6850459482038429,
"grad_norm": 0.21788516640663147,
"kl": 1.83984375,
"learning_rate": 5.444493168859304e-06,
"loss": -0.0808,
"reward": 1.5507813096046448,
"reward_std": 0.5835923999547958,
"rewards/embodied_math": 0.011160714784637094,
"rewards/format_reward": 0.7991071790456772,
"rewards/tag_count_reward": 0.7405134439468384,
"step": 205
},
{
"clip_ratio": 0.0,
"completion_length": 1229.9822082519531,
"epoch": 0.6883876357560568,
"grad_norm": 0.21175533533096313,
"kl": 1.4609375,
"learning_rate": 5.340840523155769e-06,
"loss": -0.092,
"reward": 1.5630581080913544,
"reward_std": 0.6021421700716019,
"rewards/embodied_math": 0.0781250037252903,
"rewards/format_reward": 0.7723214775323868,
"rewards/tag_count_reward": 0.7126116305589676,
"step": 206
},
{
"clip_ratio": 0.0,
"completion_length": 1232.7411193847656,
"epoch": 0.6917293233082706,
"grad_norm": 2.031179904937744,
"kl": 1.998046875,
"learning_rate": 5.237823351508953e-06,
"loss": -0.0844,
"reward": 1.6768973767757416,
"reward_std": 0.5545762553811073,
"rewards/embodied_math": 0.1071428619325161,
"rewards/format_reward": 0.8191964626312256,
"rewards/tag_count_reward": 0.750558078289032,
"step": 207
},
{
"clip_ratio": 0.0,
"completion_length": 1244.7433776855469,
"epoch": 0.6950710108604845,
"grad_norm": 0.4047699570655823,
"kl": 1.185546875,
"learning_rate": 5.135455704680646e-06,
"loss": -0.0906,
"reward": 1.6255581378936768,
"reward_std": 0.5186707675457001,
"rewards/embodied_math": 0.026785715483129025,
"rewards/format_reward": 0.8504464626312256,
"rewards/tag_count_reward": 0.7483259290456772,
"step": 208
},
{
"clip_ratio": 0.0,
"completion_length": 1263.3215026855469,
"epoch": 0.6984126984126984,
"grad_norm": 0.38765090703964233,
"kl": 1.130859375,
"learning_rate": 5.03375154484238e-06,
"loss": -0.0635,
"reward": 1.6350446939468384,
"reward_std": 0.5284169614315033,
"rewards/embodied_math": 0.0267857164144516,
"rewards/format_reward": 0.8526786118745804,
"rewards/tag_count_reward": 0.7555803954601288,
"step": 209
},
{
"clip_ratio": 0.0,
"completion_length": 1262.1652221679688,
"epoch": 0.7017543859649122,
"grad_norm": 0.4771866798400879,
"kl": 1.1787109375,
"learning_rate": 4.932724743671089e-06,
"loss": -0.0582,
"reward": 1.6517857909202576,
"reward_std": 0.5431385114789009,
"rewards/embodied_math": 0.08482143259607255,
"rewards/format_reward": 0.8325893431901932,
"rewards/tag_count_reward": 0.7343750447034836,
"step": 210
},
{
"clip_ratio": 0.0,
"completion_length": 1237.1361999511719,
"epoch": 0.7050960735171261,
"grad_norm": 0.4817124009132385,
"kl": 1.5126953125,
"learning_rate": 4.832389080457118e-06,
"loss": -0.1114,
"reward": 1.7349331080913544,
"reward_std": 0.5682315081357956,
"rewards/embodied_math": 0.1718750111758709,
"rewards/format_reward": 0.8459821790456772,
"rewards/tag_count_reward": 0.7170759290456772,
"step": 211
},
{
"clip_ratio": 0.0,
"completion_length": 1250.9888916015625,
"epoch": 0.70843776106934,
"grad_norm": 0.2896386682987213,
"kl": 1.2412109375,
"learning_rate": 4.732758240224819e-06,
"loss": -0.0715,
"reward": 1.5792411267757416,
"reward_std": 0.5823845863342285,
"rewards/embodied_math": 0.03794642980210483,
"rewards/format_reward": 0.8415178954601288,
"rewards/tag_count_reward": 0.699776828289032,
"step": 212
},
{
"clip_ratio": 0.0,
"completion_length": 1255.7433471679688,
"epoch": 0.7117794486215538,
"grad_norm": 0.2107439637184143,
"kl": 1.296875,
"learning_rate": 4.633845811866044e-06,
"loss": -0.0616,
"reward": 1.6333706080913544,
"reward_std": 0.49481259286403656,
"rewards/embodied_math": 0.0691964291036129,
"rewards/format_reward": 0.8482143431901932,
"rewards/tag_count_reward": 0.7159598469734192,
"step": 213
},
{
"clip_ratio": 0.0,
"completion_length": 1255.4933471679688,
"epoch": 0.7151211361737677,
"grad_norm": 0.13716629147529602,
"kl": 1.3798828125,
"learning_rate": 4.535665286286691e-06,
"loss": -0.0621,
"reward": 1.5842634737491608,
"reward_std": 0.49601756781339645,
"rewards/embodied_math": 0.015625000931322575,
"rewards/format_reward": 0.8459821939468384,
"rewards/tag_count_reward": 0.7226562798023224,
"step": 214
},
{
"clip_ratio": 0.0,
"completion_length": 1259.1473388671875,
"epoch": 0.7184628237259816,
"grad_norm": 0.2403624951839447,
"kl": 1.541015625,
"learning_rate": 4.438230054566678e-06,
"loss": -0.0537,
"reward": 1.6422991454601288,
"reward_std": 0.5113217607140541,
"rewards/embodied_math": 0.07589286053553224,
"rewards/format_reward": 0.8593750447034836,
"rewards/tag_count_reward": 0.7070312798023224,
"step": 215
},
{
"clip_ratio": 0.0,
"completion_length": 1263.2745971679688,
"epoch": 0.7218045112781954,
"grad_norm": 0.29377609491348267,
"kl": 1.101806640625,
"learning_rate": 4.34155340613348e-06,
"loss": -0.0438,
"reward": 1.5970982909202576,
"reward_std": 0.5749376714229584,
"rewards/embodied_math": 0.07142857369035482,
"rewards/format_reward": 0.8370536118745804,
"rewards/tag_count_reward": 0.6886161118745804,
"step": 216
},
{
"clip_ratio": 0.0,
"completion_length": 1258.6473693847656,
"epoch": 0.7251461988304093,
"grad_norm": 0.30173197388648987,
"kl": 1.80078125,
"learning_rate": 4.245648526949568e-06,
"loss": -0.0573,
"reward": 1.6188616752624512,
"reward_std": 0.4864480197429657,
"rewards/embodied_math": 0.0357142873108387,
"rewards/format_reward": 0.8638393133878708,
"rewards/tag_count_reward": 0.719308078289032,
"step": 217
},
{
"clip_ratio": 0.0,
"completion_length": 1242.0201416015625,
"epoch": 0.7284878863826232,
"grad_norm": 0.1726818084716797,
"kl": 2.19921875,
"learning_rate": 4.150528497713911e-06,
"loss": -0.0906,
"reward": 1.6897322237491608,
"reward_std": 0.5109899789094925,
"rewards/embodied_math": 0.08705357578583062,
"rewards/format_reward": 0.8839286118745804,
"rewards/tag_count_reward": 0.7187500447034836,
"step": 218
},
{
"clip_ratio": 0.0,
"completion_length": 1235.4732666015625,
"epoch": 0.731829573934837,
"grad_norm": 0.18735679984092712,
"kl": 1.8564453125,
"learning_rate": 4.056206292077916e-06,
"loss": -0.0943,
"reward": 1.797991156578064,
"reward_std": 0.48637502640485764,
"rewards/embodied_math": 0.2209821562282741,
"rewards/format_reward": 0.863839328289032,
"rewards/tag_count_reward": 0.7131696790456772,
"step": 219
},
{
"clip_ratio": 0.0,
"completion_length": 1232.6250610351562,
"epoch": 0.7351712614870509,
"grad_norm": 0.27614831924438477,
"kl": 2.23828125,
"learning_rate": 3.96269477487588e-06,
"loss": -0.0937,
"reward": 1.7008929252624512,
"reward_std": 0.5516977161169052,
"rewards/embodied_math": 0.1562500074505806,
"rewards/format_reward": 0.8415178954601288,
"rewards/tag_count_reward": 0.7031250298023224,
"step": 220
},
{
"clip_ratio": 0.0,
"completion_length": 1269.6986999511719,
"epoch": 0.7385129490392648,
"grad_norm": 0.1375630497932434,
"kl": 0.9873046875,
"learning_rate": 3.870006700370348e-06,
"loss": -0.0571,
"reward": 1.7500000596046448,
"reward_std": 0.4832083433866501,
"rewards/embodied_math": 0.12500000605359674,
"rewards/format_reward": 0.8883928954601288,
"rewards/tag_count_reward": 0.7366071790456772,
"step": 221
},
{
"clip_ratio": 0.0,
"completion_length": 1253.5558776855469,
"epoch": 0.7418546365914787,
"grad_norm": 0.1821809709072113,
"kl": 1.4970703125,
"learning_rate": 3.778154710512513e-06,
"loss": -0.0626,
"reward": 1.7126117050647736,
"reward_std": 0.5748995840549469,
"rewards/embodied_math": 0.12276786239817739,
"rewards/format_reward": 0.848214328289032,
"rewards/tag_count_reward": 0.7416294813156128,
"step": 222
},
{
"clip_ratio": 0.0,
"completion_length": 1227.3995971679688,
"epoch": 0.7451963241436925,
"grad_norm": 0.4636569619178772,
"kl": 2.15625,
"learning_rate": 3.687151333217952e-06,
"loss": -0.108,
"reward": 1.6289063096046448,
"reward_std": 0.6172408014535904,
"rewards/embodied_math": 0.09821428824216127,
"rewards/format_reward": 0.81026791036129,
"rewards/tag_count_reward": 0.7204241305589676,
"step": 223
},
{
"clip_ratio": 0.0,
"completion_length": 1259.0357666015625,
"epoch": 0.7485380116959064,
"grad_norm": 1.434766173362732,
"kl": 1.8427734375,
"learning_rate": 3.597008980657929e-06,
"loss": -0.0383,
"reward": 1.7098215222358704,
"reward_std": 0.5351722091436386,
"rewards/embodied_math": 0.0959821455180645,
"rewards/format_reward": 0.8660714775323868,
"rewards/tag_count_reward": 0.7477678954601288,
"step": 224
},
{
"clip_ratio": 0.0,
"completion_length": 1248.3951721191406,
"epoch": 0.7518796992481203,
"grad_norm": 0.17362454533576965,
"kl": 1.62890625,
"learning_rate": 3.5077399475664474e-06,
"loss": -0.0778,
"reward": 1.7622768580913544,
"reward_std": 0.4962947890162468,
"rewards/embodied_math": 0.13392857648432255,
"rewards/format_reward": 0.85714291036129,
"rewards/tag_count_reward": 0.7712053954601288,
"step": 225
},
{
"clip_ratio": 0.0,
"completion_length": 1278.1116638183594,
"epoch": 0.7552213868003341,
"grad_norm": 0.1242133304476738,
"kl": 0.9599609375,
"learning_rate": 3.419356409563361e-06,
"loss": -0.0309,
"reward": 1.774553656578064,
"reward_std": 0.5362864062190056,
"rewards/embodied_math": 0.1517857201397419,
"rewards/format_reward": 0.8504464626312256,
"rewards/tag_count_reward": 0.7723214775323868,
"step": 226
},
{
"clip_ratio": 0.0,
"completion_length": 1245.2143249511719,
"epoch": 0.758563074352548,
"grad_norm": 0.2596234083175659,
"kl": 1.728515625,
"learning_rate": 3.331870421493688e-06,
"loss": -0.0695,
"reward": 1.6724331080913544,
"reward_std": 0.5726595818996429,
"rewards/embodied_math": 0.08035714598372579,
"rewards/format_reward": 0.8303571790456772,
"rewards/tag_count_reward": 0.7617187798023224,
"step": 227
},
{
"clip_ratio": 0.0,
"completion_length": 1236.2857666015625,
"epoch": 0.7619047619047619,
"grad_norm": 0.6998271346092224,
"kl": 1.978515625,
"learning_rate": 3.245293915783444e-06,
"loss": -0.0877,
"reward": 1.684151828289032,
"reward_std": 0.6410565972328186,
"rewards/embodied_math": 0.11383929406292737,
"rewards/format_reward": 0.8125000298023224,
"rewards/tag_count_reward": 0.7578125149011612,
"step": 228
},
{
"clip_ratio": 0.0,
"completion_length": 1249.9911499023438,
"epoch": 0.7652464494569757,
"grad_norm": 0.3117403984069824,
"kl": 1.6494140625,
"learning_rate": 3.1596387008121386e-06,
"loss": -0.0699,
"reward": 1.7248884737491608,
"reward_std": 0.6069164872169495,
"rewards/embodied_math": 0.11607143469154835,
"rewards/format_reward": 0.823660746216774,
"rewards/tag_count_reward": 0.7851562798023224,
"step": 229
},
{
"clip_ratio": 0.0,
"completion_length": 1263.8147888183594,
"epoch": 0.7685881370091896,
"grad_norm": 0.4184093177318573,
"kl": 1.0625,
"learning_rate": 3.074916459302211e-06,
"loss": -0.0566,
"reward": 1.8108259737491608,
"reward_std": 0.5378059893846512,
"rewards/embodied_math": 0.16517857927829027,
"rewards/format_reward": 0.848214328289032,
"rewards/tag_count_reward": 0.7974330633878708,
"step": 230
},
{
"clip_ratio": 0.0,
"completion_length": 1239.7857666015625,
"epoch": 0.7719298245614035,
"grad_norm": 1.0474584102630615,
"kl": 2.328125,
"learning_rate": 2.9911387467255737e-06,
"loss": -0.0811,
"reward": 1.7176340222358704,
"reward_std": 0.5790911167860031,
"rewards/embodied_math": 0.10714286006987095,
"rewards/format_reward": 0.8169643133878708,
"rewards/tag_count_reward": 0.793526828289032,
"step": 231
},
{
"clip_ratio": 0.0,
"completion_length": 1212.7254943847656,
"epoch": 0.7752715121136173,
"grad_norm": 0.3452969789505005,
"kl": 1.599609375,
"learning_rate": 2.9083169897275554e-06,
"loss": -0.117,
"reward": 1.6752232909202576,
"reward_std": 0.5666229277849197,
"rewards/embodied_math": 0.08035714668221772,
"rewards/format_reward": 0.8236607611179352,
"rewards/tag_count_reward": 0.7712053805589676,
"step": 232
},
{
"clip_ratio": 0.0,
"completion_length": 1239.716552734375,
"epoch": 0.7786131996658312,
"grad_norm": 0.25167980790138245,
"kl": 1.430419921875,
"learning_rate": 2.82646248456839e-06,
"loss": -0.0755,
"reward": 1.72991082072258,
"reward_std": 0.6188212782144547,
"rewards/embodied_math": 0.12500000558793545,
"rewards/format_reward": 0.8169643133878708,
"rewards/tag_count_reward": 0.7879464775323868,
"step": 233
},
{
"clip_ratio": 0.0,
"completion_length": 1248.3750610351562,
"epoch": 0.7819548872180451,
"grad_norm": 0.3067150413990021,
"kl": 1.31640625,
"learning_rate": 2.745586395582481e-06,
"loss": -0.0627,
"reward": 1.7762278020381927,
"reward_std": 0.5344242751598358,
"rewards/embodied_math": 0.10267857648432255,
"rewards/format_reward": 0.879464328289032,
"rewards/tag_count_reward": 0.7940848618745804,
"step": 234
},
{
"clip_ratio": 0.0,
"completion_length": 1239.2902526855469,
"epoch": 0.7852965747702589,
"grad_norm": 0.3017171323299408,
"kl": 2.639892578125,
"learning_rate": 2.665699753655684e-06,
"loss": -0.0628,
"reward": 1.735491156578064,
"reward_std": 0.5408925563097,
"rewards/embodied_math": 0.07589285913854837,
"rewards/format_reward": 0.863839328289032,
"rewards/tag_count_reward": 0.7957589626312256,
"step": 235
},
{
"clip_ratio": 0.0,
"completion_length": 1203.1183471679688,
"epoch": 0.7886382623224728,
"grad_norm": 1.1649154424667358,
"kl": 2.390625,
"learning_rate": 2.586813454720771e-06,
"loss": -0.1158,
"reward": 1.6802456080913544,
"reward_std": 0.5190064385533333,
"rewards/embodied_math": 0.03348214412108064,
"rewards/format_reward": 0.8683036118745804,
"rewards/tag_count_reward": 0.7784598469734192,
"step": 236
},
{
"clip_ratio": 0.0,
"completion_length": 1228.2589721679688,
"epoch": 0.7919799498746867,
"grad_norm": 0.4925525188446045,
"kl": 2.134765625,
"learning_rate": 2.5089382582712995e-06,
"loss": -0.0895,
"reward": 1.7260045409202576,
"reward_std": 0.4922590032219887,
"rewards/embodied_math": 0.058035718742758036,
"rewards/format_reward": 0.8839286118745804,
"rewards/tag_count_reward": 0.7840402126312256,
"step": 237
},
{
"clip_ratio": 0.0,
"completion_length": 1212.357177734375,
"epoch": 0.7953216374269005,
"grad_norm": 0.24148394167423248,
"kl": 2.8828125,
"learning_rate": 2.4320847858941167e-06,
"loss": -0.1478,
"reward": 1.7299107909202576,
"reward_std": 0.5158706456422806,
"rewards/embodied_math": 0.11160714668221772,
"rewards/format_reward": 0.870535746216774,
"rewards/tag_count_reward": 0.7477678954601288,
"step": 238
},
{
"clip_ratio": 0.0,
"completion_length": 1228.7500305175781,
"epoch": 0.7986633249791144,
"grad_norm": 0.4909496307373047,
"kl": 2.439453125,
"learning_rate": 2.3562635198206476e-06,
"loss": -0.1085,
"reward": 1.739397406578064,
"reward_std": 0.53496253490448,
"rewards/embodied_math": 0.1004464328289032,
"rewards/format_reward": 0.8727678954601288,
"rewards/tag_count_reward": 0.766183078289032,
"step": 239
},
{
"clip_ratio": 0.0,
"completion_length": 1226.5000610351562,
"epoch": 0.8020050125313283,
"grad_norm": 0.1784716248512268,
"kl": 2.171875,
"learning_rate": 2.281484801497186e-06,
"loss": -0.1241,
"reward": 1.6556920111179352,
"reward_std": 0.5279371589422226,
"rewards/embodied_math": 0.013392857741564512,
"rewards/format_reward": 0.877232164144516,
"rewards/tag_count_reward": 0.7650670111179352,
"step": 240
},
{
"clip_ratio": 0.0,
"completion_length": 1210.8616638183594,
"epoch": 0.8053467000835421,
"grad_norm": 0.5043416619300842,
"kl": 3.140625,
"learning_rate": 2.2077588301744234e-06,
"loss": -0.1453,
"reward": 1.6947545409202576,
"reward_std": 0.551237165927887,
"rewards/embodied_math": 0.08928571827709675,
"rewards/format_reward": 0.8593750298023224,
"rewards/tag_count_reward": 0.7460937798023224,
"step": 241
},
{
"clip_ratio": 0.0,
"completion_length": 1213.3147583007812,
"epoch": 0.808688387635756,
"grad_norm": 0.8992521166801453,
"kl": 3.134765625,
"learning_rate": 2.1350956615163254e-06,
"loss": -0.1302,
"reward": 1.7220982909202576,
"reward_std": 0.5575926005840302,
"rewards/embodied_math": 0.12276786006987095,
"rewards/format_reward": 0.854910746216774,
"rewards/tag_count_reward": 0.744419664144516,
"step": 242
},
{
"clip_ratio": 0.0,
"completion_length": 1236.669677734375,
"epoch": 0.8120300751879699,
"grad_norm": 0.33429309725761414,
"kl": 1.98828125,
"learning_rate": 2.0635052062286323e-06,
"loss": -0.0843,
"reward": 1.7075893580913544,
"reward_std": 0.5474491640925407,
"rewards/embodied_math": 0.0915178619325161,
"rewards/format_reward": 0.8504464775323868,
"rewards/tag_count_reward": 0.7656250298023224,
"step": 243
},
{
"clip_ratio": 0.0,
"completion_length": 1238.6719360351562,
"epoch": 0.8153717627401837,
"grad_norm": 0.5671308040618896,
"kl": 2.73046875,
"learning_rate": 1.992997228707103e-06,
"loss": -0.0982,
"reward": 1.6908482611179352,
"reward_std": 0.5311977565288544,
"rewards/embodied_math": 0.05133928940631449,
"rewards/format_reward": 0.8660714775323868,
"rewards/tag_count_reward": 0.7734375298023224,
"step": 244
},
{
"clip_ratio": 0.0,
"completion_length": 1212.2991333007812,
"epoch": 0.8187134502923976,
"grad_norm": 0.2072792798280716,
"kl": 2.033203125,
"learning_rate": 1.923581345705736e-06,
"loss": -0.1452,
"reward": 1.6718750894069672,
"reward_std": 0.5443079173564911,
"rewards/embodied_math": 0.0691964328289032,
"rewards/format_reward": 0.8392857611179352,
"rewards/tag_count_reward": 0.7633928954601288,
"step": 245
},
{
"clip_ratio": 0.0,
"completion_length": 1202.3460388183594,
"epoch": 0.8220551378446115,
"grad_norm": 0.17846164107322693,
"kl": 2.375,
"learning_rate": 1.8552670250251003e-06,
"loss": -0.144,
"reward": 1.8275670111179352,
"reward_std": 0.5471851527690887,
"rewards/embodied_math": 0.2209821492433548,
"rewards/format_reward": 0.839285746216774,
"rewards/tag_count_reward": 0.7672991454601288,
"step": 246
},
{
"clip_ratio": 0.0,
"completion_length": 1229.0536193847656,
"epoch": 0.8253968253968254,
"grad_norm": 0.1999313086271286,
"kl": 1.751953125,
"learning_rate": 1.788063584221017e-06,
"loss": -0.1177,
"reward": 1.7338170409202576,
"reward_std": 0.5468832030892372,
"rewards/embodied_math": 0.10491071827709675,
"rewards/format_reward": 0.8392857611179352,
"rewards/tag_count_reward": 0.7896205633878708,
"step": 247
},
{
"clip_ratio": 0.0,
"completion_length": 1251.4620666503906,
"epoch": 0.8287385129490392,
"grad_norm": 0.4205353558063507,
"kl": 1.279296875,
"learning_rate": 1.7219801893337073e-06,
"loss": -0.0776,
"reward": 1.8007813096046448,
"reward_std": 0.5819149166345596,
"rewards/embodied_math": 0.1785714402794838,
"rewards/format_reward": 0.832589328289032,
"rewards/tag_count_reward": 0.789620578289032,
"step": 248
},
{
"clip_ratio": 0.0,
"completion_length": 1217.8080749511719,
"epoch": 0.8320802005012531,
"grad_norm": 0.24345842003822327,
"kl": 2.443359375,
"learning_rate": 1.6570258536376083e-06,
"loss": -0.1328,
"reward": 1.6685268878936768,
"reward_std": 0.6357107758522034,
"rewards/embodied_math": 0.08035714738070965,
"rewards/format_reward": 0.7946428805589676,
"rewards/tag_count_reward": 0.7935268133878708,
"step": 249
},
{
"clip_ratio": 0.0,
"completion_length": 1205.1920471191406,
"epoch": 0.835421888053467,
"grad_norm": 0.17488045990467072,
"kl": 2.93359375,
"learning_rate": 1.5932094364120453e-06,
"loss": -0.1389,
"reward": 1.7232143580913544,
"reward_std": 0.6385822296142578,
"rewards/embodied_math": 0.1696428693830967,
"rewards/format_reward": 0.7723214775323868,
"rewards/tag_count_reward": 0.7812500298023224,
"step": 250
},
{
"clip_ratio": 0.0,
"completion_length": 1205.0938110351562,
"epoch": 0.8387635756056808,
"grad_norm": 0.4325239956378937,
"kl": 3.138671875,
"learning_rate": 1.5305396417328755e-06,
"loss": -0.1453,
"reward": 1.6110491752624512,
"reward_std": 0.6495798975229263,
"rewards/embodied_math": 0.0580357164144516,
"rewards/format_reward": 0.7745535969734192,
"rewards/tag_count_reward": 0.7784598469734192,
"step": 251
},
{
"clip_ratio": 0.0,
"completion_length": 1207.5023193359375,
"epoch": 0.8421052631578947,
"grad_norm": 0.29781222343444824,
"kl": 2.95703125,
"learning_rate": 1.469025017285335e-06,
"loss": -0.122,
"reward": 1.702008992433548,
"reward_std": 0.6345908641815186,
"rewards/embodied_math": 0.1473214365541935,
"rewards/format_reward": 0.7656250298023224,
"rewards/tag_count_reward": 0.7890625298023224,
"step": 252
},
{
"clip_ratio": 0.0,
"completion_length": 1228.7054138183594,
"epoch": 0.8454469507101086,
"grad_norm": 0.20634357631206512,
"kl": 1.56640625,
"learning_rate": 1.4086739531981886e-06,
"loss": -0.1182,
"reward": 1.6674107909202576,
"reward_std": 0.6024844646453857,
"rewards/embodied_math": 0.0602678619325161,
"rewards/format_reward": 0.8035714775323868,
"rewards/tag_count_reward": 0.8035714626312256,
"step": 253
},
{
"clip_ratio": 0.0,
"completion_length": 1224.6339721679688,
"epoch": 0.8487886382623224,
"grad_norm": 0.46803462505340576,
"kl": 2.296875,
"learning_rate": 1.3494946808993804e-06,
"loss": -0.1075,
"reward": 1.6858259439468384,
"reward_std": 0.6125819832086563,
"rewards/embodied_math": 0.1071428619325161,
"rewards/format_reward": 0.7834821790456772,
"rewards/tag_count_reward": 0.7952009290456772,
"step": 254
},
{
"clip_ratio": 0.0,
"completion_length": 1195.5380249023438,
"epoch": 0.8521303258145363,
"grad_norm": 0.1840384304523468,
"kl": 2.791015625,
"learning_rate": 1.291495271993337e-06,
"loss": -0.1668,
"reward": 1.8063617050647736,
"reward_std": 0.6036971360445023,
"rewards/embodied_math": 0.19866072200238705,
"rewards/format_reward": 0.7946428805589676,
"rewards/tag_count_reward": 0.813058078289032,
"step": 255
},
{
"clip_ratio": 0.0,
"completion_length": 1177.13623046875,
"epoch": 0.8554720133667502,
"grad_norm": 0.2672232389450073,
"kl": 3.494140625,
"learning_rate": 1.234683637160048e-06,
"loss": -0.1723,
"reward": 1.6679688394069672,
"reward_std": 0.6499809473752975,
"rewards/embodied_math": 0.10714285797439516,
"rewards/format_reward": 0.7790178954601288,
"rewards/tag_count_reward": 0.7818080633878708,
"step": 256
},
{
"clip_ratio": 0.0,
"completion_length": 1220.99560546875,
"epoch": 0.858813700918964,
"grad_norm": 0.33298298716545105,
"kl": 2.24609375,
"learning_rate": 1.1790675250761263e-06,
"loss": -0.1364,
"reward": 1.7343750894069672,
"reward_std": 0.5940727889537811,
"rewards/embodied_math": 0.1250000037252903,
"rewards/format_reward": 0.8058035969734192,
"rewards/tag_count_reward": 0.8035714626312256,
"step": 257
},
{
"clip_ratio": 0.0,
"completion_length": 1222.9643249511719,
"epoch": 0.8621553884711779,
"grad_norm": 0.2807648181915283,
"kl": 2.150390625,
"learning_rate": 1.124654521357934e-06,
"loss": -0.124,
"reward": 1.8018974363803864,
"reward_std": 0.6039710342884064,
"rewards/embodied_math": 0.15625000977888703,
"rewards/format_reward": 0.8281250596046448,
"rewards/tag_count_reward": 0.8175223618745804,
"step": 258
},
{
"clip_ratio": 0.0,
"completion_length": 1242.5736999511719,
"epoch": 0.8654970760233918,
"grad_norm": 0.2357352077960968,
"kl": 1.890625,
"learning_rate": 1.0714520475269653e-06,
"loss": -0.091,
"reward": 1.8046875894069672,
"reward_std": 0.528480052947998,
"rewards/embodied_math": 0.10714285913854837,
"rewards/format_reward": 0.8593750447034836,
"rewards/tag_count_reward": 0.8381696790456772,
"step": 259
},
{
"clip_ratio": 0.0,
"completion_length": 1218.3772583007812,
"epoch": 0.8688387635756056,
"grad_norm": 0.35705995559692383,
"kl": 2.6884765625,
"learning_rate": 1.0194673599976134e-06,
"loss": -0.1249,
"reward": 1.6947545111179352,
"reward_std": 0.615173727273941,
"rewards/embodied_math": 0.10491071827709675,
"rewards/format_reward": 0.8013393133878708,
"rewards/tag_count_reward": 0.7885045111179352,
"step": 260
},
{
"clip_ratio": 0.0,
"completion_length": 1250.2745971679688,
"epoch": 0.8721804511278195,
"grad_norm": 0.2711421847343445,
"kl": 1.5419921875,
"learning_rate": 9.687075490874376e-07,
"loss": -0.085,
"reward": 1.6858260035514832,
"reward_std": 0.5885833278298378,
"rewards/embodied_math": 0.06473214481957257,
"rewards/format_reward": 0.808035746216774,
"rewards/tag_count_reward": 0.8130580633878708,
"step": 261
},
{
"clip_ratio": 0.0,
"completion_length": 1219.0625305175781,
"epoch": 0.8755221386800334,
"grad_norm": 0.5203860998153687,
"kl": 2.400390625,
"learning_rate": 9.191795380501133e-07,
"loss": -0.1352,
"reward": 1.6562500596046448,
"reward_std": 0.6262349635362625,
"rewards/embodied_math": 0.0446428582072258,
"rewards/format_reward": 0.8236607760190964,
"rewards/tag_count_reward": 0.7879464477300644,
"step": 262
},
{
"clip_ratio": 0.0,
"completion_length": 1232.0000610351562,
"epoch": 0.8788638262322472,
"grad_norm": 0.3880462944507599,
"kl": 1.912109375,
"learning_rate": 8.708900821311405e-07,
"loss": -0.1179,
"reward": 1.7483260035514832,
"reward_std": 0.5779529809951782,
"rewards/embodied_math": 0.06696428917348385,
"rewards/format_reward": 0.850446492433548,
"rewards/tag_count_reward": 0.8309152126312256,
"step": 263
},
{
"clip_ratio": 0.0,
"completion_length": 1225.6027526855469,
"epoch": 0.8822055137844611,
"grad_norm": 0.276309609413147,
"kl": 2.359375,
"learning_rate": 8.238457676464873e-07,
"loss": -0.1309,
"reward": 1.8102679252624512,
"reward_std": 0.5943205058574677,
"rewards/embodied_math": 0.1450892947614193,
"rewards/format_reward": 0.8459821790456772,
"rewards/tag_count_reward": 0.8191964626312256,
"step": 264
},
{
"clip_ratio": 0.0,
"completion_length": 1237.4420166015625,
"epoch": 0.885547201336675,
"grad_norm": 0.30299896001815796,
"kl": 1.912109375,
"learning_rate": 7.780530110842566e-07,
"loss": -0.101,
"reward": 1.8069196939468384,
"reward_std": 0.5637443214654922,
"rewards/embodied_math": 0.15178572107106447,
"rewards/format_reward": 0.830357164144516,
"rewards/tag_count_reward": 0.824776828289032,
"step": 265
},
{
"clip_ratio": 0.0,
"completion_length": 1217.5067443847656,
"epoch": 0.8888888888888888,
"grad_norm": 0.45059701800346375,
"kl": 3.25,
"learning_rate": 7.335180582295387e-07,
"loss": -0.1389,
"reward": 1.7516741752624512,
"reward_std": 0.5634035468101501,
"rewards/embodied_math": 0.06250000186264515,
"rewards/format_reward": 0.8526786118745804,
"rewards/tag_count_reward": 0.8364955633878708,
"step": 266
},
{
"clip_ratio": 0.0,
"completion_length": 1246.4553833007812,
"epoch": 0.8922305764411027,
"grad_norm": 0.3325289189815521,
"kl": 2.2841796875,
"learning_rate": 6.902469833125236e-07,
"loss": -0.0868,
"reward": 1.83147332072258,
"reward_std": 0.478071965277195,
"rewards/embodied_math": 0.09821428847499192,
"rewards/format_reward": 0.8839286267757416,
"rewards/tag_count_reward": 0.8493303954601288,
"step": 267
},
{
"clip_ratio": 0.0,
"completion_length": 1242.0826416015625,
"epoch": 0.8955722639933166,
"grad_norm": 0.17567971348762512,
"kl": 1.86669921875,
"learning_rate": 6.482456881800248e-07,
"loss": -0.0873,
"reward": 1.784040242433548,
"reward_std": 0.5592886134982109,
"rewards/embodied_math": 0.1004464328289032,
"rewards/format_reward": 0.8437500596046448,
"rewards/tag_count_reward": 0.8398437947034836,
"step": 268
},
{
"clip_ratio": 0.0,
"completion_length": 1242.1361999511719,
"epoch": 0.8989139515455304,
"grad_norm": 0.33967000246047974,
"kl": 2.61474609375,
"learning_rate": 6.075199014905153e-07,
"loss": -0.094,
"reward": 1.8325894176959991,
"reward_std": 0.5444860756397247,
"rewards/embodied_math": 0.11830357811413705,
"rewards/format_reward": 0.861607164144516,
"rewards/tag_count_reward": 0.8526786267757416,
"step": 269
},
{
"clip_ratio": 0.0,
"completion_length": 1256.3304138183594,
"epoch": 0.9022556390977443,
"grad_norm": 0.2733778655529022,
"kl": 1.634765625,
"learning_rate": 5.680751779327742e-07,
"loss": -0.0806,
"reward": 1.7968750894069672,
"reward_std": 0.5211016461253166,
"rewards/embodied_math": 0.0647321455180645,
"rewards/format_reward": 0.8772321790456772,
"rewards/tag_count_reward": 0.854910746216774,
"step": 270
},
{
"clip_ratio": 0.0,
"completion_length": 1230.5335388183594,
"epoch": 0.9055973266499582,
"grad_norm": 0.21413490176200867,
"kl": 2.7421875,
"learning_rate": 5.299168974682789e-07,
"loss": -0.1133,
"reward": 1.8024554550647736,
"reward_std": 0.5250123292207718,
"rewards/embodied_math": 0.0625000037252903,
"rewards/format_reward": 0.886160746216774,
"rewards/tag_count_reward": 0.8537946939468384,
"step": 271
},
{
"clip_ratio": 0.0,
"completion_length": 1242.419677734375,
"epoch": 0.908939014202172,
"grad_norm": 0.1741914302110672,
"kl": 2.4306640625,
"learning_rate": 4.930502645974122e-07,
"loss": -0.0974,
"reward": 1.8108260035514832,
"reward_std": 0.5831947550177574,
"rewards/embodied_math": 0.13616071757860482,
"rewards/format_reward": 0.8504464626312256,
"rewards/tag_count_reward": 0.8242187649011612,
"step": 272
},
{
"clip_ratio": 0.0,
"completion_length": 1224.7545166015625,
"epoch": 0.9122807017543859,
"grad_norm": 0.20169594883918762,
"kl": 2.310546875,
"learning_rate": 4.574803076496148e-07,
"loss": -0.1158,
"reward": 1.7460938394069672,
"reward_std": 0.6089013665914536,
"rewards/embodied_math": 0.10267857206054032,
"rewards/format_reward": 0.8303571939468384,
"rewards/tag_count_reward": 0.813058078289032,
"step": 273
},
{
"clip_ratio": 0.0,
"completion_length": 1227.2879943847656,
"epoch": 0.9156223893065998,
"grad_norm": 0.1624419391155243,
"kl": 2.3359375,
"learning_rate": 4.232118780975447e-07,
"loss": -0.1202,
"reward": 1.8231027722358704,
"reward_std": 0.5556938722729683,
"rewards/embodied_math": 0.12053571827709675,
"rewards/format_reward": 0.8616071939468384,
"rewards/tag_count_reward": 0.8409598469734192,
"step": 274
},
{
"clip_ratio": 0.0,
"completion_length": 1242.0625305175781,
"epoch": 0.9189640768588136,
"grad_norm": 0.2469264715909958,
"kl": 2.6015625,
"learning_rate": 3.9024964989539227e-07,
"loss": -0.1026,
"reward": 1.7650670409202576,
"reward_std": 0.5289107412099838,
"rewards/embodied_math": 0.0602678619325161,
"rewards/format_reward": 0.8660714775323868,
"rewards/tag_count_reward": 0.8387277126312256,
"step": 275
},
{
"clip_ratio": 0.0,
"completion_length": 1248.2746276855469,
"epoch": 0.9223057644110275,
"grad_norm": 0.154992938041687,
"kl": 1.62109375,
"learning_rate": 3.585981188413767e-07,
"loss": -0.0986,
"reward": 1.848772406578064,
"reward_std": 0.43934302031993866,
"rewards/embodied_math": 0.08928571990691125,
"rewards/format_reward": 0.895089328289032,
"rewards/tag_count_reward": 0.8643973618745804,
"step": 276
},
{
"clip_ratio": 0.0,
"completion_length": 1241.7991638183594,
"epoch": 0.9256474519632414,
"grad_norm": 0.1895643174648285,
"kl": 2.521484375,
"learning_rate": 3.2826160196455124e-07,
"loss": -0.0872,
"reward": 1.8766741752624512,
"reward_std": 0.5750466883182526,
"rewards/embodied_math": 0.17633929336443543,
"rewards/format_reward": 0.8549107760190964,
"rewards/tag_count_reward": 0.8454241454601288,
"step": 277
},
{
"clip_ratio": 0.0,
"completion_length": 1223.7031555175781,
"epoch": 0.9289891395154553,
"grad_norm": 0.3093890845775604,
"kl": 2.98828125,
"learning_rate": 2.9924423693600157e-07,
"loss": -0.1305,
"reward": 1.7991071939468384,
"reward_std": 0.6020410656929016,
"rewards/embodied_math": 0.1205357201397419,
"rewards/format_reward": 0.8437500447034836,
"rewards/tag_count_reward": 0.8348214775323868,
"step": 278
},
{
"clip_ratio": 0.0,
"completion_length": 1252.821533203125,
"epoch": 0.9323308270676691,
"grad_norm": 0.14048472046852112,
"kl": 2.017578125,
"learning_rate": 2.7154998150449643e-07,
"loss": -0.0813,
"reward": 1.7885045409202576,
"reward_std": 0.5473662465810776,
"rewards/embodied_math": 0.07366071827709675,
"rewards/format_reward": 0.8660714775323868,
"rewards/tag_count_reward": 0.8487723767757416,
"step": 279
},
{
"clip_ratio": 0.0,
"completion_length": 1257.71435546875,
"epoch": 0.935672514619883,
"grad_norm": 0.25984320044517517,
"kl": 1.62060546875,
"learning_rate": 2.4518261295667255e-07,
"loss": -0.0721,
"reward": 1.7974331080913544,
"reward_std": 0.5135258659720421,
"rewards/embodied_math": 0.06026785937137902,
"rewards/format_reward": 0.8816964626312256,
"rewards/tag_count_reward": 0.8554687798023224,
"step": 280
},
{
"clip_ratio": 0.0,
"completion_length": 1249.5870971679688,
"epoch": 0.9390142021720969,
"grad_norm": 0.15457814931869507,
"kl": 2.15234375,
"learning_rate": 2.201457276018526e-07,
"loss": -0.0906,
"reward": 1.8141742050647736,
"reward_std": 0.5247047990560532,
"rewards/embodied_math": 0.09821428847499192,
"rewards/format_reward": 0.863839328289032,
"rewards/tag_count_reward": 0.852120578289032,
"step": 281
},
{
"clip_ratio": 0.0,
"completion_length": 1263.9152221679688,
"epoch": 0.9423558897243107,
"grad_norm": 0.1510641723871231,
"kl": 1.44287109375,
"learning_rate": 1.9644274028152944e-07,
"loss": -0.0561,
"reward": 1.8599331080913544,
"reward_std": 0.49419236928224564,
"rewards/embodied_math": 0.09375000698491931,
"rewards/format_reward": 0.8906250298023224,
"rewards/tag_count_reward": 0.8755580931901932,
"step": 282
},
{
"clip_ratio": 0.0,
"completion_length": 1247.6763916015625,
"epoch": 0.9456975772765246,
"grad_norm": 0.19849680364131927,
"kl": 1.9423828125,
"learning_rate": 1.740768839036111e-07,
"loss": -0.1009,
"reward": 1.8638394176959991,
"reward_std": 0.5082411393523216,
"rewards/embodied_math": 0.12946428847499192,
"rewards/format_reward": 0.8816964626312256,
"rewards/tag_count_reward": 0.8526786118745804,
"step": 283
},
{
"clip_ratio": 0.0,
"completion_length": 1232.4442749023438,
"epoch": 0.9490392648287385,
"grad_norm": 0.16909794509410858,
"kl": 2.779296875,
"learning_rate": 1.5305120900146908e-07,
"loss": -0.1205,
"reward": 1.8030134737491608,
"reward_std": 0.5760641321539879,
"rewards/embodied_math": 0.10491071734577417,
"rewards/format_reward": 0.8638393133878708,
"rewards/tag_count_reward": 0.8342634439468384,
"step": 284
},
{
"clip_ratio": 0.0,
"completion_length": 1238.4888916015625,
"epoch": 0.9523809523809523,
"grad_norm": 0.15841911733150482,
"kl": 2.1328125,
"learning_rate": 1.3336858331787993e-07,
"loss": -0.1172,
"reward": 1.8030134737491608,
"reward_std": 0.5178222879767418,
"rewards/embodied_math": 0.066964291036129,
"rewards/format_reward": 0.8772321790456772,
"rewards/tag_count_reward": 0.8588170111179352,
"step": 285
},
{
"clip_ratio": 0.0,
"completion_length": 1237.6495971679688,
"epoch": 0.9557226399331662,
"grad_norm": 0.26993128657341003,
"kl": 2.83203125,
"learning_rate": 1.1503169141388049e-07,
"loss": -0.1095,
"reward": 1.794084906578064,
"reward_std": 0.5182835385203362,
"rewards/embodied_math": 0.03348214412108064,
"rewards/format_reward": 0.8861607611179352,
"rewards/tag_count_reward": 0.8744420111179352,
"step": 286
},
{
"clip_ratio": 0.0,
"completion_length": 1231.6451416015625,
"epoch": 0.9590643274853801,
"grad_norm": 0.15350092947483063,
"kl": 1.8896484375,
"learning_rate": 9.804303430261175e-08,
"loss": -0.1118,
"reward": 1.8800224363803864,
"reward_std": 0.5567026510834694,
"rewards/embodied_math": 0.15848215040750802,
"rewards/format_reward": 0.8660714626312256,
"rewards/tag_count_reward": 0.8554687947034836,
"step": 287
},
{
"clip_ratio": 0.0,
"completion_length": 1261.5045166015625,
"epoch": 0.9624060150375939,
"grad_norm": 0.1839500516653061,
"kl": 1.720703125,
"learning_rate": 8.240492910820407e-08,
"loss": -0.0612,
"reward": 1.8571429550647736,
"reward_std": 0.49432309716939926,
"rewards/embodied_math": 0.09598214738070965,
"rewards/format_reward": 0.8906250298023224,
"rewards/tag_count_reward": 0.8705357611179352,
"step": 288
},
{
"clip_ratio": 0.0,
"completion_length": 1249.1339721679688,
"epoch": 0.9657477025898078,
"grad_norm": 0.23754307627677917,
"kl": 1.796875,
"learning_rate": 6.811950874973994e-08,
"loss": -0.0933,
"reward": 1.8593750894069672,
"reward_std": 0.4672791361808777,
"rewards/embodied_math": 0.07589286123402417,
"rewards/format_reward": 0.9017857611179352,
"rewards/tag_count_reward": 0.8816964626312256,
"step": 289
},
{
"clip_ratio": 0.0,
"completion_length": 1219.2768249511719,
"epoch": 0.9690893901420217,
"grad_norm": 0.20686741173267365,
"kl": 3.244140625,
"learning_rate": 5.518872165033329e-08,
"loss": -0.1338,
"reward": 1.8242188394069672,
"reward_std": 0.5806285068392754,
"rewards/embodied_math": 0.1071428619325161,
"rewards/format_reward": 0.8571428805589676,
"rewards/tag_count_reward": 0.859933078289032,
"step": 290
},
{
"clip_ratio": 0.0,
"completion_length": 1250.810302734375,
"epoch": 0.9724310776942355,
"grad_norm": 0.35922443866729736,
"kl": 2.0673828125,
"learning_rate": 4.361433147138772e-08,
"loss": -0.0769,
"reward": 1.7885045409202576,
"reward_std": 0.5405867323279381,
"rewards/embodied_math": 0.06696428917348385,
"rewards/format_reward": 0.8683036118745804,
"rewards/tag_count_reward": 0.8532366454601288,
"step": 291
},
{
"clip_ratio": 0.0,
"completion_length": 1233.82373046875,
"epoch": 0.9757727652464494,
"grad_norm": 0.3444741368293762,
"kl": 2.0546875,
"learning_rate": 3.339791687203997e-08,
"loss": -0.1137,
"reward": 1.854352742433548,
"reward_std": 0.524741031229496,
"rewards/embodied_math": 0.129464291036129,
"rewards/format_reward": 0.8750000447034836,
"rewards/tag_count_reward": 0.8498884439468384,
"step": 292
},
{
"clip_ratio": 0.0,
"completion_length": 1226.1250610351562,
"epoch": 0.9791144527986633,
"grad_norm": 0.2046412229537964,
"kl": 2.8984375,
"learning_rate": 2.4540871293845526e-08,
"loss": -0.1185,
"reward": 1.9469866752624512,
"reward_std": 0.57990662753582,
"rewards/embodied_math": 0.207589291036129,
"rewards/format_reward": 0.879464328289032,
"rewards/tag_count_reward": 0.8599330633878708,
"step": 293
},
{
"clip_ratio": 0.0,
"completion_length": 1216.9888916015625,
"epoch": 0.9824561403508771,
"grad_norm": 0.4618666172027588,
"kl": 3.0234375,
"learning_rate": 1.7044402770725055e-08,
"loss": -0.1354,
"reward": 1.782366156578064,
"reward_std": 0.5473798364400864,
"rewards/embodied_math": 0.09821428777649999,
"rewards/format_reward": 0.8437500447034836,
"rewards/tag_count_reward": 0.8404018133878708,
"step": 294
},
{
"clip_ratio": 0.0,
"completion_length": 1245.3147888183594,
"epoch": 0.985797827903091,
"grad_norm": 0.21296104788780212,
"kl": 1.7734375,
"learning_rate": 1.0909533764194013e-08,
"loss": -0.0758,
"reward": 1.7204241752624512,
"reward_std": 0.5121402740478516,
"rewards/embodied_math": 0.029017859371379018,
"rewards/format_reward": 0.8526786267757416,
"rewards/tag_count_reward": 0.8387277275323868,
"step": 295
},
{
"clip_ratio": 0.0,
"completion_length": 1222.90185546875,
"epoch": 0.9891395154553049,
"grad_norm": 0.2440209835767746,
"kl": 2.74609375,
"learning_rate": 6.137101023910852e-09,
"loss": -0.1229,
"reward": 1.8281250894069672,
"reward_std": 0.5333529859781265,
"rewards/embodied_math": 0.08482143236324191,
"rewards/format_reward": 0.870535746216774,
"rewards/tag_count_reward": 0.8727678954601288,
"step": 296
},
{
"clip_ratio": 0.0,
"completion_length": 1258.2143249511719,
"epoch": 0.9924812030075187,
"grad_norm": 0.14400802552700043,
"kl": 1.7626953125,
"learning_rate": 2.7277554735449797e-09,
"loss": -0.0644,
"reward": 1.8348215222358704,
"reward_std": 0.5516516491770744,
"rewards/embodied_math": 0.12723214738070965,
"rewards/format_reward": 0.8549107313156128,
"rewards/tag_count_reward": 0.8526786118745804,
"step": 297
},
{
"clip_ratio": 0.0,
"completion_length": 1253.5580444335938,
"epoch": 0.9958228905597326,
"grad_norm": 0.4155780076980591,
"kl": 2.40234375,
"learning_rate": 6.819621220033323e-10,
"loss": -0.08,
"reward": 1.825334906578064,
"reward_std": 0.5505645722150803,
"rewards/embodied_math": 0.11383929220028222,
"rewards/format_reward": 0.8660714626312256,
"rewards/tag_count_reward": 0.8454241305589676,
"step": 298
},
{
"clip_ratio": 0.0,
"completion_length": 1240.5057678222656,
"epoch": 0.9991645781119465,
"grad_norm": 0.3007453680038452,
"kl": 2.044921875,
"learning_rate": 0.0,
"loss": -0.1029,
"reward": 1.8593750894069672,
"reward_std": 0.5401175245642662,
"rewards/embodied_math": 0.13839286053553224,
"rewards/format_reward": 0.8816964626312256,
"rewards/tag_count_reward": 0.8392857611179352,
"step": 299
},
{
"epoch": 0.9991645781119465,
"step": 299,
"total_flos": 0.0,
"train_loss": -0.013552246318095758,
"train_runtime": 19984.1821,
"train_samples_per_second": 0.419,
"train_steps_per_second": 0.015
}
],
"logging_steps": 1,
"max_steps": 299,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 20,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}