| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.9991645781119465, | |
| "eval_steps": 500, | |
| "global_step": 299, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 714.7366333007812, | |
| "epoch": 0.003341687552213868, | |
| "grad_norm": 0.32587897777557373, | |
| "kl": 0.00015604496002197266, | |
| "learning_rate": 6.666666666666667e-07, | |
| "loss": 0.037, | |
| "reward": 0.3007812649011612, | |
| "reward_std": 0.3548884987831116, | |
| "rewards/embodied_math": 0.12946429196745157, | |
| "rewards/format_reward": 0.0290178582072258, | |
| "rewards/tag_count_reward": 0.14229911379516125, | |
| "step": 1 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 679.2701110839844, | |
| "epoch": 0.006683375104427736, | |
| "grad_norm": 0.37410375475883484, | |
| "kl": 0.00015926361083984375, | |
| "learning_rate": 1.3333333333333334e-06, | |
| "loss": 0.0165, | |
| "reward": 0.2979910857975483, | |
| "reward_std": 0.4222180098295212, | |
| "rewards/embodied_math": 0.08258928963914514, | |
| "rewards/format_reward": 0.03794643096625805, | |
| "rewards/tag_count_reward": 0.1774553619325161, | |
| "step": 2 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 752.3594055175781, | |
| "epoch": 0.010025062656641603, | |
| "grad_norm": 0.3303394913673401, | |
| "kl": 0.00016188621520996094, | |
| "learning_rate": 2.0000000000000003e-06, | |
| "loss": 0.0331, | |
| "reward": 0.2845982275903225, | |
| "reward_std": 0.37162595987319946, | |
| "rewards/embodied_math": 0.0959821455180645, | |
| "rewards/format_reward": 0.029017858440056443, | |
| "rewards/tag_count_reward": 0.1595982238650322, | |
| "step": 3 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 650.1518096923828, | |
| "epoch": 0.013366750208855471, | |
| "grad_norm": 0.42782607674598694, | |
| "kl": 0.00015664100646972656, | |
| "learning_rate": 2.666666666666667e-06, | |
| "loss": -0.0001, | |
| "reward": 0.27120537497103214, | |
| "reward_std": 0.3699168190360069, | |
| "rewards/embodied_math": 0.10044643399305642, | |
| "rewards/format_reward": 0.03571428684517741, | |
| "rewards/tag_count_reward": 0.13504465110599995, | |
| "step": 4 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 703.1094055175781, | |
| "epoch": 0.01670843776106934, | |
| "grad_norm": 0.392333447933197, | |
| "kl": 0.000461578369140625, | |
| "learning_rate": 3.3333333333333333e-06, | |
| "loss": 0.0218, | |
| "reward": 0.254464291036129, | |
| "reward_std": 0.32935116440057755, | |
| "rewards/embodied_math": 0.08035714668221772, | |
| "rewards/format_reward": 0.026785714784637094, | |
| "rewards/tag_count_reward": 0.1473214328289032, | |
| "step": 5 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 571.1674270629883, | |
| "epoch": 0.020050125313283207, | |
| "grad_norm": 0.4020984172821045, | |
| "kl": 0.004108428955078125, | |
| "learning_rate": 4.000000000000001e-06, | |
| "loss": 0.0477, | |
| "reward": 0.5329241380095482, | |
| "reward_std": 0.495420403778553, | |
| "rewards/embodied_math": 0.15401786682195961, | |
| "rewards/format_reward": 0.08258929010480642, | |
| "rewards/tag_count_reward": 0.2963169813156128, | |
| "step": 6 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 599.1049346923828, | |
| "epoch": 0.023391812865497075, | |
| "grad_norm": 118.67343139648438, | |
| "kl": 2.298828125, | |
| "learning_rate": 4.666666666666667e-06, | |
| "loss": 0.1594, | |
| "reward": 0.7059152126312256, | |
| "reward_std": 0.659125566482544, | |
| "rewards/embodied_math": 0.07589285937137902, | |
| "rewards/format_reward": 0.2031250111758709, | |
| "rewards/tag_count_reward": 0.426897332072258, | |
| "step": 7 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 610.7879943847656, | |
| "epoch": 0.026733500417710943, | |
| "grad_norm": 12.420914649963379, | |
| "kl": 0.4365234375, | |
| "learning_rate": 5.333333333333334e-06, | |
| "loss": 0.064, | |
| "reward": 0.6316964626312256, | |
| "reward_std": 0.6864016056060791, | |
| "rewards/embodied_math": 0.06250000419095159, | |
| "rewards/format_reward": 0.1651785783469677, | |
| "rewards/tag_count_reward": 0.4040178805589676, | |
| "step": 8 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 579.732177734375, | |
| "epoch": 0.03007518796992481, | |
| "grad_norm": 1.7345542907714844, | |
| "kl": 0.1015625, | |
| "learning_rate": 6e-06, | |
| "loss": 0.0845, | |
| "reward": 1.0446428954601288, | |
| "reward_std": 0.7974795699119568, | |
| "rewards/embodied_math": 0.2031250111758709, | |
| "rewards/format_reward": 0.290178582072258, | |
| "rewards/tag_count_reward": 0.5513393133878708, | |
| "step": 9 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 667.1272735595703, | |
| "epoch": 0.03341687552213868, | |
| "grad_norm": 0.9705411195755005, | |
| "kl": 0.056121826171875, | |
| "learning_rate": 6.666666666666667e-06, | |
| "loss": 0.0447, | |
| "reward": 0.8666295111179352, | |
| "reward_std": 0.7798839062452316, | |
| "rewards/embodied_math": 0.0803571455180645, | |
| "rewards/format_reward": 0.2991071566939354, | |
| "rewards/tag_count_reward": 0.4871651977300644, | |
| "step": 10 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 662.5736846923828, | |
| "epoch": 0.036758563074352546, | |
| "grad_norm": 1.020012378692627, | |
| "kl": 0.024505615234375, | |
| "learning_rate": 7.333333333333333e-06, | |
| "loss": 0.0359, | |
| "reward": 1.0256696790456772, | |
| "reward_std": 0.8458178341388702, | |
| "rewards/embodied_math": 0.13392857951112092, | |
| "rewards/format_reward": 0.3482142984867096, | |
| "rewards/tag_count_reward": 0.5435268059372902, | |
| "step": 11 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 493.3817138671875, | |
| "epoch": 0.040100250626566414, | |
| "grad_norm": 0.5114466547966003, | |
| "kl": 0.02532958984375, | |
| "learning_rate": 8.000000000000001e-06, | |
| "loss": 0.0494, | |
| "reward": 1.0876116454601288, | |
| "reward_std": 0.8503101617097855, | |
| "rewards/embodied_math": 0.12723214644938707, | |
| "rewards/format_reward": 0.4017857313156128, | |
| "rewards/tag_count_reward": 0.5585937649011612, | |
| "step": 12 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 619.1384201049805, | |
| "epoch": 0.04344193817878028, | |
| "grad_norm": 0.4396233856678009, | |
| "kl": 0.023834228515625, | |
| "learning_rate": 8.666666666666668e-06, | |
| "loss": 0.0118, | |
| "reward": 1.002232164144516, | |
| "reward_std": 0.8543792814016342, | |
| "rewards/embodied_math": 0.03125000069849193, | |
| "rewards/format_reward": 0.408482164144516, | |
| "rewards/tag_count_reward": 0.5625000149011612, | |
| "step": 13 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 633.654052734375, | |
| "epoch": 0.04678362573099415, | |
| "grad_norm": 0.39523178339004517, | |
| "kl": 0.026153564453125, | |
| "learning_rate": 9.333333333333334e-06, | |
| "loss": 0.0537, | |
| "reward": 1.4285715222358704, | |
| "reward_std": 0.7692538350820541, | |
| "rewards/embodied_math": 0.0781250037252903, | |
| "rewards/format_reward": 0.587053582072258, | |
| "rewards/tag_count_reward": 0.76339291036129, | |
| "step": 14 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 642.9152221679688, | |
| "epoch": 0.05012531328320802, | |
| "grad_norm": 0.48618102073669434, | |
| "kl": 0.8505859375, | |
| "learning_rate": 1e-05, | |
| "loss": -0.0001, | |
| "reward": 1.5468750596046448, | |
| "reward_std": 0.7278469949960709, | |
| "rewards/embodied_math": 0.10044643399305642, | |
| "rewards/format_reward": 0.627232164144516, | |
| "rewards/tag_count_reward": 0.8191964626312256, | |
| "step": 15 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 669.107177734375, | |
| "epoch": 0.053467000835421885, | |
| "grad_norm": 15783.640625, | |
| "kl": 300.05426025390625, | |
| "learning_rate": 1.0666666666666667e-05, | |
| "loss": 9.6148, | |
| "reward": 1.5273438096046448, | |
| "reward_std": 0.7241310179233551, | |
| "rewards/embodied_math": 0.08258929196745157, | |
| "rewards/format_reward": 0.6138392984867096, | |
| "rewards/tag_count_reward": 0.8309152126312256, | |
| "step": 16 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 628.6786041259766, | |
| "epoch": 0.05680868838763575, | |
| "grad_norm": 0.5250495076179504, | |
| "kl": 0.0806884765625, | |
| "learning_rate": 1.1333333333333334e-05, | |
| "loss": 0.0244, | |
| "reward": 1.5290178954601288, | |
| "reward_std": 0.6853032112121582, | |
| "rewards/embodied_math": 0.11383929033763707, | |
| "rewards/format_reward": 0.5781250149011612, | |
| "rewards/tag_count_reward": 0.8370536118745804, | |
| "step": 17 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 601.9754638671875, | |
| "epoch": 0.06015037593984962, | |
| "grad_norm": 0.6404133439064026, | |
| "kl": 0.1341552734375, | |
| "learning_rate": 1.2e-05, | |
| "loss": -0.016, | |
| "reward": 1.8063616454601288, | |
| "reward_std": 0.5558229237794876, | |
| "rewards/embodied_math": 0.1361607201397419, | |
| "rewards/format_reward": 0.7566964626312256, | |
| "rewards/tag_count_reward": 0.913504496216774, | |
| "step": 18 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 672.0781555175781, | |
| "epoch": 0.06349206349206349, | |
| "grad_norm": 3.5720558166503906, | |
| "kl": 0.458251953125, | |
| "learning_rate": 1.2666666666666667e-05, | |
| "loss": 0.064, | |
| "reward": 1.786272406578064, | |
| "reward_std": 0.5581437945365906, | |
| "rewards/embodied_math": 0.11383929220028222, | |
| "rewards/format_reward": 0.7678571790456772, | |
| "rewards/tag_count_reward": 0.9045759439468384, | |
| "step": 19 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 748.0268249511719, | |
| "epoch": 0.06683375104427736, | |
| "grad_norm": 0.5579437017440796, | |
| "kl": 0.0953369140625, | |
| "learning_rate": 1.3333333333333333e-05, | |
| "loss": 0.0603, | |
| "reward": 1.7711087763309479, | |
| "reward_std": 0.6120292246341705, | |
| "rewards/embodied_math": 0.14722478203475475, | |
| "rewards/format_reward": 0.738839328289032, | |
| "rewards/tag_count_reward": 0.8850446790456772, | |
| "step": 20 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 756.9888610839844, | |
| "epoch": 0.07017543859649122, | |
| "grad_norm": 0.6377071142196655, | |
| "kl": 0.06231689453125, | |
| "learning_rate": 1.4e-05, | |
| "loss": 0.035, | |
| "reward": 1.7229181826114655, | |
| "reward_std": 0.6206964701414108, | |
| "rewards/embodied_math": 0.06834219070151448, | |
| "rewards/format_reward": 0.76339291036129, | |
| "rewards/tag_count_reward": 0.891183078289032, | |
| "step": 21 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 753.357177734375, | |
| "epoch": 0.07351712614870509, | |
| "grad_norm": 1.8024694919586182, | |
| "kl": 0.150146484375, | |
| "learning_rate": 1.4666666666666666e-05, | |
| "loss": 0.0766, | |
| "reward": 1.697922170162201, | |
| "reward_std": 0.6154973953962326, | |
| "rewards/embodied_math": 0.1008238852955401, | |
| "rewards/format_reward": 0.7633928805589676, | |
| "rewards/tag_count_reward": 0.8337053954601288, | |
| "step": 22 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 871.7009124755859, | |
| "epoch": 0.07685881370091896, | |
| "grad_norm": 0.9224417209625244, | |
| "kl": 0.120849609375, | |
| "learning_rate": 1.5333333333333334e-05, | |
| "loss": 0.1467, | |
| "reward": 1.4809691905975342, | |
| "reward_std": 0.7514003068208694, | |
| "rewards/embodied_math": 0.11880402453243732, | |
| "rewards/format_reward": 0.651785746216774, | |
| "rewards/tag_count_reward": 0.710379496216774, | |
| "step": 23 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 816.9911193847656, | |
| "epoch": 0.08020050125313283, | |
| "grad_norm": 2.6176247596740723, | |
| "kl": 0.22021484375, | |
| "learning_rate": 1.6000000000000003e-05, | |
| "loss": 0.1362, | |
| "reward": 1.5853315591812134, | |
| "reward_std": 0.7496855407953262, | |
| "rewards/embodied_math": 0.13611273211427033, | |
| "rewards/format_reward": 0.6808036118745804, | |
| "rewards/tag_count_reward": 0.7684152126312256, | |
| "step": 24 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 881.4955596923828, | |
| "epoch": 0.0835421888053467, | |
| "grad_norm": 2947.326171875, | |
| "kl": 18.1572265625, | |
| "learning_rate": 1.6666666666666667e-05, | |
| "loss": 1.0788, | |
| "reward": 1.377666562795639, | |
| "reward_std": 0.8003540188074112, | |
| "rewards/embodied_math": 0.1455236654728651, | |
| "rewards/format_reward": 0.5357142984867096, | |
| "rewards/tag_count_reward": 0.6964286118745804, | |
| "step": 25 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 879.4531707763672, | |
| "epoch": 0.08688387635756056, | |
| "grad_norm": 8.099024772644043, | |
| "kl": 1.2607421875, | |
| "learning_rate": 1.7333333333333336e-05, | |
| "loss": 0.1733, | |
| "reward": 1.0582136511802673, | |
| "reward_std": 0.7152388989925385, | |
| "rewards/embodied_math": 0.05765558429993689, | |
| "rewards/format_reward": 0.3370535895228386, | |
| "rewards/tag_count_reward": 0.6635045111179352, | |
| "step": 26 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 907.2210235595703, | |
| "epoch": 0.09022556390977443, | |
| "grad_norm": 7.911270618438721, | |
| "kl": 0.85986328125, | |
| "learning_rate": 1.8e-05, | |
| "loss": 0.1583, | |
| "reward": 0.9548228234052658, | |
| "reward_std": 0.6883680373430252, | |
| "rewards/embodied_math": 0.15459956042468548, | |
| "rewards/format_reward": 0.2165178656578064, | |
| "rewards/tag_count_reward": 0.5837053805589676, | |
| "step": 27 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 947.0982666015625, | |
| "epoch": 0.0935672514619883, | |
| "grad_norm": 5.554101467132568, | |
| "kl": 1.0263671875, | |
| "learning_rate": 1.866666666666667e-05, | |
| "loss": 0.1178, | |
| "reward": 0.704320564866066, | |
| "reward_std": 0.4823942184448242, | |
| "rewards/embodied_math": 0.18478929996490479, | |
| "rewards/format_reward": 0.07812500465661287, | |
| "rewards/tag_count_reward": 0.4414062649011612, | |
| "step": 28 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 791.7611999511719, | |
| "epoch": 0.09690893901420217, | |
| "grad_norm": 13.062017440795898, | |
| "kl": 3.9375, | |
| "learning_rate": 1.9333333333333333e-05, | |
| "loss": -0.0151, | |
| "reward": 0.3755580484867096, | |
| "reward_std": 0.33308642357587814, | |
| "rewards/embodied_math": 0.051339289639145136, | |
| "rewards/format_reward": 0.015625000931322575, | |
| "rewards/tag_count_reward": 0.3085937649011612, | |
| "step": 29 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 589.7611846923828, | |
| "epoch": 0.10025062656641603, | |
| "grad_norm": 864.9710693359375, | |
| "kl": 6.4609375, | |
| "learning_rate": 2e-05, | |
| "loss": -0.0239, | |
| "reward": 0.3041294813156128, | |
| "reward_std": 0.286144133657217, | |
| "rewards/embodied_math": 0.04910714505240321, | |
| "rewards/format_reward": 0.004464285913854837, | |
| "rewards/tag_count_reward": 0.2505580447614193, | |
| "step": 30 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 440.2567138671875, | |
| "epoch": 0.1035923141186299, | |
| "grad_norm": 13.670504570007324, | |
| "kl": 5.5625, | |
| "learning_rate": 1.9999318037877998e-05, | |
| "loss": -0.5693, | |
| "reward": 0.203683041036129, | |
| "reward_std": 0.2521365247666836, | |
| "rewards/embodied_math": 0.06473214598372579, | |
| "rewards/format_reward": 0.0022321429569274187, | |
| "rewards/tag_count_reward": 0.13671875558793545, | |
| "step": 31 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 516.3727951049805, | |
| "epoch": 0.10693400167084377, | |
| "grad_norm": 7.831124782562256, | |
| "kl": 2.88671875, | |
| "learning_rate": 1.9997272244526454e-05, | |
| "loss": -0.5291, | |
| "reward": 0.2455357238650322, | |
| "reward_std": 0.19646178930997849, | |
| "rewards/embodied_math": 0.12276786006987095, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.12276786379516125, | |
| "step": 32 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 582.5781402587891, | |
| "epoch": 0.11027568922305764, | |
| "grad_norm": 589.973388671875, | |
| "kl": 4.8671875, | |
| "learning_rate": 1.9993862898976092e-05, | |
| "loss": -0.5682, | |
| "reward": 0.1043526828289032, | |
| "reward_std": 0.15242009237408638, | |
| "rewards/embodied_math": 0.0066964291036129, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.0976562537252903, | |
| "step": 33 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 356.7455520629883, | |
| "epoch": 0.1136173767752715, | |
| "grad_norm": 17.74056053161621, | |
| "kl": 3.6328125, | |
| "learning_rate": 1.998909046623581e-05, | |
| "loss": -0.5021, | |
| "reward": 0.255022332072258, | |
| "reward_std": 0.24578910320997238, | |
| "rewards/embodied_math": 0.11160714784637094, | |
| "rewards/format_reward": 0.017857143422588706, | |
| "rewards/tag_count_reward": 0.125558041036129, | |
| "step": 34 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 410.4955520629883, | |
| "epoch": 0.11695906432748537, | |
| "grad_norm": 9.428912162780762, | |
| "kl": 2.2265625, | |
| "learning_rate": 1.9982955597229275e-05, | |
| "loss": -0.6298, | |
| "reward": 0.2382812574505806, | |
| "reward_std": 0.22896768525242805, | |
| "rewards/embodied_math": 0.10937500488944352, | |
| "rewards/format_reward": 0.01785714365541935, | |
| "rewards/tag_count_reward": 0.1110491119325161, | |
| "step": 35 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 548.084846496582, | |
| "epoch": 0.12030075187969924, | |
| "grad_norm": 7.926641941070557, | |
| "kl": 1.931640625, | |
| "learning_rate": 1.9975459128706155e-05, | |
| "loss": -0.5518, | |
| "reward": 0.24944196827709675, | |
| "reward_std": 0.16391626000404358, | |
| "rewards/embodied_math": 0.1428571492433548, | |
| "rewards/format_reward": 0.004464285913854837, | |
| "rewards/tag_count_reward": 0.102120541036129, | |
| "step": 36 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 605.2120819091797, | |
| "epoch": 0.12364243943191311, | |
| "grad_norm": 16.085020065307617, | |
| "kl": 3.2109375, | |
| "learning_rate": 1.996660208312796e-05, | |
| "loss": -0.6493, | |
| "reward": 0.08649553917348385, | |
| "reward_std": 0.12165977619588375, | |
| "rewards/embodied_math": 0.0, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.08649553917348385, | |
| "step": 37 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 624.4464569091797, | |
| "epoch": 0.12698412698412698, | |
| "grad_norm": 5.551716327667236, | |
| "kl": 2.5703125, | |
| "learning_rate": 1.9956385668528614e-05, | |
| "loss": -0.466, | |
| "reward": 0.0920758955180645, | |
| "reward_std": 0.12959402054548264, | |
| "rewards/embodied_math": 0.0022321429569274187, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.08984375186264515, | |
| "step": 38 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 762.6674499511719, | |
| "epoch": 0.13032581453634084, | |
| "grad_norm": 6.6176934242248535, | |
| "kl": 2.4609375, | |
| "learning_rate": 1.9944811278349666e-05, | |
| "loss": -0.2619, | |
| "reward": 0.1785714402794838, | |
| "reward_std": 0.1333499550819397, | |
| "rewards/embodied_math": 0.0714285746216774, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.1071428619325161, | |
| "step": 39 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 898.5669860839844, | |
| "epoch": 0.1336675020885547, | |
| "grad_norm": 7.996533393859863, | |
| "kl": 3.1640625, | |
| "learning_rate": 1.9931880491250263e-05, | |
| "loss": -0.0767, | |
| "reward": 0.08593750186264515, | |
| "reward_std": 0.11663081869482994, | |
| "rewards/embodied_math": 0.0, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.08593750186264515, | |
| "step": 40 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1162.091552734375, | |
| "epoch": 0.13700918964076858, | |
| "grad_norm": 4.013873100280762, | |
| "kl": 3.97265625, | |
| "learning_rate": 1.9917595070891796e-05, | |
| "loss": 0.0529, | |
| "reward": 0.1668526865541935, | |
| "reward_std": 0.12196414358913898, | |
| "rewards/embodied_math": 0.0714285746216774, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.0954241119325161, | |
| "step": 41 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1291.6339721679688, | |
| "epoch": 0.14035087719298245, | |
| "grad_norm": 5.315943241119385, | |
| "kl": 4.1796875, | |
| "learning_rate": 1.9901956965697387e-05, | |
| "loss": 0.1625, | |
| "reward": 0.1400669701397419, | |
| "reward_std": 0.12121919170022011, | |
| "rewards/embodied_math": 0.0357142873108387, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.10435268469154835, | |
| "step": 42 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1297.1205444335938, | |
| "epoch": 0.14369256474519632, | |
| "grad_norm": 7.909899711608887, | |
| "kl": 8.6875, | |
| "learning_rate": 1.988496830858612e-05, | |
| "loss": 0.194, | |
| "reward": 0.11662947200238705, | |
| "reward_std": 0.11725078709423542, | |
| "rewards/embodied_math": 0.0357142873108387, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.0809151828289032, | |
| "step": 43 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1300.0, | |
| "epoch": 0.14703425229741018, | |
| "grad_norm": 10.900046348571777, | |
| "kl": 8.90625, | |
| "learning_rate": 1.986663141668212e-05, | |
| "loss": 0.3553, | |
| "reward": 0.1389508992433548, | |
| "reward_std": 0.1102825254201889, | |
| "rewards/embodied_math": 0.0714285746216774, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.0675223246216774, | |
| "step": 44 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1297.1227722167969, | |
| "epoch": 0.15037593984962405, | |
| "grad_norm": 20.010757446289062, | |
| "kl": 7.046875, | |
| "learning_rate": 1.9846948790998532e-05, | |
| "loss": 0.2807, | |
| "reward": 0.10267857741564512, | |
| "reward_std": 0.10911580175161362, | |
| "rewards/embodied_math": 0.0357142873108387, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.06696429010480642, | |
| "step": 45 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1300.0, | |
| "epoch": 0.15371762740183792, | |
| "grad_norm": 6.929605960845947, | |
| "kl": 4.0703125, | |
| "learning_rate": 1.982592311609639e-05, | |
| "loss": 0.1625, | |
| "reward": 0.16015625931322575, | |
| "reward_std": 0.11942839249968529, | |
| "rewards/embodied_math": 0.0714285746216774, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.08872768469154835, | |
| "step": 46 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1300.0, | |
| "epoch": 0.1570593149540518, | |
| "grad_norm": 3.139974594116211, | |
| "kl": 4.9453125, | |
| "learning_rate": 1.9803557259718472e-05, | |
| "loss": 0.1971, | |
| "reward": 0.1434151865541935, | |
| "reward_std": 0.12306286953389645, | |
| "rewards/embodied_math": 0.0357142873108387, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.1077008955180645, | |
| "step": 47 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1300.0, | |
| "epoch": 0.16040100250626566, | |
| "grad_norm": 12.176130294799805, | |
| "kl": 7.8125, | |
| "learning_rate": 1.977985427239815e-05, | |
| "loss": 0.311, | |
| "reward": 0.3018973357975483, | |
| "reward_std": 0.120529068633914, | |
| "rewards/embodied_math": 0.1428571492433548, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.1590401865541935, | |
| "step": 48 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1300.0, | |
| "epoch": 0.16374269005847952, | |
| "grad_norm": 60.613792419433594, | |
| "kl": 3.265625, | |
| "learning_rate": 1.975481738704333e-05, | |
| "loss": 0.1304, | |
| "reward": 0.2890625149011612, | |
| "reward_std": 0.10946565680205822, | |
| "rewards/embodied_math": 0.1071428619325161, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.1819196529686451, | |
| "step": 49 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1300.0, | |
| "epoch": 0.1670843776106934, | |
| "grad_norm": 7.159120559692383, | |
| "kl": 2.24609375, | |
| "learning_rate": 1.9728450018495506e-05, | |
| "loss": 0.0896, | |
| "reward": 0.1780133992433548, | |
| "reward_std": 0.13202833384275436, | |
| "rewards/embodied_math": 0.0357142873108387, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.1422991119325161, | |
| "step": 50 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1300.0, | |
| "epoch": 0.17042606516290726, | |
| "grad_norm": 7.3434624671936035, | |
| "kl": 1.193359375, | |
| "learning_rate": 1.9700755763064e-05, | |
| "loss": 0.0476, | |
| "reward": 0.2114955484867096, | |
| "reward_std": 0.10858343727886677, | |
| "rewards/embodied_math": 0.0, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.2114955484867096, | |
| "step": 51 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1300.0, | |
| "epoch": 0.17376775271512113, | |
| "grad_norm": 3.7159674167633057, | |
| "kl": 1.43359375, | |
| "learning_rate": 1.967173839803545e-05, | |
| "loss": 0.0572, | |
| "reward": 0.3286830522119999, | |
| "reward_std": 0.09371168166399002, | |
| "rewards/embodied_math": 0.1071428619325161, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.2215401865541935, | |
| "step": 52 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1300.0, | |
| "epoch": 0.177109440267335, | |
| "grad_norm": 3.8071751594543457, | |
| "kl": 2.0390625, | |
| "learning_rate": 1.9641401881158625e-05, | |
| "loss": 0.0813, | |
| "reward": 0.294642873108387, | |
| "reward_std": 0.08163666725158691, | |
| "rewards/embodied_math": 0.0714285746216774, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.2232142947614193, | |
| "step": 53 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1300.0, | |
| "epoch": 0.18045112781954886, | |
| "grad_norm": 73.75374603271484, | |
| "kl": 15.703125, | |
| "learning_rate": 1.960975035010461e-05, | |
| "loss": 0.6256, | |
| "reward": 0.2801339402794838, | |
| "reward_std": 0.13134469836950302, | |
| "rewards/embodied_math": 0.1071428619325161, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.1729910783469677, | |
| "step": 54 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1300.0, | |
| "epoch": 0.18379281537176273, | |
| "grad_norm": 23.148263931274414, | |
| "kl": 6.6640625, | |
| "learning_rate": 1.9576788121902457e-05, | |
| "loss": 0.2659, | |
| "reward": 0.1568080447614193, | |
| "reward_std": 0.14204410836100578, | |
| "rewards/embodied_math": 0.0357142873108387, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.1210937537252903, | |
| "step": 55 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1297.4419860839844, | |
| "epoch": 0.1871345029239766, | |
| "grad_norm": 4.164386749267578, | |
| "kl": 1.478515625, | |
| "learning_rate": 1.954251969235039e-05, | |
| "loss": 0.0545, | |
| "reward": 0.2148437574505806, | |
| "reward_std": 0.12886478379368782, | |
| "rewards/embodied_math": 0.03794643026776612, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.1768973283469677, | |
| "step": 56 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1300.0, | |
| "epoch": 0.19047619047619047, | |
| "grad_norm": 5.979341506958008, | |
| "kl": 1.724609375, | |
| "learning_rate": 1.950694973540259e-05, | |
| "loss": 0.0687, | |
| "reward": 0.2181919775903225, | |
| "reward_std": 0.12196704186499119, | |
| "rewards/embodied_math": 0.0357142873108387, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.1824776865541935, | |
| "step": 57 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1297.1919860839844, | |
| "epoch": 0.19381787802840433, | |
| "grad_norm": 10.468803405761719, | |
| "kl": 5.8671875, | |
| "learning_rate": 1.9470083102531724e-05, | |
| "loss": 0.2288, | |
| "reward": 0.1841517947614193, | |
| "reward_std": 0.12486465089023113, | |
| "rewards/embodied_math": 0.0, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.1841517947614193, | |
| "step": 58 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1300.0, | |
| "epoch": 0.1971595655806182, | |
| "grad_norm": 4.8235697746276855, | |
| "kl": 3.2734375, | |
| "learning_rate": 1.943192482206723e-05, | |
| "loss": 0.1307, | |
| "reward": 0.258928582072258, | |
| "reward_std": 0.10686362534761429, | |
| "rewards/embodied_math": 0.0714285746216774, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.1875000111758709, | |
| "step": 59 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1300.0, | |
| "epoch": 0.20050125313283207, | |
| "grad_norm": 4.113831520080566, | |
| "kl": 1.681640625, | |
| "learning_rate": 1.9392480098509488e-05, | |
| "loss": 0.067, | |
| "reward": 0.2282366193830967, | |
| "reward_std": 0.106992082670331, | |
| "rewards/embodied_math": 0.0357142873108387, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.192522332072258, | |
| "step": 60 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1300.0, | |
| "epoch": 0.20384294068504594, | |
| "grad_norm": 30.52185821533203, | |
| "kl": 1.46484375, | |
| "learning_rate": 1.9351754311819978e-05, | |
| "loss": 0.0584, | |
| "reward": 0.3147321604192257, | |
| "reward_std": 0.10511562786996365, | |
| "rewards/embodied_math": 0.1071428619325161, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.2075892947614193, | |
| "step": 61 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1300.0, | |
| "epoch": 0.2071846282372598, | |
| "grad_norm": 4.678550720214844, | |
| "kl": 1.7421875, | |
| "learning_rate": 1.9309753016687478e-05, | |
| "loss": 0.0695, | |
| "reward": 0.2410714440047741, | |
| "reward_std": 0.09354828484356403, | |
| "rewards/embodied_math": 0.0357142873108387, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.2053571529686451, | |
| "step": 62 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1300.0, | |
| "epoch": 0.21052631578947367, | |
| "grad_norm": 2.103879928588867, | |
| "kl": 1.75390625, | |
| "learning_rate": 1.9266481941770463e-05, | |
| "loss": 0.0699, | |
| "reward": 0.2539062611758709, | |
| "reward_std": 0.08228430338203907, | |
| "rewards/embodied_math": 0.0357142873108387, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.2181919738650322, | |
| "step": 63 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1300.0, | |
| "epoch": 0.21386800334168754, | |
| "grad_norm": 9.774397850036621, | |
| "kl": 3.8046875, | |
| "learning_rate": 1.9221946988915745e-05, | |
| "loss": 0.1517, | |
| "reward": 0.3058035857975483, | |
| "reward_std": 0.09955826215445995, | |
| "rewards/embodied_math": 0.1071428619325161, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.1986607238650322, | |
| "step": 64 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1298.1830444335938, | |
| "epoch": 0.2172096908939014, | |
| "grad_norm": 13.15608024597168, | |
| "kl": 6.14453125, | |
| "learning_rate": 1.9176154232353513e-05, | |
| "loss": 0.2418, | |
| "reward": 0.1925223283469677, | |
| "reward_std": 0.10169229097664356, | |
| "rewards/embodied_math": 0.0, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.1925223283469677, | |
| "step": 65 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1300.0, | |
| "epoch": 0.22055137844611528, | |
| "grad_norm": 13.001031875610352, | |
| "kl": 1.91796875, | |
| "learning_rate": 1.9129109917868863e-05, | |
| "loss": 0.0765, | |
| "reward": 0.2812500186264515, | |
| "reward_std": 0.10722276195883751, | |
| "rewards/embodied_math": 0.0714285746216774, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.2098214402794838, | |
| "step": 66 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1300.0, | |
| "epoch": 0.22389306599832914, | |
| "grad_norm": 20.15164566040039, | |
| "kl": 5.0703125, | |
| "learning_rate": 1.9080820461949886e-05, | |
| "loss": 0.2023, | |
| "reward": 0.2455357201397419, | |
| "reward_std": 0.09997103177011013, | |
| "rewards/embodied_math": 0.0357142873108387, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.2098214365541935, | |
| "step": 67 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1297.8348388671875, | |
| "epoch": 0.227234753550543, | |
| "grad_norm": 69.2470703125, | |
| "kl": 4.8671875, | |
| "learning_rate": 1.9031292450912565e-05, | |
| "loss": 0.19, | |
| "reward": 0.1975446566939354, | |
| "reward_std": 0.09716965816915035, | |
| "rewards/embodied_math": 0.0, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.1975446566939354, | |
| "step": 68 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1112.46435546875, | |
| "epoch": 0.23057644110275688, | |
| "grad_norm": 1767.9007568359375, | |
| "kl": 12.921875, | |
| "learning_rate": 1.898053264000239e-05, | |
| "loss": 0.5162, | |
| "reward": 0.290736623108387, | |
| "reward_std": 0.12393852323293686, | |
| "rewards/embodied_math": 0.0357142873108387, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.2550223395228386, | |
| "step": 69 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1265.5714416503906, | |
| "epoch": 0.23391812865497075, | |
| "grad_norm": 457.6597595214844, | |
| "kl": 10.125, | |
| "learning_rate": 1.8928547952473037e-05, | |
| "loss": 0.4033, | |
| "reward": 0.2818080522119999, | |
| "reward_std": 0.17049719020724297, | |
| "rewards/embodied_math": 0.03794643026776612, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.243861623108387, | |
| "step": 70 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1300.0, | |
| "epoch": 0.23725981620718462, | |
| "grad_norm": 23.20967674255371, | |
| "kl": 8.6171875, | |
| "learning_rate": 1.8875345478642067e-05, | |
| "loss": 0.3434, | |
| "reward": 0.2377232238650322, | |
| "reward_std": 0.13999010622501373, | |
| "rewards/embodied_math": 0.0714285746216774, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.1662946492433548, | |
| "step": 71 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1300.0, | |
| "epoch": 0.24060150375939848, | |
| "grad_norm": 4.324472904205322, | |
| "kl": 3.1328125, | |
| "learning_rate": 1.8820932474923874e-05, | |
| "loss": 0.1251, | |
| "reward": 0.2349330484867096, | |
| "reward_std": 0.13926412537693977, | |
| "rewards/embodied_math": 0.0714285746216774, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.1635044664144516, | |
| "step": 72 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1300.0, | |
| "epoch": 0.24394319131161235, | |
| "grad_norm": 15.31286334991455, | |
| "kl": 1.8125, | |
| "learning_rate": 1.8765316362839955e-05, | |
| "loss": 0.0722, | |
| "reward": 0.2338169738650322, | |
| "reward_std": 0.14043857902288437, | |
| "rewards/embodied_math": 0.0357142873108387, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.1981026865541935, | |
| "step": 73 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1300.0, | |
| "epoch": 0.24728487886382622, | |
| "grad_norm": 12.260504722595215, | |
| "kl": 1.69921875, | |
| "learning_rate": 1.8708504728006668e-05, | |
| "loss": 0.0677, | |
| "reward": 0.2544642984867096, | |
| "reward_std": 0.11344457603991032, | |
| "rewards/embodied_math": 0.0357142873108387, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.2187500149011612, | |
| "step": 74 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1300.0, | |
| "epoch": 0.2506265664160401, | |
| "grad_norm": 8.76353645324707, | |
| "kl": 2.29296875, | |
| "learning_rate": 1.865050531910062e-05, | |
| "loss": 0.0917, | |
| "reward": 0.2287946566939354, | |
| "reward_std": 0.134426174685359, | |
| "rewards/embodied_math": 0.0, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.2287946566939354, | |
| "step": 75 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1300.0, | |
| "epoch": 0.25396825396825395, | |
| "grad_norm": 11.521807670593262, | |
| "kl": 7.5703125, | |
| "learning_rate": 1.8591326046801813e-05, | |
| "loss": 0.302, | |
| "reward": 0.262834832072258, | |
| "reward_std": 0.13520535081624985, | |
| "rewards/embodied_math": 0.0357142873108387, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.2271205484867096, | |
| "step": 76 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1300.0, | |
| "epoch": 0.2573099415204678, | |
| "grad_norm": 38.819496154785156, | |
| "kl": 13.5625, | |
| "learning_rate": 1.8530974982714667e-05, | |
| "loss": 0.5417, | |
| "reward": 0.2622768022119999, | |
| "reward_std": 0.1606529802083969, | |
| "rewards/embodied_math": 0.0357142873108387, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.2265625111758709, | |
| "step": 77 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1298.7522583007812, | |
| "epoch": 0.2606516290726817, | |
| "grad_norm": 10.04819393157959, | |
| "kl": 8.65625, | |
| "learning_rate": 1.8469460358267127e-05, | |
| "loss": 0.344, | |
| "reward": 0.3052455522119999, | |
| "reward_std": 0.17179123312234879, | |
| "rewards/embodied_math": 0.0357142873108387, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.2695312611758709, | |
| "step": 78 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1300.0, | |
| "epoch": 0.26399331662489556, | |
| "grad_norm": 8.36678409576416, | |
| "kl": 3.57421875, | |
| "learning_rate": 1.8406790563587958e-05, | |
| "loss": 0.1425, | |
| "reward": 0.2154017984867096, | |
| "reward_std": 0.12405722960829735, | |
| "rewards/embodied_math": 0.0, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.2154017984867096, | |
| "step": 79 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1300.0, | |
| "epoch": 0.2673350041771094, | |
| "grad_norm": 11.640973091125488, | |
| "kl": 2.359375, | |
| "learning_rate": 1.8342974146362397e-05, | |
| "loss": 0.094, | |
| "reward": 0.2254464365541935, | |
| "reward_std": 0.10200263559818268, | |
| "rewards/embodied_math": 0.0, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.2254464365541935, | |
| "step": 80 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1300.0, | |
| "epoch": 0.2706766917293233, | |
| "grad_norm": 10.865341186523438, | |
| "kl": 2.0625, | |
| "learning_rate": 1.8278019810666295e-05, | |
| "loss": 0.0823, | |
| "reward": 0.2589285857975483, | |
| "reward_std": 0.09501025639474392, | |
| "rewards/embodied_math": 0.0357142873108387, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.2232142947614193, | |
| "step": 81 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1300.0, | |
| "epoch": 0.27401837928153716, | |
| "grad_norm": 3.9547715187072754, | |
| "kl": 3.9140625, | |
| "learning_rate": 1.8211936415778986e-05, | |
| "loss": 0.156, | |
| "reward": 0.2678571529686451, | |
| "reward_std": 0.08793714456260204, | |
| "rewards/embodied_math": 0.0357142873108387, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.2321428656578064, | |
| "step": 82 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1300.0, | |
| "epoch": 0.27736006683375103, | |
| "grad_norm": 11.76015567779541, | |
| "kl": 5.71875, | |
| "learning_rate": 1.8144732974974902e-05, | |
| "loss": 0.2278, | |
| "reward": 0.3203125186264515, | |
| "reward_std": 0.09355076402425766, | |
| "rewards/embodied_math": 0.1071428619325161, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.2131696529686451, | |
| "step": 83 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1300.0, | |
| "epoch": 0.2807017543859649, | |
| "grad_norm": 15.147577285766602, | |
| "kl": 6.8125, | |
| "learning_rate": 1.8076418654294267e-05, | |
| "loss": 0.2713, | |
| "reward": 0.3175223395228386, | |
| "reward_std": 0.09090832434594631, | |
| "rewards/embodied_math": 0.1071428619325161, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.2103794701397419, | |
| "step": 84 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1300.0, | |
| "epoch": 0.28404344193817876, | |
| "grad_norm": 2.8102500438690186, | |
| "kl": 1.943359375, | |
| "learning_rate": 1.80070027712929e-05, | |
| "loss": 0.0775, | |
| "reward": 0.2193080447614193, | |
| "reward_std": 0.06847428530454636, | |
| "rewards/embodied_math": 0.0, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.2193080447614193, | |
| "step": 85 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1300.0, | |
| "epoch": 0.28738512949039263, | |
| "grad_norm": 3.1324164867401123, | |
| "kl": 2.29296875, | |
| "learning_rate": 1.793649479377137e-05, | |
| "loss": 0.0914, | |
| "reward": 0.2165178656578064, | |
| "reward_std": 0.07726325932890177, | |
| "rewards/embodied_math": 0.0, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.2165178656578064, | |
| "step": 86 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1300.0, | |
| "epoch": 0.2907268170426065, | |
| "grad_norm": 5.72972297668457, | |
| "kl": 2.6796875, | |
| "learning_rate": 1.7864904338483676e-05, | |
| "loss": 0.1071, | |
| "reward": 0.2907366268336773, | |
| "reward_std": 0.07602266781032085, | |
| "rewards/embodied_math": 0.0714285746216774, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.2193080484867096, | |
| "step": 87 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1299.3995666503906, | |
| "epoch": 0.29406850459482037, | |
| "grad_norm": 7.760501861572266, | |
| "kl": 2.681640625, | |
| "learning_rate": 1.779224116982558e-05, | |
| "loss": 0.1047, | |
| "reward": 0.2968750074505806, | |
| "reward_std": 0.06183902267366648, | |
| "rewards/embodied_math": 0.0714285746216774, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.2254464402794838, | |
| "step": 88 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1300.0, | |
| "epoch": 0.29741019214703424, | |
| "grad_norm": 4.000362396240234, | |
| "kl": 1.8857421875, | |
| "learning_rate": 1.7718515198502816e-05, | |
| "loss": 0.0753, | |
| "reward": 0.298549123108387, | |
| "reward_std": 0.05841092485934496, | |
| "rewards/embodied_math": 0.0714285746216774, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.2271205484867096, | |
| "step": 89 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1300.0, | |
| "epoch": 0.3007518796992481, | |
| "grad_norm": 5.868449687957764, | |
| "kl": 1.1416015625, | |
| "learning_rate": 1.7643736480179353e-05, | |
| "loss": 0.0455, | |
| "reward": 0.3085937649011612, | |
| "reward_std": 0.04876699857413769, | |
| "rewards/embodied_math": 0.0714285746216774, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.2371651902794838, | |
| "step": 90 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1294.4285888671875, | |
| "epoch": 0.30409356725146197, | |
| "grad_norm": 1.812473177909851, | |
| "kl": 1.982421875, | |
| "learning_rate": 1.7567915214105883e-05, | |
| "loss": 0.0715, | |
| "reward": 0.2689732238650322, | |
| "reward_std": 0.04237840510904789, | |
| "rewards/embodied_math": 0.0357142873108387, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.2332589365541935, | |
| "step": 91 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1300.0, | |
| "epoch": 0.30743525480367584, | |
| "grad_norm": 2.7611286640167236, | |
| "kl": 2.279296875, | |
| "learning_rate": 1.7491061741728703e-05, | |
| "loss": 0.0909, | |
| "reward": 0.2795759029686451, | |
| "reward_std": 0.02313897479325533, | |
| "rewards/embodied_math": 0.0357142873108387, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.2438616193830967, | |
| "step": 92 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1296.2187805175781, | |
| "epoch": 0.3107769423558897, | |
| "grad_norm": 8.670313835144043, | |
| "kl": 3.568359375, | |
| "learning_rate": 1.741318654527923e-05, | |
| "loss": 0.1315, | |
| "reward": 0.2745535895228386, | |
| "reward_std": 0.03872372629120946, | |
| "rewards/embodied_math": 0.0357142873108387, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.2388392984867096, | |
| "step": 93 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1285.0982360839844, | |
| "epoch": 0.3141186299081036, | |
| "grad_norm": 2.591120481491089, | |
| "kl": 2.2060546875, | |
| "learning_rate": 1.7334300246344318e-05, | |
| "loss": 0.0649, | |
| "reward": 0.2360491156578064, | |
| "reward_std": 0.05540083209052682, | |
| "rewards/embodied_math": 0.004464285913854837, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.2315848283469677, | |
| "step": 94 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1284.8661193847656, | |
| "epoch": 0.31746031746031744, | |
| "grad_norm": 3.4341654777526855, | |
| "kl": 1.61328125, | |
| "learning_rate": 1.725441360441752e-05, | |
| "loss": 0.0342, | |
| "reward": 0.2751116156578064, | |
| "reward_std": 0.03906811494380236, | |
| "rewards/embodied_math": 0.0357142873108387, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.239397332072258, | |
| "step": 95 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1272.2076416015625, | |
| "epoch": 0.3208020050125313, | |
| "grad_norm": 15.758692741394043, | |
| "kl": 5.55078125, | |
| "learning_rate": 1.7173537515431612e-05, | |
| "loss": 0.1119, | |
| "reward": 0.2343750111758709, | |
| "reward_std": 0.040907643269747496, | |
| "rewards/embodied_math": 0.0, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.2343750111758709, | |
| "step": 96 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1245.6607666015625, | |
| "epoch": 0.3241436925647452, | |
| "grad_norm": 1.1037966012954712, | |
| "kl": 4.46484375, | |
| "learning_rate": 1.7091683010272447e-05, | |
| "loss": 0.0287, | |
| "reward": 0.2606026902794838, | |
| "reward_std": 0.07436614017933607, | |
| "rewards/embodied_math": 0.03794643026776612, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.2226562611758709, | |
| "step": 97 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1229.1741943359375, | |
| "epoch": 0.32748538011695905, | |
| "grad_norm": 10.468352317810059, | |
| "kl": 6.5390625, | |
| "learning_rate": 1.700886125327443e-05, | |
| "loss": 0.0752, | |
| "reward": 0.2907366156578064, | |
| "reward_std": 0.08076621871441603, | |
| "rewards/embodied_math": 0.0714285746216774, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.2193080484867096, | |
| "step": 98 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1222.9174499511719, | |
| "epoch": 0.3308270676691729, | |
| "grad_norm": 2.7917568683624268, | |
| "kl": 3.5390625, | |
| "learning_rate": 1.692508354069779e-05, | |
| "loss": -0.0414, | |
| "reward": 0.3063616268336773, | |
| "reward_std": 0.09452157653868198, | |
| "rewards/embodied_math": 0.08482143143191934, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.2215401902794838, | |
| "step": 99 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1230.76123046875, | |
| "epoch": 0.3341687552213868, | |
| "grad_norm": 8.454556465148926, | |
| "kl": 6.4296875, | |
| "learning_rate": 1.684036129918786e-05, | |
| "loss": 0.03, | |
| "reward": 0.2924107313156128, | |
| "reward_std": 0.06796729285269976, | |
| "rewards/embodied_math": 0.0714285746216774, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.2209821566939354, | |
| "step": 100 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1260.3147888183594, | |
| "epoch": 0.33751044277360065, | |
| "grad_norm": 4.354994297027588, | |
| "kl": 5.12890625, | |
| "learning_rate": 1.6754706084216556e-05, | |
| "loss": 0.0526, | |
| "reward": 0.3370535857975483, | |
| "reward_std": 0.054657368920743465, | |
| "rewards/embodied_math": 0.1071428619325161, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.2299107238650322, | |
| "step": 101 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1223.2879943847656, | |
| "epoch": 0.3408521303258145, | |
| "grad_norm": 86.24649047851562, | |
| "kl": 896.05859375, | |
| "learning_rate": 1.6668129578506315e-05, | |
| "loss": 0.14, | |
| "reward": 0.2600446529686451, | |
| "reward_std": 0.06857628654688597, | |
| "rewards/embodied_math": 0.0357142873108387, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.2243303656578064, | |
| "step": 102 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1215.3170166015625, | |
| "epoch": 0.3441938178780284, | |
| "grad_norm": 8.539790153503418, | |
| "kl": 2.80078125, | |
| "learning_rate": 1.658064359043664e-05, | |
| "loss": -0.1095, | |
| "reward": 0.2901785857975483, | |
| "reward_std": 0.0779828317463398, | |
| "rewards/embodied_math": 0.0714285746216774, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.2187500111758709, | |
| "step": 103 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1111.7098388671875, | |
| "epoch": 0.34753550543024225, | |
| "grad_norm": 3.1039435863494873, | |
| "kl": 2.70703125, | |
| "learning_rate": 1.6492260052433554e-05, | |
| "loss": -0.2501, | |
| "reward": 0.3147321604192257, | |
| "reward_std": 0.0896985549479723, | |
| "rewards/embodied_math": 0.10937500488944352, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.2053571529686451, | |
| "step": 104 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1124.6875610351562, | |
| "epoch": 0.3508771929824561, | |
| "grad_norm": 2.739511013031006, | |
| "kl": 4.7265625, | |
| "learning_rate": 1.6402991019342073e-05, | |
| "loss": -0.2298, | |
| "reward": 0.2806919813156128, | |
| "reward_std": 0.07699156645685434, | |
| "rewards/embodied_math": 0.0714285746216774, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.2092634029686451, | |
| "step": 105 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1193.6183471679688, | |
| "epoch": 0.35421888053467, | |
| "grad_norm": 4.411526203155518, | |
| "kl": 7.28125, | |
| "learning_rate": 1.631284866678205e-05, | |
| "loss": -0.1137, | |
| "reward": 0.3002232275903225, | |
| "reward_std": 0.07899147737771273, | |
| "rewards/embodied_math": 0.0758928619325161, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.2243303693830967, | |
| "step": 106 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1137.857177734375, | |
| "epoch": 0.35756056808688386, | |
| "grad_norm": 7.792470455169678, | |
| "kl": 7.45703125, | |
| "learning_rate": 1.6221845289487493e-05, | |
| "loss": -0.2043, | |
| "reward": 0.2220982275903225, | |
| "reward_std": 0.081636568531394, | |
| "rewards/embodied_math": 0.004464285913854837, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.2176339402794838, | |
| "step": 107 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1163.8683776855469, | |
| "epoch": 0.3609022556390977, | |
| "grad_norm": 2.2603821754455566, | |
| "kl": 6.671875, | |
| "learning_rate": 1.6129993299629652e-05, | |
| "loss": -0.1924, | |
| "reward": 0.2890625074505806, | |
| "reward_std": 0.0779200978577137, | |
| "rewards/embodied_math": 0.0714285746216774, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.2176339402794838, | |
| "step": 108 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1173.3482971191406, | |
| "epoch": 0.3642439431913116, | |
| "grad_norm": 2.22993540763855, | |
| "kl": 3.044921875, | |
| "learning_rate": 1.6037305225124122e-05, | |
| "loss": -0.2054, | |
| "reward": 0.2952008992433548, | |
| "reward_std": 0.07529877964407206, | |
| "rewards/embodied_math": 0.0736607164144516, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.2215401865541935, | |
| "step": 109 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1138.9710540771484, | |
| "epoch": 0.36758563074352546, | |
| "grad_norm": 1.692765474319458, | |
| "kl": 2.310546875, | |
| "learning_rate": 1.5943793707922086e-05, | |
| "loss": -0.2594, | |
| "reward": 0.2555803656578064, | |
| "reward_std": 0.08683113381266594, | |
| "rewards/embodied_math": 0.03794643026776612, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.2176339402794838, | |
| "step": 110 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1246.4576110839844, | |
| "epoch": 0.37092731829573933, | |
| "grad_norm": 1.1689916849136353, | |
| "kl": 1.72265625, | |
| "learning_rate": 1.5849471502286088e-05, | |
| "loss": -0.0851, | |
| "reward": 0.2455357275903225, | |
| "reward_std": 0.06550503056496382, | |
| "rewards/embodied_math": 0.008928572060540318, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.2366071566939354, | |
| "step": 111 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1265.3482666015625, | |
| "epoch": 0.3742690058479532, | |
| "grad_norm": 0.7709049582481384, | |
| "kl": 1.814453125, | |
| "learning_rate": 1.5754351473050434e-05, | |
| "loss": -0.0438, | |
| "reward": 0.3197544701397419, | |
| "reward_std": 0.047460266621783376, | |
| "rewards/embodied_math": 0.07812500232830644, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.2416294775903225, | |
| "step": 112 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1250.3304138183594, | |
| "epoch": 0.37761069340016706, | |
| "grad_norm": 16.40837287902832, | |
| "kl": 2.6015625, | |
| "learning_rate": 1.5658446593866517e-05, | |
| "loss": -0.053, | |
| "reward": 0.2823660857975483, | |
| "reward_std": 0.06565927620977163, | |
| "rewards/embodied_math": 0.04464285937137902, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.2377232238650322, | |
| "step": 113 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1227.9532165527344, | |
| "epoch": 0.38095238095238093, | |
| "grad_norm": 1.9537498950958252, | |
| "kl": 3.77734375, | |
| "learning_rate": 1.5561769945433326e-05, | |
| "loss": -0.0841, | |
| "reward": 0.3549107313156128, | |
| "reward_std": 0.07631831709295511, | |
| "rewards/embodied_math": 0.10937500488944352, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.2455357201397419, | |
| "step": 114 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1232.9554138183594, | |
| "epoch": 0.3842940685045948, | |
| "grad_norm": 5.672196865081787, | |
| "kl": 3.42578125, | |
| "learning_rate": 1.5464334713713312e-05, | |
| "loss": -0.0807, | |
| "reward": 0.2695312574505806, | |
| "reward_std": 0.11467710882425308, | |
| "rewards/embodied_math": 0.004464285913854837, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.2650669738650322, | |
| "step": 115 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1029.2232513427734, | |
| "epoch": 0.38763575605680867, | |
| "grad_norm": 47.228187561035156, | |
| "kl": 6.15625, | |
| "learning_rate": 1.5366154188133962e-05, | |
| "loss": -0.0135, | |
| "reward": 0.4296875149011612, | |
| "reward_std": 0.18271929770708084, | |
| "rewards/embodied_math": 0.11160715157166123, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.3180803656578064, | |
| "step": 116 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1071.0067443847656, | |
| "epoch": 0.39097744360902253, | |
| "grad_norm": 2.859192132949829, | |
| "kl": 5.3359375, | |
| "learning_rate": 1.526724175977518e-05, | |
| "loss": -0.0756, | |
| "reward": 0.415736623108387, | |
| "reward_std": 0.20538460090756416, | |
| "rewards/embodied_math": 0.05133928847499192, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.3643973395228386, | |
| "step": 117 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1082.4531555175781, | |
| "epoch": 0.3943191311612364, | |
| "grad_norm": 5.876430988311768, | |
| "kl": 4.078125, | |
| "learning_rate": 1.5167610919542885e-05, | |
| "loss": -0.2275, | |
| "reward": 0.3699776977300644, | |
| "reward_std": 0.20139999315142632, | |
| "rewards/embodied_math": 0.0022321429569274187, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.3677455559372902, | |
| "step": 118 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1152.7322082519531, | |
| "epoch": 0.39766081871345027, | |
| "grad_norm": 2.9599621295928955, | |
| "kl": 4.1796875, | |
| "learning_rate": 1.5067275256328913e-05, | |
| "loss": -0.1624, | |
| "reward": 0.423549123108387, | |
| "reward_std": 0.18587969616055489, | |
| "rewards/embodied_math": 0.04017857206054032, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.383370541036129, | |
| "step": 119 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1142.29248046875, | |
| "epoch": 0.40100250626566414, | |
| "grad_norm": 3.511597156524658, | |
| "kl": 8.5234375, | |
| "learning_rate": 1.4966248455157622e-05, | |
| "loss": -0.1474, | |
| "reward": 0.4648437649011612, | |
| "reward_std": 0.16743408516049385, | |
| "rewards/embodied_math": 0.0714285746216774, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.3934151902794838, | |
| "step": 120 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1083.3371276855469, | |
| "epoch": 0.404344193817878, | |
| "grad_norm": 5.298196315765381, | |
| "kl": 9.5859375, | |
| "learning_rate": 1.4864544295319357e-05, | |
| "loss": -0.1957, | |
| "reward": 0.479352705180645, | |
| "reward_std": 0.17764294892549515, | |
| "rewards/embodied_math": 0.1071428619325161, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.3722098395228386, | |
| "step": 121 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1091.8326416015625, | |
| "epoch": 0.4076858813700919, | |
| "grad_norm": 1.1045624017715454, | |
| "kl": 4.83984375, | |
| "learning_rate": 1.4762176648491052e-05, | |
| "loss": -0.2606, | |
| "reward": 0.462611623108387, | |
| "reward_std": 0.17855771258473396, | |
| "rewards/embodied_math": 0.07366071757860482, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.3889509066939354, | |
| "step": 122 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1094.7880249023438, | |
| "epoch": 0.41102756892230574, | |
| "grad_norm": 1.479848027229309, | |
| "kl": 3.59765625, | |
| "learning_rate": 1.4659159476844231e-05, | |
| "loss": -0.2782, | |
| "reward": 0.4408482313156128, | |
| "reward_std": 0.1699872985482216, | |
| "rewards/embodied_math": 0.03794643026776612, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.4029018059372902, | |
| "step": 123 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1031.8750457763672, | |
| "epoch": 0.4143692564745196, | |
| "grad_norm": 0.9205887913703918, | |
| "kl": 5.6875, | |
| "learning_rate": 1.4555506831140698e-05, | |
| "loss": -0.3515, | |
| "reward": 0.3850446566939354, | |
| "reward_std": 0.17519571259617805, | |
| "rewards/embodied_math": 0.0, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.3850446566939354, | |
| "step": 124 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1061.9531860351562, | |
| "epoch": 0.4177109440267335, | |
| "grad_norm": 1.5503870248794556, | |
| "kl": 7.1640625, | |
| "learning_rate": 1.445123284881609e-05, | |
| "loss": -0.3, | |
| "reward": 0.4709821715950966, | |
| "reward_std": 0.19400348514318466, | |
| "rewards/embodied_math": 0.08035714644938707, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.3906250149011612, | |
| "step": 125 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1092.4754943847656, | |
| "epoch": 0.42105263157894735, | |
| "grad_norm": 3.406398296356201, | |
| "kl": 6.828125, | |
| "learning_rate": 1.4346351752051663e-05, | |
| "loss": -0.2414, | |
| "reward": 0.4414062649011612, | |
| "reward_std": 0.1831374131143093, | |
| "rewards/embodied_math": 0.0424107164144516, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.3989955559372902, | |
| "step": 126 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1072.4755249023438, | |
| "epoch": 0.4243943191311612, | |
| "grad_norm": 4.0440993309021, | |
| "kl": 6.4140625, | |
| "learning_rate": 1.4240877845834473e-05, | |
| "loss": -0.0842, | |
| "reward": 0.2572544775903225, | |
| "reward_std": 0.22713791206479073, | |
| "rewards/embodied_math": 0.044642857974395156, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.2126116156578064, | |
| "step": 127 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1057.2053985595703, | |
| "epoch": 0.4277360066833751, | |
| "grad_norm": 17.8367862701416, | |
| "kl": 6.6875, | |
| "learning_rate": 1.4134825516006307e-05, | |
| "loss": 0.0196, | |
| "reward": 0.1813616119325161, | |
| "reward_std": 0.13534531742334366, | |
| "rewards/embodied_math": 0.0357142873108387, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.1456473283469677, | |
| "step": 128 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 982.388427734375, | |
| "epoch": 0.43107769423558895, | |
| "grad_norm": 269.2532958984375, | |
| "kl": 11.546875, | |
| "learning_rate": 1.4028209227301534e-05, | |
| "loss": 0.305, | |
| "reward": 0.1852678693830967, | |
| "reward_std": 0.11594511196017265, | |
| "rewards/embodied_math": 0.0357142873108387, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.1495535783469677, | |
| "step": 129 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1021.2768402099609, | |
| "epoch": 0.4344193817878028, | |
| "grad_norm": 13.602232933044434, | |
| "kl": 5.3359375, | |
| "learning_rate": 1.392104352137426e-05, | |
| "loss": 0.019, | |
| "reward": 0.152901791036129, | |
| "reward_std": 0.12491585314273834, | |
| "rewards/embodied_math": 0.0022321429569274187, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.1506696492433548, | |
| "step": 130 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 555.147346496582, | |
| "epoch": 0.4377610693400167, | |
| "grad_norm": 8.347877502441406, | |
| "kl": 8.546875, | |
| "learning_rate": 1.3813343014814926e-05, | |
| "loss": 0.3175, | |
| "reward": 0.11941964738070965, | |
| "reward_std": 0.12463105469942093, | |
| "rewards/embodied_math": 0.0, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.11941964738070965, | |
| "step": 131 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1125.8594055175781, | |
| "epoch": 0.44110275689223055, | |
| "grad_norm": 286.98577880859375, | |
| "kl": 3.12109375, | |
| "learning_rate": 1.3705122397156727e-05, | |
| "loss": 0.0132, | |
| "reward": 0.2070312611758709, | |
| "reward_std": 0.18715298175811768, | |
| "rewards/embodied_math": 0.011160714784637094, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.1958705447614193, | |
| "step": 132 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1083.7411041259766, | |
| "epoch": 0.4444444444444444, | |
| "grad_norm": 19.709352493286133, | |
| "kl": 1.904296875, | |
| "learning_rate": 1.359639642887208e-05, | |
| "loss": -0.1734, | |
| "reward": 0.4101562649011612, | |
| "reward_std": 0.23037764430046082, | |
| "rewards/embodied_math": 0.113839291036129, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.2963169813156128, | |
| "step": 133 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1081.1719055175781, | |
| "epoch": 0.4477861319966583, | |
| "grad_norm": 2.971442461013794, | |
| "kl": 1.2763671875, | |
| "learning_rate": 1.3487179939359394e-05, | |
| "loss": -0.1503, | |
| "reward": 0.4107143059372902, | |
| "reward_std": 0.23758460581302643, | |
| "rewards/embodied_math": 0.1071428619325161, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.3035714402794838, | |
| "step": 134 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1124.0000305175781, | |
| "epoch": 0.45112781954887216, | |
| "grad_norm": 5.354092597961426, | |
| "kl": 1.5625, | |
| "learning_rate": 1.3377487824920459e-05, | |
| "loss": -0.1025, | |
| "reward": 0.3236607313156128, | |
| "reward_std": 0.2569897249341011, | |
| "rewards/embodied_math": 0.01785714365541935, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.305803582072258, | |
| "step": 135 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1038.9911499023438, | |
| "epoch": 0.454469507101086, | |
| "grad_norm": 12.347515106201172, | |
| "kl": 3.61328125, | |
| "learning_rate": 1.32673350467287e-05, | |
| "loss": -0.1563, | |
| "reward": 0.459821455180645, | |
| "reward_std": 0.28034432977437973, | |
| "rewards/embodied_math": 0.15401787031441927, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.305803582072258, | |
| "step": 136 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1030.7902374267578, | |
| "epoch": 0.4578111946532999, | |
| "grad_norm": 3.499429702758789, | |
| "kl": 3.16796875, | |
| "learning_rate": 1.3156736628788585e-05, | |
| "loss": -0.0581, | |
| "reward": 0.4006696566939354, | |
| "reward_std": 0.23839671164751053, | |
| "rewards/embodied_math": 0.1116071492433548, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.2890625149011612, | |
| "step": 137 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 958.8616638183594, | |
| "epoch": 0.46115288220551376, | |
| "grad_norm": 5.829720973968506, | |
| "kl": 3.53125, | |
| "learning_rate": 1.304570765588648e-05, | |
| "loss": -0.0615, | |
| "reward": 0.2762276902794838, | |
| "reward_std": 0.22271455451846123, | |
| "rewards/embodied_math": 0.0, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.2762276902794838, | |
| "step": 138 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1053.7389068603516, | |
| "epoch": 0.4644945697577276, | |
| "grad_norm": 4.121999263763428, | |
| "kl": 2.267578125, | |
| "learning_rate": 1.293426327153317e-05, | |
| "loss": -0.1534, | |
| "reward": 0.306919664144516, | |
| "reward_std": 0.21316596493124962, | |
| "rewards/embodied_math": 0.0, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.306919664144516, | |
| "step": 139 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1174.8036193847656, | |
| "epoch": 0.4678362573099415, | |
| "grad_norm": 3.6972813606262207, | |
| "kl": 2.021484375, | |
| "learning_rate": 1.2822418675898428e-05, | |
| "loss": -0.0593, | |
| "reward": 0.5362723469734192, | |
| "reward_std": 0.215391855686903, | |
| "rewards/embodied_math": 0.1473214328289032, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.388950914144516, | |
| "step": 140 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1115.7031860351562, | |
| "epoch": 0.47117794486215536, | |
| "grad_norm": 1.3065992593765259, | |
| "kl": 2.73046875, | |
| "learning_rate": 1.2710189123737804e-05, | |
| "loss": -0.1325, | |
| "reward": 0.4441964477300644, | |
| "reward_std": 0.23979860544204712, | |
| "rewards/embodied_math": 0.08035714412108064, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.3638392984867096, | |
| "step": 141 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1066.138427734375, | |
| "epoch": 0.47451963241436923, | |
| "grad_norm": 2.3809707164764404, | |
| "kl": 3.703125, | |
| "learning_rate": 1.2597589922312009e-05, | |
| "loss": -0.1879, | |
| "reward": 0.5602678880095482, | |
| "reward_std": 0.21912335231900215, | |
| "rewards/embodied_math": 0.1808035783469677, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.3794642984867096, | |
| "step": 142 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1052.279067993164, | |
| "epoch": 0.4778613199665831, | |
| "grad_norm": 2.187842607498169, | |
| "kl": 4.671875, | |
| "learning_rate": 1.2484636429299113e-05, | |
| "loss": -0.1914, | |
| "reward": 0.491071455180645, | |
| "reward_std": 0.2304515242576599, | |
| "rewards/embodied_math": 0.113839291036129, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.377232164144516, | |
| "step": 143 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1100.8661193847656, | |
| "epoch": 0.48120300751879697, | |
| "grad_norm": 4.84511661529541, | |
| "kl": 3.40234375, | |
| "learning_rate": 1.2371344050699872e-05, | |
| "loss": -0.171, | |
| "reward": 0.4760044887661934, | |
| "reward_std": 0.21165388822555542, | |
| "rewards/embodied_math": 0.0781250037252903, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.3978794813156128, | |
| "step": 144 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1072.1763916015625, | |
| "epoch": 0.48454469507101083, | |
| "grad_norm": 0.7366332411766052, | |
| "kl": 3.59765625, | |
| "learning_rate": 1.2257728238736468e-05, | |
| "loss": -0.263, | |
| "reward": 0.506696455180645, | |
| "reward_std": 0.22405631840229034, | |
| "rewards/embodied_math": 0.1183035783469677, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.3883928805589676, | |
| "step": 145 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1110.9710388183594, | |
| "epoch": 0.4878863826232247, | |
| "grad_norm": 1.2310246229171753, | |
| "kl": 3.564453125, | |
| "learning_rate": 1.2143804489744941e-05, | |
| "loss": -0.2307, | |
| "reward": 0.5100446715950966, | |
| "reward_std": 0.17644834145903587, | |
| "rewards/embodied_math": 0.1071428619325161, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.4029018059372902, | |
| "step": 146 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1189.0067443847656, | |
| "epoch": 0.49122807017543857, | |
| "grad_norm": 1.516117811203003, | |
| "kl": 2.3984375, | |
| "learning_rate": 1.2029588342061623e-05, | |
| "loss": -0.1141, | |
| "reward": 0.4481026902794838, | |
| "reward_std": 0.1671721525490284, | |
| "rewards/embodied_math": 0.03794643026776612, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.4101562649011612, | |
| "step": 147 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1152.7366638183594, | |
| "epoch": 0.49456975772765244, | |
| "grad_norm": 1.7402369976043701, | |
| "kl": 3.30078125, | |
| "learning_rate": 1.1915095373903789e-05, | |
| "loss": -0.1621, | |
| "reward": 0.5251116380095482, | |
| "reward_std": 0.1959940418601036, | |
| "rewards/embodied_math": 0.145089291036129, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.3800223469734192, | |
| "step": 148 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1159.7254943847656, | |
| "epoch": 0.4979114452798663, | |
| "grad_norm": 3.0580763816833496, | |
| "kl": 3.0390625, | |
| "learning_rate": 1.1800341201244954e-05, | |
| "loss": -0.1601, | |
| "reward": 0.424107164144516, | |
| "reward_std": 0.18527107685804367, | |
| "rewards/embodied_math": 0.0357142873108387, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.388392873108387, | |
| "step": 149 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1173.6719055175781, | |
| "epoch": 0.5012531328320802, | |
| "grad_norm": 4.800704479217529, | |
| "kl": 2.416015625, | |
| "learning_rate": 1.1685341475684935e-05, | |
| "loss": -0.1069, | |
| "reward": 0.412388414144516, | |
| "reward_std": 0.18909326568245888, | |
| "rewards/embodied_math": 0.0357142873108387, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.376674123108387, | |
| "step": 150 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1095.2187957763672, | |
| "epoch": 0.504594820384294, | |
| "grad_norm": 14.275434494018555, | |
| "kl": 4.41796875, | |
| "learning_rate": 1.15701118823151e-05, | |
| "loss": -0.1452, | |
| "reward": 0.4190848395228386, | |
| "reward_std": 0.20743219926953316, | |
| "rewards/embodied_math": 0.0736607164144516, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.3454241156578064, | |
| "step": 151 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1133.5447082519531, | |
| "epoch": 0.5079365079365079, | |
| "grad_norm": 8.570171356201172, | |
| "kl": 4.53125, | |
| "learning_rate": 1.1454668137579059e-05, | |
| "loss": -0.1404, | |
| "reward": 0.483816996216774, | |
| "reward_std": 0.20593848451972008, | |
| "rewards/embodied_math": 0.10937500488944352, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.3744419813156128, | |
| "step": 152 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1146.6652526855469, | |
| "epoch": 0.5112781954887218, | |
| "grad_norm": 23.04602813720703, | |
| "kl": 2.85546875, | |
| "learning_rate": 1.1339025987129033e-05, | |
| "loss": -0.1318, | |
| "reward": 0.4670759066939354, | |
| "reward_std": 0.2194213978946209, | |
| "rewards/embodied_math": 0.08482143143191934, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.3822544813156128, | |
| "step": 153 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1122.2768249511719, | |
| "epoch": 0.5146198830409356, | |
| "grad_norm": 2.594608783721924, | |
| "kl": 2.466796875, | |
| "learning_rate": 1.1223201203678289e-05, | |
| "loss": -0.2029, | |
| "reward": 0.4525669887661934, | |
| "reward_std": 0.1940429024398327, | |
| "rewards/embodied_math": 0.0714285746216774, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.3811384066939354, | |
| "step": 154 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1132.1786499023438, | |
| "epoch": 0.5179615705931495, | |
| "grad_norm": 2.2763757705688477, | |
| "kl": 2.4453125, | |
| "learning_rate": 1.1107209584849845e-05, | |
| "loss": -0.1703, | |
| "reward": 0.4380580633878708, | |
| "reward_std": 0.2103111855685711, | |
| "rewards/embodied_math": 0.046875003492459655, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.3911830484867096, | |
| "step": 155 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1131.6138610839844, | |
| "epoch": 0.5213032581453634, | |
| "grad_norm": 1.8114477396011353, | |
| "kl": 2.7421875, | |
| "learning_rate": 1.0991066951021802e-05, | |
| "loss": -0.1733, | |
| "reward": 0.5206473395228386, | |
| "reward_std": 0.19661833345890045, | |
| "rewards/embodied_math": 0.11607143143191934, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.4045759066939354, | |
| "step": 156 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1162.8169860839844, | |
| "epoch": 0.5246449456975772, | |
| "grad_norm": 2.25466251373291, | |
| "kl": 1.982421875, | |
| "learning_rate": 1.0874789143169569e-05, | |
| "loss": -0.1559, | |
| "reward": 0.4871651902794838, | |
| "reward_std": 0.20492572709918022, | |
| "rewards/embodied_math": 0.08258928824216127, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.4045759066939354, | |
| "step": 157 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1192.7478332519531, | |
| "epoch": 0.5279866332497911, | |
| "grad_norm": 1.1387535333633423, | |
| "kl": 1.3837890625, | |
| "learning_rate": 1.0758392020705258e-05, | |
| "loss": -0.1423, | |
| "reward": 0.5078125223517418, | |
| "reward_std": 0.1970166452229023, | |
| "rewards/embodied_math": 0.0937500037252903, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.4140625223517418, | |
| "step": 158 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1229.7813110351562, | |
| "epoch": 0.531328320802005, | |
| "grad_norm": 0.46632248163223267, | |
| "kl": 1.669921875, | |
| "learning_rate": 1.0641891459314598e-05, | |
| "loss": -0.0904, | |
| "reward": 0.4726562649011612, | |
| "reward_std": 0.19845933839678764, | |
| "rewards/embodied_math": 0.0357142873108387, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.4369419887661934, | |
| "step": 159 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1248.58935546875, | |
| "epoch": 0.5346700083542189, | |
| "grad_norm": 1.9050358533859253, | |
| "kl": 1.359375, | |
| "learning_rate": 1.0525303348791599e-05, | |
| "loss": -0.063, | |
| "reward": 0.5089285895228386, | |
| "reward_std": 0.19919028133153915, | |
| "rewards/embodied_math": 0.07142857555299997, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.4375000223517418, | |
| "step": 160 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1238.6652221679688, | |
| "epoch": 0.5380116959064327, | |
| "grad_norm": 39.550594329833984, | |
| "kl": 13.736328125, | |
| "learning_rate": 1.0408643590871312e-05, | |
| "loss": -0.0245, | |
| "reward": 0.5340401977300644, | |
| "reward_std": 0.21040942147374153, | |
| "rewards/embodied_math": 0.0937500037252903, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.4402901902794838, | |
| "step": 161 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1222.1406555175781, | |
| "epoch": 0.5413533834586466, | |
| "grad_norm": 2.6502327919006348, | |
| "kl": 2.689697265625, | |
| "learning_rate": 1.029192809706095e-05, | |
| "loss": -0.0896, | |
| "reward": 0.6010044887661934, | |
| "reward_std": 0.18520404025912285, | |
| "rewards/embodied_math": 0.1629464365541935, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.4380580559372902, | |
| "step": 162 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1224.9911193847656, | |
| "epoch": 0.5446950710108605, | |
| "grad_norm": 5.2321457862854, | |
| "kl": 2.138671875, | |
| "learning_rate": 1.017517278646968e-05, | |
| "loss": -0.0588, | |
| "reward": 0.5859375298023224, | |
| "reward_std": 0.2227453775703907, | |
| "rewards/embodied_math": 0.12723214738070965, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.4587053805589676, | |
| "step": 163 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1183.5982666015625, | |
| "epoch": 0.5480367585630743, | |
| "grad_norm": 1.016186237335205, | |
| "kl": 2.61328125, | |
| "learning_rate": 1.0058393583637376e-05, | |
| "loss": -0.1255, | |
| "reward": 0.490513414144516, | |
| "reward_std": 0.23803862929344177, | |
| "rewards/embodied_math": 0.049107146449387074, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.4414062723517418, | |
| "step": 164 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1201.27685546875, | |
| "epoch": 0.5513784461152882, | |
| "grad_norm": 0.6124681830406189, | |
| "kl": 2.61328125, | |
| "learning_rate": 9.94160641636263e-06, | |
| "loss": -0.1277, | |
| "reward": 0.4882812723517418, | |
| "reward_std": 0.18272774666547775, | |
| "rewards/embodied_math": 0.03794643026776612, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.4503348395228386, | |
| "step": 165 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1201.8638916015625, | |
| "epoch": 0.5547201336675021, | |
| "grad_norm": 0.4987063705921173, | |
| "kl": 2.947265625, | |
| "learning_rate": 9.824827213530323e-06, | |
| "loss": -0.1437, | |
| "reward": 0.5530134215950966, | |
| "reward_std": 0.19792700558900833, | |
| "rewards/embodied_math": 0.04017857206054032, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.5128348544239998, | |
| "step": 166 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1198.6294860839844, | |
| "epoch": 0.5580618212197159, | |
| "grad_norm": 0.23080599308013916, | |
| "kl": 2.048828125, | |
| "learning_rate": 9.708071902939053e-06, | |
| "loss": -0.1569, | |
| "reward": 0.5262277126312256, | |
| "reward_std": 0.22177628055214882, | |
| "rewards/embodied_math": 0.008928572060540318, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.5172991305589676, | |
| "step": 167 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1178.0647583007812, | |
| "epoch": 0.5614035087719298, | |
| "grad_norm": 0.2520909309387207, | |
| "kl": 2.63671875, | |
| "learning_rate": 9.591356409128691e-06, | |
| "loss": -0.1652, | |
| "reward": 0.581473246216774, | |
| "reward_std": 0.27404002100229263, | |
| "rewards/embodied_math": 0.0558035746216774, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.525669664144516, | |
| "step": 168 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1197.9822082519531, | |
| "epoch": 0.5647451963241437, | |
| "grad_norm": 0.18577782809734344, | |
| "kl": 2.50390625, | |
| "learning_rate": 9.474696651208406e-06, | |
| "loss": -0.1337, | |
| "reward": 0.612723246216774, | |
| "reward_std": 0.2510472200810909, | |
| "rewards/embodied_math": 0.06919643003493547, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.5435268133878708, | |
| "step": 169 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1176.0268249511719, | |
| "epoch": 0.5680868838763575, | |
| "grad_norm": 0.1762024164199829, | |
| "kl": 2.5625, | |
| "learning_rate": 9.358108540685406e-06, | |
| "loss": -0.176, | |
| "reward": 0.5552455633878708, | |
| "reward_std": 0.2156192846596241, | |
| "rewards/embodied_math": 0.03794643026776612, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.5172991305589676, | |
| "step": 170 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1211.8772583007812, | |
| "epoch": 0.5714285714285714, | |
| "grad_norm": 0.7674791216850281, | |
| "kl": 2.2890625, | |
| "learning_rate": 9.241607979294745e-06, | |
| "loss": -0.11, | |
| "reward": 0.6635044813156128, | |
| "reward_std": 0.27448395639657974, | |
| "rewards/embodied_math": 0.11830357694998384, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.545200914144516, | |
| "step": 171 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1206.4286193847656, | |
| "epoch": 0.5747702589807853, | |
| "grad_norm": 2.854034185409546, | |
| "kl": 2.60546875, | |
| "learning_rate": 9.125210856830433e-06, | |
| "loss": -0.1176, | |
| "reward": 0.6149553954601288, | |
| "reward_std": 0.24333922192454338, | |
| "rewards/embodied_math": 0.046875000931322575, | |
| "rewards/format_reward": 0.0022321429569274187, | |
| "rewards/tag_count_reward": 0.5658482313156128, | |
| "step": 172 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1170.6652526855469, | |
| "epoch": 0.5781119465329991, | |
| "grad_norm": 1.9103987216949463, | |
| "kl": 3.7734375, | |
| "learning_rate": 9.0089330489782e-06, | |
| "loss": -0.1914, | |
| "reward": 0.7047991454601288, | |
| "reward_std": 0.22894323244690895, | |
| "rewards/embodied_math": 0.1517857201397419, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.553013414144516, | |
| "step": 173 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1144.2031860351562, | |
| "epoch": 0.581453634085213, | |
| "grad_norm": 1.3213469982147217, | |
| "kl": 3.146484375, | |
| "learning_rate": 8.892790415150161e-06, | |
| "loss": -0.1989, | |
| "reward": 0.6216517984867096, | |
| "reward_std": 0.25715912505984306, | |
| "rewards/embodied_math": 0.08258928963914514, | |
| "rewards/format_reward": 0.0022321429569274187, | |
| "rewards/tag_count_reward": 0.5368303805589676, | |
| "step": 174 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1199.1563110351562, | |
| "epoch": 0.5847953216374269, | |
| "grad_norm": 1.474984884262085, | |
| "kl": 2.3408203125, | |
| "learning_rate": 8.776798796321715e-06, | |
| "loss": -0.0847, | |
| "reward": 0.711495578289032, | |
| "reward_std": 0.29080621898174286, | |
| "rewards/embodied_math": 0.12723214738070965, | |
| "rewards/format_reward": 0.008928572060540318, | |
| "rewards/tag_count_reward": 0.5753348469734192, | |
| "step": 175 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1154.560302734375, | |
| "epoch": 0.5881370091896407, | |
| "grad_norm": 0.5839706659317017, | |
| "kl": 3.8046875, | |
| "learning_rate": 8.66097401287097e-06, | |
| "loss": -0.2043, | |
| "reward": 0.7008928954601288, | |
| "reward_std": 0.24622002243995667, | |
| "rewards/embodied_math": 0.16741072200238705, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.533482164144516, | |
| "step": 176 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1153.7812805175781, | |
| "epoch": 0.5914786967418546, | |
| "grad_norm": 0.2968985438346863, | |
| "kl": 2.97265625, | |
| "learning_rate": 8.545331862420945e-06, | |
| "loss": -0.1888, | |
| "reward": 0.5625000149011612, | |
| "reward_std": 0.26313546672463417, | |
| "rewards/embodied_math": 0.07812500488944352, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.4843750149011612, | |
| "step": 177 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1164.2590026855469, | |
| "epoch": 0.5948203842940685, | |
| "grad_norm": 0.29861223697662354, | |
| "kl": 2.93359375, | |
| "learning_rate": 8.429888117684904e-06, | |
| "loss": -0.1956, | |
| "reward": 0.5970982313156128, | |
| "reward_std": 0.18818846344947815, | |
| "rewards/embodied_math": 0.1071428619325161, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.489955373108387, | |
| "step": 178 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1210.9576721191406, | |
| "epoch": 0.5981620718462823, | |
| "grad_norm": 0.3574662506580353, | |
| "kl": 1.658203125, | |
| "learning_rate": 8.314658524315068e-06, | |
| "loss": -0.1221, | |
| "reward": 0.6149553805589676, | |
| "reward_std": 0.22238216921687126, | |
| "rewards/embodied_math": 0.12723214738070965, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.4877232387661934, | |
| "step": 179 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1205.4754943847656, | |
| "epoch": 0.6015037593984962, | |
| "grad_norm": 0.3439498543739319, | |
| "kl": 1.9462890625, | |
| "learning_rate": 8.199658798755048e-06, | |
| "loss": -0.1349, | |
| "reward": 0.6132812798023224, | |
| "reward_std": 0.23523807525634766, | |
| "rewards/embodied_math": 0.09151786379516125, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.5217634066939354, | |
| "step": 180 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1218.1607666015625, | |
| "epoch": 0.6048454469507101, | |
| "grad_norm": 0.32480064034461975, | |
| "kl": 1.892578125, | |
| "learning_rate": 8.084904626096211e-06, | |
| "loss": -0.1013, | |
| "reward": 0.5898437798023224, | |
| "reward_std": 0.2543032169342041, | |
| "rewards/embodied_math": 0.06250000186264515, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.5273437798023224, | |
| "step": 181 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1199.7500305175781, | |
| "epoch": 0.6081871345029239, | |
| "grad_norm": 0.3220880925655365, | |
| "kl": 2.478515625, | |
| "learning_rate": 7.970411657938382e-06, | |
| "loss": -0.1372, | |
| "reward": 0.7087053805589676, | |
| "reward_std": 0.27387310564517975, | |
| "rewards/embodied_math": 0.12946429033763707, | |
| "rewards/format_reward": 0.004464285913854837, | |
| "rewards/tag_count_reward": 0.5747767984867096, | |
| "step": 182 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1236.7522888183594, | |
| "epoch": 0.6115288220551378, | |
| "grad_norm": 1.8340140581130981, | |
| "kl": 9.578125, | |
| "learning_rate": 7.856195510255059e-06, | |
| "loss": -0.0849, | |
| "reward": 0.7823660969734192, | |
| "reward_std": 0.24205785244703293, | |
| "rewards/embodied_math": 0.17633928917348385, | |
| "rewards/format_reward": 0.0022321429569274187, | |
| "rewards/tag_count_reward": 0.6037946492433548, | |
| "step": 183 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1228.3147888183594, | |
| "epoch": 0.6148705096073517, | |
| "grad_norm": 0.21496865153312683, | |
| "kl": 1.609375, | |
| "learning_rate": 7.742271761263537e-06, | |
| "loss": -0.0914, | |
| "reward": 0.7248884439468384, | |
| "reward_std": 0.2794860415160656, | |
| "rewards/embodied_math": 0.1272321455180645, | |
| "rewards/format_reward": 0.011160715017467737, | |
| "rewards/tag_count_reward": 0.5864955633878708, | |
| "step": 184 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1247.9107666015625, | |
| "epoch": 0.6182121971595655, | |
| "grad_norm": 0.20037777721881866, | |
| "kl": 1.1376953125, | |
| "learning_rate": 7.628655949300133e-06, | |
| "loss": -0.0576, | |
| "reward": 0.7533482611179352, | |
| "reward_std": 0.2807689905166626, | |
| "rewards/embodied_math": 0.14285714668221772, | |
| "rewards/format_reward": 0.01562500069849193, | |
| "rewards/tag_count_reward": 0.594866082072258, | |
| "step": 185 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1231.3705749511719, | |
| "epoch": 0.6215538847117794, | |
| "grad_norm": 2.5596811771392822, | |
| "kl": 1.927734375, | |
| "learning_rate": 7.51536357070089e-06, | |
| "loss": -0.0724, | |
| "reward": 0.6902902126312256, | |
| "reward_std": 0.37761393934488297, | |
| "rewards/embodied_math": 0.04017857415601611, | |
| "rewards/format_reward": 0.05357143096625805, | |
| "rewards/tag_count_reward": 0.5965402126312256, | |
| "step": 186 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1210.7098693847656, | |
| "epoch": 0.6248955722639933, | |
| "grad_norm": 0.7288307547569275, | |
| "kl": 2.509765625, | |
| "learning_rate": 7.402410077687994e-06, | |
| "loss": -0.0863, | |
| "reward": 0.9045759290456772, | |
| "reward_std": 0.4523550197482109, | |
| "rewards/embodied_math": 0.2187500111758709, | |
| "rewards/format_reward": 0.08705357555299997, | |
| "rewards/tag_count_reward": 0.5987723469734192, | |
| "step": 187 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1234.4465026855469, | |
| "epoch": 0.6282372598162071, | |
| "grad_norm": 0.8198930025100708, | |
| "kl": 2.033203125, | |
| "learning_rate": 7.2898108762622e-06, | |
| "loss": -0.0534, | |
| "reward": 0.8792080730199814, | |
| "reward_std": 0.5382220521569252, | |
| "rewards/embodied_math": 0.023181250551715493, | |
| "rewards/format_reward": 0.2165178693830967, | |
| "rewards/tag_count_reward": 0.6395089477300644, | |
| "step": 188 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1267.7723999023438, | |
| "epoch": 0.631578947368421, | |
| "grad_norm": 0.2449118196964264, | |
| "kl": 1.06103515625, | |
| "learning_rate": 7.1775813241015755e-06, | |
| "loss": -0.0225, | |
| "reward": 1.1289062798023224, | |
| "reward_std": 0.6006747037172318, | |
| "rewards/embodied_math": 0.12053572200238705, | |
| "rewards/format_reward": 0.3660714402794838, | |
| "rewards/tag_count_reward": 0.6422991305589676, | |
| "step": 189 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1262.41748046875, | |
| "epoch": 0.6349206349206349, | |
| "grad_norm": 0.24977703392505646, | |
| "kl": 1.0390625, | |
| "learning_rate": 7.065736728466832e-06, | |
| "loss": -0.0361, | |
| "reward": 1.2664072215557098, | |
| "reward_std": 0.6163594722747803, | |
| "rewards/embodied_math": 0.11852768177050166, | |
| "rewards/format_reward": 0.5312500149011612, | |
| "rewards/tag_count_reward": 0.616629496216774, | |
| "step": 190 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1245.0402221679688, | |
| "epoch": 0.6382623224728488, | |
| "grad_norm": 0.4997389018535614, | |
| "kl": 0.80029296875, | |
| "learning_rate": 6.9542923441135226e-06, | |
| "loss": -0.0598, | |
| "reward": 1.3069196939468384, | |
| "reward_std": 0.595898911356926, | |
| "rewards/embodied_math": 0.1361607201397419, | |
| "rewards/format_reward": 0.5825893133878708, | |
| "rewards/tag_count_reward": 0.5881696492433548, | |
| "step": 191 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1229.638427734375, | |
| "epoch": 0.6416040100250626, | |
| "grad_norm": 0.4335998594760895, | |
| "kl": 1.01171875, | |
| "learning_rate": 6.843263371211415e-06, | |
| "loss": -0.0753, | |
| "reward": 1.2984784245491028, | |
| "reward_std": 0.6277011036872864, | |
| "rewards/embodied_math": 0.07470602937974036, | |
| "rewards/format_reward": 0.6294643133878708, | |
| "rewards/tag_count_reward": 0.5943080633878708, | |
| "step": 192 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1256.2232666015625, | |
| "epoch": 0.6449456975772765, | |
| "grad_norm": 0.9794358015060425, | |
| "kl": 0.58349609375, | |
| "learning_rate": 6.732664953271305e-06, | |
| "loss": -0.0549, | |
| "reward": 1.3554688096046448, | |
| "reward_std": 0.5859769731760025, | |
| "rewards/embodied_math": 0.0379464291036129, | |
| "rewards/format_reward": 0.6919643133878708, | |
| "rewards/tag_count_reward": 0.6255580633878708, | |
| "step": 193 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1249.2857666015625, | |
| "epoch": 0.6482873851294904, | |
| "grad_norm": 0.6576189994812012, | |
| "kl": 0.61962890625, | |
| "learning_rate": 6.622512175079543e-06, | |
| "loss": -0.0609, | |
| "reward": 1.4308036267757416, | |
| "reward_std": 0.6184276640415192, | |
| "rewards/embodied_math": 0.04687500186264515, | |
| "rewards/format_reward": 0.7254464477300644, | |
| "rewards/tag_count_reward": 0.658482164144516, | |
| "step": 194 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1265.3482666015625, | |
| "epoch": 0.6516290726817042, | |
| "grad_norm": 0.4225464165210724, | |
| "kl": 0.84716796875, | |
| "learning_rate": 6.512820060640608e-06, | |
| "loss": -0.0428, | |
| "reward": 1.4782367050647736, | |
| "reward_std": 0.5665149390697479, | |
| "rewards/embodied_math": 0.049107145285233855, | |
| "rewards/format_reward": 0.752232164144516, | |
| "rewards/tag_count_reward": 0.6768973469734192, | |
| "step": 195 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1251.8259582519531, | |
| "epoch": 0.6549707602339181, | |
| "grad_norm": 0.6224950551986694, | |
| "kl": 0.69775390625, | |
| "learning_rate": 6.403603571127921e-06, | |
| "loss": -0.0696, | |
| "reward": 1.6000739634037018, | |
| "reward_std": 0.5625407323241234, | |
| "rewards/embodied_math": 0.1296497832518071, | |
| "rewards/format_reward": 0.7924107611179352, | |
| "rewards/tag_count_reward": 0.6780134290456772, | |
| "step": 196 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1253.7812805175781, | |
| "epoch": 0.658312447786132, | |
| "grad_norm": 0.3629626929759979, | |
| "kl": 0.90234375, | |
| "learning_rate": 6.294877602843276e-06, | |
| "loss": -0.0731, | |
| "reward": 1.6656273305416107, | |
| "reward_std": 0.4725663438439369, | |
| "rewards/embodied_math": 0.08750223537208512, | |
| "rewards/format_reward": 0.8660714775323868, | |
| "rewards/tag_count_reward": 0.7120536118745804, | |
| "step": 197 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1272.6205749511719, | |
| "epoch": 0.6616541353383458, | |
| "grad_norm": 0.336520254611969, | |
| "kl": 1.02685546875, | |
| "learning_rate": 6.186656985185078e-06, | |
| "loss": -0.0326, | |
| "reward": 1.6741072237491608, | |
| "reward_std": 0.4543350860476494, | |
| "rewards/embodied_math": 0.06919643003493547, | |
| "rewards/format_reward": 0.8660714626312256, | |
| "rewards/tag_count_reward": 0.7388393133878708, | |
| "step": 198 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1280.7210388183594, | |
| "epoch": 0.6649958228905597, | |
| "grad_norm": 0.1395803987979889, | |
| "kl": 0.560546875, | |
| "learning_rate": 6.078956478625743e-06, | |
| "loss": -0.0183, | |
| "reward": 1.7165179550647736, | |
| "reward_std": 0.5158629938960075, | |
| "rewards/embodied_math": 0.13392857648432255, | |
| "rewards/format_reward": 0.8571428805589676, | |
| "rewards/tag_count_reward": 0.7254464626312256, | |
| "step": 199 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1254.1563110351562, | |
| "epoch": 0.6683375104427736, | |
| "grad_norm": 0.6481565833091736, | |
| "kl": 1.6591796875, | |
| "learning_rate": 5.971790772698467e-06, | |
| "loss": -0.0423, | |
| "reward": 1.717633992433548, | |
| "reward_std": 0.47620728611946106, | |
| "rewards/embodied_math": 0.1428571492433548, | |
| "rewards/format_reward": 0.8526786118745804, | |
| "rewards/tag_count_reward": 0.722098246216774, | |
| "step": 200 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1249.982177734375, | |
| "epoch": 0.6716791979949874, | |
| "grad_norm": 0.37232309579849243, | |
| "kl": 1.18359375, | |
| "learning_rate": 5.865174483993697e-06, | |
| "loss": -0.0609, | |
| "reward": 1.6735491752624512, | |
| "reward_std": 0.6033786237239838, | |
| "rewards/embodied_math": 0.13392857648432255, | |
| "rewards/format_reward": 0.8058036118745804, | |
| "rewards/tag_count_reward": 0.7338170111179352, | |
| "step": 201 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1251.3504943847656, | |
| "epoch": 0.6750208855472013, | |
| "grad_norm": 0.5929485559463501, | |
| "kl": 1.576171875, | |
| "learning_rate": 5.759122154165528e-06, | |
| "loss": -0.0636, | |
| "reward": 1.6389509439468384, | |
| "reward_std": 0.5772095322608948, | |
| "rewards/embodied_math": 0.10937500605359674, | |
| "rewards/format_reward": 0.8169643431901932, | |
| "rewards/tag_count_reward": 0.7126116454601288, | |
| "step": 202 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1251.3839721679688, | |
| "epoch": 0.6783625730994152, | |
| "grad_norm": 0.7571573853492737, | |
| "kl": 3.2109375, | |
| "learning_rate": 5.653648247948342e-06, | |
| "loss": -0.0644, | |
| "reward": 1.7477679550647736, | |
| "reward_std": 0.589836597442627, | |
| "rewards/embodied_math": 0.2031250074505806, | |
| "rewards/format_reward": 0.8191964626312256, | |
| "rewards/tag_count_reward": 0.7254464477300644, | |
| "step": 203 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1251.7701416015625, | |
| "epoch": 0.681704260651629, | |
| "grad_norm": 0.28568902611732483, | |
| "kl": 1.71484375, | |
| "learning_rate": 5.548767151183912e-06, | |
| "loss": -0.0616, | |
| "reward": 1.698102742433548, | |
| "reward_std": 0.5798378437757492, | |
| "rewards/embodied_math": 0.160714291036129, | |
| "rewards/format_reward": 0.7991071790456772, | |
| "rewards/tag_count_reward": 0.7382812798023224, | |
| "step": 204 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1244.33935546875, | |
| "epoch": 0.6850459482038429, | |
| "grad_norm": 0.21788516640663147, | |
| "kl": 1.83984375, | |
| "learning_rate": 5.444493168859304e-06, | |
| "loss": -0.0808, | |
| "reward": 1.5507813096046448, | |
| "reward_std": 0.5835923999547958, | |
| "rewards/embodied_math": 0.011160714784637094, | |
| "rewards/format_reward": 0.7991071790456772, | |
| "rewards/tag_count_reward": 0.7405134439468384, | |
| "step": 205 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1229.9822082519531, | |
| "epoch": 0.6883876357560568, | |
| "grad_norm": 0.21175533533096313, | |
| "kl": 1.4609375, | |
| "learning_rate": 5.340840523155769e-06, | |
| "loss": -0.092, | |
| "reward": 1.5630581080913544, | |
| "reward_std": 0.6021421700716019, | |
| "rewards/embodied_math": 0.0781250037252903, | |
| "rewards/format_reward": 0.7723214775323868, | |
| "rewards/tag_count_reward": 0.7126116305589676, | |
| "step": 206 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1232.7411193847656, | |
| "epoch": 0.6917293233082706, | |
| "grad_norm": 2.031179904937744, | |
| "kl": 1.998046875, | |
| "learning_rate": 5.237823351508953e-06, | |
| "loss": -0.0844, | |
| "reward": 1.6768973767757416, | |
| "reward_std": 0.5545762553811073, | |
| "rewards/embodied_math": 0.1071428619325161, | |
| "rewards/format_reward": 0.8191964626312256, | |
| "rewards/tag_count_reward": 0.750558078289032, | |
| "step": 207 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1244.7433776855469, | |
| "epoch": 0.6950710108604845, | |
| "grad_norm": 0.4047699570655823, | |
| "kl": 1.185546875, | |
| "learning_rate": 5.135455704680646e-06, | |
| "loss": -0.0906, | |
| "reward": 1.6255581378936768, | |
| "reward_std": 0.5186707675457001, | |
| "rewards/embodied_math": 0.026785715483129025, | |
| "rewards/format_reward": 0.8504464626312256, | |
| "rewards/tag_count_reward": 0.7483259290456772, | |
| "step": 208 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1263.3215026855469, | |
| "epoch": 0.6984126984126984, | |
| "grad_norm": 0.38765090703964233, | |
| "kl": 1.130859375, | |
| "learning_rate": 5.03375154484238e-06, | |
| "loss": -0.0635, | |
| "reward": 1.6350446939468384, | |
| "reward_std": 0.5284169614315033, | |
| "rewards/embodied_math": 0.0267857164144516, | |
| "rewards/format_reward": 0.8526786118745804, | |
| "rewards/tag_count_reward": 0.7555803954601288, | |
| "step": 209 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1262.1652221679688, | |
| "epoch": 0.7017543859649122, | |
| "grad_norm": 0.4771866798400879, | |
| "kl": 1.1787109375, | |
| "learning_rate": 4.932724743671089e-06, | |
| "loss": -0.0582, | |
| "reward": 1.6517857909202576, | |
| "reward_std": 0.5431385114789009, | |
| "rewards/embodied_math": 0.08482143259607255, | |
| "rewards/format_reward": 0.8325893431901932, | |
| "rewards/tag_count_reward": 0.7343750447034836, | |
| "step": 210 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1237.1361999511719, | |
| "epoch": 0.7050960735171261, | |
| "grad_norm": 0.4817124009132385, | |
| "kl": 1.5126953125, | |
| "learning_rate": 4.832389080457118e-06, | |
| "loss": -0.1114, | |
| "reward": 1.7349331080913544, | |
| "reward_std": 0.5682315081357956, | |
| "rewards/embodied_math": 0.1718750111758709, | |
| "rewards/format_reward": 0.8459821790456772, | |
| "rewards/tag_count_reward": 0.7170759290456772, | |
| "step": 211 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1250.9888916015625, | |
| "epoch": 0.70843776106934, | |
| "grad_norm": 0.2896386682987213, | |
| "kl": 1.2412109375, | |
| "learning_rate": 4.732758240224819e-06, | |
| "loss": -0.0715, | |
| "reward": 1.5792411267757416, | |
| "reward_std": 0.5823845863342285, | |
| "rewards/embodied_math": 0.03794642980210483, | |
| "rewards/format_reward": 0.8415178954601288, | |
| "rewards/tag_count_reward": 0.699776828289032, | |
| "step": 212 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1255.7433471679688, | |
| "epoch": 0.7117794486215538, | |
| "grad_norm": 0.2107439637184143, | |
| "kl": 1.296875, | |
| "learning_rate": 4.633845811866044e-06, | |
| "loss": -0.0616, | |
| "reward": 1.6333706080913544, | |
| "reward_std": 0.49481259286403656, | |
| "rewards/embodied_math": 0.0691964291036129, | |
| "rewards/format_reward": 0.8482143431901932, | |
| "rewards/tag_count_reward": 0.7159598469734192, | |
| "step": 213 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1255.4933471679688, | |
| "epoch": 0.7151211361737677, | |
| "grad_norm": 0.13716629147529602, | |
| "kl": 1.3798828125, | |
| "learning_rate": 4.535665286286691e-06, | |
| "loss": -0.0621, | |
| "reward": 1.5842634737491608, | |
| "reward_std": 0.49601756781339645, | |
| "rewards/embodied_math": 0.015625000931322575, | |
| "rewards/format_reward": 0.8459821939468384, | |
| "rewards/tag_count_reward": 0.7226562798023224, | |
| "step": 214 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1259.1473388671875, | |
| "epoch": 0.7184628237259816, | |
| "grad_norm": 0.2403624951839447, | |
| "kl": 1.541015625, | |
| "learning_rate": 4.438230054566678e-06, | |
| "loss": -0.0537, | |
| "reward": 1.6422991454601288, | |
| "reward_std": 0.5113217607140541, | |
| "rewards/embodied_math": 0.07589286053553224, | |
| "rewards/format_reward": 0.8593750447034836, | |
| "rewards/tag_count_reward": 0.7070312798023224, | |
| "step": 215 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1263.2745971679688, | |
| "epoch": 0.7218045112781954, | |
| "grad_norm": 0.29377609491348267, | |
| "kl": 1.101806640625, | |
| "learning_rate": 4.34155340613348e-06, | |
| "loss": -0.0438, | |
| "reward": 1.5970982909202576, | |
| "reward_std": 0.5749376714229584, | |
| "rewards/embodied_math": 0.07142857369035482, | |
| "rewards/format_reward": 0.8370536118745804, | |
| "rewards/tag_count_reward": 0.6886161118745804, | |
| "step": 216 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1258.6473693847656, | |
| "epoch": 0.7251461988304093, | |
| "grad_norm": 0.30173197388648987, | |
| "kl": 1.80078125, | |
| "learning_rate": 4.245648526949568e-06, | |
| "loss": -0.0573, | |
| "reward": 1.6188616752624512, | |
| "reward_std": 0.4864480197429657, | |
| "rewards/embodied_math": 0.0357142873108387, | |
| "rewards/format_reward": 0.8638393133878708, | |
| "rewards/tag_count_reward": 0.719308078289032, | |
| "step": 217 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1242.0201416015625, | |
| "epoch": 0.7284878863826232, | |
| "grad_norm": 0.1726818084716797, | |
| "kl": 2.19921875, | |
| "learning_rate": 4.150528497713911e-06, | |
| "loss": -0.0906, | |
| "reward": 1.6897322237491608, | |
| "reward_std": 0.5109899789094925, | |
| "rewards/embodied_math": 0.08705357578583062, | |
| "rewards/format_reward": 0.8839286118745804, | |
| "rewards/tag_count_reward": 0.7187500447034836, | |
| "step": 218 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1235.4732666015625, | |
| "epoch": 0.731829573934837, | |
| "grad_norm": 0.18735679984092712, | |
| "kl": 1.8564453125, | |
| "learning_rate": 4.056206292077916e-06, | |
| "loss": -0.0943, | |
| "reward": 1.797991156578064, | |
| "reward_std": 0.48637502640485764, | |
| "rewards/embodied_math": 0.2209821562282741, | |
| "rewards/format_reward": 0.863839328289032, | |
| "rewards/tag_count_reward": 0.7131696790456772, | |
| "step": 219 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1232.6250610351562, | |
| "epoch": 0.7351712614870509, | |
| "grad_norm": 0.27614831924438477, | |
| "kl": 2.23828125, | |
| "learning_rate": 3.96269477487588e-06, | |
| "loss": -0.0937, | |
| "reward": 1.7008929252624512, | |
| "reward_std": 0.5516977161169052, | |
| "rewards/embodied_math": 0.1562500074505806, | |
| "rewards/format_reward": 0.8415178954601288, | |
| "rewards/tag_count_reward": 0.7031250298023224, | |
| "step": 220 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1269.6986999511719, | |
| "epoch": 0.7385129490392648, | |
| "grad_norm": 0.1375630497932434, | |
| "kl": 0.9873046875, | |
| "learning_rate": 3.870006700370348e-06, | |
| "loss": -0.0571, | |
| "reward": 1.7500000596046448, | |
| "reward_std": 0.4832083433866501, | |
| "rewards/embodied_math": 0.12500000605359674, | |
| "rewards/format_reward": 0.8883928954601288, | |
| "rewards/tag_count_reward": 0.7366071790456772, | |
| "step": 221 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1253.5558776855469, | |
| "epoch": 0.7418546365914787, | |
| "grad_norm": 0.1821809709072113, | |
| "kl": 1.4970703125, | |
| "learning_rate": 3.778154710512513e-06, | |
| "loss": -0.0626, | |
| "reward": 1.7126117050647736, | |
| "reward_std": 0.5748995840549469, | |
| "rewards/embodied_math": 0.12276786239817739, | |
| "rewards/format_reward": 0.848214328289032, | |
| "rewards/tag_count_reward": 0.7416294813156128, | |
| "step": 222 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1227.3995971679688, | |
| "epoch": 0.7451963241436925, | |
| "grad_norm": 0.4636569619178772, | |
| "kl": 2.15625, | |
| "learning_rate": 3.687151333217952e-06, | |
| "loss": -0.108, | |
| "reward": 1.6289063096046448, | |
| "reward_std": 0.6172408014535904, | |
| "rewards/embodied_math": 0.09821428824216127, | |
| "rewards/format_reward": 0.81026791036129, | |
| "rewards/tag_count_reward": 0.7204241305589676, | |
| "step": 223 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1259.0357666015625, | |
| "epoch": 0.7485380116959064, | |
| "grad_norm": 1.434766173362732, | |
| "kl": 1.8427734375, | |
| "learning_rate": 3.597008980657929e-06, | |
| "loss": -0.0383, | |
| "reward": 1.7098215222358704, | |
| "reward_std": 0.5351722091436386, | |
| "rewards/embodied_math": 0.0959821455180645, | |
| "rewards/format_reward": 0.8660714775323868, | |
| "rewards/tag_count_reward": 0.7477678954601288, | |
| "step": 224 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1248.3951721191406, | |
| "epoch": 0.7518796992481203, | |
| "grad_norm": 0.17362454533576965, | |
| "kl": 1.62890625, | |
| "learning_rate": 3.5077399475664474e-06, | |
| "loss": -0.0778, | |
| "reward": 1.7622768580913544, | |
| "reward_std": 0.4962947890162468, | |
| "rewards/embodied_math": 0.13392857648432255, | |
| "rewards/format_reward": 0.85714291036129, | |
| "rewards/tag_count_reward": 0.7712053954601288, | |
| "step": 225 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1278.1116638183594, | |
| "epoch": 0.7552213868003341, | |
| "grad_norm": 0.1242133304476738, | |
| "kl": 0.9599609375, | |
| "learning_rate": 3.419356409563361e-06, | |
| "loss": -0.0309, | |
| "reward": 1.774553656578064, | |
| "reward_std": 0.5362864062190056, | |
| "rewards/embodied_math": 0.1517857201397419, | |
| "rewards/format_reward": 0.8504464626312256, | |
| "rewards/tag_count_reward": 0.7723214775323868, | |
| "step": 226 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1245.2143249511719, | |
| "epoch": 0.758563074352548, | |
| "grad_norm": 0.2596234083175659, | |
| "kl": 1.728515625, | |
| "learning_rate": 3.331870421493688e-06, | |
| "loss": -0.0695, | |
| "reward": 1.6724331080913544, | |
| "reward_std": 0.5726595818996429, | |
| "rewards/embodied_math": 0.08035714598372579, | |
| "rewards/format_reward": 0.8303571790456772, | |
| "rewards/tag_count_reward": 0.7617187798023224, | |
| "step": 227 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1236.2857666015625, | |
| "epoch": 0.7619047619047619, | |
| "grad_norm": 0.6998271346092224, | |
| "kl": 1.978515625, | |
| "learning_rate": 3.245293915783444e-06, | |
| "loss": -0.0877, | |
| "reward": 1.684151828289032, | |
| "reward_std": 0.6410565972328186, | |
| "rewards/embodied_math": 0.11383929406292737, | |
| "rewards/format_reward": 0.8125000298023224, | |
| "rewards/tag_count_reward": 0.7578125149011612, | |
| "step": 228 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1249.9911499023438, | |
| "epoch": 0.7652464494569757, | |
| "grad_norm": 0.3117403984069824, | |
| "kl": 1.6494140625, | |
| "learning_rate": 3.1596387008121386e-06, | |
| "loss": -0.0699, | |
| "reward": 1.7248884737491608, | |
| "reward_std": 0.6069164872169495, | |
| "rewards/embodied_math": 0.11607143469154835, | |
| "rewards/format_reward": 0.823660746216774, | |
| "rewards/tag_count_reward": 0.7851562798023224, | |
| "step": 229 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1263.8147888183594, | |
| "epoch": 0.7685881370091896, | |
| "grad_norm": 0.4184093177318573, | |
| "kl": 1.0625, | |
| "learning_rate": 3.074916459302211e-06, | |
| "loss": -0.0566, | |
| "reward": 1.8108259737491608, | |
| "reward_std": 0.5378059893846512, | |
| "rewards/embodied_math": 0.16517857927829027, | |
| "rewards/format_reward": 0.848214328289032, | |
| "rewards/tag_count_reward": 0.7974330633878708, | |
| "step": 230 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1239.7857666015625, | |
| "epoch": 0.7719298245614035, | |
| "grad_norm": 1.0474584102630615, | |
| "kl": 2.328125, | |
| "learning_rate": 2.9911387467255737e-06, | |
| "loss": -0.0811, | |
| "reward": 1.7176340222358704, | |
| "reward_std": 0.5790911167860031, | |
| "rewards/embodied_math": 0.10714286006987095, | |
| "rewards/format_reward": 0.8169643133878708, | |
| "rewards/tag_count_reward": 0.793526828289032, | |
| "step": 231 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1212.7254943847656, | |
| "epoch": 0.7752715121136173, | |
| "grad_norm": 0.3452969789505005, | |
| "kl": 1.599609375, | |
| "learning_rate": 2.9083169897275554e-06, | |
| "loss": -0.117, | |
| "reward": 1.6752232909202576, | |
| "reward_std": 0.5666229277849197, | |
| "rewards/embodied_math": 0.08035714668221772, | |
| "rewards/format_reward": 0.8236607611179352, | |
| "rewards/tag_count_reward": 0.7712053805589676, | |
| "step": 232 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1239.716552734375, | |
| "epoch": 0.7786131996658312, | |
| "grad_norm": 0.25167980790138245, | |
| "kl": 1.430419921875, | |
| "learning_rate": 2.82646248456839e-06, | |
| "loss": -0.0755, | |
| "reward": 1.72991082072258, | |
| "reward_std": 0.6188212782144547, | |
| "rewards/embodied_math": 0.12500000558793545, | |
| "rewards/format_reward": 0.8169643133878708, | |
| "rewards/tag_count_reward": 0.7879464775323868, | |
| "step": 233 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1248.3750610351562, | |
| "epoch": 0.7819548872180451, | |
| "grad_norm": 0.3067150413990021, | |
| "kl": 1.31640625, | |
| "learning_rate": 2.745586395582481e-06, | |
| "loss": -0.0627, | |
| "reward": 1.7762278020381927, | |
| "reward_std": 0.5344242751598358, | |
| "rewards/embodied_math": 0.10267857648432255, | |
| "rewards/format_reward": 0.879464328289032, | |
| "rewards/tag_count_reward": 0.7940848618745804, | |
| "step": 234 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1239.2902526855469, | |
| "epoch": 0.7852965747702589, | |
| "grad_norm": 0.3017171323299408, | |
| "kl": 2.639892578125, | |
| "learning_rate": 2.665699753655684e-06, | |
| "loss": -0.0628, | |
| "reward": 1.735491156578064, | |
| "reward_std": 0.5408925563097, | |
| "rewards/embodied_math": 0.07589285913854837, | |
| "rewards/format_reward": 0.863839328289032, | |
| "rewards/tag_count_reward": 0.7957589626312256, | |
| "step": 235 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1203.1183471679688, | |
| "epoch": 0.7886382623224728, | |
| "grad_norm": 1.1649154424667358, | |
| "kl": 2.390625, | |
| "learning_rate": 2.586813454720771e-06, | |
| "loss": -0.1158, | |
| "reward": 1.6802456080913544, | |
| "reward_std": 0.5190064385533333, | |
| "rewards/embodied_math": 0.03348214412108064, | |
| "rewards/format_reward": 0.8683036118745804, | |
| "rewards/tag_count_reward": 0.7784598469734192, | |
| "step": 236 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1228.2589721679688, | |
| "epoch": 0.7919799498746867, | |
| "grad_norm": 0.4925525188446045, | |
| "kl": 2.134765625, | |
| "learning_rate": 2.5089382582712995e-06, | |
| "loss": -0.0895, | |
| "reward": 1.7260045409202576, | |
| "reward_std": 0.4922590032219887, | |
| "rewards/embodied_math": 0.058035718742758036, | |
| "rewards/format_reward": 0.8839286118745804, | |
| "rewards/tag_count_reward": 0.7840402126312256, | |
| "step": 237 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1212.357177734375, | |
| "epoch": 0.7953216374269005, | |
| "grad_norm": 0.24148394167423248, | |
| "kl": 2.8828125, | |
| "learning_rate": 2.4320847858941167e-06, | |
| "loss": -0.1478, | |
| "reward": 1.7299107909202576, | |
| "reward_std": 0.5158706456422806, | |
| "rewards/embodied_math": 0.11160714668221772, | |
| "rewards/format_reward": 0.870535746216774, | |
| "rewards/tag_count_reward": 0.7477678954601288, | |
| "step": 238 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1228.7500305175781, | |
| "epoch": 0.7986633249791144, | |
| "grad_norm": 0.4909496307373047, | |
| "kl": 2.439453125, | |
| "learning_rate": 2.3562635198206476e-06, | |
| "loss": -0.1085, | |
| "reward": 1.739397406578064, | |
| "reward_std": 0.53496253490448, | |
| "rewards/embodied_math": 0.1004464328289032, | |
| "rewards/format_reward": 0.8727678954601288, | |
| "rewards/tag_count_reward": 0.766183078289032, | |
| "step": 239 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1226.5000610351562, | |
| "epoch": 0.8020050125313283, | |
| "grad_norm": 0.1784716248512268, | |
| "kl": 2.171875, | |
| "learning_rate": 2.281484801497186e-06, | |
| "loss": -0.1241, | |
| "reward": 1.6556920111179352, | |
| "reward_std": 0.5279371589422226, | |
| "rewards/embodied_math": 0.013392857741564512, | |
| "rewards/format_reward": 0.877232164144516, | |
| "rewards/tag_count_reward": 0.7650670111179352, | |
| "step": 240 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1210.8616638183594, | |
| "epoch": 0.8053467000835421, | |
| "grad_norm": 0.5043416619300842, | |
| "kl": 3.140625, | |
| "learning_rate": 2.2077588301744234e-06, | |
| "loss": -0.1453, | |
| "reward": 1.6947545409202576, | |
| "reward_std": 0.551237165927887, | |
| "rewards/embodied_math": 0.08928571827709675, | |
| "rewards/format_reward": 0.8593750298023224, | |
| "rewards/tag_count_reward": 0.7460937798023224, | |
| "step": 241 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1213.3147583007812, | |
| "epoch": 0.808688387635756, | |
| "grad_norm": 0.8992521166801453, | |
| "kl": 3.134765625, | |
| "learning_rate": 2.1350956615163254e-06, | |
| "loss": -0.1302, | |
| "reward": 1.7220982909202576, | |
| "reward_std": 0.5575926005840302, | |
| "rewards/embodied_math": 0.12276786006987095, | |
| "rewards/format_reward": 0.854910746216774, | |
| "rewards/tag_count_reward": 0.744419664144516, | |
| "step": 242 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1236.669677734375, | |
| "epoch": 0.8120300751879699, | |
| "grad_norm": 0.33429309725761414, | |
| "kl": 1.98828125, | |
| "learning_rate": 2.0635052062286323e-06, | |
| "loss": -0.0843, | |
| "reward": 1.7075893580913544, | |
| "reward_std": 0.5474491640925407, | |
| "rewards/embodied_math": 0.0915178619325161, | |
| "rewards/format_reward": 0.8504464775323868, | |
| "rewards/tag_count_reward": 0.7656250298023224, | |
| "step": 243 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1238.6719360351562, | |
| "epoch": 0.8153717627401837, | |
| "grad_norm": 0.5671308040618896, | |
| "kl": 2.73046875, | |
| "learning_rate": 1.992997228707103e-06, | |
| "loss": -0.0982, | |
| "reward": 1.6908482611179352, | |
| "reward_std": 0.5311977565288544, | |
| "rewards/embodied_math": 0.05133928940631449, | |
| "rewards/format_reward": 0.8660714775323868, | |
| "rewards/tag_count_reward": 0.7734375298023224, | |
| "step": 244 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1212.2991333007812, | |
| "epoch": 0.8187134502923976, | |
| "grad_norm": 0.2072792798280716, | |
| "kl": 2.033203125, | |
| "learning_rate": 1.923581345705736e-06, | |
| "loss": -0.1452, | |
| "reward": 1.6718750894069672, | |
| "reward_std": 0.5443079173564911, | |
| "rewards/embodied_math": 0.0691964328289032, | |
| "rewards/format_reward": 0.8392857611179352, | |
| "rewards/tag_count_reward": 0.7633928954601288, | |
| "step": 245 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1202.3460388183594, | |
| "epoch": 0.8220551378446115, | |
| "grad_norm": 0.17846164107322693, | |
| "kl": 2.375, | |
| "learning_rate": 1.8552670250251003e-06, | |
| "loss": -0.144, | |
| "reward": 1.8275670111179352, | |
| "reward_std": 0.5471851527690887, | |
| "rewards/embodied_math": 0.2209821492433548, | |
| "rewards/format_reward": 0.839285746216774, | |
| "rewards/tag_count_reward": 0.7672991454601288, | |
| "step": 246 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1229.0536193847656, | |
| "epoch": 0.8253968253968254, | |
| "grad_norm": 0.1999313086271286, | |
| "kl": 1.751953125, | |
| "learning_rate": 1.788063584221017e-06, | |
| "loss": -0.1177, | |
| "reward": 1.7338170409202576, | |
| "reward_std": 0.5468832030892372, | |
| "rewards/embodied_math": 0.10491071827709675, | |
| "rewards/format_reward": 0.8392857611179352, | |
| "rewards/tag_count_reward": 0.7896205633878708, | |
| "step": 247 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1251.4620666503906, | |
| "epoch": 0.8287385129490392, | |
| "grad_norm": 0.4205353558063507, | |
| "kl": 1.279296875, | |
| "learning_rate": 1.7219801893337073e-06, | |
| "loss": -0.0776, | |
| "reward": 1.8007813096046448, | |
| "reward_std": 0.5819149166345596, | |
| "rewards/embodied_math": 0.1785714402794838, | |
| "rewards/format_reward": 0.832589328289032, | |
| "rewards/tag_count_reward": 0.789620578289032, | |
| "step": 248 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1217.8080749511719, | |
| "epoch": 0.8320802005012531, | |
| "grad_norm": 0.24345842003822327, | |
| "kl": 2.443359375, | |
| "learning_rate": 1.6570258536376083e-06, | |
| "loss": -0.1328, | |
| "reward": 1.6685268878936768, | |
| "reward_std": 0.6357107758522034, | |
| "rewards/embodied_math": 0.08035714738070965, | |
| "rewards/format_reward": 0.7946428805589676, | |
| "rewards/tag_count_reward": 0.7935268133878708, | |
| "step": 249 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1205.1920471191406, | |
| "epoch": 0.835421888053467, | |
| "grad_norm": 0.17488045990467072, | |
| "kl": 2.93359375, | |
| "learning_rate": 1.5932094364120453e-06, | |
| "loss": -0.1389, | |
| "reward": 1.7232143580913544, | |
| "reward_std": 0.6385822296142578, | |
| "rewards/embodied_math": 0.1696428693830967, | |
| "rewards/format_reward": 0.7723214775323868, | |
| "rewards/tag_count_reward": 0.7812500298023224, | |
| "step": 250 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1205.0938110351562, | |
| "epoch": 0.8387635756056808, | |
| "grad_norm": 0.4325239956378937, | |
| "kl": 3.138671875, | |
| "learning_rate": 1.5305396417328755e-06, | |
| "loss": -0.1453, | |
| "reward": 1.6110491752624512, | |
| "reward_std": 0.6495798975229263, | |
| "rewards/embodied_math": 0.0580357164144516, | |
| "rewards/format_reward": 0.7745535969734192, | |
| "rewards/tag_count_reward": 0.7784598469734192, | |
| "step": 251 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1207.5023193359375, | |
| "epoch": 0.8421052631578947, | |
| "grad_norm": 0.29781222343444824, | |
| "kl": 2.95703125, | |
| "learning_rate": 1.469025017285335e-06, | |
| "loss": -0.122, | |
| "reward": 1.702008992433548, | |
| "reward_std": 0.6345908641815186, | |
| "rewards/embodied_math": 0.1473214365541935, | |
| "rewards/format_reward": 0.7656250298023224, | |
| "rewards/tag_count_reward": 0.7890625298023224, | |
| "step": 252 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1228.7054138183594, | |
| "epoch": 0.8454469507101086, | |
| "grad_norm": 0.20634357631206512, | |
| "kl": 1.56640625, | |
| "learning_rate": 1.4086739531981886e-06, | |
| "loss": -0.1182, | |
| "reward": 1.6674107909202576, | |
| "reward_std": 0.6024844646453857, | |
| "rewards/embodied_math": 0.0602678619325161, | |
| "rewards/format_reward": 0.8035714775323868, | |
| "rewards/tag_count_reward": 0.8035714626312256, | |
| "step": 253 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1224.6339721679688, | |
| "epoch": 0.8487886382623224, | |
| "grad_norm": 0.46803462505340576, | |
| "kl": 2.296875, | |
| "learning_rate": 1.3494946808993804e-06, | |
| "loss": -0.1075, | |
| "reward": 1.6858259439468384, | |
| "reward_std": 0.6125819832086563, | |
| "rewards/embodied_math": 0.1071428619325161, | |
| "rewards/format_reward": 0.7834821790456772, | |
| "rewards/tag_count_reward": 0.7952009290456772, | |
| "step": 254 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1195.5380249023438, | |
| "epoch": 0.8521303258145363, | |
| "grad_norm": 0.1840384304523468, | |
| "kl": 2.791015625, | |
| "learning_rate": 1.291495271993337e-06, | |
| "loss": -0.1668, | |
| "reward": 1.8063617050647736, | |
| "reward_std": 0.6036971360445023, | |
| "rewards/embodied_math": 0.19866072200238705, | |
| "rewards/format_reward": 0.7946428805589676, | |
| "rewards/tag_count_reward": 0.813058078289032, | |
| "step": 255 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1177.13623046875, | |
| "epoch": 0.8554720133667502, | |
| "grad_norm": 0.2672232389450073, | |
| "kl": 3.494140625, | |
| "learning_rate": 1.234683637160048e-06, | |
| "loss": -0.1723, | |
| "reward": 1.6679688394069672, | |
| "reward_std": 0.6499809473752975, | |
| "rewards/embodied_math": 0.10714285797439516, | |
| "rewards/format_reward": 0.7790178954601288, | |
| "rewards/tag_count_reward": 0.7818080633878708, | |
| "step": 256 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1220.99560546875, | |
| "epoch": 0.858813700918964, | |
| "grad_norm": 0.33298298716545105, | |
| "kl": 2.24609375, | |
| "learning_rate": 1.1790675250761263e-06, | |
| "loss": -0.1364, | |
| "reward": 1.7343750894069672, | |
| "reward_std": 0.5940727889537811, | |
| "rewards/embodied_math": 0.1250000037252903, | |
| "rewards/format_reward": 0.8058035969734192, | |
| "rewards/tag_count_reward": 0.8035714626312256, | |
| "step": 257 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1222.9643249511719, | |
| "epoch": 0.8621553884711779, | |
| "grad_norm": 0.2807648181915283, | |
| "kl": 2.150390625, | |
| "learning_rate": 1.124654521357934e-06, | |
| "loss": -0.124, | |
| "reward": 1.8018974363803864, | |
| "reward_std": 0.6039710342884064, | |
| "rewards/embodied_math": 0.15625000977888703, | |
| "rewards/format_reward": 0.8281250596046448, | |
| "rewards/tag_count_reward": 0.8175223618745804, | |
| "step": 258 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1242.5736999511719, | |
| "epoch": 0.8654970760233918, | |
| "grad_norm": 0.2357352077960968, | |
| "kl": 1.890625, | |
| "learning_rate": 1.0714520475269653e-06, | |
| "loss": -0.091, | |
| "reward": 1.8046875894069672, | |
| "reward_std": 0.528480052947998, | |
| "rewards/embodied_math": 0.10714285913854837, | |
| "rewards/format_reward": 0.8593750447034836, | |
| "rewards/tag_count_reward": 0.8381696790456772, | |
| "step": 259 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1218.3772583007812, | |
| "epoch": 0.8688387635756056, | |
| "grad_norm": 0.35705995559692383, | |
| "kl": 2.6884765625, | |
| "learning_rate": 1.0194673599976134e-06, | |
| "loss": -0.1249, | |
| "reward": 1.6947545111179352, | |
| "reward_std": 0.615173727273941, | |
| "rewards/embodied_math": 0.10491071827709675, | |
| "rewards/format_reward": 0.8013393133878708, | |
| "rewards/tag_count_reward": 0.7885045111179352, | |
| "step": 260 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1250.2745971679688, | |
| "epoch": 0.8721804511278195, | |
| "grad_norm": 0.2711421847343445, | |
| "kl": 1.5419921875, | |
| "learning_rate": 9.687075490874376e-07, | |
| "loss": -0.085, | |
| "reward": 1.6858260035514832, | |
| "reward_std": 0.5885833278298378, | |
| "rewards/embodied_math": 0.06473214481957257, | |
| "rewards/format_reward": 0.808035746216774, | |
| "rewards/tag_count_reward": 0.8130580633878708, | |
| "step": 261 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1219.0625305175781, | |
| "epoch": 0.8755221386800334, | |
| "grad_norm": 0.5203860998153687, | |
| "kl": 2.400390625, | |
| "learning_rate": 9.191795380501133e-07, | |
| "loss": -0.1352, | |
| "reward": 1.6562500596046448, | |
| "reward_std": 0.6262349635362625, | |
| "rewards/embodied_math": 0.0446428582072258, | |
| "rewards/format_reward": 0.8236607760190964, | |
| "rewards/tag_count_reward": 0.7879464477300644, | |
| "step": 262 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1232.0000610351562, | |
| "epoch": 0.8788638262322472, | |
| "grad_norm": 0.3880462944507599, | |
| "kl": 1.912109375, | |
| "learning_rate": 8.708900821311405e-07, | |
| "loss": -0.1179, | |
| "reward": 1.7483260035514832, | |
| "reward_std": 0.5779529809951782, | |
| "rewards/embodied_math": 0.06696428917348385, | |
| "rewards/format_reward": 0.850446492433548, | |
| "rewards/tag_count_reward": 0.8309152126312256, | |
| "step": 263 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1225.6027526855469, | |
| "epoch": 0.8822055137844611, | |
| "grad_norm": 0.276309609413147, | |
| "kl": 2.359375, | |
| "learning_rate": 8.238457676464873e-07, | |
| "loss": -0.1309, | |
| "reward": 1.8102679252624512, | |
| "reward_std": 0.5943205058574677, | |
| "rewards/embodied_math": 0.1450892947614193, | |
| "rewards/format_reward": 0.8459821790456772, | |
| "rewards/tag_count_reward": 0.8191964626312256, | |
| "step": 264 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1237.4420166015625, | |
| "epoch": 0.885547201336675, | |
| "grad_norm": 0.30299896001815796, | |
| "kl": 1.912109375, | |
| "learning_rate": 7.780530110842566e-07, | |
| "loss": -0.101, | |
| "reward": 1.8069196939468384, | |
| "reward_std": 0.5637443214654922, | |
| "rewards/embodied_math": 0.15178572107106447, | |
| "rewards/format_reward": 0.830357164144516, | |
| "rewards/tag_count_reward": 0.824776828289032, | |
| "step": 265 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1217.5067443847656, | |
| "epoch": 0.8888888888888888, | |
| "grad_norm": 0.45059701800346375, | |
| "kl": 3.25, | |
| "learning_rate": 7.335180582295387e-07, | |
| "loss": -0.1389, | |
| "reward": 1.7516741752624512, | |
| "reward_std": 0.5634035468101501, | |
| "rewards/embodied_math": 0.06250000186264515, | |
| "rewards/format_reward": 0.8526786118745804, | |
| "rewards/tag_count_reward": 0.8364955633878708, | |
| "step": 266 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1246.4553833007812, | |
| "epoch": 0.8922305764411027, | |
| "grad_norm": 0.3325289189815521, | |
| "kl": 2.2841796875, | |
| "learning_rate": 6.902469833125236e-07, | |
| "loss": -0.0868, | |
| "reward": 1.83147332072258, | |
| "reward_std": 0.478071965277195, | |
| "rewards/embodied_math": 0.09821428847499192, | |
| "rewards/format_reward": 0.8839286267757416, | |
| "rewards/tag_count_reward": 0.8493303954601288, | |
| "step": 267 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1242.0826416015625, | |
| "epoch": 0.8955722639933166, | |
| "grad_norm": 0.17567971348762512, | |
| "kl": 1.86669921875, | |
| "learning_rate": 6.482456881800248e-07, | |
| "loss": -0.0873, | |
| "reward": 1.784040242433548, | |
| "reward_std": 0.5592886134982109, | |
| "rewards/embodied_math": 0.1004464328289032, | |
| "rewards/format_reward": 0.8437500596046448, | |
| "rewards/tag_count_reward": 0.8398437947034836, | |
| "step": 268 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1242.1361999511719, | |
| "epoch": 0.8989139515455304, | |
| "grad_norm": 0.33967000246047974, | |
| "kl": 2.61474609375, | |
| "learning_rate": 6.075199014905153e-07, | |
| "loss": -0.094, | |
| "reward": 1.8325894176959991, | |
| "reward_std": 0.5444860756397247, | |
| "rewards/embodied_math": 0.11830357811413705, | |
| "rewards/format_reward": 0.861607164144516, | |
| "rewards/tag_count_reward": 0.8526786267757416, | |
| "step": 269 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1256.3304138183594, | |
| "epoch": 0.9022556390977443, | |
| "grad_norm": 0.2733778655529022, | |
| "kl": 1.634765625, | |
| "learning_rate": 5.680751779327742e-07, | |
| "loss": -0.0806, | |
| "reward": 1.7968750894069672, | |
| "reward_std": 0.5211016461253166, | |
| "rewards/embodied_math": 0.0647321455180645, | |
| "rewards/format_reward": 0.8772321790456772, | |
| "rewards/tag_count_reward": 0.854910746216774, | |
| "step": 270 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1230.5335388183594, | |
| "epoch": 0.9055973266499582, | |
| "grad_norm": 0.21413490176200867, | |
| "kl": 2.7421875, | |
| "learning_rate": 5.299168974682789e-07, | |
| "loss": -0.1133, | |
| "reward": 1.8024554550647736, | |
| "reward_std": 0.5250123292207718, | |
| "rewards/embodied_math": 0.0625000037252903, | |
| "rewards/format_reward": 0.886160746216774, | |
| "rewards/tag_count_reward": 0.8537946939468384, | |
| "step": 271 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1242.419677734375, | |
| "epoch": 0.908939014202172, | |
| "grad_norm": 0.1741914302110672, | |
| "kl": 2.4306640625, | |
| "learning_rate": 4.930502645974122e-07, | |
| "loss": -0.0974, | |
| "reward": 1.8108260035514832, | |
| "reward_std": 0.5831947550177574, | |
| "rewards/embodied_math": 0.13616071757860482, | |
| "rewards/format_reward": 0.8504464626312256, | |
| "rewards/tag_count_reward": 0.8242187649011612, | |
| "step": 272 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1224.7545166015625, | |
| "epoch": 0.9122807017543859, | |
| "grad_norm": 0.20169594883918762, | |
| "kl": 2.310546875, | |
| "learning_rate": 4.574803076496148e-07, | |
| "loss": -0.1158, | |
| "reward": 1.7460938394069672, | |
| "reward_std": 0.6089013665914536, | |
| "rewards/embodied_math": 0.10267857206054032, | |
| "rewards/format_reward": 0.8303571939468384, | |
| "rewards/tag_count_reward": 0.813058078289032, | |
| "step": 273 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1227.2879943847656, | |
| "epoch": 0.9156223893065998, | |
| "grad_norm": 0.1624419391155243, | |
| "kl": 2.3359375, | |
| "learning_rate": 4.232118780975447e-07, | |
| "loss": -0.1202, | |
| "reward": 1.8231027722358704, | |
| "reward_std": 0.5556938722729683, | |
| "rewards/embodied_math": 0.12053571827709675, | |
| "rewards/format_reward": 0.8616071939468384, | |
| "rewards/tag_count_reward": 0.8409598469734192, | |
| "step": 274 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1242.0625305175781, | |
| "epoch": 0.9189640768588136, | |
| "grad_norm": 0.2469264715909958, | |
| "kl": 2.6015625, | |
| "learning_rate": 3.9024964989539227e-07, | |
| "loss": -0.1026, | |
| "reward": 1.7650670409202576, | |
| "reward_std": 0.5289107412099838, | |
| "rewards/embodied_math": 0.0602678619325161, | |
| "rewards/format_reward": 0.8660714775323868, | |
| "rewards/tag_count_reward": 0.8387277126312256, | |
| "step": 275 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1248.2746276855469, | |
| "epoch": 0.9223057644110275, | |
| "grad_norm": 0.154992938041687, | |
| "kl": 1.62109375, | |
| "learning_rate": 3.585981188413767e-07, | |
| "loss": -0.0986, | |
| "reward": 1.848772406578064, | |
| "reward_std": 0.43934302031993866, | |
| "rewards/embodied_math": 0.08928571990691125, | |
| "rewards/format_reward": 0.895089328289032, | |
| "rewards/tag_count_reward": 0.8643973618745804, | |
| "step": 276 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1241.7991638183594, | |
| "epoch": 0.9256474519632414, | |
| "grad_norm": 0.1895643174648285, | |
| "kl": 2.521484375, | |
| "learning_rate": 3.2826160196455124e-07, | |
| "loss": -0.0872, | |
| "reward": 1.8766741752624512, | |
| "reward_std": 0.5750466883182526, | |
| "rewards/embodied_math": 0.17633929336443543, | |
| "rewards/format_reward": 0.8549107760190964, | |
| "rewards/tag_count_reward": 0.8454241454601288, | |
| "step": 277 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1223.7031555175781, | |
| "epoch": 0.9289891395154553, | |
| "grad_norm": 0.3093890845775604, | |
| "kl": 2.98828125, | |
| "learning_rate": 2.9924423693600157e-07, | |
| "loss": -0.1305, | |
| "reward": 1.7991071939468384, | |
| "reward_std": 0.6020410656929016, | |
| "rewards/embodied_math": 0.1205357201397419, | |
| "rewards/format_reward": 0.8437500447034836, | |
| "rewards/tag_count_reward": 0.8348214775323868, | |
| "step": 278 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1252.821533203125, | |
| "epoch": 0.9323308270676691, | |
| "grad_norm": 0.14048472046852112, | |
| "kl": 2.017578125, | |
| "learning_rate": 2.7154998150449643e-07, | |
| "loss": -0.0813, | |
| "reward": 1.7885045409202576, | |
| "reward_std": 0.5473662465810776, | |
| "rewards/embodied_math": 0.07366071827709675, | |
| "rewards/format_reward": 0.8660714775323868, | |
| "rewards/tag_count_reward": 0.8487723767757416, | |
| "step": 279 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1257.71435546875, | |
| "epoch": 0.935672514619883, | |
| "grad_norm": 0.25984320044517517, | |
| "kl": 1.62060546875, | |
| "learning_rate": 2.4518261295667255e-07, | |
| "loss": -0.0721, | |
| "reward": 1.7974331080913544, | |
| "reward_std": 0.5135258659720421, | |
| "rewards/embodied_math": 0.06026785937137902, | |
| "rewards/format_reward": 0.8816964626312256, | |
| "rewards/tag_count_reward": 0.8554687798023224, | |
| "step": 280 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1249.5870971679688, | |
| "epoch": 0.9390142021720969, | |
| "grad_norm": 0.15457814931869507, | |
| "kl": 2.15234375, | |
| "learning_rate": 2.201457276018526e-07, | |
| "loss": -0.0906, | |
| "reward": 1.8141742050647736, | |
| "reward_std": 0.5247047990560532, | |
| "rewards/embodied_math": 0.09821428847499192, | |
| "rewards/format_reward": 0.863839328289032, | |
| "rewards/tag_count_reward": 0.852120578289032, | |
| "step": 281 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1263.9152221679688, | |
| "epoch": 0.9423558897243107, | |
| "grad_norm": 0.1510641723871231, | |
| "kl": 1.44287109375, | |
| "learning_rate": 1.9644274028152944e-07, | |
| "loss": -0.0561, | |
| "reward": 1.8599331080913544, | |
| "reward_std": 0.49419236928224564, | |
| "rewards/embodied_math": 0.09375000698491931, | |
| "rewards/format_reward": 0.8906250298023224, | |
| "rewards/tag_count_reward": 0.8755580931901932, | |
| "step": 282 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1247.6763916015625, | |
| "epoch": 0.9456975772765246, | |
| "grad_norm": 0.19849680364131927, | |
| "kl": 1.9423828125, | |
| "learning_rate": 1.740768839036111e-07, | |
| "loss": -0.1009, | |
| "reward": 1.8638394176959991, | |
| "reward_std": 0.5082411393523216, | |
| "rewards/embodied_math": 0.12946428847499192, | |
| "rewards/format_reward": 0.8816964626312256, | |
| "rewards/tag_count_reward": 0.8526786118745804, | |
| "step": 283 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1232.4442749023438, | |
| "epoch": 0.9490392648287385, | |
| "grad_norm": 0.16909794509410858, | |
| "kl": 2.779296875, | |
| "learning_rate": 1.5305120900146908e-07, | |
| "loss": -0.1205, | |
| "reward": 1.8030134737491608, | |
| "reward_std": 0.5760641321539879, | |
| "rewards/embodied_math": 0.10491071734577417, | |
| "rewards/format_reward": 0.8638393133878708, | |
| "rewards/tag_count_reward": 0.8342634439468384, | |
| "step": 284 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1238.4888916015625, | |
| "epoch": 0.9523809523809523, | |
| "grad_norm": 0.15841911733150482, | |
| "kl": 2.1328125, | |
| "learning_rate": 1.3336858331787993e-07, | |
| "loss": -0.1172, | |
| "reward": 1.8030134737491608, | |
| "reward_std": 0.5178222879767418, | |
| "rewards/embodied_math": 0.066964291036129, | |
| "rewards/format_reward": 0.8772321790456772, | |
| "rewards/tag_count_reward": 0.8588170111179352, | |
| "step": 285 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1237.6495971679688, | |
| "epoch": 0.9557226399331662, | |
| "grad_norm": 0.26993128657341003, | |
| "kl": 2.83203125, | |
| "learning_rate": 1.1503169141388049e-07, | |
| "loss": -0.1095, | |
| "reward": 1.794084906578064, | |
| "reward_std": 0.5182835385203362, | |
| "rewards/embodied_math": 0.03348214412108064, | |
| "rewards/format_reward": 0.8861607611179352, | |
| "rewards/tag_count_reward": 0.8744420111179352, | |
| "step": 286 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1231.6451416015625, | |
| "epoch": 0.9590643274853801, | |
| "grad_norm": 0.15350092947483063, | |
| "kl": 1.8896484375, | |
| "learning_rate": 9.804303430261175e-08, | |
| "loss": -0.1118, | |
| "reward": 1.8800224363803864, | |
| "reward_std": 0.5567026510834694, | |
| "rewards/embodied_math": 0.15848215040750802, | |
| "rewards/format_reward": 0.8660714626312256, | |
| "rewards/tag_count_reward": 0.8554687947034836, | |
| "step": 287 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1261.5045166015625, | |
| "epoch": 0.9624060150375939, | |
| "grad_norm": 0.1839500516653061, | |
| "kl": 1.720703125, | |
| "learning_rate": 8.240492910820407e-08, | |
| "loss": -0.0612, | |
| "reward": 1.8571429550647736, | |
| "reward_std": 0.49432309716939926, | |
| "rewards/embodied_math": 0.09598214738070965, | |
| "rewards/format_reward": 0.8906250298023224, | |
| "rewards/tag_count_reward": 0.8705357611179352, | |
| "step": 288 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1249.1339721679688, | |
| "epoch": 0.9657477025898078, | |
| "grad_norm": 0.23754307627677917, | |
| "kl": 1.796875, | |
| "learning_rate": 6.811950874973994e-08, | |
| "loss": -0.0933, | |
| "reward": 1.8593750894069672, | |
| "reward_std": 0.4672791361808777, | |
| "rewards/embodied_math": 0.07589286123402417, | |
| "rewards/format_reward": 0.9017857611179352, | |
| "rewards/tag_count_reward": 0.8816964626312256, | |
| "step": 289 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1219.2768249511719, | |
| "epoch": 0.9690893901420217, | |
| "grad_norm": 0.20686741173267365, | |
| "kl": 3.244140625, | |
| "learning_rate": 5.518872165033329e-08, | |
| "loss": -0.1338, | |
| "reward": 1.8242188394069672, | |
| "reward_std": 0.5806285068392754, | |
| "rewards/embodied_math": 0.1071428619325161, | |
| "rewards/format_reward": 0.8571428805589676, | |
| "rewards/tag_count_reward": 0.859933078289032, | |
| "step": 290 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1250.810302734375, | |
| "epoch": 0.9724310776942355, | |
| "grad_norm": 0.35922443866729736, | |
| "kl": 2.0673828125, | |
| "learning_rate": 4.361433147138772e-08, | |
| "loss": -0.0769, | |
| "reward": 1.7885045409202576, | |
| "reward_std": 0.5405867323279381, | |
| "rewards/embodied_math": 0.06696428917348385, | |
| "rewards/format_reward": 0.8683036118745804, | |
| "rewards/tag_count_reward": 0.8532366454601288, | |
| "step": 291 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1233.82373046875, | |
| "epoch": 0.9757727652464494, | |
| "grad_norm": 0.3444741368293762, | |
| "kl": 2.0546875, | |
| "learning_rate": 3.339791687203997e-08, | |
| "loss": -0.1137, | |
| "reward": 1.854352742433548, | |
| "reward_std": 0.524741031229496, | |
| "rewards/embodied_math": 0.129464291036129, | |
| "rewards/format_reward": 0.8750000447034836, | |
| "rewards/tag_count_reward": 0.8498884439468384, | |
| "step": 292 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1226.1250610351562, | |
| "epoch": 0.9791144527986633, | |
| "grad_norm": 0.2046412229537964, | |
| "kl": 2.8984375, | |
| "learning_rate": 2.4540871293845526e-08, | |
| "loss": -0.1185, | |
| "reward": 1.9469866752624512, | |
| "reward_std": 0.57990662753582, | |
| "rewards/embodied_math": 0.207589291036129, | |
| "rewards/format_reward": 0.879464328289032, | |
| "rewards/tag_count_reward": 0.8599330633878708, | |
| "step": 293 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1216.9888916015625, | |
| "epoch": 0.9824561403508771, | |
| "grad_norm": 0.4618666172027588, | |
| "kl": 3.0234375, | |
| "learning_rate": 1.7044402770725055e-08, | |
| "loss": -0.1354, | |
| "reward": 1.782366156578064, | |
| "reward_std": 0.5473798364400864, | |
| "rewards/embodied_math": 0.09821428777649999, | |
| "rewards/format_reward": 0.8437500447034836, | |
| "rewards/tag_count_reward": 0.8404018133878708, | |
| "step": 294 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1245.3147888183594, | |
| "epoch": 0.985797827903091, | |
| "grad_norm": 0.21296104788780212, | |
| "kl": 1.7734375, | |
| "learning_rate": 1.0909533764194013e-08, | |
| "loss": -0.0758, | |
| "reward": 1.7204241752624512, | |
| "reward_std": 0.5121402740478516, | |
| "rewards/embodied_math": 0.029017859371379018, | |
| "rewards/format_reward": 0.8526786267757416, | |
| "rewards/tag_count_reward": 0.8387277275323868, | |
| "step": 295 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1222.90185546875, | |
| "epoch": 0.9891395154553049, | |
| "grad_norm": 0.2440209835767746, | |
| "kl": 2.74609375, | |
| "learning_rate": 6.137101023910852e-09, | |
| "loss": -0.1229, | |
| "reward": 1.8281250894069672, | |
| "reward_std": 0.5333529859781265, | |
| "rewards/embodied_math": 0.08482143236324191, | |
| "rewards/format_reward": 0.870535746216774, | |
| "rewards/tag_count_reward": 0.8727678954601288, | |
| "step": 296 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1258.2143249511719, | |
| "epoch": 0.9924812030075187, | |
| "grad_norm": 0.14400802552700043, | |
| "kl": 1.7626953125, | |
| "learning_rate": 2.7277554735449797e-09, | |
| "loss": -0.0644, | |
| "reward": 1.8348215222358704, | |
| "reward_std": 0.5516516491770744, | |
| "rewards/embodied_math": 0.12723214738070965, | |
| "rewards/format_reward": 0.8549107313156128, | |
| "rewards/tag_count_reward": 0.8526786118745804, | |
| "step": 297 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1253.5580444335938, | |
| "epoch": 0.9958228905597326, | |
| "grad_norm": 0.4155780076980591, | |
| "kl": 2.40234375, | |
| "learning_rate": 6.819621220033323e-10, | |
| "loss": -0.08, | |
| "reward": 1.825334906578064, | |
| "reward_std": 0.5505645722150803, | |
| "rewards/embodied_math": 0.11383929220028222, | |
| "rewards/format_reward": 0.8660714626312256, | |
| "rewards/tag_count_reward": 0.8454241305589676, | |
| "step": 298 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1240.5057678222656, | |
| "epoch": 0.9991645781119465, | |
| "grad_norm": 0.3007453680038452, | |
| "kl": 2.044921875, | |
| "learning_rate": 0.0, | |
| "loss": -0.1029, | |
| "reward": 1.8593750894069672, | |
| "reward_std": 0.5401175245642662, | |
| "rewards/embodied_math": 0.13839286053553224, | |
| "rewards/format_reward": 0.8816964626312256, | |
| "rewards/tag_count_reward": 0.8392857611179352, | |
| "step": 299 | |
| }, | |
| { | |
| "epoch": 0.9991645781119465, | |
| "step": 299, | |
| "total_flos": 0.0, | |
| "train_loss": -0.013552246318095758, | |
| "train_runtime": 19984.1821, | |
| "train_samples_per_second": 0.419, | |
| "train_steps_per_second": 0.015 | |
| } | |
| ], | |
| "logging_steps": 1, | |
| "max_steps": 299, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 20, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 0.0, | |
| "train_batch_size": 16, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |