| { |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 0.05333333333333334, |
| "eval_steps": 500, |
| "global_step": 100, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completion_length": 250.1770896911621, |
| "entropy": 0.4295753017067909, |
| "epoch": 0.0005333333333333334, |
| "grad_norm": 0.0589873343706131, |
| "learning_rate": 1.5e-06, |
| "log_pi_ratio": -8.231443386813586e-05, |
| "loss": -0.0, |
| "reward": 1.406250074505806, |
| "rewards/boxed_and_answer_tags_format_reward": 0.65625, |
| "rewards/correctness_reward_func_math": 0.7499999850988388, |
| "step": 1 |
| }, |
| { |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "entropy": 0.4295753017067909, |
| "epoch": 0.0010666666666666667, |
| "grad_norm": 0.05906615033745766, |
| "learning_rate": 3e-06, |
| "log_pi_ratio": -8.231443386813586e-05, |
| "loss": -0.0, |
| "step": 2 |
| }, |
| { |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "entropy": 0.429742269217968, |
| "epoch": 0.0016, |
| "grad_norm": 0.07365015894174576, |
| "learning_rate": 3e-06, |
| "log_pi_ratio": -0.0002492839630576782, |
| "loss": 0.0002, |
| "step": 3 |
| }, |
| { |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "entropy": 0.4291049689054489, |
| "epoch": 0.0021333333333333334, |
| "grad_norm": 0.06198830157518387, |
| "learning_rate": 3e-06, |
| "log_pi_ratio": 0.0003880164513248019, |
| "loss": -0.0, |
| "step": 4 |
| }, |
| { |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completion_length": 228.6354217529297, |
| "entropy": 0.5289351046085358, |
| "epoch": 0.0026666666666666666, |
| "grad_norm": 0.05378839746117592, |
| "learning_rate": 3e-06, |
| "log_pi_ratio": -5.4222204198595136e-05, |
| "loss": -0.0003, |
| "reward": 1.2187500298023224, |
| "rewards/boxed_and_answer_tags_format_reward": 0.59375, |
| "rewards/correctness_reward_func_math": 0.6250000055879354, |
| "step": 5 |
| }, |
| { |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "entropy": 0.529054805636406, |
| "epoch": 0.0032, |
| "grad_norm": 0.05765789747238159, |
| "learning_rate": 3e-06, |
| "log_pi_ratio": -0.0001739257131703198, |
| "loss": -0.0001, |
| "step": 6 |
| }, |
| { |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "entropy": 0.5290582627058029, |
| "epoch": 0.0037333333333333333, |
| "grad_norm": 0.06511564552783966, |
| "learning_rate": 3e-06, |
| "log_pi_ratio": -0.00017735697838361375, |
| "loss": -0.0003, |
| "step": 7 |
| }, |
| { |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "entropy": 0.5294944420456886, |
| "epoch": 0.004266666666666667, |
| "grad_norm": 0.05595042556524277, |
| "learning_rate": 3e-06, |
| "log_pi_ratio": -0.0006135236180853099, |
| "loss": -0.0004, |
| "step": 8 |
| }, |
| { |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completion_length": 242.25000762939453, |
| "entropy": 0.4981203153729439, |
| "epoch": 0.0048, |
| "grad_norm": 0.05393671989440918, |
| "learning_rate": 3e-06, |
| "log_pi_ratio": 7.427447417285293e-05, |
| "loss": 0.0002, |
| "reward": 1.2395833730697632, |
| "rewards/boxed_and_answer_tags_format_reward": 0.5729166641831398, |
| "rewards/correctness_reward_func_math": 0.6666666716337204, |
| "step": 9 |
| }, |
| { |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "entropy": 0.49807676672935486, |
| "epoch": 0.005333333333333333, |
| "grad_norm": 0.2750526964664459, |
| "learning_rate": 3e-06, |
| "log_pi_ratio": 0.00011780268323491327, |
| "loss": 0.0001, |
| "step": 10 |
| }, |
| { |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "entropy": 0.49833260476589203, |
| "epoch": 0.005866666666666667, |
| "grad_norm": 0.058018915355205536, |
| "learning_rate": 3e-06, |
| "log_pi_ratio": -0.0001380204048473388, |
| "loss": -0.0003, |
| "step": 11 |
| }, |
| { |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "entropy": 0.4981199651956558, |
| "epoch": 0.0064, |
| "grad_norm": 0.05605858191847801, |
| "learning_rate": 3e-06, |
| "log_pi_ratio": 7.462104986188933e-05, |
| "loss": -0.0001, |
| "step": 12 |
| }, |
| { |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completion_length": 239.56250381469727, |
| "entropy": 0.5426438078284264, |
| "epoch": 0.006933333333333333, |
| "grad_norm": 0.04904413968324661, |
| "learning_rate": 3e-06, |
| "log_pi_ratio": 0.00010345246846554801, |
| "loss": 0.0003, |
| "reward": 1.1406250298023224, |
| "rewards/boxed_and_answer_tags_format_reward": 0.640625, |
| "rewards/correctness_reward_func_math": 0.5, |
| "step": 13 |
| }, |
| { |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "entropy": 0.5426872745156288, |
| "epoch": 0.007466666666666667, |
| "grad_norm": 0.04319930449128151, |
| "learning_rate": 3e-06, |
| "log_pi_ratio": 5.999111454002559e-05, |
| "loss": 0.0004, |
| "step": 14 |
| }, |
| { |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "entropy": 0.5425786674022675, |
| "epoch": 0.008, |
| "grad_norm": 0.03787970170378685, |
| "learning_rate": 3e-06, |
| "log_pi_ratio": 0.00016858673552633263, |
| "loss": 0.0001, |
| "step": 15 |
| }, |
| { |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "entropy": 0.5426425188779831, |
| "epoch": 0.008533333333333334, |
| "grad_norm": 0.036648496985435486, |
| "learning_rate": 3e-06, |
| "log_pi_ratio": 0.00010471276982570998, |
| "loss": 0.0, |
| "step": 16 |
| }, |
| { |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completion_length": 246.3854217529297, |
| "entropy": 0.5258407667279243, |
| "epoch": 0.009066666666666667, |
| "grad_norm": 0.04535071551799774, |
| "learning_rate": 3e-06, |
| "log_pi_ratio": 0.0003478629805613309, |
| "loss": -0.0001, |
| "reward": 1.1562500596046448, |
| "rewards/boxed_and_answer_tags_format_reward": 0.6145833283662796, |
| "rewards/correctness_reward_func_math": 0.5416666753590107, |
| "step": 17 |
| }, |
| { |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "entropy": 0.525847963988781, |
| "epoch": 0.0096, |
| "grad_norm": 0.056307464838027954, |
| "learning_rate": 3e-06, |
| "log_pi_ratio": 0.00034069575485773385, |
| "loss": 0.0001, |
| "step": 18 |
| }, |
| { |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "entropy": 0.5257168933749199, |
| "epoch": 0.010133333333333333, |
| "grad_norm": 0.044158197939395905, |
| "learning_rate": 3e-06, |
| "log_pi_ratio": 0.00047175012514344417, |
| "loss": 0.0001, |
| "step": 19 |
| }, |
| { |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "entropy": 0.5258035808801651, |
| "epoch": 0.010666666666666666, |
| "grad_norm": 0.04544621706008911, |
| "learning_rate": 3e-06, |
| "log_pi_ratio": 0.0003850707726087421, |
| "loss": 0.0, |
| "step": 20 |
| }, |
| { |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completion_length": 247.2291717529297, |
| "entropy": 0.5515406280755997, |
| "epoch": 0.0112, |
| "grad_norm": 0.058575354516506195, |
| "learning_rate": 3e-06, |
| "log_pi_ratio": -0.0008289311444968916, |
| "loss": -0.0, |
| "reward": 1.2031250298023224, |
| "rewards/boxed_and_answer_tags_format_reward": 0.6197916716337204, |
| "rewards/correctness_reward_func_math": 0.583333333954215, |
| "step": 21 |
| }, |
| { |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "entropy": 0.5512647777795792, |
| "epoch": 0.011733333333333333, |
| "grad_norm": 0.06019975244998932, |
| "learning_rate": 3e-06, |
| "log_pi_ratio": -0.0005530662892851979, |
| "loss": -0.0002, |
| "step": 22 |
| }, |
| { |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "entropy": 0.5515913963317871, |
| "epoch": 0.012266666666666667, |
| "grad_norm": 0.05747281387448311, |
| "learning_rate": 3e-06, |
| "log_pi_ratio": -0.000879692546732258, |
| "loss": -0.0002, |
| "step": 23 |
| }, |
| { |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "entropy": 0.5510128661990166, |
| "epoch": 0.0128, |
| "grad_norm": 0.06023112311959267, |
| "learning_rate": 3e-06, |
| "log_pi_ratio": -0.00030118109862087294, |
| "loss": -0.0002, |
| "step": 24 |
| }, |
| { |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completion_length": 242.03125381469727, |
| "entropy": 0.5678461492061615, |
| "epoch": 0.013333333333333334, |
| "grad_norm": 0.02757483720779419, |
| "learning_rate": 3e-06, |
| "log_pi_ratio": -0.00023131727721192874, |
| "loss": -0.0, |
| "reward": 1.1770833730697632, |
| "rewards/boxed_and_answer_tags_format_reward": 0.59375, |
| "rewards/correctness_reward_func_math": 0.5833333358168602, |
| "step": 25 |
| }, |
| { |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "entropy": 0.5679307132959366, |
| "epoch": 0.013866666666666666, |
| "grad_norm": 0.029515035450458527, |
| "learning_rate": 3e-06, |
| "log_pi_ratio": -0.0003158858453389257, |
| "loss": -0.0, |
| "step": 26 |
| }, |
| { |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "entropy": 0.5677110254764557, |
| "epoch": 0.0144, |
| "grad_norm": 0.02889692410826683, |
| "learning_rate": 3e-06, |
| "log_pi_ratio": -9.621075878385454e-05, |
| "loss": -0.0001, |
| "step": 27 |
| }, |
| { |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "entropy": 0.5675213783979416, |
| "epoch": 0.014933333333333333, |
| "grad_norm": 0.030576596036553383, |
| "learning_rate": 3e-06, |
| "log_pi_ratio": 9.346040314994752e-05, |
| "loss": -0.0, |
| "step": 28 |
| }, |
| { |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completion_length": 247.46875381469727, |
| "entropy": 0.5834467113018036, |
| "epoch": 0.015466666666666667, |
| "grad_norm": 0.03030555695295334, |
| "learning_rate": 3e-06, |
| "log_pi_ratio": -0.0001248269181814976, |
| "loss": -0.0, |
| "reward": 0.9479166865348816, |
| "rewards/boxed_and_answer_tags_format_reward": 0.5729166716337204, |
| "rewards/correctness_reward_func_math": 0.375, |
| "step": 29 |
| }, |
| { |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "entropy": 0.5832108706235886, |
| "epoch": 0.016, |
| "grad_norm": 0.030331073328852654, |
| "learning_rate": 3e-06, |
| "log_pi_ratio": 0.00011100981646450236, |
| "loss": 0.0, |
| "step": 30 |
| }, |
| { |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "entropy": 0.5831519067287445, |
| "epoch": 0.016533333333333334, |
| "grad_norm": 0.031816788017749786, |
| "learning_rate": 3e-06, |
| "log_pi_ratio": 0.00016997993225231767, |
| "loss": 0.0, |
| "step": 31 |
| }, |
| { |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "entropy": 0.583659753203392, |
| "epoch": 0.017066666666666667, |
| "grad_norm": 0.03242775425314903, |
| "learning_rate": 3e-06, |
| "log_pi_ratio": -0.0003378683468326926, |
| "loss": -0.0001, |
| "step": 32 |
| }, |
| { |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completion_length": 237.91667556762695, |
| "entropy": 0.5774409174919128, |
| "epoch": 0.0176, |
| "grad_norm": 0.040067657828330994, |
| "learning_rate": 3e-06, |
| "log_pi_ratio": -0.0002348238049307838, |
| "loss": 0.0, |
| "reward": 0.911458358168602, |
| "rewards/boxed_and_answer_tags_format_reward": 0.5572916567325592, |
| "rewards/correctness_reward_func_math": 0.3541666567325592, |
| "step": 33 |
| }, |
| { |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "entropy": 0.5770600140094757, |
| "epoch": 0.018133333333333335, |
| "grad_norm": 0.04447396472096443, |
| "learning_rate": 3e-06, |
| "log_pi_ratio": 0.00014607750927098095, |
| "loss": -0.0001, |
| "step": 34 |
| }, |
| { |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "entropy": 0.5768891721963882, |
| "epoch": 0.018666666666666668, |
| "grad_norm": 0.04008471965789795, |
| "learning_rate": 3e-06, |
| "log_pi_ratio": 0.00031690803734818473, |
| "loss": -0.0002, |
| "step": 35 |
| }, |
| { |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "entropy": 0.5769828110933304, |
| "epoch": 0.0192, |
| "grad_norm": 0.041593629866838455, |
| "learning_rate": 3e-06, |
| "log_pi_ratio": 0.00022326732505462132, |
| "loss": -0.0001, |
| "step": 36 |
| }, |
| { |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completion_length": 245.14583587646484, |
| "entropy": 0.5890882089734077, |
| "epoch": 0.019733333333333332, |
| "grad_norm": 0.05088355764746666, |
| "learning_rate": 3e-06, |
| "log_pi_ratio": -0.0004102104066987522, |
| "loss": 0.0, |
| "reward": 0.9531250298023224, |
| "rewards/boxed_and_answer_tags_format_reward": 0.6406249850988388, |
| "rewards/correctness_reward_func_math": 0.3125000037252903, |
| "step": 37 |
| }, |
| { |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "entropy": 0.5890304446220398, |
| "epoch": 0.020266666666666665, |
| "grad_norm": 0.05404159799218178, |
| "learning_rate": 3e-06, |
| "log_pi_ratio": -0.00035247779851488303, |
| "loss": -0.0001, |
| "step": 38 |
| }, |
| { |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "entropy": 0.5886539816856384, |
| "epoch": 0.0208, |
| "grad_norm": 0.05120766907930374, |
| "learning_rate": 3e-06, |
| "log_pi_ratio": 2.400765151833184e-05, |
| "loss": -0.0002, |
| "step": 39 |
| }, |
| { |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "entropy": 0.5889870673418045, |
| "epoch": 0.021333333333333333, |
| "grad_norm": 0.053153108805418015, |
| "learning_rate": 3e-06, |
| "log_pi_ratio": -0.00030907555992598645, |
| "loss": -0.0001, |
| "step": 40 |
| }, |
| { |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completion_length": 237.1354217529297, |
| "entropy": 0.54951012134552, |
| "epoch": 0.021866666666666666, |
| "grad_norm": 0.06290385127067566, |
| "learning_rate": 3e-06, |
| "log_pi_ratio": -0.00025390044174855575, |
| "loss": -0.0001, |
| "reward": 1.005208358168602, |
| "rewards/boxed_and_answer_tags_format_reward": 0.5677083358168602, |
| "rewards/correctness_reward_func_math": 0.4375, |
| "step": 41 |
| }, |
| { |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "entropy": 0.54947529733181, |
| "epoch": 0.0224, |
| "grad_norm": 0.06733641773462296, |
| "learning_rate": 3e-06, |
| "log_pi_ratio": -0.00021907184418523684, |
| "loss": -0.0002, |
| "step": 42 |
| }, |
| { |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "entropy": 0.5490709245204926, |
| "epoch": 0.022933333333333333, |
| "grad_norm": 0.06687918305397034, |
| "learning_rate": 3e-06, |
| "log_pi_ratio": 0.0001853161011240445, |
| "loss": 0.0001, |
| "step": 43 |
| }, |
| { |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "entropy": 0.5493637472391129, |
| "epoch": 0.023466666666666667, |
| "grad_norm": 0.06797856092453003, |
| "learning_rate": 3e-06, |
| "log_pi_ratio": -0.00010751574882306159, |
| "loss": -0.0004, |
| "step": 44 |
| }, |
| { |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completion_length": 244.3854217529297, |
| "entropy": 0.5019608214497566, |
| "epoch": 0.024, |
| "grad_norm": 0.04074651375412941, |
| "learning_rate": 3e-06, |
| "log_pi_ratio": -0.00022211501527635846, |
| "loss": 0.0003, |
| "reward": 1.1562500298023224, |
| "rewards/boxed_and_answer_tags_format_reward": 0.6145833432674408, |
| "rewards/correctness_reward_func_math": 0.5416666679084301, |
| "step": 45 |
| }, |
| { |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "entropy": 0.5016429349780083, |
| "epoch": 0.024533333333333334, |
| "grad_norm": 0.05098120495676994, |
| "learning_rate": 3e-06, |
| "log_pi_ratio": 9.576557931723073e-05, |
| "loss": 0.0002, |
| "step": 46 |
| }, |
| { |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "entropy": 0.5021634548902512, |
| "epoch": 0.025066666666666668, |
| "grad_norm": 0.04488995671272278, |
| "learning_rate": 3e-06, |
| "log_pi_ratio": -0.00042477464376133867, |
| "loss": 0.0003, |
| "step": 47 |
| }, |
| { |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "entropy": 0.502085268497467, |
| "epoch": 0.0256, |
| "grad_norm": 0.04084954410791397, |
| "learning_rate": 3e-06, |
| "log_pi_ratio": -0.00034658329968806356, |
| "loss": 0.0001, |
| "step": 48 |
| }, |
| { |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completion_length": 244.5208396911621, |
| "entropy": 0.5566539317369461, |
| "epoch": 0.026133333333333335, |
| "grad_norm": 0.048481233417987823, |
| "learning_rate": 3e-06, |
| "log_pi_ratio": -0.00011470551180536859, |
| "loss": -0.0, |
| "reward": 1.2760417014360428, |
| "rewards/boxed_and_answer_tags_format_reward": 0.6302083283662796, |
| "rewards/correctness_reward_func_math": 0.6458333283662796, |
| "step": 49 |
| }, |
| { |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "entropy": 0.5565382838249207, |
| "epoch": 0.02666666666666667, |
| "grad_norm": 0.049503978341817856, |
| "learning_rate": 3e-06, |
| "log_pi_ratio": 9.508294169791043e-07, |
| "loss": -0.0002, |
| "step": 50 |
| }, |
| { |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "entropy": 0.5565498322248459, |
| "epoch": 0.0272, |
| "grad_norm": 0.04954688996076584, |
| "learning_rate": 3e-06, |
| "log_pi_ratio": -1.0585041309241205e-05, |
| "loss": -0.0001, |
| "step": 51 |
| }, |
| { |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "entropy": 0.5563456490635872, |
| "epoch": 0.027733333333333332, |
| "grad_norm": 0.04860557243227959, |
| "learning_rate": 3e-06, |
| "log_pi_ratio": 0.00019361122394911945, |
| "loss": -0.0003, |
| "step": 52 |
| }, |
| { |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completion_length": 249.14583587646484, |
| "entropy": 0.5415888875722885, |
| "epoch": 0.028266666666666666, |
| "grad_norm": 0.028491849079728127, |
| "learning_rate": 3e-06, |
| "log_pi_ratio": -0.00018408894175081514, |
| "loss": -0.0001, |
| "reward": 1.3177083730697632, |
| "rewards/boxed_and_answer_tags_format_reward": 0.5885416716337204, |
| "rewards/correctness_reward_func_math": 0.7291666772216558, |
| "step": 53 |
| }, |
| { |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "entropy": 0.5417958348989487, |
| "epoch": 0.0288, |
| "grad_norm": 0.0282636396586895, |
| "learning_rate": 3e-06, |
| "log_pi_ratio": -0.00039100474168662913, |
| "loss": -0.0001, |
| "step": 54 |
| }, |
| { |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "entropy": 0.5419644862413406, |
| "epoch": 0.029333333333333333, |
| "grad_norm": 0.028839441016316414, |
| "learning_rate": 3e-06, |
| "log_pi_ratio": -0.0005596724040515255, |
| "loss": -0.0002, |
| "step": 55 |
| }, |
| { |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "entropy": 0.5417948812246323, |
| "epoch": 0.029866666666666666, |
| "grad_norm": 0.053943611681461334, |
| "learning_rate": 3e-06, |
| "log_pi_ratio": -0.000390107452403754, |
| "loss": -0.0001, |
| "step": 56 |
| }, |
| { |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completion_length": 244.5729217529297, |
| "entropy": 0.47408151626586914, |
| "epoch": 0.0304, |
| "grad_norm": 0.04138944670557976, |
| "learning_rate": 3e-06, |
| "log_pi_ratio": -0.00039840556564740837, |
| "loss": 0.0, |
| "reward": 1.5520833730697632, |
| "rewards/boxed_and_answer_tags_format_reward": 0.5729166567325592, |
| "rewards/correctness_reward_func_math": 0.9791666716337204, |
| "step": 57 |
| }, |
| { |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "entropy": 0.47388753294944763, |
| "epoch": 0.030933333333333334, |
| "grad_norm": 0.040251851081848145, |
| "learning_rate": 3e-06, |
| "log_pi_ratio": -0.00020442262757569551, |
| "loss": 0.0001, |
| "step": 58 |
| }, |
| { |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "entropy": 0.47425322979688644, |
| "epoch": 0.031466666666666664, |
| "grad_norm": 0.042413048446178436, |
| "learning_rate": 3e-06, |
| "log_pi_ratio": -0.0005701218979083933, |
| "loss": -0.0001, |
| "step": 59 |
| }, |
| { |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "entropy": 0.4742051735520363, |
| "epoch": 0.032, |
| "grad_norm": 0.04123881086707115, |
| "learning_rate": 3e-06, |
| "log_pi_ratio": -0.0005220575912971981, |
| "loss": -0.0001, |
| "step": 60 |
| }, |
| { |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completion_length": 249.36458587646484, |
| "entropy": 0.5222784653306007, |
| "epoch": 0.03253333333333333, |
| "grad_norm": 0.039247363805770874, |
| "learning_rate": 3e-06, |
| "log_pi_ratio": 0.00015966666069289204, |
| "loss": -0.0001, |
| "reward": 0.9322917014360428, |
| "rewards/boxed_and_answer_tags_format_reward": 0.5989583283662796, |
| "rewards/correctness_reward_func_math": 0.33333334140479565, |
| "step": 61 |
| }, |
| { |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "entropy": 0.5227270275354385, |
| "epoch": 0.03306666666666667, |
| "grad_norm": 0.039518315345048904, |
| "learning_rate": 3e-06, |
| "log_pi_ratio": -0.0002888756343963905, |
| "loss": 0.0, |
| "step": 62 |
| }, |
| { |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "entropy": 0.5227140858769417, |
| "epoch": 0.0336, |
| "grad_norm": 0.039462391287088394, |
| "learning_rate": 3e-06, |
| "log_pi_ratio": -0.00027595218853093684, |
| "loss": 0.0001, |
| "step": 63 |
| }, |
| { |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "entropy": 0.5224398598074913, |
| "epoch": 0.034133333333333335, |
| "grad_norm": 0.03947189077734947, |
| "learning_rate": 3e-06, |
| "log_pi_ratio": -1.7108341126004234e-06, |
| "loss": 0.0, |
| "step": 64 |
| }, |
| { |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completion_length": 243.2083396911621, |
| "entropy": 0.6066446006298065, |
| "epoch": 0.034666666666666665, |
| "grad_norm": 0.16914069652557373, |
| "learning_rate": 3e-06, |
| "log_pi_ratio": 0.0003220176149625331, |
| "loss": 0.0006, |
| "reward": 1.197916716337204, |
| "rewards/boxed_and_answer_tags_format_reward": 0.6354166567325592, |
| "rewards/correctness_reward_func_math": 0.5625000074505806, |
| "step": 65 |
| }, |
| { |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "entropy": 0.6070155650377274, |
| "epoch": 0.0352, |
| "grad_norm": 0.051245734095573425, |
| "learning_rate": 3e-06, |
| "log_pi_ratio": -4.8981230065692216e-05, |
| "loss": -0.0003, |
| "step": 66 |
| }, |
| { |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "entropy": 0.6069309562444687, |
| "epoch": 0.03573333333333333, |
| "grad_norm": 0.6271181702613831, |
| "learning_rate": 3e-06, |
| "log_pi_ratio": 3.563751306501217e-05, |
| "loss": 0.0002, |
| "step": 67 |
| }, |
| { |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "entropy": 0.6063584238290787, |
| "epoch": 0.03626666666666667, |
| "grad_norm": 0.2069249153137207, |
| "learning_rate": 3e-06, |
| "log_pi_ratio": 0.0006081771425670013, |
| "loss": 0.0004, |
| "step": 68 |
| }, |
| { |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completion_length": 247.91667556762695, |
| "entropy": 0.5086031928658485, |
| "epoch": 0.0368, |
| "grad_norm": 0.02908812277019024, |
| "learning_rate": 3e-06, |
| "log_pi_ratio": 0.00016642993068671785, |
| "loss": 0.0002, |
| "reward": 0.8541667014360428, |
| "rewards/boxed_and_answer_tags_format_reward": 0.6250000149011612, |
| "rewards/correctness_reward_func_math": 0.2291666679084301, |
| "step": 69 |
| }, |
| { |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "entropy": 0.5083719342947006, |
| "epoch": 0.037333333333333336, |
| "grad_norm": 0.028778737410902977, |
| "learning_rate": 3e-06, |
| "log_pi_ratio": 0.00039769090653862804, |
| "loss": 0.0, |
| "step": 70 |
| }, |
| { |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "entropy": 0.5086484104394913, |
| "epoch": 0.037866666666666667, |
| "grad_norm": 0.028771471232175827, |
| "learning_rate": 3e-06, |
| "log_pi_ratio": 0.00012122248881496489, |
| "loss": 0.0001, |
| "step": 71 |
| }, |
| { |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "entropy": 0.5086371675133705, |
| "epoch": 0.0384, |
| "grad_norm": 0.02933097817003727, |
| "learning_rate": 3e-06, |
| "log_pi_ratio": 0.00013244504225440323, |
| "loss": 0.0, |
| "step": 72 |
| }, |
| { |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completion_length": 246.05209350585938, |
| "entropy": 0.4944235682487488, |
| "epoch": 0.038933333333333334, |
| "grad_norm": 0.05496666207909584, |
| "learning_rate": 3e-06, |
| "log_pi_ratio": -0.0007831976836314425, |
| "loss": -0.0003, |
| "reward": 1.1666667014360428, |
| "rewards/boxed_and_answer_tags_format_reward": 0.625, |
| "rewards/correctness_reward_func_math": 0.5416666828095913, |
| "step": 73 |
| }, |
| { |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "entropy": 0.4939584732055664, |
| "epoch": 0.039466666666666664, |
| "grad_norm": 0.047105155885219574, |
| "learning_rate": 3e-06, |
| "log_pi_ratio": -0.0003181274078087881, |
| "loss": 0.0, |
| "step": 74 |
| }, |
| { |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "entropy": 0.4941820651292801, |
| "epoch": 0.04, |
| "grad_norm": 0.056431617587804794, |
| "learning_rate": 3e-06, |
| "log_pi_ratio": -0.0005417007705545984, |
| "loss": -0.0004, |
| "step": 75 |
| }, |
| { |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "entropy": 0.4941381961107254, |
| "epoch": 0.04053333333333333, |
| "grad_norm": 0.05862469598650932, |
| "learning_rate": 3e-06, |
| "log_pi_ratio": -0.0004978412907803431, |
| "loss": -0.0002, |
| "step": 76 |
| }, |
| { |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completion_length": 240.15625381469727, |
| "entropy": 0.5174174681305885, |
| "epoch": 0.04106666666666667, |
| "grad_norm": 0.0542527511715889, |
| "learning_rate": 3e-06, |
| "log_pi_ratio": -0.0001666912721702829, |
| "loss": 0.0003, |
| "reward": 1.145833358168602, |
| "rewards/boxed_and_answer_tags_format_reward": 0.6041666716337204, |
| "rewards/correctness_reward_func_math": 0.5416666641831398, |
| "step": 77 |
| }, |
| { |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "entropy": 0.5173228904604912, |
| "epoch": 0.0416, |
| "grad_norm": 0.054989397525787354, |
| "learning_rate": 3e-06, |
| "log_pi_ratio": -7.21228716429323e-05, |
| "loss": -0.0001, |
| "step": 78 |
| }, |
| { |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "entropy": 0.5171471834182739, |
| "epoch": 0.042133333333333335, |
| "grad_norm": 0.060466669499874115, |
| "learning_rate": 3e-06, |
| "log_pi_ratio": 0.0001036007670336403, |
| "loss": 0.0, |
| "step": 79 |
| }, |
| { |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "entropy": 0.5172587782144547, |
| "epoch": 0.042666666666666665, |
| "grad_norm": 0.05961833521723747, |
| "learning_rate": 3e-06, |
| "log_pi_ratio": -8.001996320672333e-06, |
| "loss": 0.0, |
| "step": 80 |
| }, |
| { |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completion_length": 237.87500381469727, |
| "entropy": 0.5285271257162094, |
| "epoch": 0.0432, |
| "grad_norm": 0.03982525318861008, |
| "learning_rate": 3e-06, |
| "log_pi_ratio": -5.077758032712154e-05, |
| "loss": -0.0001, |
| "reward": 0.994791716337204, |
| "rewards/boxed_and_answer_tags_format_reward": 0.536458320915699, |
| "rewards/correctness_reward_func_math": 0.45833333395421505, |
| "step": 81 |
| }, |
| { |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "entropy": 0.5284614786505699, |
| "epoch": 0.04373333333333333, |
| "grad_norm": 0.03896940127015114, |
| "learning_rate": 3e-06, |
| "log_pi_ratio": 1.4868252037558705e-05, |
| "loss": -0.0002, |
| "step": 82 |
| }, |
| { |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "entropy": 0.5284661799669266, |
| "epoch": 0.04426666666666667, |
| "grad_norm": 0.036616504192352295, |
| "learning_rate": 3e-06, |
| "log_pi_ratio": 1.0161373211303726e-05, |
| "loss": -0.0002, |
| "step": 83 |
| }, |
| { |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "entropy": 0.5283261984586716, |
| "epoch": 0.0448, |
| "grad_norm": 0.037983059883117676, |
| "learning_rate": 3e-06, |
| "log_pi_ratio": 0.0001501273291069083, |
| "loss": -0.0002, |
| "step": 84 |
| }, |
| { |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completion_length": 233.6666717529297, |
| "entropy": 0.6016157865524292, |
| "epoch": 0.04533333333333334, |
| "grad_norm": 0.03654065728187561, |
| "learning_rate": 3e-06, |
| "log_pi_ratio": -0.0014846454323560465, |
| "loss": 0.0002, |
| "reward": 1.0677083730697632, |
| "rewards/boxed_and_answer_tags_format_reward": 0.5677083358168602, |
| "rewards/correctness_reward_func_math": 0.5, |
| "step": 85 |
| }, |
| { |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "entropy": 0.600335918366909, |
| "epoch": 0.04586666666666667, |
| "grad_norm": 0.047179799526929855, |
| "learning_rate": 3e-06, |
| "log_pi_ratio": -0.00020477080761338584, |
| "loss": 0.0001, |
| "step": 86 |
| }, |
| { |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "entropy": 0.5999719500541687, |
| "epoch": 0.0464, |
| "grad_norm": 0.038354724645614624, |
| "learning_rate": 3e-06, |
| "log_pi_ratio": 0.00015919587895041332, |
| "loss": 0.0, |
| "step": 87 |
| }, |
| { |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "entropy": 0.600743018090725, |
| "epoch": 0.046933333333333334, |
| "grad_norm": 0.036794889718294144, |
| "learning_rate": 3e-06, |
| "log_pi_ratio": -0.0006118775927461684, |
| "loss": -0.0001, |
| "step": 88 |
| }, |
| { |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completion_length": 248.46875, |
| "entropy": 0.48756077140569687, |
| "epoch": 0.047466666666666664, |
| "grad_norm": 0.06522957235574722, |
| "learning_rate": 3e-06, |
| "log_pi_ratio": 0.00020513215349637903, |
| "loss": 0.0001, |
| "reward": 1.5520834028720856, |
| "rewards/boxed_and_answer_tags_format_reward": 0.6145833283662796, |
| "rewards/correctness_reward_func_math": 0.9375, |
| "step": 89 |
| }, |
| { |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "entropy": 0.4876759424805641, |
| "epoch": 0.048, |
| "grad_norm": 0.06448456645011902, |
| "learning_rate": 3e-06, |
| "log_pi_ratio": 8.997155237011611e-05, |
| "loss": -0.0001, |
| "step": 90 |
| }, |
| { |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "entropy": 0.4875106140971184, |
| "epoch": 0.04853333333333333, |
| "grad_norm": 0.0680398941040039, |
| "learning_rate": 3e-06, |
| "log_pi_ratio": 0.0002552772348280996, |
| "loss": -0.0002, |
| "step": 91 |
| }, |
| { |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "entropy": 0.4876427575945854, |
| "epoch": 0.04906666666666667, |
| "grad_norm": 0.06725798547267914, |
| "learning_rate": 3e-06, |
| "log_pi_ratio": 0.000123157435155008, |
| "loss": -0.0001, |
| "step": 92 |
| }, |
| { |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completion_length": 223.21875762939453, |
| "entropy": 0.541013702750206, |
| "epoch": 0.0496, |
| "grad_norm": 0.044657789170742035, |
| "learning_rate": 3e-06, |
| "log_pi_ratio": -5.162663001101464e-05, |
| "loss": -0.0003, |
| "reward": 1.4635417312383652, |
| "rewards/boxed_and_answer_tags_format_reward": 0.5885416716337204, |
| "rewards/correctness_reward_func_math": 0.8749999776482582, |
| "step": 93 |
| }, |
| { |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "entropy": 0.5411660596728325, |
| "epoch": 0.050133333333333335, |
| "grad_norm": 0.04498978331685066, |
| "learning_rate": 3e-06, |
| "log_pi_ratio": -0.00020399675122462213, |
| "loss": -0.0002, |
| "step": 94 |
| }, |
| { |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "entropy": 0.5406382009387016, |
| "epoch": 0.050666666666666665, |
| "grad_norm": 0.045420143753290176, |
| "learning_rate": 3e-06, |
| "log_pi_ratio": 0.00032386608654633164, |
| "loss": -0.0004, |
| "step": 95 |
| }, |
| { |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "entropy": 0.5410767793655396, |
| "epoch": 0.0512, |
| "grad_norm": 0.08541081845760345, |
| "learning_rate": 3e-06, |
| "log_pi_ratio": -0.0001147016737377271, |
| "loss": -0.0003, |
| "step": 96 |
| }, |
| { |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completion_length": 235.8541717529297, |
| "entropy": 0.5329247713088989, |
| "epoch": 0.05173333333333333, |
| "grad_norm": 0.06892135739326477, |
| "learning_rate": 3e-06, |
| "log_pi_ratio": 0.0005140792491147295, |
| "loss": 0.0, |
| "reward": 1.182291716337204, |
| "rewards/boxed_and_answer_tags_format_reward": 0.640625, |
| "rewards/correctness_reward_func_math": 0.5416666641831398, |
| "step": 97 |
| }, |
| { |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "entropy": 0.5331239253282547, |
| "epoch": 0.05226666666666667, |
| "grad_norm": 0.07429134100675583, |
| "learning_rate": 3e-06, |
| "log_pi_ratio": 0.0003149460808344884, |
| "loss": -0.0002, |
| "step": 98 |
| }, |
| { |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "entropy": 0.5332067087292671, |
| "epoch": 0.0528, |
| "grad_norm": 0.06773337721824646, |
| "learning_rate": 3e-06, |
| "log_pi_ratio": 0.00023215196961245965, |
| "loss": -0.0001, |
| "step": 99 |
| }, |
| { |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "entropy": 0.5332778841257095, |
| "epoch": 0.05333333333333334, |
| "grad_norm": 0.07203734666109085, |
| "learning_rate": 3e-06, |
| "log_pi_ratio": 0.00016098002379294485, |
| "loss": -0.0006, |
| "step": 100 |
| } |
| ], |
| "logging_steps": 1, |
| "max_steps": 18750, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 10, |
| "save_steps": 100, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 0.0, |
| "train_batch_size": 6, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|