math_base_espo / checkpoint-100 /trainer_state.json
sheepy928's picture
Training in progress, step 100, checkpoint
d1891a7 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.05333333333333334,
"eval_steps": 500,
"global_step": 100,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 250.1770896911621,
"entropy": 0.4295753017067909,
"epoch": 0.0005333333333333334,
"grad_norm": 0.0589873343706131,
"learning_rate": 1.5e-06,
"log_pi_ratio": -8.231443386813586e-05,
"loss": -0.0,
"reward": 1.406250074505806,
"rewards/boxed_and_answer_tags_format_reward": 0.65625,
"rewards/correctness_reward_func_math": 0.7499999850988388,
"step": 1
},
{
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 0.4295753017067909,
"epoch": 0.0010666666666666667,
"grad_norm": 0.05906615033745766,
"learning_rate": 3e-06,
"log_pi_ratio": -8.231443386813586e-05,
"loss": -0.0,
"step": 2
},
{
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 0.429742269217968,
"epoch": 0.0016,
"grad_norm": 0.07365015894174576,
"learning_rate": 3e-06,
"log_pi_ratio": -0.0002492839630576782,
"loss": 0.0002,
"step": 3
},
{
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 0.4291049689054489,
"epoch": 0.0021333333333333334,
"grad_norm": 0.06198830157518387,
"learning_rate": 3e-06,
"log_pi_ratio": 0.0003880164513248019,
"loss": -0.0,
"step": 4
},
{
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 228.6354217529297,
"entropy": 0.5289351046085358,
"epoch": 0.0026666666666666666,
"grad_norm": 0.05378839746117592,
"learning_rate": 3e-06,
"log_pi_ratio": -5.4222204198595136e-05,
"loss": -0.0003,
"reward": 1.2187500298023224,
"rewards/boxed_and_answer_tags_format_reward": 0.59375,
"rewards/correctness_reward_func_math": 0.6250000055879354,
"step": 5
},
{
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 0.529054805636406,
"epoch": 0.0032,
"grad_norm": 0.05765789747238159,
"learning_rate": 3e-06,
"log_pi_ratio": -0.0001739257131703198,
"loss": -0.0001,
"step": 6
},
{
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 0.5290582627058029,
"epoch": 0.0037333333333333333,
"grad_norm": 0.06511564552783966,
"learning_rate": 3e-06,
"log_pi_ratio": -0.00017735697838361375,
"loss": -0.0003,
"step": 7
},
{
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 0.5294944420456886,
"epoch": 0.004266666666666667,
"grad_norm": 0.05595042556524277,
"learning_rate": 3e-06,
"log_pi_ratio": -0.0006135236180853099,
"loss": -0.0004,
"step": 8
},
{
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 242.25000762939453,
"entropy": 0.4981203153729439,
"epoch": 0.0048,
"grad_norm": 0.05393671989440918,
"learning_rate": 3e-06,
"log_pi_ratio": 7.427447417285293e-05,
"loss": 0.0002,
"reward": 1.2395833730697632,
"rewards/boxed_and_answer_tags_format_reward": 0.5729166641831398,
"rewards/correctness_reward_func_math": 0.6666666716337204,
"step": 9
},
{
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 0.49807676672935486,
"epoch": 0.005333333333333333,
"grad_norm": 0.2750526964664459,
"learning_rate": 3e-06,
"log_pi_ratio": 0.00011780268323491327,
"loss": 0.0001,
"step": 10
},
{
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 0.49833260476589203,
"epoch": 0.005866666666666667,
"grad_norm": 0.058018915355205536,
"learning_rate": 3e-06,
"log_pi_ratio": -0.0001380204048473388,
"loss": -0.0003,
"step": 11
},
{
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 0.4981199651956558,
"epoch": 0.0064,
"grad_norm": 0.05605858191847801,
"learning_rate": 3e-06,
"log_pi_ratio": 7.462104986188933e-05,
"loss": -0.0001,
"step": 12
},
{
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 239.56250381469727,
"entropy": 0.5426438078284264,
"epoch": 0.006933333333333333,
"grad_norm": 0.04904413968324661,
"learning_rate": 3e-06,
"log_pi_ratio": 0.00010345246846554801,
"loss": 0.0003,
"reward": 1.1406250298023224,
"rewards/boxed_and_answer_tags_format_reward": 0.640625,
"rewards/correctness_reward_func_math": 0.5,
"step": 13
},
{
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 0.5426872745156288,
"epoch": 0.007466666666666667,
"grad_norm": 0.04319930449128151,
"learning_rate": 3e-06,
"log_pi_ratio": 5.999111454002559e-05,
"loss": 0.0004,
"step": 14
},
{
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 0.5425786674022675,
"epoch": 0.008,
"grad_norm": 0.03787970170378685,
"learning_rate": 3e-06,
"log_pi_ratio": 0.00016858673552633263,
"loss": 0.0001,
"step": 15
},
{
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 0.5426425188779831,
"epoch": 0.008533333333333334,
"grad_norm": 0.036648496985435486,
"learning_rate": 3e-06,
"log_pi_ratio": 0.00010471276982570998,
"loss": 0.0,
"step": 16
},
{
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 246.3854217529297,
"entropy": 0.5258407667279243,
"epoch": 0.009066666666666667,
"grad_norm": 0.04535071551799774,
"learning_rate": 3e-06,
"log_pi_ratio": 0.0003478629805613309,
"loss": -0.0001,
"reward": 1.1562500596046448,
"rewards/boxed_and_answer_tags_format_reward": 0.6145833283662796,
"rewards/correctness_reward_func_math": 0.5416666753590107,
"step": 17
},
{
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 0.525847963988781,
"epoch": 0.0096,
"grad_norm": 0.056307464838027954,
"learning_rate": 3e-06,
"log_pi_ratio": 0.00034069575485773385,
"loss": 0.0001,
"step": 18
},
{
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 0.5257168933749199,
"epoch": 0.010133333333333333,
"grad_norm": 0.044158197939395905,
"learning_rate": 3e-06,
"log_pi_ratio": 0.00047175012514344417,
"loss": 0.0001,
"step": 19
},
{
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 0.5258035808801651,
"epoch": 0.010666666666666666,
"grad_norm": 0.04544621706008911,
"learning_rate": 3e-06,
"log_pi_ratio": 0.0003850707726087421,
"loss": 0.0,
"step": 20
},
{
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 247.2291717529297,
"entropy": 0.5515406280755997,
"epoch": 0.0112,
"grad_norm": 0.058575354516506195,
"learning_rate": 3e-06,
"log_pi_ratio": -0.0008289311444968916,
"loss": -0.0,
"reward": 1.2031250298023224,
"rewards/boxed_and_answer_tags_format_reward": 0.6197916716337204,
"rewards/correctness_reward_func_math": 0.583333333954215,
"step": 21
},
{
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 0.5512647777795792,
"epoch": 0.011733333333333333,
"grad_norm": 0.06019975244998932,
"learning_rate": 3e-06,
"log_pi_ratio": -0.0005530662892851979,
"loss": -0.0002,
"step": 22
},
{
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 0.5515913963317871,
"epoch": 0.012266666666666667,
"grad_norm": 0.05747281387448311,
"learning_rate": 3e-06,
"log_pi_ratio": -0.000879692546732258,
"loss": -0.0002,
"step": 23
},
{
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 0.5510128661990166,
"epoch": 0.0128,
"grad_norm": 0.06023112311959267,
"learning_rate": 3e-06,
"log_pi_ratio": -0.00030118109862087294,
"loss": -0.0002,
"step": 24
},
{
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 242.03125381469727,
"entropy": 0.5678461492061615,
"epoch": 0.013333333333333334,
"grad_norm": 0.02757483720779419,
"learning_rate": 3e-06,
"log_pi_ratio": -0.00023131727721192874,
"loss": -0.0,
"reward": 1.1770833730697632,
"rewards/boxed_and_answer_tags_format_reward": 0.59375,
"rewards/correctness_reward_func_math": 0.5833333358168602,
"step": 25
},
{
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 0.5679307132959366,
"epoch": 0.013866666666666666,
"grad_norm": 0.029515035450458527,
"learning_rate": 3e-06,
"log_pi_ratio": -0.0003158858453389257,
"loss": -0.0,
"step": 26
},
{
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 0.5677110254764557,
"epoch": 0.0144,
"grad_norm": 0.02889692410826683,
"learning_rate": 3e-06,
"log_pi_ratio": -9.621075878385454e-05,
"loss": -0.0001,
"step": 27
},
{
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 0.5675213783979416,
"epoch": 0.014933333333333333,
"grad_norm": 0.030576596036553383,
"learning_rate": 3e-06,
"log_pi_ratio": 9.346040314994752e-05,
"loss": -0.0,
"step": 28
},
{
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 247.46875381469727,
"entropy": 0.5834467113018036,
"epoch": 0.015466666666666667,
"grad_norm": 0.03030555695295334,
"learning_rate": 3e-06,
"log_pi_ratio": -0.0001248269181814976,
"loss": -0.0,
"reward": 0.9479166865348816,
"rewards/boxed_and_answer_tags_format_reward": 0.5729166716337204,
"rewards/correctness_reward_func_math": 0.375,
"step": 29
},
{
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 0.5832108706235886,
"epoch": 0.016,
"grad_norm": 0.030331073328852654,
"learning_rate": 3e-06,
"log_pi_ratio": 0.00011100981646450236,
"loss": 0.0,
"step": 30
},
{
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 0.5831519067287445,
"epoch": 0.016533333333333334,
"grad_norm": 0.031816788017749786,
"learning_rate": 3e-06,
"log_pi_ratio": 0.00016997993225231767,
"loss": 0.0,
"step": 31
},
{
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 0.583659753203392,
"epoch": 0.017066666666666667,
"grad_norm": 0.03242775425314903,
"learning_rate": 3e-06,
"log_pi_ratio": -0.0003378683468326926,
"loss": -0.0001,
"step": 32
},
{
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 237.91667556762695,
"entropy": 0.5774409174919128,
"epoch": 0.0176,
"grad_norm": 0.040067657828330994,
"learning_rate": 3e-06,
"log_pi_ratio": -0.0002348238049307838,
"loss": 0.0,
"reward": 0.911458358168602,
"rewards/boxed_and_answer_tags_format_reward": 0.5572916567325592,
"rewards/correctness_reward_func_math": 0.3541666567325592,
"step": 33
},
{
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 0.5770600140094757,
"epoch": 0.018133333333333335,
"grad_norm": 0.04447396472096443,
"learning_rate": 3e-06,
"log_pi_ratio": 0.00014607750927098095,
"loss": -0.0001,
"step": 34
},
{
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 0.5768891721963882,
"epoch": 0.018666666666666668,
"grad_norm": 0.04008471965789795,
"learning_rate": 3e-06,
"log_pi_ratio": 0.00031690803734818473,
"loss": -0.0002,
"step": 35
},
{
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 0.5769828110933304,
"epoch": 0.0192,
"grad_norm": 0.041593629866838455,
"learning_rate": 3e-06,
"log_pi_ratio": 0.00022326732505462132,
"loss": -0.0001,
"step": 36
},
{
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 245.14583587646484,
"entropy": 0.5890882089734077,
"epoch": 0.019733333333333332,
"grad_norm": 0.05088355764746666,
"learning_rate": 3e-06,
"log_pi_ratio": -0.0004102104066987522,
"loss": 0.0,
"reward": 0.9531250298023224,
"rewards/boxed_and_answer_tags_format_reward": 0.6406249850988388,
"rewards/correctness_reward_func_math": 0.3125000037252903,
"step": 37
},
{
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 0.5890304446220398,
"epoch": 0.020266666666666665,
"grad_norm": 0.05404159799218178,
"learning_rate": 3e-06,
"log_pi_ratio": -0.00035247779851488303,
"loss": -0.0001,
"step": 38
},
{
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 0.5886539816856384,
"epoch": 0.0208,
"grad_norm": 0.05120766907930374,
"learning_rate": 3e-06,
"log_pi_ratio": 2.400765151833184e-05,
"loss": -0.0002,
"step": 39
},
{
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 0.5889870673418045,
"epoch": 0.021333333333333333,
"grad_norm": 0.053153108805418015,
"learning_rate": 3e-06,
"log_pi_ratio": -0.00030907555992598645,
"loss": -0.0001,
"step": 40
},
{
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 237.1354217529297,
"entropy": 0.54951012134552,
"epoch": 0.021866666666666666,
"grad_norm": 0.06290385127067566,
"learning_rate": 3e-06,
"log_pi_ratio": -0.00025390044174855575,
"loss": -0.0001,
"reward": 1.005208358168602,
"rewards/boxed_and_answer_tags_format_reward": 0.5677083358168602,
"rewards/correctness_reward_func_math": 0.4375,
"step": 41
},
{
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 0.54947529733181,
"epoch": 0.0224,
"grad_norm": 0.06733641773462296,
"learning_rate": 3e-06,
"log_pi_ratio": -0.00021907184418523684,
"loss": -0.0002,
"step": 42
},
{
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 0.5490709245204926,
"epoch": 0.022933333333333333,
"grad_norm": 0.06687918305397034,
"learning_rate": 3e-06,
"log_pi_ratio": 0.0001853161011240445,
"loss": 0.0001,
"step": 43
},
{
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 0.5493637472391129,
"epoch": 0.023466666666666667,
"grad_norm": 0.06797856092453003,
"learning_rate": 3e-06,
"log_pi_ratio": -0.00010751574882306159,
"loss": -0.0004,
"step": 44
},
{
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 244.3854217529297,
"entropy": 0.5019608214497566,
"epoch": 0.024,
"grad_norm": 0.04074651375412941,
"learning_rate": 3e-06,
"log_pi_ratio": -0.00022211501527635846,
"loss": 0.0003,
"reward": 1.1562500298023224,
"rewards/boxed_and_answer_tags_format_reward": 0.6145833432674408,
"rewards/correctness_reward_func_math": 0.5416666679084301,
"step": 45
},
{
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 0.5016429349780083,
"epoch": 0.024533333333333334,
"grad_norm": 0.05098120495676994,
"learning_rate": 3e-06,
"log_pi_ratio": 9.576557931723073e-05,
"loss": 0.0002,
"step": 46
},
{
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 0.5021634548902512,
"epoch": 0.025066666666666668,
"grad_norm": 0.04488995671272278,
"learning_rate": 3e-06,
"log_pi_ratio": -0.00042477464376133867,
"loss": 0.0003,
"step": 47
},
{
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 0.502085268497467,
"epoch": 0.0256,
"grad_norm": 0.04084954410791397,
"learning_rate": 3e-06,
"log_pi_ratio": -0.00034658329968806356,
"loss": 0.0001,
"step": 48
},
{
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 244.5208396911621,
"entropy": 0.5566539317369461,
"epoch": 0.026133333333333335,
"grad_norm": 0.048481233417987823,
"learning_rate": 3e-06,
"log_pi_ratio": -0.00011470551180536859,
"loss": -0.0,
"reward": 1.2760417014360428,
"rewards/boxed_and_answer_tags_format_reward": 0.6302083283662796,
"rewards/correctness_reward_func_math": 0.6458333283662796,
"step": 49
},
{
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 0.5565382838249207,
"epoch": 0.02666666666666667,
"grad_norm": 0.049503978341817856,
"learning_rate": 3e-06,
"log_pi_ratio": 9.508294169791043e-07,
"loss": -0.0002,
"step": 50
},
{
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 0.5565498322248459,
"epoch": 0.0272,
"grad_norm": 0.04954688996076584,
"learning_rate": 3e-06,
"log_pi_ratio": -1.0585041309241205e-05,
"loss": -0.0001,
"step": 51
},
{
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 0.5563456490635872,
"epoch": 0.027733333333333332,
"grad_norm": 0.04860557243227959,
"learning_rate": 3e-06,
"log_pi_ratio": 0.00019361122394911945,
"loss": -0.0003,
"step": 52
},
{
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 249.14583587646484,
"entropy": 0.5415888875722885,
"epoch": 0.028266666666666666,
"grad_norm": 0.028491849079728127,
"learning_rate": 3e-06,
"log_pi_ratio": -0.00018408894175081514,
"loss": -0.0001,
"reward": 1.3177083730697632,
"rewards/boxed_and_answer_tags_format_reward": 0.5885416716337204,
"rewards/correctness_reward_func_math": 0.7291666772216558,
"step": 53
},
{
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 0.5417958348989487,
"epoch": 0.0288,
"grad_norm": 0.0282636396586895,
"learning_rate": 3e-06,
"log_pi_ratio": -0.00039100474168662913,
"loss": -0.0001,
"step": 54
},
{
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 0.5419644862413406,
"epoch": 0.029333333333333333,
"grad_norm": 0.028839441016316414,
"learning_rate": 3e-06,
"log_pi_ratio": -0.0005596724040515255,
"loss": -0.0002,
"step": 55
},
{
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 0.5417948812246323,
"epoch": 0.029866666666666666,
"grad_norm": 0.053943611681461334,
"learning_rate": 3e-06,
"log_pi_ratio": -0.000390107452403754,
"loss": -0.0001,
"step": 56
},
{
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 244.5729217529297,
"entropy": 0.47408151626586914,
"epoch": 0.0304,
"grad_norm": 0.04138944670557976,
"learning_rate": 3e-06,
"log_pi_ratio": -0.00039840556564740837,
"loss": 0.0,
"reward": 1.5520833730697632,
"rewards/boxed_and_answer_tags_format_reward": 0.5729166567325592,
"rewards/correctness_reward_func_math": 0.9791666716337204,
"step": 57
},
{
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 0.47388753294944763,
"epoch": 0.030933333333333334,
"grad_norm": 0.040251851081848145,
"learning_rate": 3e-06,
"log_pi_ratio": -0.00020442262757569551,
"loss": 0.0001,
"step": 58
},
{
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 0.47425322979688644,
"epoch": 0.031466666666666664,
"grad_norm": 0.042413048446178436,
"learning_rate": 3e-06,
"log_pi_ratio": -0.0005701218979083933,
"loss": -0.0001,
"step": 59
},
{
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 0.4742051735520363,
"epoch": 0.032,
"grad_norm": 0.04123881086707115,
"learning_rate": 3e-06,
"log_pi_ratio": -0.0005220575912971981,
"loss": -0.0001,
"step": 60
},
{
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 249.36458587646484,
"entropy": 0.5222784653306007,
"epoch": 0.03253333333333333,
"grad_norm": 0.039247363805770874,
"learning_rate": 3e-06,
"log_pi_ratio": 0.00015966666069289204,
"loss": -0.0001,
"reward": 0.9322917014360428,
"rewards/boxed_and_answer_tags_format_reward": 0.5989583283662796,
"rewards/correctness_reward_func_math": 0.33333334140479565,
"step": 61
},
{
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 0.5227270275354385,
"epoch": 0.03306666666666667,
"grad_norm": 0.039518315345048904,
"learning_rate": 3e-06,
"log_pi_ratio": -0.0002888756343963905,
"loss": 0.0,
"step": 62
},
{
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 0.5227140858769417,
"epoch": 0.0336,
"grad_norm": 0.039462391287088394,
"learning_rate": 3e-06,
"log_pi_ratio": -0.00027595218853093684,
"loss": 0.0001,
"step": 63
},
{
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 0.5224398598074913,
"epoch": 0.034133333333333335,
"grad_norm": 0.03947189077734947,
"learning_rate": 3e-06,
"log_pi_ratio": -1.7108341126004234e-06,
"loss": 0.0,
"step": 64
},
{
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 243.2083396911621,
"entropy": 0.6066446006298065,
"epoch": 0.034666666666666665,
"grad_norm": 0.16914069652557373,
"learning_rate": 3e-06,
"log_pi_ratio": 0.0003220176149625331,
"loss": 0.0006,
"reward": 1.197916716337204,
"rewards/boxed_and_answer_tags_format_reward": 0.6354166567325592,
"rewards/correctness_reward_func_math": 0.5625000074505806,
"step": 65
},
{
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 0.6070155650377274,
"epoch": 0.0352,
"grad_norm": 0.051245734095573425,
"learning_rate": 3e-06,
"log_pi_ratio": -4.8981230065692216e-05,
"loss": -0.0003,
"step": 66
},
{
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 0.6069309562444687,
"epoch": 0.03573333333333333,
"grad_norm": 0.6271181702613831,
"learning_rate": 3e-06,
"log_pi_ratio": 3.563751306501217e-05,
"loss": 0.0002,
"step": 67
},
{
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 0.6063584238290787,
"epoch": 0.03626666666666667,
"grad_norm": 0.2069249153137207,
"learning_rate": 3e-06,
"log_pi_ratio": 0.0006081771425670013,
"loss": 0.0004,
"step": 68
},
{
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 247.91667556762695,
"entropy": 0.5086031928658485,
"epoch": 0.0368,
"grad_norm": 0.02908812277019024,
"learning_rate": 3e-06,
"log_pi_ratio": 0.00016642993068671785,
"loss": 0.0002,
"reward": 0.8541667014360428,
"rewards/boxed_and_answer_tags_format_reward": 0.6250000149011612,
"rewards/correctness_reward_func_math": 0.2291666679084301,
"step": 69
},
{
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 0.5083719342947006,
"epoch": 0.037333333333333336,
"grad_norm": 0.028778737410902977,
"learning_rate": 3e-06,
"log_pi_ratio": 0.00039769090653862804,
"loss": 0.0,
"step": 70
},
{
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 0.5086484104394913,
"epoch": 0.037866666666666667,
"grad_norm": 0.028771471232175827,
"learning_rate": 3e-06,
"log_pi_ratio": 0.00012122248881496489,
"loss": 0.0001,
"step": 71
},
{
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 0.5086371675133705,
"epoch": 0.0384,
"grad_norm": 0.02933097817003727,
"learning_rate": 3e-06,
"log_pi_ratio": 0.00013244504225440323,
"loss": 0.0,
"step": 72
},
{
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 246.05209350585938,
"entropy": 0.4944235682487488,
"epoch": 0.038933333333333334,
"grad_norm": 0.05496666207909584,
"learning_rate": 3e-06,
"log_pi_ratio": -0.0007831976836314425,
"loss": -0.0003,
"reward": 1.1666667014360428,
"rewards/boxed_and_answer_tags_format_reward": 0.625,
"rewards/correctness_reward_func_math": 0.5416666828095913,
"step": 73
},
{
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 0.4939584732055664,
"epoch": 0.039466666666666664,
"grad_norm": 0.047105155885219574,
"learning_rate": 3e-06,
"log_pi_ratio": -0.0003181274078087881,
"loss": 0.0,
"step": 74
},
{
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 0.4941820651292801,
"epoch": 0.04,
"grad_norm": 0.056431617587804794,
"learning_rate": 3e-06,
"log_pi_ratio": -0.0005417007705545984,
"loss": -0.0004,
"step": 75
},
{
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 0.4941381961107254,
"epoch": 0.04053333333333333,
"grad_norm": 0.05862469598650932,
"learning_rate": 3e-06,
"log_pi_ratio": -0.0004978412907803431,
"loss": -0.0002,
"step": 76
},
{
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 240.15625381469727,
"entropy": 0.5174174681305885,
"epoch": 0.04106666666666667,
"grad_norm": 0.0542527511715889,
"learning_rate": 3e-06,
"log_pi_ratio": -0.0001666912721702829,
"loss": 0.0003,
"reward": 1.145833358168602,
"rewards/boxed_and_answer_tags_format_reward": 0.6041666716337204,
"rewards/correctness_reward_func_math": 0.5416666641831398,
"step": 77
},
{
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 0.5173228904604912,
"epoch": 0.0416,
"grad_norm": 0.054989397525787354,
"learning_rate": 3e-06,
"log_pi_ratio": -7.21228716429323e-05,
"loss": -0.0001,
"step": 78
},
{
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 0.5171471834182739,
"epoch": 0.042133333333333335,
"grad_norm": 0.060466669499874115,
"learning_rate": 3e-06,
"log_pi_ratio": 0.0001036007670336403,
"loss": 0.0,
"step": 79
},
{
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 0.5172587782144547,
"epoch": 0.042666666666666665,
"grad_norm": 0.05961833521723747,
"learning_rate": 3e-06,
"log_pi_ratio": -8.001996320672333e-06,
"loss": 0.0,
"step": 80
},
{
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 237.87500381469727,
"entropy": 0.5285271257162094,
"epoch": 0.0432,
"grad_norm": 0.03982525318861008,
"learning_rate": 3e-06,
"log_pi_ratio": -5.077758032712154e-05,
"loss": -0.0001,
"reward": 0.994791716337204,
"rewards/boxed_and_answer_tags_format_reward": 0.536458320915699,
"rewards/correctness_reward_func_math": 0.45833333395421505,
"step": 81
},
{
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 0.5284614786505699,
"epoch": 0.04373333333333333,
"grad_norm": 0.03896940127015114,
"learning_rate": 3e-06,
"log_pi_ratio": 1.4868252037558705e-05,
"loss": -0.0002,
"step": 82
},
{
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 0.5284661799669266,
"epoch": 0.04426666666666667,
"grad_norm": 0.036616504192352295,
"learning_rate": 3e-06,
"log_pi_ratio": 1.0161373211303726e-05,
"loss": -0.0002,
"step": 83
},
{
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 0.5283261984586716,
"epoch": 0.0448,
"grad_norm": 0.037983059883117676,
"learning_rate": 3e-06,
"log_pi_ratio": 0.0001501273291069083,
"loss": -0.0002,
"step": 84
},
{
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 233.6666717529297,
"entropy": 0.6016157865524292,
"epoch": 0.04533333333333334,
"grad_norm": 0.03654065728187561,
"learning_rate": 3e-06,
"log_pi_ratio": -0.0014846454323560465,
"loss": 0.0002,
"reward": 1.0677083730697632,
"rewards/boxed_and_answer_tags_format_reward": 0.5677083358168602,
"rewards/correctness_reward_func_math": 0.5,
"step": 85
},
{
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 0.600335918366909,
"epoch": 0.04586666666666667,
"grad_norm": 0.047179799526929855,
"learning_rate": 3e-06,
"log_pi_ratio": -0.00020477080761338584,
"loss": 0.0001,
"step": 86
},
{
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 0.5999719500541687,
"epoch": 0.0464,
"grad_norm": 0.038354724645614624,
"learning_rate": 3e-06,
"log_pi_ratio": 0.00015919587895041332,
"loss": 0.0,
"step": 87
},
{
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 0.600743018090725,
"epoch": 0.046933333333333334,
"grad_norm": 0.036794889718294144,
"learning_rate": 3e-06,
"log_pi_ratio": -0.0006118775927461684,
"loss": -0.0001,
"step": 88
},
{
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 248.46875,
"entropy": 0.48756077140569687,
"epoch": 0.047466666666666664,
"grad_norm": 0.06522957235574722,
"learning_rate": 3e-06,
"log_pi_ratio": 0.00020513215349637903,
"loss": 0.0001,
"reward": 1.5520834028720856,
"rewards/boxed_and_answer_tags_format_reward": 0.6145833283662796,
"rewards/correctness_reward_func_math": 0.9375,
"step": 89
},
{
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 0.4876759424805641,
"epoch": 0.048,
"grad_norm": 0.06448456645011902,
"learning_rate": 3e-06,
"log_pi_ratio": 8.997155237011611e-05,
"loss": -0.0001,
"step": 90
},
{
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 0.4875106140971184,
"epoch": 0.04853333333333333,
"grad_norm": 0.0680398941040039,
"learning_rate": 3e-06,
"log_pi_ratio": 0.0002552772348280996,
"loss": -0.0002,
"step": 91
},
{
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 0.4876427575945854,
"epoch": 0.04906666666666667,
"grad_norm": 0.06725798547267914,
"learning_rate": 3e-06,
"log_pi_ratio": 0.000123157435155008,
"loss": -0.0001,
"step": 92
},
{
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 223.21875762939453,
"entropy": 0.541013702750206,
"epoch": 0.0496,
"grad_norm": 0.044657789170742035,
"learning_rate": 3e-06,
"log_pi_ratio": -5.162663001101464e-05,
"loss": -0.0003,
"reward": 1.4635417312383652,
"rewards/boxed_and_answer_tags_format_reward": 0.5885416716337204,
"rewards/correctness_reward_func_math": 0.8749999776482582,
"step": 93
},
{
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 0.5411660596728325,
"epoch": 0.050133333333333335,
"grad_norm": 0.04498978331685066,
"learning_rate": 3e-06,
"log_pi_ratio": -0.00020399675122462213,
"loss": -0.0002,
"step": 94
},
{
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 0.5406382009387016,
"epoch": 0.050666666666666665,
"grad_norm": 0.045420143753290176,
"learning_rate": 3e-06,
"log_pi_ratio": 0.00032386608654633164,
"loss": -0.0004,
"step": 95
},
{
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 0.5410767793655396,
"epoch": 0.0512,
"grad_norm": 0.08541081845760345,
"learning_rate": 3e-06,
"log_pi_ratio": -0.0001147016737377271,
"loss": -0.0003,
"step": 96
},
{
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 235.8541717529297,
"entropy": 0.5329247713088989,
"epoch": 0.05173333333333333,
"grad_norm": 0.06892135739326477,
"learning_rate": 3e-06,
"log_pi_ratio": 0.0005140792491147295,
"loss": 0.0,
"reward": 1.182291716337204,
"rewards/boxed_and_answer_tags_format_reward": 0.640625,
"rewards/correctness_reward_func_math": 0.5416666641831398,
"step": 97
},
{
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 0.5331239253282547,
"epoch": 0.05226666666666667,
"grad_norm": 0.07429134100675583,
"learning_rate": 3e-06,
"log_pi_ratio": 0.0003149460808344884,
"loss": -0.0002,
"step": 98
},
{
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 0.5332067087292671,
"epoch": 0.0528,
"grad_norm": 0.06773337721824646,
"learning_rate": 3e-06,
"log_pi_ratio": 0.00023215196961245965,
"loss": -0.0001,
"step": 99
},
{
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 0.5332778841257095,
"epoch": 0.05333333333333334,
"grad_norm": 0.07203734666109085,
"learning_rate": 3e-06,
"log_pi_ratio": 0.00016098002379294485,
"loss": -0.0006,
"step": 100
}
],
"logging_steps": 1,
"max_steps": 18750,
"num_input_tokens_seen": 0,
"num_train_epochs": 10,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 6,
"trial_name": null,
"trial_params": null
}