Qwen3-1.7B-Open-R1-GRPO-Baseline / trainer_state.json
wzx111's picture
Model save
6ad1f0a verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.0,
"eval_steps": 500,
"global_step": 48,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.328125,
"completions/max_length": 1024.0,
"completions/max_terminated_length": 1022.0,
"completions/mean_length": 823.9140625,
"completions/mean_terminated_length": 726.1976928710938,
"completions/min_length": 240.0,
"completions/min_terminated_length": 240.0,
"entropy": 0.10510256746783853,
"epoch": 0.041666666666666664,
"frac_reward_zero_std": 0.203125,
"grad_norm": 0.116986483335495,
"learning_rate": 0.0,
"loss": 0.0234,
"num_tokens": 477500.0,
"reward": 0.5213682651519775,
"reward_std": 0.2439471185207367,
"rewards/<lambda>/mean": 0.5213682651519775,
"rewards/<lambda>/std": 0.5060697793960571,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.9999539852142334,
"sampling/importance_sampling_ratio/min": 0.051202189177274704,
"sampling/sampling_logp_difference/max": 2.971972942352295,
"sampling/sampling_logp_difference/mean": 0.009669496677815914,
"step": 1,
"step_time": 74.34781758487225
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.34765625,
"completions/max_length": 1024.0,
"completions/max_terminated_length": 1024.0,
"completions/mean_length": 845.5703125,
"completions/mean_terminated_length": 750.4790649414062,
"completions/min_length": 263.0,
"completions/min_terminated_length": 263.0,
"entropy": 0.10980924824252725,
"epoch": 0.08333333333333333,
"frac_reward_zero_std": 0.171875,
"grad_norm": 0.10744482278823853,
"learning_rate": 3e-06,
"loss": 0.0278,
"num_tokens": 965576.0,
"reward": 0.519410252571106,
"reward_std": 0.3138212561607361,
"rewards/<lambda>/mean": 0.519410252571106,
"rewards/<lambda>/std": 0.506157398223877,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.000095248222351,
"sampling/importance_sampling_ratio/min": 0.20458395779132843,
"sampling/sampling_logp_difference/max": 1.586776852607727,
"sampling/sampling_logp_difference/mean": 0.010054240934550762,
"step": 2,
"step_time": 53.79163959249854
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.48046875,
"completions/max_length": 1024.0,
"completions/max_terminated_length": 1022.0,
"completions/mean_length": 882.068359375,
"completions/mean_terminated_length": 750.8082885742188,
"completions/min_length": 302.0,
"completions/min_terminated_length": 302.0,
"entropy": 0.11614924483001232,
"epoch": 0.125,
"frac_reward_zero_std": 0.109375,
"grad_norm": 0.10017473250627518,
"learning_rate": 6e-06,
"loss": 0.0165,
"num_tokens": 1480499.0,
"reward": 0.33390700817108154,
"reward_std": 0.2303066849708557,
"rewards/<lambda>/mean": 0.33390700817108154,
"rewards/<lambda>/std": 0.4804892838001251,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0000317096710205,
"sampling/importance_sampling_ratio/min": 0.06684955954551697,
"sampling/sampling_logp_difference/max": 2.705310583114624,
"sampling/sampling_logp_difference/mean": 0.010464111343026161,
"step": 3,
"step_time": 57.42266962304711
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.32421875,
"completions/max_length": 1024.0,
"completions/max_terminated_length": 1022.0,
"completions/mean_length": 814.55859375,
"completions/mean_terminated_length": 714.0751342773438,
"completions/min_length": 345.0,
"completions/min_terminated_length": 345.0,
"entropy": 0.11400253418833017,
"epoch": 0.16666666666666666,
"frac_reward_zero_std": 0.109375,
"grad_norm": 0.10569294542074203,
"learning_rate": 9e-06,
"loss": 0.0248,
"num_tokens": 1952593.0,
"reward": 0.4981191158294678,
"reward_std": 0.31772834062576294,
"rewards/<lambda>/mean": 0.498119056224823,
"rewards/<lambda>/std": 0.5063196420669556,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0000189542770386,
"sampling/importance_sampling_ratio/min": 0.14644451439380646,
"sampling/sampling_logp_difference/max": 1.9211087226867676,
"sampling/sampling_logp_difference/mean": 0.010197827592492104,
"step": 4,
"step_time": 51.03766080364585
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.388671875,
"completions/max_length": 1024.0,
"completions/max_terminated_length": 1021.0,
"completions/mean_length": 835.1875,
"completions/mean_terminated_length": 715.1437377929688,
"completions/min_length": 356.0,
"completions/min_terminated_length": 356.0,
"entropy": 0.12000379897654057,
"epoch": 0.20833333333333334,
"frac_reward_zero_std": 0.140625,
"grad_norm": 0.09225241839885712,
"learning_rate": 1.2e-05,
"loss": 0.0245,
"num_tokens": 2437185.0,
"reward": 0.4845890998840332,
"reward_std": 0.2662581205368042,
"rewards/<lambda>/mean": 0.4845891296863556,
"rewards/<lambda>/std": 0.5059504508972168,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.9999595880508423,
"sampling/importance_sampling_ratio/min": 0.18561066687107086,
"sampling/sampling_logp_difference/max": 1.6841039657592773,
"sampling/sampling_logp_difference/mean": 0.010071037337183952,
"step": 5,
"step_time": 53.15481134876609
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.333984375,
"completions/max_length": 1024.0,
"completions/max_terminated_length": 1020.0,
"completions/mean_length": 833.9453125,
"completions/mean_terminated_length": 744.50439453125,
"completions/min_length": 357.0,
"completions/min_terminated_length": 357.0,
"entropy": 0.12417639419436455,
"epoch": 0.25,
"frac_reward_zero_std": 0.234375,
"grad_norm": 0.08464518189430237,
"learning_rate": 1.5e-05,
"loss": 0.0223,
"num_tokens": 2924157.0,
"reward": 0.5296170711517334,
"reward_std": 0.2583223283290863,
"rewards/<lambda>/mean": 0.5296170711517334,
"rewards/<lambda>/std": 0.505209743976593,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.9999397993087769,
"sampling/importance_sampling_ratio/min": 0.00012402510037645698,
"sampling/sampling_logp_difference/max": 8.995026588439941,
"sampling/sampling_logp_difference/mean": 0.010162664577364922,
"step": 6,
"step_time": 62.87784644961357
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.548828125,
"completions/max_length": 1024.0,
"completions/max_terminated_length": 1024.0,
"completions/mean_length": 895.59375,
"completions/mean_terminated_length": 739.3939208984375,
"completions/min_length": 410.0,
"completions/min_terminated_length": 410.0,
"entropy": 0.17639623675495386,
"epoch": 0.2916666666666667,
"frac_reward_zero_std": 0.03125,
"grad_norm": 0.08520859479904175,
"learning_rate": 1.4979992127476638e-05,
"loss": 0.0157,
"num_tokens": 3444285.0,
"reward": 0.2455175518989563,
"reward_std": 0.23690563440322876,
"rewards/<lambda>/mean": 0.2455175369977951,
"rewards/<lambda>/std": 0.4428788125514984,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.9999836683273315,
"sampling/importance_sampling_ratio/min": 0.039996612817049026,
"sampling/sampling_logp_difference/max": 3.2189605236053467,
"sampling/sampling_logp_difference/mean": 0.013452851213514805,
"step": 7,
"step_time": 57.17111527174711
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.41796875,
"completions/max_length": 1024.0,
"completions/max_terminated_length": 1022.0,
"completions/mean_length": 863.2109375,
"completions/mean_terminated_length": 747.7449951171875,
"completions/min_length": 358.0,
"completions/min_terminated_length": 358.0,
"entropy": 0.18227362260222435,
"epoch": 0.3333333333333333,
"frac_reward_zero_std": 0.140625,
"grad_norm": 0.10602783411741257,
"learning_rate": 1.4920075260563328e-05,
"loss": 0.0272,
"num_tokens": 3950121.0,
"reward": 0.47941911220550537,
"reward_std": 0.2964603304862976,
"rewards/<lambda>/mean": 0.47941911220550537,
"rewards/<lambda>/std": 0.507050096988678,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.9999715685844421,
"sampling/importance_sampling_ratio/min": 0.2541903555393219,
"sampling/sampling_logp_difference/max": 1.475466251373291,
"sampling/sampling_logp_difference/mean": 0.013659531250596046,
"step": 8,
"step_time": 55.14558635652065
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.544921875,
"completions/max_length": 1024.0,
"completions/max_terminated_length": 1021.0,
"completions/mean_length": 901.974609375,
"completions/mean_terminated_length": 757.6094360351562,
"completions/min_length": 399.0,
"completions/min_terminated_length": 399.0,
"entropy": 0.18943150993436575,
"epoch": 0.375,
"frac_reward_zero_std": 0.125,
"grad_norm": 0.11737405508756638,
"learning_rate": 1.4820569081669455e-05,
"loss": 0.0341,
"num_tokens": 4495428.0,
"reward": 0.42956632375717163,
"reward_std": 0.26905590295791626,
"rewards/<lambda>/mean": 0.42956632375717163,
"rewards/<lambda>/std": 0.5035831928253174,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.999960720539093,
"sampling/importance_sampling_ratio/min": 0.015176300890743732,
"sampling/sampling_logp_difference/max": 4.1880202293396,
"sampling/sampling_logp_difference/mean": 0.01382643636316061,
"step": 9,
"step_time": 59.8278575129807
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.515625,
"completions/max_length": 1024.0,
"completions/max_terminated_length": 1024.0,
"completions/mean_length": 890.86328125,
"completions/mean_terminated_length": 749.1370849609375,
"completions/min_length": 323.0,
"completions/min_terminated_length": 323.0,
"entropy": 0.1795631218701601,
"epoch": 0.4166666666666667,
"frac_reward_zero_std": 0.25,
"grad_norm": 0.10624588280916214,
"learning_rate": 1.4682004499313044e-05,
"loss": 0.0192,
"num_tokens": 5042470.0,
"reward": 0.4718334674835205,
"reward_std": 0.19827762246131897,
"rewards/<lambda>/mean": 0.4718334972858429,
"rewards/<lambda>/std": 0.5064524412155151,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.9999367594718933,
"sampling/importance_sampling_ratio/min": 0.0721684917807579,
"sampling/sampling_logp_difference/max": 2.628751754760742,
"sampling/sampling_logp_difference/mean": 0.01308610662817955,
"step": 10,
"step_time": 60.118371706455946
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.654296875,
"completions/max_length": 1024.0,
"completions/max_terminated_length": 1003.0,
"completions/mean_length": 928.134765625,
"completions/mean_terminated_length": 746.6949462890625,
"completions/min_length": 349.0,
"completions/min_terminated_length": 349.0,
"entropy": 0.17855694890022278,
"epoch": 0.4583333333333333,
"frac_reward_zero_std": 0.125,
"grad_norm": 0.09456736594438553,
"learning_rate": 1.450512081549411e-05,
"loss": 0.0193,
"num_tokens": 5602051.0,
"reward": 0.3620399832725525,
"reward_std": 0.20572692155838013,
"rewards/<lambda>/mean": 0.3620400130748749,
"rewards/<lambda>/std": 0.4884944558143616,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.9999369382858276,
"sampling/importance_sampling_ratio/min": 0.07273312658071518,
"sampling/sampling_logp_difference/max": 2.6209583282470703,
"sampling/sampling_logp_difference/mean": 0.012828832492232323,
"step": 11,
"step_time": 58.71875632926822
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.853515625,
"completions/max_length": 1024.0,
"completions/max_terminated_length": 1023.0,
"completions/mean_length": 998.62890625,
"completions/mean_terminated_length": 850.800048828125,
"completions/min_length": 490.0,
"completions/min_terminated_length": 490.0,
"entropy": 0.1622396009042859,
"epoch": 0.5,
"frac_reward_zero_std": 0.015625,
"grad_norm": 0.08454262465238571,
"learning_rate": 1.4290861781198601e-05,
"loss": 0.0114,
"num_tokens": 6176933.0,
"reward": 0.17149432003498077,
"reward_std": 0.19794043898582458,
"rewards/<lambda>/mean": 0.17149433493614197,
"rewards/<lambda>/std": 0.3933511972427368,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.9998893737792969,
"sampling/importance_sampling_ratio/min": 0.1942903995513916,
"sampling/sampling_logp_difference/max": 1.6384012699127197,
"sampling/sampling_logp_difference/mean": 0.0115684624761343,
"step": 12,
"step_time": 58.48493871092796
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.79296875,
"completions/max_length": 1024.0,
"completions/max_terminated_length": 1022.0,
"completions/mean_length": 982.77734375,
"completions/mean_terminated_length": 859.4528198242188,
"completions/min_length": 566.0,
"completions/min_terminated_length": 620.0,
"entropy": 0.1731550320982933,
"epoch": 0.5416666666666666,
"frac_reward_zero_std": 0.078125,
"grad_norm": 0.07863683998584747,
"learning_rate": 1.4040370561078558e-05,
"loss": 0.0125,
"num_tokens": 6756187.0,
"reward": 0.25542140007019043,
"reward_std": 0.1804811656475067,
"rewards/<lambda>/mean": 0.25542140007019043,
"rewards/<lambda>/std": 0.44602400064468384,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0000680685043335,
"sampling/importance_sampling_ratio/min": 0.02221822179853916,
"sampling/sampling_logp_difference/max": 3.806842565536499,
"sampling/sampling_logp_difference/mean": 0.012337762862443924,
"step": 13,
"step_time": 68.92022440582514
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.884765625,
"completions/max_length": 1024.0,
"completions/max_terminated_length": 1022.0,
"completions/mean_length": 1007.6796875,
"completions/mean_terminated_length": 882.3728637695312,
"completions/min_length": 559.0,
"completions/min_terminated_length": 559.0,
"entropy": 0.17464189883321524,
"epoch": 0.5833333333333334,
"frac_reward_zero_std": 0.046875,
"grad_norm": 0.07638600468635559,
"learning_rate": 1.3754983634174084e-05,
"loss": 0.0054,
"num_tokens": 7335399.0,
"reward": 0.23066657781600952,
"reward_std": 0.20333421230316162,
"rewards/<lambda>/mean": 0.23066657781600952,
"rewards/<lambda>/std": 0.43073543906211853,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.9999949932098389,
"sampling/importance_sampling_ratio/min": 0.041199441999197006,
"sampling/sampling_logp_difference/max": 3.189330577850342,
"sampling/sampling_logp_difference/mean": 0.012287369929254055,
"step": 14,
"step_time": 59.96094610914588
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.87109375,
"completions/max_length": 1024.0,
"completions/max_terminated_length": 1022.0,
"completions/mean_length": 996.556640625,
"completions/mean_terminated_length": 827.5909423828125,
"completions/min_length": 550.0,
"completions/min_terminated_length": 550.0,
"entropy": 0.17954212613403797,
"epoch": 0.625,
"frac_reward_zero_std": 0.0625,
"grad_norm": 0.08646295964717865,
"learning_rate": 1.3436223663219406e-05,
"loss": 0.0049,
"num_tokens": 7913092.0,
"reward": 0.2230292558670044,
"reward_std": 0.19263553619384766,
"rewards/<lambda>/mean": 0.2230292558670044,
"rewards/<lambda>/std": 0.42564159631729126,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.9999350309371948,
"sampling/importance_sampling_ratio/min": 0.011985468678176403,
"sampling/sampling_logp_difference/max": 4.424060344696045,
"sampling/sampling_logp_difference/mean": 0.01234557293355465,
"step": 15,
"step_time": 64.32212274521589
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.701171875,
"completions/max_length": 1024.0,
"completions/max_terminated_length": 1018.0,
"completions/mean_length": 949.470703125,
"completions/mean_terminated_length": 795.2483520507812,
"completions/min_length": 301.0,
"completions/min_terminated_length": 301.0,
"entropy": 0.17093131132423878,
"epoch": 0.6666666666666666,
"frac_reward_zero_std": 0.140625,
"grad_norm": 0.06887029111385345,
"learning_rate": 1.3085791370578364e-05,
"loss": 0.0115,
"num_tokens": 8462493.0,
"reward": 0.4387291669845581,
"reward_std": 0.2984377145767212,
"rewards/<lambda>/mean": 0.4387291967868805,
"rewards/<lambda>/std": 0.5014151334762573,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.9999709129333496,
"sampling/importance_sampling_ratio/min": 0.11177696287631989,
"sampling/sampling_logp_difference/max": 2.1912498474121094,
"sampling/sampling_logp_difference/mean": 0.011749806813895702,
"step": 16,
"step_time": 75.07067326828837
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.544921875,
"completions/max_length": 1024.0,
"completions/max_terminated_length": 1022.0,
"completions/mean_length": 919.498046875,
"completions/mean_terminated_length": 794.3648071289062,
"completions/min_length": 303.0,
"completions/min_terminated_length": 303.0,
"entropy": 0.16775457188487053,
"epoch": 0.7083333333333334,
"frac_reward_zero_std": 0.34375,
"grad_norm": 0.04024207964539528,
"learning_rate": 1.2705556464154755e-05,
"loss": 0.0184,
"num_tokens": 8985980.0,
"reward": 0.6928114295005798,
"reward_std": 0.2674423158168793,
"rewards/<lambda>/mean": 0.6928114295005798,
"rewards/<lambda>/std": 0.46451425552368164,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.000028371810913,
"sampling/importance_sampling_ratio/min": 0.008241385221481323,
"sampling/sampling_logp_difference/max": 4.798586845397949,
"sampling/sampling_logp_difference/mean": 0.011188083328306675,
"step": 17,
"step_time": 53.734655763953924
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.46875,
"completions/max_length": 1024.0,
"completions/max_terminated_length": 1024.0,
"completions/mean_length": 880.052734375,
"completions/mean_terminated_length": 753.0404663085938,
"completions/min_length": 305.0,
"completions/min_terminated_length": 305.0,
"entropy": 0.173905773088336,
"epoch": 0.75,
"frac_reward_zero_std": 0.234375,
"grad_norm": 0.08793645352125168,
"learning_rate": 1.2297547661691685e-05,
"loss": 0.0361,
"num_tokens": 9487111.0,
"reward": 0.6395045518875122,
"reward_std": 0.28389185667037964,
"rewards/<lambda>/mean": 0.6395045518875122,
"rewards/<lambda>/std": 0.48384764790534973,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.000001072883606,
"sampling/importance_sampling_ratio/min": 0.13127455115318298,
"sampling/sampling_logp_difference/max": 2.0304644107818604,
"sampling/sampling_logp_difference/mean": 0.01144577655941248,
"step": 18,
"step_time": 52.49561759829521
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.30859375,
"completions/max_length": 1024.0,
"completions/max_terminated_length": 1018.0,
"completions/mean_length": 767.435546875,
"completions/mean_terminated_length": 653.4661254882812,
"completions/min_length": 262.0,
"completions/min_terminated_length": 262.0,
"entropy": 0.1821697074919939,
"epoch": 0.7916666666666666,
"frac_reward_zero_std": 0.484375,
"grad_norm": 0.10041403770446777,
"learning_rate": 1.1863941866684647e-05,
"loss": 0.0314,
"num_tokens": 9947318.0,
"reward": 0.7617002725601196,
"reward_std": 0.20842789113521576,
"rewards/<lambda>/mean": 0.7617002725601196,
"rewards/<lambda>/std": 0.4287981688976288,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.9998761415481567,
"sampling/importance_sampling_ratio/min": 0.0028160586953163147,
"sampling/sampling_logp_difference/max": 5.872416973114014,
"sampling/sampling_logp_difference/mean": 0.012468406930565834,
"step": 19,
"step_time": 58.54368192702532
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1875,
"completions/max_length": 1024.0,
"completions/max_terminated_length": 1023.0,
"completions/mean_length": 649.126953125,
"completions/mean_terminated_length": 565.0240478515625,
"completions/min_length": 250.0,
"completions/min_terminated_length": 250.0,
"entropy": 0.18335528578609228,
"epoch": 0.8333333333333334,
"frac_reward_zero_std": 0.53125,
"grad_norm": 0.041842591017484665,
"learning_rate": 1.1407052553659478e-05,
"loss": 0.0393,
"num_tokens": 10346455.0,
"reward": 0.7931854724884033,
"reward_std": 0.1657945215702057,
"rewards/<lambda>/mean": 0.7931854724884033,
"rewards/<lambda>/std": 0.4075835645198822,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.9999485611915588,
"sampling/importance_sampling_ratio/min": 0.001173140830360353,
"sampling/sampling_logp_difference/max": 6.74807071685791,
"sampling/sampling_logp_difference/mean": 0.012672360055148602,
"step": 20,
"step_time": 53.72001050412655
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.255859375,
"completions/max_length": 1024.0,
"completions/max_terminated_length": 1021.0,
"completions/mean_length": 696.78515625,
"completions/mean_terminated_length": 584.2781982421875,
"completions/min_length": 267.0,
"completions/min_terminated_length": 267.0,
"entropy": 0.18723426572978497,
"epoch": 0.875,
"frac_reward_zero_std": 0.375,
"grad_norm": 0.08443084359169006,
"learning_rate": 1.0929317424784789e-05,
"loss": 0.0329,
"num_tokens": 10777497.0,
"reward": 0.6964685320854187,
"reward_std": 0.2045532912015915,
"rewards/<lambda>/mean": 0.6964685320854187,
"rewards/<lambda>/std": 0.46325141191482544,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.9999812841415405,
"sampling/importance_sampling_ratio/min": 0.24296453595161438,
"sampling/sampling_logp_difference/max": 1.885258436203003,
"sampling/sampling_logp_difference/mean": 0.013141268864274025,
"step": 21,
"step_time": 50.97019802033901
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.2109375,
"completions/max_length": 1024.0,
"completions/max_terminated_length": 1024.0,
"completions/mean_length": 689.134765625,
"completions/mean_terminated_length": 600.2401123046875,
"completions/min_length": 209.0,
"completions/min_terminated_length": 209.0,
"entropy": 0.18722887616604567,
"epoch": 0.9166666666666666,
"frac_reward_zero_std": 0.4375,
"grad_norm": 0.07437894493341446,
"learning_rate": 1.043328540367617e-05,
"loss": 0.0363,
"num_tokens": 11194494.0,
"reward": 0.7501273155212402,
"reward_std": 0.20716464519500732,
"rewards/<lambda>/mean": 0.7501273155212402,
"rewards/<lambda>/std": 0.43548887968063354,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0000048875808716,
"sampling/importance_sampling_ratio/min": 0.044193509966135025,
"sampling/sampling_logp_difference/max": 3.1191773414611816,
"sampling/sampling_logp_difference/mean": 0.012832986190915108,
"step": 22,
"step_time": 52.78209077939391
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.595703125,
"completions/max_length": 1024.0,
"completions/max_terminated_length": 1017.0,
"completions/mean_length": 874.330078125,
"completions/mean_terminated_length": 653.8019409179688,
"completions/min_length": 307.0,
"completions/min_terminated_length": 307.0,
"entropy": 0.21719548478722572,
"epoch": 0.9583333333333334,
"frac_reward_zero_std": 0.203125,
"grad_norm": 0.07946697622537613,
"learning_rate": 9.921603035785846e-06,
"loss": 0.029,
"num_tokens": 11711007.0,
"reward": 0.41294676065444946,
"reward_std": 0.21486328542232513,
"rewards/<lambda>/mean": 0.41294676065444946,
"rewards/<lambda>/std": 0.4979855418205261,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0000637769699097,
"sampling/importance_sampling_ratio/min": 0.003101334674283862,
"sampling/sampling_logp_difference/max": 5.775922775268555,
"sampling/sampling_logp_difference/mean": 0.01419367827475071,
"step": 23,
"step_time": 48.530132196843624
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.7109375,
"completions/max_length": 1024.0,
"completions/max_terminated_length": 1016.0,
"completions/mean_length": 942.05859375,
"completions/mean_terminated_length": 746.4729614257812,
"completions/min_length": 380.0,
"completions/min_terminated_length": 380.0,
"entropy": 0.2433080393821001,
"epoch": 1.0,
"frac_reward_zero_std": 0.09375,
"grad_norm": 0.08330480009317398,
"learning_rate": 9.397000367937605e-06,
"loss": 0.0218,
"num_tokens": 12278821.0,
"reward": 0.2651611268520355,
"reward_std": 0.17255176603794098,
"rewards/<lambda>/mean": 0.2651611268520355,
"rewards/<lambda>/std": 0.44904500246047974,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0000908374786377,
"sampling/importance_sampling_ratio/min": 0.04702044650912285,
"sampling/sampling_logp_difference/max": 3.0571727752685547,
"sampling/sampling_logp_difference/mean": 0.015090242959558964,
"step": 24,
"step_time": 61.07411051169038
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.244140625,
"completions/max_length": 1024.0,
"completions/max_terminated_length": 1019.0,
"completions/mean_length": 738.345703125,
"completions/mean_terminated_length": 646.0801391601562,
"completions/min_length": 322.0,
"completions/min_terminated_length": 322.0,
"entropy": 0.20082950219511986,
"epoch": 1.0416666666666667,
"frac_reward_zero_std": 0.421875,
"grad_norm": 0.09732793271541595,
"learning_rate": 8.862276382345772e-06,
"loss": 0.0474,
"num_tokens": 12712510.0,
"reward": 0.7737147808074951,
"reward_std": 0.24896946549415588,
"rewards/<lambda>/mean": 0.7737147212028503,
"rewards/<lambda>/std": 0.42085471749305725,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.9999430179595947,
"sampling/importance_sampling_ratio/min": 0.002486642450094223,
"sampling/sampling_logp_difference/max": 5.996821880340576,
"sampling/sampling_logp_difference/mean": 0.013192282989621162,
"step": 25,
"step_time": 51.378925789147615
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.533203125,
"completions/max_length": 1024.0,
"completions/max_terminated_length": 1024.0,
"completions/mean_length": 865.376953125,
"completions/mean_terminated_length": 684.1882934570312,
"completions/min_length": 385.0,
"completions/min_terminated_length": 385.0,
"entropy": 0.1902909567579627,
"epoch": 1.0833333333333333,
"frac_reward_zero_std": 0.359375,
"grad_norm": 0.07519173622131348,
"learning_rate": 8.32028406283406e-06,
"loss": 0.0145,
"num_tokens": 13210727.0,
"reward": 0.7598530054092407,
"reward_std": 0.2680090367794037,
"rewards/<lambda>/mean": 0.7598530054092407,
"rewards/<lambda>/std": 0.42980262637138367,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.9999756813049316,
"sampling/importance_sampling_ratio/min": 0.06907939910888672,
"sampling/sampling_logp_difference/max": 2.6724987030029297,
"sampling/sampling_logp_difference/mean": 0.01252746395766735,
"step": 26,
"step_time": 54.474919099360704
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.8515625,
"completions/max_length": 1024.0,
"completions/max_terminated_length": 1021.0,
"completions/mean_length": 978.05078125,
"completions/mean_terminated_length": 714.4473876953125,
"completions/min_length": 389.0,
"completions/min_terminated_length": 389.0,
"entropy": 0.18582966551184654,
"epoch": 1.125,
"frac_reward_zero_std": 0.28125,
"grad_norm": 0.0853697806596756,
"learning_rate": 7.773915172932443e-06,
"loss": 0.0036,
"num_tokens": 13774793.0,
"reward": 0.7163721919059753,
"reward_std": 0.2866782546043396,
"rewards/<lambda>/mean": 0.7163721919059753,
"rewards/<lambda>/std": 0.45386844873428345,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.9999269247055054,
"sampling/importance_sampling_ratio/min": 0.03673957660794258,
"sampling/sampling_logp_difference/max": 3.303900718688965,
"sampling/sampling_logp_difference/mean": 0.011891147121787071,
"step": 27,
"step_time": 59.99074776098132
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.873046875,
"completions/max_length": 1024.0,
"completions/max_terminated_length": 1024.0,
"completions/mean_length": 985.720703125,
"completions/mean_terminated_length": 722.4769287109375,
"completions/min_length": 410.0,
"completions/min_terminated_length": 410.0,
"entropy": 0.17072805669158697,
"epoch": 1.1666666666666667,
"frac_reward_zero_std": 0.46875,
"grad_norm": 0.06768080592155457,
"learning_rate": 7.226084827067558e-06,
"loss": 0.0008,
"num_tokens": 14334522.0,
"reward": 0.8005068302154541,
"reward_std": 0.21028977632522583,
"rewards/<lambda>/mean": 0.8005068302154541,
"rewards/<lambda>/std": 0.40284714102745056,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.9999420642852783,
"sampling/importance_sampling_ratio/min": 0.0021877989638596773,
"sampling/sampling_logp_difference/max": 6.12485933303833,
"sampling/sampling_logp_difference/mean": 0.01087690144777298,
"step": 28,
"step_time": 54.91908521205187
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.974609375,
"completions/max_length": 1024.0,
"completions/max_terminated_length": 955.0,
"completions/mean_length": 1017.001953125,
"completions/mean_terminated_length": 748.3846435546875,
"completions/min_length": 463.0,
"completions/min_terminated_length": 463.0,
"entropy": 0.18543414678424597,
"epoch": 1.2083333333333333,
"frac_reward_zero_std": 0.28125,
"grad_norm": 0.08286742866039276,
"learning_rate": 6.679715937165944e-06,
"loss": -0.0,
"num_tokens": 14912203.0,
"reward": 0.6823173761367798,
"reward_std": 0.28883033990859985,
"rewards/<lambda>/mean": 0.6823173761367798,
"rewards/<lambda>/std": 0.4695548117160797,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0000499486923218,
"sampling/importance_sampling_ratio/min": 0.010657834820449352,
"sampling/sampling_logp_difference/max": 4.541460037231445,
"sampling/sampling_logp_difference/mean": 0.01162964478135109,
"step": 29,
"step_time": 58.209084182977676
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 1024.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 1020.09375,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 774.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.1811871463432908,
"epoch": 1.25,
"frac_reward_zero_std": 0.359375,
"grad_norm": 0.06104443594813347,
"learning_rate": 6.137723617654227e-06,
"loss": 0.0,
"num_tokens": 15494483.0,
"reward": 0.7511924505233765,
"reward_std": 0.2601562440395355,
"rewards/<lambda>/mean": 0.7511924505233765,
"rewards/<lambda>/std": 0.4359346628189087,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0000349283218384,
"sampling/importance_sampling_ratio/min": 6.476168437075103e-07,
"sampling/sampling_logp_difference/max": 14.249966621398926,
"sampling/sampling_logp_difference/mean": 0.011373220011591911,
"step": 30,
"step_time": 67.36348918452859
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 1024.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 1024.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 1024.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.238006591796875,
"epoch": 1.2916666666666667,
"frac_reward_zero_std": 0.1875,
"grad_norm": 0.08633699268102646,
"learning_rate": 5.602999632062394e-06,
"loss": 0.0,
"num_tokens": 16080355.0,
"reward": 0.4735250473022461,
"reward_std": 0.24217532575130463,
"rewards/<lambda>/mean": 0.4735250473022461,
"rewards/<lambda>/std": 0.5048364400863647,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.9999916553497314,
"sampling/importance_sampling_ratio/min": 0.0006342114065773785,
"sampling/sampling_logp_difference/max": 7.363128185272217,
"sampling/sampling_logp_difference/mean": 0.013966077007353306,
"step": 31,
"step_time": 59.991970762610435
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 1024.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 1024.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 1024.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.2205047607421875,
"epoch": 1.3333333333333333,
"frac_reward_zero_std": 0.1875,
"grad_norm": 0.06900659203529358,
"learning_rate": 5.078396964214155e-06,
"loss": 0.0001,
"num_tokens": 16668515.0,
"reward": 0.5699542760848999,
"reward_std": 0.29703187942504883,
"rewards/<lambda>/mean": 0.5699542760848999,
"rewards/<lambda>/std": 0.5019528865814209,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0000271797180176,
"sampling/importance_sampling_ratio/min": 0.0721689909696579,
"sampling/sampling_logp_difference/max": 2.6287448406219482,
"sampling/sampling_logp_difference/mean": 0.01299051009118557,
"step": 32,
"step_time": 58.6862215436995
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 1024.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 1023.203125,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 973.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.23412521183490753,
"epoch": 1.375,
"frac_reward_zero_std": 0.140625,
"grad_norm": 0.08849670737981796,
"learning_rate": 4.566714596323831e-06,
"loss": 0.0,
"num_tokens": 17275891.0,
"reward": 0.45683372020721436,
"reward_std": 0.2977556884288788,
"rewards/<lambda>/mean": 0.45683372020721436,
"rewards/<lambda>/std": 0.5047717094421387,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0000598430633545,
"sampling/importance_sampling_ratio/min": 0.05894443392753601,
"sampling/sampling_logp_difference/max": 2.831160068511963,
"sampling/sampling_logp_difference/mean": 0.01418858952820301,
"step": 33,
"step_time": 62.67323864623904
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.998046875,
"completions/max_length": 1024.0,
"completions/max_terminated_length": 442.0,
"completions/mean_length": 1022.86328125,
"completions/mean_terminated_length": 442.0,
"completions/min_length": 442.0,
"completions/min_terminated_length": 442.0,
"entropy": 0.22855819575488567,
"epoch": 1.4166666666666667,
"frac_reward_zero_std": 0.140625,
"grad_norm": 0.059393007308244705,
"learning_rate": 4.0706825752152114e-06,
"loss": -0.0,
"num_tokens": 17890517.0,
"reward": 0.4321480095386505,
"reward_std": 0.22312432527542114,
"rewards/<lambda>/mean": 0.4321480393409729,
"rewards/<lambda>/std": 0.5013213157653809,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0000094175338745,
"sampling/importance_sampling_ratio/min": 0.05853661522269249,
"sampling/sampling_logp_difference/max": 2.8381028175354004,
"sampling/sampling_logp_difference/mean": 0.013826340436935425,
"step": 34,
"step_time": 62.6375826895237
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 1024.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 1024.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 1024.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.2439422607421875,
"epoch": 1.4583333333333333,
"frac_reward_zero_std": 0.125,
"grad_norm": 0.08650114387273788,
"learning_rate": 3.592947446340524e-06,
"loss": -0.0001,
"num_tokens": 18499181.0,
"reward": 0.36806878447532654,
"reward_std": 0.20019319653511047,
"rewards/<lambda>/mean": 0.36806878447532654,
"rewards/<lambda>/std": 0.4879459738731384,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.9999986886978149,
"sampling/importance_sampling_ratio/min": 0.15711112320423126,
"sampling/sampling_logp_difference/max": 1.850801944732666,
"sampling/sampling_logp_difference/mean": 0.014828124083578587,
"step": 35,
"step_time": 61.840191546827555
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 1024.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 1024.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 1024.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.2600250244140625,
"epoch": 1.5,
"frac_reward_zero_std": 0.125,
"grad_norm": 0.12246455252170563,
"learning_rate": 3.136058133315355e-06,
"loss": -0.0,
"num_tokens": 19087053.0,
"reward": 0.3060392141342163,
"reward_std": 0.1883051097393036,
"rewards/<lambda>/mean": 0.3060392141342163,
"rewards/<lambda>/std": 0.4683455228805542,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.9998786449432373,
"sampling/importance_sampling_ratio/min": 0.0640513002872467,
"sampling/sampling_logp_difference/max": 2.7480709552764893,
"sampling/sampling_logp_difference/mean": 0.015524221584200859,
"step": 36,
"step_time": 59.04825992509723
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 1024.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 1016.84375,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 566.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.2629384808242321,
"epoch": 1.5416666666666665,
"frac_reward_zero_std": 0.046875,
"grad_norm": 0.09859054535627365,
"learning_rate": 2.702452338308317e-06,
"loss": -0.0,
"num_tokens": 19683749.0,
"reward": 0.21034839749336243,
"reward_std": 0.16528862714767456,
"rewards/<lambda>/mean": 0.21034839749336243,
"rewards/<lambda>/std": 0.4158807694911957,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.9999657869338989,
"sampling/importance_sampling_ratio/min": 0.007038592826575041,
"sampling/sampling_logp_difference/max": 4.9563469886779785,
"sampling/sampling_logp_difference/mean": 0.015743985772132874,
"step": 37,
"step_time": 69.43658219277859
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 1024.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 1024.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 1024.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.257904052734375,
"epoch": 1.5833333333333335,
"frac_reward_zero_std": 0.046875,
"grad_norm": 0.0776461511850357,
"learning_rate": 2.2944435358452453e-06,
"loss": -0.0001,
"num_tokens": 20271317.0,
"reward": 0.2907390892505646,
"reward_std": 0.2345331609249115,
"rewards/<lambda>/mean": 0.2907390892505646,
"rewards/<lambda>/std": 0.4613417983055115,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.9999305009841919,
"sampling/importance_sampling_ratio/min": 0.1541903167963028,
"sampling/sampling_logp_difference/max": 1.869567632675171,
"sampling/sampling_logp_difference/mean": 0.01533452421426773,
"step": 38,
"step_time": 60.22879173234105
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 1024.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 1021.875,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 888.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.259298924356699,
"epoch": 1.625,
"frac_reward_zero_std": 0.015625,
"grad_norm": 0.08768095821142197,
"learning_rate": 1.914208629421636e-06,
"loss": 0.0,
"num_tokens": 20861973.0,
"reward": 0.1866101324558258,
"reward_std": 0.23392625153064728,
"rewards/<lambda>/mean": 0.1866101324558258,
"rewards/<lambda>/std": 0.3986544907093048,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.9999127388000488,
"sampling/importance_sampling_ratio/min": 9.618316809678618e-09,
"sampling/sampling_logp_difference/max": 18.459596633911133,
"sampling/sampling_logp_difference/mean": 0.015485625714063644,
"step": 39,
"step_time": 64.56263257935643
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 1024.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 1017.828125,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 629.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.22701962105929852,
"epoch": 1.6666666666666665,
"frac_reward_zero_std": 0.0625,
"grad_norm": 0.14714667201042175,
"learning_rate": 1.563776336780595e-06,
"loss": -0.0,
"num_tokens": 21446373.0,
"reward": 0.3874555230140686,
"reward_std": 0.3079039454460144,
"rewards/<lambda>/mean": 0.3874555230140686,
"rewards/<lambda>/std": 0.49295976758003235,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.9999634027481079,
"sampling/importance_sampling_ratio/min": 0.007032718509435654,
"sampling/sampling_logp_difference/max": 4.957181930541992,
"sampling/sampling_logp_difference/mean": 0.013643273152410984,
"step": 40,
"step_time": 70.35990770533681
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 1024.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 1024.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 1024.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.20330810546875,
"epoch": 1.7083333333333335,
"frac_reward_zero_std": 0.3125,
"grad_norm": 0.050596319139003754,
"learning_rate": 1.2450163658259165e-06,
"loss": -0.0001,
"num_tokens": 22023365.0,
"reward": 0.5747976303100586,
"reward_std": 0.23263724148273468,
"rewards/<lambda>/mean": 0.5747976303100586,
"rewards/<lambda>/std": 0.49826157093048096,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0000696182250977,
"sampling/importance_sampling_ratio/min": 0.1340048760175705,
"sampling/sampling_logp_difference/max": 2.0098791122436523,
"sampling/sampling_logp_difference/mean": 0.012190128676593304,
"step": 41,
"step_time": 56.359509252011776
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 1024.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 1024.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 1024.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.20894622802734375,
"epoch": 1.75,
"frac_reward_zero_std": 0.15625,
"grad_norm": 0.06408947706222534,
"learning_rate": 9.596294389214422e-07,
"loss": 0.0,
"num_tokens": 22598197.0,
"reward": 0.4889833331108093,
"reward_std": 0.3267000913619995,
"rewards/<lambda>/mean": 0.48898330330848694,
"rewards/<lambda>/std": 0.5056021213531494,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.9999138116836548,
"sampling/importance_sampling_ratio/min": 0.01976090297102928,
"sampling/sampling_logp_difference/max": 3.9240498542785645,
"sampling/sampling_logp_difference/mean": 0.012254208326339722,
"step": 42,
"step_time": 55.88660566881299
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 1024.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 1023.625,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 1000.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.18526224978268147,
"epoch": 1.7916666666666665,
"frac_reward_zero_std": 0.421875,
"grad_norm": 0.05824067071080208,
"learning_rate": 7.091382188014004e-07,
"loss": 0.0,
"num_tokens": 23189573.0,
"reward": 0.6663753986358643,
"reward_std": 0.1958390325307846,
"rewards/<lambda>/mean": 0.6663753986358643,
"rewards/<lambda>/std": 0.4757937490940094,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0000879764556885,
"sampling/importance_sampling_ratio/min": 0.02108073979616165,
"sampling/sampling_logp_difference/max": 3.8593955039978027,
"sampling/sampling_logp_difference/mean": 0.011367494240403175,
"step": 43,
"step_time": 61.6868560872972
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 1024.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 1021.765625,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 881.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.1563689550384879,
"epoch": 1.8333333333333335,
"frac_reward_zero_std": 0.53125,
"grad_norm": 0.05948049575090408,
"learning_rate": 4.948791845058906e-07,
"loss": 0.0,
"num_tokens": 23779501.0,
"reward": 0.7701754570007324,
"reward_std": 0.15122410655021667,
"rewards/<lambda>/mean": 0.7701754570007324,
"rewards/<lambda>/std": 0.4250968396663666,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.9999828338623047,
"sampling/importance_sampling_ratio/min": 0.0004995743511244655,
"sampling/sampling_logp_difference/max": 7.601754188537598,
"sampling/sampling_logp_difference/mean": 0.009836211800575256,
"step": 44,
"step_time": 64.21834829077125
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.998046875,
"completions/max_length": 1024.0,
"completions/max_terminated_length": 692.0,
"completions/mean_length": 1023.3515625,
"completions/mean_terminated_length": 692.0,
"completions/min_length": 692.0,
"completions/min_terminated_length": 692.0,
"entropy": 0.1624470753595233,
"epoch": 1.875,
"frac_reward_zero_std": 0.40625,
"grad_norm": 0.06368126720190048,
"learning_rate": 3.1799550068695616e-07,
"loss": 0.0004,
"num_tokens": 24377745.0,
"reward": 0.645195484161377,
"reward_std": 0.16846325993537903,
"rewards/<lambda>/mean": 0.645195484161377,
"rewards/<lambda>/std": 0.48446959257125854,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0000487565994263,
"sampling/importance_sampling_ratio/min": 0.11990867555141449,
"sampling/sampling_logp_difference/max": 2.1210248470306396,
"sampling/sampling_logp_difference/mean": 0.010421509854495525,
"step": 45,
"step_time": 61.393697403371334
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 1024.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 1022.03125,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 898.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.16149251069873571,
"epoch": 1.9166666666666665,
"frac_reward_zero_std": 0.390625,
"grad_norm": 0.06683196127414703,
"learning_rate": 1.7943091833054704e-07,
"loss": 0.0,
"num_tokens": 24965185.0,
"reward": 0.7338901162147522,
"reward_std": 0.23456785082817078,
"rewards/<lambda>/mean": 0.7338901162147522,
"rewards/<lambda>/std": 0.44744160771369934,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.9999337196350098,
"sampling/importance_sampling_ratio/min": 0.07218174636363983,
"sampling/sampling_logp_difference/max": 2.628568172454834,
"sampling/sampling_logp_difference/mean": 0.010157747194170952,
"step": 46,
"step_time": 64.5227914750576
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 1024.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 1024.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 1024.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.22264862060546875,
"epoch": 1.9583333333333335,
"frac_reward_zero_std": 0.203125,
"grad_norm": 0.0897730141878128,
"learning_rate": 7.992473943667311e-08,
"loss": 0.0,
"num_tokens": 25558329.0,
"reward": 0.4105945825576782,
"reward_std": 0.22466689348220825,
"rewards/<lambda>/mean": 0.4105945825576782,
"rewards/<lambda>/std": 0.49798643589019775,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.9999638199806213,
"sampling/importance_sampling_ratio/min": 0.060054682195186615,
"sampling/sampling_logp_difference/max": 2.812499761581421,
"sampling/sampling_logp_difference/mean": 0.013261111453175545,
"step": 47,
"step_time": 59.11356810852885
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 1024.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 1022.28125,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 914.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.2494364334270358,
"epoch": 2.0,
"frac_reward_zero_std": 0.125,
"grad_norm": 0.06464344263076782,
"learning_rate": 2.0007872523362668e-08,
"loss": -0.0,
"num_tokens": 26167217.0,
"reward": 0.31925156712532043,
"reward_std": 0.1734967827796936,
"rewards/<lambda>/mean": 0.31925156712532043,
"rewards/<lambda>/std": 0.4741312563419342,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.000075340270996,
"sampling/importance_sampling_ratio/min": 0.011130619794130325,
"sampling/sampling_logp_difference/max": 4.498055458068848,
"sampling/sampling_logp_difference/mean": 0.014584287069737911,
"step": 48,
"step_time": 63.68918735533953
},
{
"epoch": 2.0,
"step": 48,
"total_flos": 0.0,
"train_loss": 0.0,
"train_runtime": 2.496,
"train_samples_per_second": 1275.628,
"train_steps_per_second": 19.231
}
],
"logging_steps": 1,
"max_steps": 48,
"num_input_tokens_seen": 26167217,
"num_train_epochs": 2,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}