ToolPRM-GRPO-v3 / trainer_state.json
wjldw's picture
Upload folder using huggingface_hub
c333ac3 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.0,
"eval_steps": 500,
"global_step": 411,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.009375,
"completions/max_length": 3070.0,
"completions/max_terminated_length": 2896.7,
"completions/mean_length": 1017.178125,
"completions/mean_terminated_length": 988.4677978515625,
"completions/min_length": 239.7,
"completions/min_terminated_length": 239.7,
"entropy": 0.3605123937129974,
"epoch": 0.072992700729927,
"frac_reward_zero_std": 0.675,
"grad_norm": 0.6796875,
"learning_rate": 9.78102189781022e-06,
"loss": -0.0091,
"num_tokens": 1212849.0,
"reward": 0.434375,
"reward_std": 0.15717875137925147,
"rewards/qwen_accuracy_reward/mean": 0.434375,
"rewards/qwen_accuracy_reward/std": 0.46030205190181733,
"step": 10,
"step_time": 106.30040930798278
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.071875,
"completions/max_length": 3382.3,
"completions/max_terminated_length": 3000.2,
"completions/mean_length": 1388.7375,
"completions/mean_terminated_length": 1186.2124328613281,
"completions/min_length": 297.7,
"completions/min_terminated_length": 297.7,
"entropy": 0.4167850613594055,
"epoch": 0.145985401459854,
"frac_reward_zero_std": 0.375,
"grad_norm": 1.046875,
"learning_rate": 9.537712895377129e-06,
"loss": 0.0608,
"num_tokens": 2697493.0,
"reward": 0.478125,
"reward_std": 0.27870663031935694,
"rewards/qwen_accuracy_reward/mean": 0.478125,
"rewards/qwen_accuracy_reward/std": 0.4272594749927521,
"step": 20,
"step_time": 116.88615104537458
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.053125,
"completions/max_length": 3308.6,
"completions/max_terminated_length": 3243.7,
"completions/mean_length": 1304.284375,
"completions/mean_terminated_length": 1165.7695190429688,
"completions/min_length": 270.6,
"completions/min_terminated_length": 270.6,
"entropy": 0.4009901225566864,
"epoch": 0.21897810218978103,
"frac_reward_zero_std": 0.55,
"grad_norm": 1.875,
"learning_rate": 9.294403892944039e-06,
"loss": 0.0254,
"num_tokens": 4025352.0,
"reward": 0.471875,
"reward_std": 0.19138479307293893,
"rewards/qwen_accuracy_reward/mean": 0.471875,
"rewards/qwen_accuracy_reward/std": 0.48829147815704343,
"step": 30,
"step_time": 106.55014775730669
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.040625,
"completions/max_length": 3444.4,
"completions/max_terminated_length": 2867.6,
"completions/mean_length": 1196.390625,
"completions/mean_terminated_length": 1088.1075134277344,
"completions/min_length": 288.2,
"completions/min_terminated_length": 288.2,
"entropy": 0.4015911310911179,
"epoch": 0.291970802919708,
"frac_reward_zero_std": 0.6,
"grad_norm": 0.53515625,
"learning_rate": 9.05109489051095e-06,
"loss": 0.0434,
"num_tokens": 5191093.0,
"reward": 0.521875,
"reward_std": 0.18105824217200278,
"rewards/qwen_accuracy_reward/mean": 0.521875,
"rewards/qwen_accuracy_reward/std": 0.48740494847297666,
"step": 40,
"step_time": 116.13580646244809
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.025,
"completions/max_length": 3239.4,
"completions/max_terminated_length": 3025.9,
"completions/mean_length": 1166.659375,
"completions/mean_terminated_length": 1098.1408386230469,
"completions/min_length": 246.9,
"completions/min_terminated_length": 246.9,
"entropy": 0.4154060840606689,
"epoch": 0.36496350364963503,
"frac_reward_zero_std": 0.75,
"grad_norm": 0.71484375,
"learning_rate": 8.80778588807786e-06,
"loss": 0.0252,
"num_tokens": 6289576.0,
"reward": 0.56875,
"reward_std": 0.11836256608366966,
"rewards/qwen_accuracy_reward/mean": 0.56875,
"rewards/qwen_accuracy_reward/std": 0.47113593220710753,
"step": 50,
"step_time": 102.3454062897712
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.009375,
"completions/max_length": 2896.0,
"completions/max_terminated_length": 2697.2,
"completions/mean_length": 1067.15,
"completions/mean_terminated_length": 1040.172314453125,
"completions/min_length": 278.2,
"completions/min_terminated_length": 278.2,
"entropy": 0.4182083398103714,
"epoch": 0.43795620437956206,
"frac_reward_zero_std": 0.675,
"grad_norm": 0.0,
"learning_rate": 8.56447688564477e-06,
"loss": -0.0045,
"num_tokens": 7485720.0,
"reward": 0.46875,
"reward_std": 0.14403236508369446,
"rewards/qwen_accuracy_reward/mean": 0.46875,
"rewards/qwen_accuracy_reward/std": 0.4563746154308319,
"step": 60,
"step_time": 103.93981777941808
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.009375,
"completions/max_length": 2650.0,
"completions/max_terminated_length": 2549.8,
"completions/mean_length": 948.559375,
"completions/mean_terminated_length": 924.6193481445313,
"completions/min_length": 209.7,
"completions/min_terminated_length": 209.7,
"entropy": 0.34950864464044573,
"epoch": 0.5109489051094891,
"frac_reward_zero_std": 0.65,
"grad_norm": 0.0,
"learning_rate": 8.32116788321168e-06,
"loss": 0.016,
"num_tokens": 8730603.0,
"reward": 0.528125,
"reward_std": 0.1587614081799984,
"rewards/qwen_accuracy_reward/mean": 0.528125,
"rewards/qwen_accuracy_reward/std": 0.4472527623176575,
"step": 70,
"step_time": 90.1208279568702
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.003125,
"completions/max_length": 2975.0,
"completions/max_terminated_length": 2827.8,
"completions/mean_length": 988.715625,
"completions/mean_terminated_length": 980.0193481445312,
"completions/min_length": 225.9,
"completions/min_terminated_length": 225.9,
"entropy": 0.37495362758636475,
"epoch": 0.583941605839416,
"frac_reward_zero_std": 0.6,
"grad_norm": 1.234375,
"learning_rate": 8.07785888077859e-06,
"loss": -0.021,
"num_tokens": 9949824.0,
"reward": 0.521875,
"reward_std": 0.18053897097706795,
"rewards/qwen_accuracy_reward/mean": 0.521875,
"rewards/qwen_accuracy_reward/std": 0.4544884204864502,
"step": 80,
"step_time": 97.22130602933467
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 2843.5,
"completions/max_terminated_length": 2426.7,
"completions/mean_length": 921.684375,
"completions/mean_terminated_length": 873.637158203125,
"completions/min_length": 278.9,
"completions/min_terminated_length": 278.9,
"entropy": 0.37491864860057833,
"epoch": 0.656934306569343,
"frac_reward_zero_std": 0.625,
"grad_norm": 1.546875,
"learning_rate": 7.8345498783455e-06,
"loss": 0.0557,
"num_tokens": 11177307.0,
"reward": 0.5625,
"reward_std": 0.1712738409638405,
"rewards/qwen_accuracy_reward/mean": 0.5625,
"rewards/qwen_accuracy_reward/std": 0.46112917065620423,
"step": 90,
"step_time": 95.75766938729211
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 2333.1,
"completions/max_terminated_length": 2311.9,
"completions/mean_length": 859.08125,
"completions/mean_terminated_length": 811.47939453125,
"completions/min_length": 218.5,
"completions/min_terminated_length": 218.5,
"entropy": 0.36555847227573396,
"epoch": 0.7299270072992701,
"frac_reward_zero_std": 0.675,
"grad_norm": 0.96875,
"learning_rate": 7.591240875912409e-06,
"loss": -0.002,
"num_tokens": 12404869.0,
"reward": 0.440625,
"reward_std": 0.14087215512990953,
"rewards/qwen_accuracy_reward/mean": 0.440625,
"rewards/qwen_accuracy_reward/std": 0.4636432766914368,
"step": 100,
"step_time": 82.00954903159291
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.009375,
"completions/max_length": 3052.7,
"completions/max_terminated_length": 2857.8,
"completions/mean_length": 1005.96875,
"completions/mean_terminated_length": 977.4821411132813,
"completions/min_length": 276.6,
"completions/min_terminated_length": 276.6,
"entropy": 0.36328954696655275,
"epoch": 0.8029197080291971,
"frac_reward_zero_std": 0.675,
"grad_norm": 1.0546875,
"learning_rate": 7.347931873479319e-06,
"loss": 0.0546,
"num_tokens": 13642787.0,
"reward": 0.66875,
"reward_std": 0.1395061768591404,
"rewards/qwen_accuracy_reward/mean": 0.66875,
"rewards/qwen_accuracy_reward/std": 0.40047125071287154,
"step": 110,
"step_time": 102.26997727015987
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0125,
"completions/max_length": 3162.9,
"completions/max_terminated_length": 2987.6,
"completions/mean_length": 1102.9875,
"completions/mean_terminated_length": 1069.5874267578124,
"completions/min_length": 304.1,
"completions/min_terminated_length": 304.1,
"entropy": 0.35163818299770355,
"epoch": 0.8759124087591241,
"frac_reward_zero_std": 0.575,
"grad_norm": 0.84375,
"learning_rate": 7.1046228710462296e-06,
"loss": 0.0128,
"num_tokens": 14774175.0,
"reward": 0.534375,
"reward_std": 0.18348564356565475,
"rewards/qwen_accuracy_reward/mean": 0.534375,
"rewards/qwen_accuracy_reward/std": 0.4590408980846405,
"step": 120,
"step_time": 97.75387887172401
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.009375,
"completions/max_length": 2885.8,
"completions/max_terminated_length": 2841.5,
"completions/mean_length": 1009.309375,
"completions/mean_terminated_length": 981.8988159179687,
"completions/min_length": 288.1,
"completions/min_terminated_length": 288.1,
"entropy": 0.34102891981601713,
"epoch": 0.948905109489051,
"frac_reward_zero_std": 0.7,
"grad_norm": 0.82421875,
"learning_rate": 6.86131386861314e-06,
"loss": -0.0044,
"num_tokens": 15966530.0,
"reward": 0.70625,
"reward_std": 0.14433300495147705,
"rewards/qwen_accuracy_reward/mean": 0.70625,
"rewards/qwen_accuracy_reward/std": 0.411708801984787,
"step": 130,
"step_time": 103.35281996680423
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.04375,
"completions/max_length": 2664.1,
"completions/max_terminated_length": 2568.1,
"completions/mean_length": 1134.346875,
"completions/mean_terminated_length": 1014.4972045898437,
"completions/min_length": 318.0,
"completions/min_terminated_length": 318.0,
"entropy": 0.33926538228988645,
"epoch": 1.0218978102189782,
"frac_reward_zero_std": 0.625,
"grad_norm": 0.0,
"learning_rate": 6.618004866180049e-06,
"loss": 0.0148,
"num_tokens": 17214129.0,
"reward": 0.540625,
"reward_std": 0.1648663252592087,
"rewards/qwen_accuracy_reward/mean": 0.540625,
"rewards/qwen_accuracy_reward/std": 0.45352436900138854,
"step": 140,
"step_time": 91.66583738289773
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00625,
"completions/max_length": 2898.0,
"completions/max_terminated_length": 2755.6,
"completions/mean_length": 1108.35625,
"completions/mean_terminated_length": 1091.26259765625,
"completions/min_length": 349.7,
"completions/min_terminated_length": 349.7,
"entropy": 0.33356466293334963,
"epoch": 1.094890510948905,
"frac_reward_zero_std": 0.675,
"grad_norm": 0.734375,
"learning_rate": 6.3746958637469595e-06,
"loss": 0.0161,
"num_tokens": 18739043.0,
"reward": 0.65,
"reward_std": 0.14624504819512368,
"rewards/qwen_accuracy_reward/mean": 0.65,
"rewards/qwen_accuracy_reward/std": 0.4041076198220253,
"step": 150,
"step_time": 111.4213294208981
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.05,
"completions/max_length": 2954.3,
"completions/max_terminated_length": 2814.9,
"completions/mean_length": 1294.83125,
"completions/mean_terminated_length": 1166.3035034179688,
"completions/min_length": 401.2,
"completions/min_terminated_length": 401.2,
"entropy": 0.36098510324954985,
"epoch": 1.167883211678832,
"frac_reward_zero_std": 0.675,
"grad_norm": 0.0,
"learning_rate": 6.13138686131387e-06,
"loss": 0.0291,
"num_tokens": 20131237.0,
"reward": 0.71875,
"reward_std": 0.14718669205904006,
"rewards/qwen_accuracy_reward/mean": 0.71875,
"rewards/qwen_accuracy_reward/std": 0.3977723315358162,
"step": 160,
"step_time": 100.20992619153112
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00625,
"completions/max_length": 2856.4,
"completions/max_terminated_length": 2700.9,
"completions/mean_length": 1204.96875,
"completions/mean_terminated_length": 1187.974169921875,
"completions/min_length": 393.8,
"completions/min_terminated_length": 393.8,
"entropy": 0.3756037563085556,
"epoch": 1.2408759124087592,
"frac_reward_zero_std": 0.55,
"grad_norm": 0.671875,
"learning_rate": 5.888077858880778e-06,
"loss": 0.0116,
"num_tokens": 21383299.0,
"reward": 0.596875,
"reward_std": 0.20872601345181466,
"rewards/qwen_accuracy_reward/mean": 0.596875,
"rewards/qwen_accuracy_reward/std": 0.46048612892627716,
"step": 170,
"step_time": 99.4391059097834
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00625,
"completions/max_length": 3201.4,
"completions/max_terminated_length": 3001.8,
"completions/mean_length": 1198.003125,
"completions/mean_terminated_length": 1179.4652099609375,
"completions/min_length": 440.1,
"completions/min_terminated_length": 440.1,
"entropy": 0.38104947507381437,
"epoch": 1.313868613138686,
"frac_reward_zero_std": 0.75,
"grad_norm": 0.5625,
"learning_rate": 5.6447688564476885e-06,
"loss": 0.0187,
"num_tokens": 22501172.0,
"reward": 0.7625,
"reward_std": 0.10478792265057564,
"rewards/qwen_accuracy_reward/mean": 0.7625,
"rewards/qwen_accuracy_reward/std": 0.3862288236618042,
"step": 180,
"step_time": 101.38317952565849
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01875,
"completions/max_length": 2751.6,
"completions/max_terminated_length": 2576.8,
"completions/mean_length": 1141.85625,
"completions/mean_terminated_length": 1095.5864868164062,
"completions/min_length": 378.7,
"completions/min_terminated_length": 378.7,
"entropy": 0.37404528707265855,
"epoch": 1.3868613138686132,
"frac_reward_zero_std": 0.575,
"grad_norm": 0.0,
"learning_rate": 5.401459854014599e-06,
"loss": 0.0105,
"num_tokens": 23793222.0,
"reward": 0.521875,
"reward_std": 0.18928286358714103,
"rewards/qwen_accuracy_reward/mean": 0.521875,
"rewards/qwen_accuracy_reward/std": 0.43783398270606994,
"step": 190,
"step_time": 91.93207672638819
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0375,
"completions/max_length": 3231.8,
"completions/max_terminated_length": 3000.5,
"completions/mean_length": 1305.665625,
"completions/mean_terminated_length": 1198.9109619140625,
"completions/min_length": 365.7,
"completions/min_terminated_length": 365.7,
"entropy": 0.3786713719367981,
"epoch": 1.4598540145985401,
"frac_reward_zero_std": 0.725,
"grad_norm": 0.80078125,
"learning_rate": 5.158150851581509e-06,
"loss": 0.0082,
"num_tokens": 25037259.0,
"reward": 0.603125,
"reward_std": 0.12046253234148026,
"rewards/qwen_accuracy_reward/mean": 0.603125,
"rewards/qwen_accuracy_reward/std": 0.42881221920251844,
"step": 200,
"step_time": 103.0258747473359
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 3153.1,
"completions/max_terminated_length": 3017.2,
"completions/mean_length": 1360.325,
"completions/mean_terminated_length": 1316.6628784179688,
"completions/min_length": 432.6,
"completions/min_terminated_length": 432.6,
"entropy": 0.3885278135538101,
"epoch": 1.5328467153284673,
"frac_reward_zero_std": 0.6,
"grad_norm": 0.50390625,
"learning_rate": 4.914841849148419e-06,
"loss": 0.034,
"num_tokens": 26258443.0,
"reward": 0.66875,
"reward_std": 0.18044402971863746,
"rewards/qwen_accuracy_reward/mean": 0.66875,
"rewards/qwen_accuracy_reward/std": 0.4504780650138855,
"step": 210,
"step_time": 96.96674608923495
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2816.0,
"completions/max_terminated_length": 2816.0,
"completions/mean_length": 1161.459375,
"completions/mean_terminated_length": 1161.459375,
"completions/min_length": 397.7,
"completions/min_terminated_length": 397.7,
"entropy": 0.38007940649986266,
"epoch": 1.6058394160583942,
"frac_reward_zero_std": 0.75,
"grad_norm": 0.7421875,
"learning_rate": 4.671532846715329e-06,
"loss": 0.0182,
"num_tokens": 27530662.0,
"reward": 0.7,
"reward_std": 0.11110442206263542,
"rewards/qwen_accuracy_reward/mean": 0.7,
"rewards/qwen_accuracy_reward/std": 0.36664991080760956,
"step": 220,
"step_time": 93.51679303245619
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.034375,
"completions/max_length": 2982.4,
"completions/max_terminated_length": 2932.6,
"completions/mean_length": 1282.75,
"completions/mean_terminated_length": 1197.8360961914063,
"completions/min_length": 432.0,
"completions/min_terminated_length": 432.0,
"entropy": 0.36942420303821566,
"epoch": 1.6788321167883211,
"frac_reward_zero_std": 0.55,
"grad_norm": 0.68359375,
"learning_rate": 4.428223844282239e-06,
"loss": 0.0049,
"num_tokens": 28913198.0,
"reward": 0.559375,
"reward_std": 0.1864362359046936,
"rewards/qwen_accuracy_reward/mean": 0.559375,
"rewards/qwen_accuracy_reward/std": 0.4745823562145233,
"step": 230,
"step_time": 106.11833359738812
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.028125,
"completions/max_length": 3358.5,
"completions/max_terminated_length": 3078.0,
"completions/mean_length": 1389.68125,
"completions/mean_terminated_length": 1318.075048828125,
"completions/min_length": 451.0,
"completions/min_terminated_length": 451.0,
"entropy": 0.4076398193836212,
"epoch": 1.7518248175182483,
"frac_reward_zero_std": 0.625,
"grad_norm": 0.640625,
"learning_rate": 4.184914841849148e-06,
"loss": 0.0356,
"num_tokens": 30106480.0,
"reward": 0.640625,
"reward_std": 0.16717590987682343,
"rewards/qwen_accuracy_reward/mean": 0.640625,
"rewards/qwen_accuracy_reward/std": 0.4686185359954834,
"step": 240,
"step_time": 115.86419467311353
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 3010.5,
"completions/max_terminated_length": 2811.1,
"completions/mean_length": 1186.053125,
"completions/mean_terminated_length": 1145.055078125,
"completions/min_length": 383.4,
"completions/min_terminated_length": 383.4,
"entropy": 0.3604145884513855,
"epoch": 1.8248175182481752,
"frac_reward_zero_std": 0.625,
"grad_norm": 0.8984375,
"learning_rate": 3.9416058394160585e-06,
"loss": 0.0403,
"num_tokens": 31363833.0,
"reward": 0.59375,
"reward_std": 0.16990982741117477,
"rewards/qwen_accuracy_reward/mean": 0.59375,
"rewards/qwen_accuracy_reward/std": 0.43118971437215803,
"step": 250,
"step_time": 99.8452183963731
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2370.0,
"completions/max_terminated_length": 2370.0,
"completions/mean_length": 1080.86875,
"completions/mean_terminated_length": 1080.86875,
"completions/min_length": 409.6,
"completions/min_terminated_length": 409.6,
"entropy": 0.37349976003170016,
"epoch": 1.897810218978102,
"frac_reward_zero_std": 0.825,
"grad_norm": 0.53515625,
"learning_rate": 3.698296836982969e-06,
"loss": -0.0252,
"num_tokens": 32702599.0,
"reward": 0.65,
"reward_std": 0.07775410786271095,
"rewards/qwen_accuracy_reward/mean": 0.65,
"rewards/qwen_accuracy_reward/std": 0.3688706248998642,
"step": 260,
"step_time": 86.65903639029712
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.025,
"completions/max_length": 2481.9,
"completions/max_terminated_length": 2415.4,
"completions/mean_length": 1075.425,
"completions/mean_terminated_length": 1002.4191040039062,
"completions/min_length": 349.1,
"completions/min_terminated_length": 349.1,
"entropy": 0.34225144386291506,
"epoch": 1.9708029197080292,
"frac_reward_zero_std": 0.7,
"grad_norm": 0.96484375,
"learning_rate": 3.454987834549879e-06,
"loss": 0.0167,
"num_tokens": 33787199.0,
"reward": 0.578125,
"reward_std": 0.12972569838166237,
"rewards/qwen_accuracy_reward/mean": 0.578125,
"rewards/qwen_accuracy_reward/std": 0.4668997347354889,
"step": 270,
"step_time": 80.8301064182073
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.009375,
"completions/max_length": 2574.6,
"completions/max_terminated_length": 2453.7,
"completions/mean_length": 1054.425,
"completions/mean_terminated_length": 1026.784130859375,
"completions/min_length": 392.3,
"completions/min_terminated_length": 392.3,
"entropy": 0.3572549015283585,
"epoch": 2.0437956204379564,
"frac_reward_zero_std": 0.85,
"grad_norm": 0.8046875,
"learning_rate": 3.2116788321167884e-06,
"loss": 0.0306,
"num_tokens": 34995479.0,
"reward": 0.821875,
"reward_std": 0.06165712922811508,
"rewards/qwen_accuracy_reward/mean": 0.821875,
"rewards/qwen_accuracy_reward/std": 0.2794704169034958,
"step": 280,
"step_time": 85.92972797648981
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2578.2,
"completions/max_terminated_length": 2578.2,
"completions/mean_length": 1058.575,
"completions/mean_terminated_length": 1058.575,
"completions/min_length": 371.0,
"completions/min_terminated_length": 371.0,
"entropy": 0.3524599611759186,
"epoch": 2.116788321167883,
"frac_reward_zero_std": 0.775,
"grad_norm": 0.0,
"learning_rate": 2.9683698296836987e-06,
"loss": 0.0015,
"num_tokens": 36338735.0,
"reward": 0.621875,
"reward_std": 0.10636548325419426,
"rewards/qwen_accuracy_reward/mean": 0.621875,
"rewards/qwen_accuracy_reward/std": 0.4201431304216385,
"step": 290,
"step_time": 93.4068580438383
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.009375,
"completions/max_length": 3337.3,
"completions/max_terminated_length": 3052.6,
"completions/mean_length": 1304.146875,
"completions/mean_terminated_length": 1277.986474609375,
"completions/min_length": 366.6,
"completions/min_terminated_length": 366.6,
"entropy": 0.37322444319725034,
"epoch": 2.18978102189781,
"frac_reward_zero_std": 0.675,
"grad_norm": 0.90625,
"learning_rate": 2.7250608272506085e-06,
"loss": 0.0091,
"num_tokens": 37781158.0,
"reward": 0.684375,
"reward_std": 0.13267236873507499,
"rewards/qwen_accuracy_reward/mean": 0.684375,
"rewards/qwen_accuracy_reward/std": 0.36027481555938723,
"step": 300,
"step_time": 124.43963868878782
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.025,
"completions/max_length": 3103.4,
"completions/max_terminated_length": 3048.6,
"completions/mean_length": 1280.71875,
"completions/mean_terminated_length": 1224.5757568359375,
"completions/min_length": 414.7,
"completions/min_terminated_length": 414.7,
"entropy": 0.36646572649478915,
"epoch": 2.2627737226277373,
"frac_reward_zero_std": 0.575,
"grad_norm": 0.75390625,
"learning_rate": 2.4817518248175183e-06,
"loss": -0.0218,
"num_tokens": 39053748.0,
"reward": 0.60625,
"reward_std": 0.19305532947182655,
"rewards/qwen_accuracy_reward/mean": 0.60625,
"rewards/qwen_accuracy_reward/std": 0.45746631026268003,
"step": 310,
"step_time": 96.06518278419972
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.025,
"completions/max_length": 3248.1,
"completions/max_terminated_length": 3058.9,
"completions/mean_length": 1360.24375,
"completions/mean_terminated_length": 1295.442919921875,
"completions/min_length": 362.0,
"completions/min_terminated_length": 362.0,
"entropy": 0.3854114145040512,
"epoch": 2.335766423357664,
"frac_reward_zero_std": 0.625,
"grad_norm": 1.078125,
"learning_rate": 2.2384428223844286e-06,
"loss": 0.0638,
"num_tokens": 40304938.0,
"reward": 0.70625,
"reward_std": 0.16034209728240967,
"rewards/qwen_accuracy_reward/mean": 0.70625,
"rewards/qwen_accuracy_reward/std": 0.3804707407951355,
"step": 320,
"step_time": 109.24540220741183
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.025,
"completions/max_length": 2834.8,
"completions/max_terminated_length": 2617.5,
"completions/mean_length": 1194.184375,
"completions/mean_terminated_length": 1109.6020874023438,
"completions/min_length": 388.7,
"completions/min_terminated_length": 388.7,
"entropy": 0.36874857246875764,
"epoch": 2.408759124087591,
"frac_reward_zero_std": 0.7,
"grad_norm": 0.0,
"learning_rate": 1.9951338199513384e-06,
"loss": -0.005,
"num_tokens": 41547669.0,
"reward": 0.7,
"reward_std": 0.13057240098714828,
"rewards/qwen_accuracy_reward/mean": 0.7,
"rewards/qwen_accuracy_reward/std": 0.3882273375988007,
"step": 330,
"step_time": 94.99811747204512
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2663.3,
"completions/max_terminated_length": 2663.3,
"completions/mean_length": 1023.784375,
"completions/mean_terminated_length": 1023.784375,
"completions/min_length": 363.6,
"completions/min_terminated_length": 363.6,
"entropy": 0.3397214740514755,
"epoch": 2.4817518248175183,
"frac_reward_zero_std": 0.675,
"grad_norm": 0.88671875,
"learning_rate": 1.7518248175182485e-06,
"loss": 0.0084,
"num_tokens": 42817864.0,
"reward": 0.740625,
"reward_std": 0.14487907364964486,
"rewards/qwen_accuracy_reward/mean": 0.740625,
"rewards/qwen_accuracy_reward/std": 0.35987740010023117,
"step": 340,
"step_time": 88.57682326808572
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2727.0,
"completions/max_terminated_length": 2727.0,
"completions/mean_length": 1162.9625,
"completions/mean_terminated_length": 1162.9625,
"completions/min_length": 442.9,
"completions/min_terminated_length": 442.9,
"entropy": 0.37642553746700286,
"epoch": 2.554744525547445,
"frac_reward_zero_std": 0.825,
"grad_norm": 0.0,
"learning_rate": 1.5085158150851583e-06,
"loss": -0.011,
"num_tokens": 44131612.0,
"reward": 0.80625,
"reward_std": 0.07280554845929146,
"rewards/qwen_accuracy_reward/mean": 0.80625,
"rewards/qwen_accuracy_reward/std": 0.2779258817434311,
"step": 350,
"step_time": 94.38652676101773
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00625,
"completions/max_length": 3228.0,
"completions/max_terminated_length": 2900.1,
"completions/mean_length": 1259.84375,
"completions/mean_terminated_length": 1240.6628967285155,
"completions/min_length": 373.4,
"completions/min_terminated_length": 373.4,
"entropy": 0.36303475201129914,
"epoch": 2.627737226277372,
"frac_reward_zero_std": 0.675,
"grad_norm": 0.69140625,
"learning_rate": 1.2652068126520683e-06,
"loss": 0.0103,
"num_tokens": 45384666.0,
"reward": 0.58125,
"reward_std": 0.14351309314370156,
"rewards/qwen_accuracy_reward/mean": 0.58125,
"rewards/qwen_accuracy_reward/std": 0.4772630840539932,
"step": 360,
"step_time": 100.1825181835331
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.05,
"completions/max_length": 3610.3,
"completions/max_terminated_length": 3382.4,
"completions/mean_length": 1473.825,
"completions/mean_terminated_length": 1346.1535278320312,
"completions/min_length": 388.9,
"completions/min_terminated_length": 388.9,
"entropy": 0.3781063288450241,
"epoch": 2.7007299270072993,
"frac_reward_zero_std": 0.575,
"grad_norm": 1.4375,
"learning_rate": 1.0218978102189781e-06,
"loss": 0.0401,
"num_tokens": 46681234.0,
"reward": 0.55625,
"reward_std": 0.19663594886660576,
"rewards/qwen_accuracy_reward/mean": 0.55625,
"rewards/qwen_accuracy_reward/std": 0.4555644616484642,
"step": 370,
"step_time": 121.74294393500313
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00625,
"completions/max_length": 2807.6,
"completions/max_terminated_length": 2798.0,
"completions/mean_length": 1129.05,
"completions/mean_terminated_length": 1113.3502197265625,
"completions/min_length": 362.2,
"completions/min_terminated_length": 362.2,
"entropy": 0.3525690257549286,
"epoch": 2.7737226277372264,
"frac_reward_zero_std": 0.725,
"grad_norm": 0.66015625,
"learning_rate": 7.785888077858882e-07,
"loss": 0.024,
"num_tokens": 47890746.0,
"reward": 0.609375,
"reward_std": 0.12319448739290237,
"rewards/qwen_accuracy_reward/mean": 0.609375,
"rewards/qwen_accuracy_reward/std": 0.42726452350616456,
"step": 380,
"step_time": 91.42815532507375
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 3030.0,
"completions/max_terminated_length": 2951.3,
"completions/mean_length": 1326.25,
"completions/mean_terminated_length": 1238.2517333984374,
"completions/min_length": 360.5,
"completions/min_terminated_length": 360.5,
"entropy": 0.3662068575620651,
"epoch": 2.846715328467153,
"frac_reward_zero_std": 0.575,
"grad_norm": 0.640625,
"learning_rate": 5.352798053527981e-07,
"loss": 0.0044,
"num_tokens": 49127866.0,
"reward": 0.60625,
"reward_std": 0.18800986632704736,
"rewards/qwen_accuracy_reward/mean": 0.60625,
"rewards/qwen_accuracy_reward/std": 0.4078336015343666,
"step": 390,
"step_time": 111.37025026166812
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2633.6,
"completions/max_terminated_length": 2633.6,
"completions/mean_length": 1038.35625,
"completions/mean_terminated_length": 1038.35625,
"completions/min_length": 389.5,
"completions/min_terminated_length": 389.5,
"entropy": 0.3475939750671387,
"epoch": 2.9197080291970803,
"frac_reward_zero_std": 0.775,
"grad_norm": 0.859375,
"learning_rate": 2.9197080291970804e-07,
"loss": 0.0046,
"num_tokens": 50219684.0,
"reward": 0.703125,
"reward_std": 0.09816569313406945,
"rewards/qwen_accuracy_reward/mean": 0.703125,
"rewards/qwen_accuracy_reward/std": 0.3836729422211647,
"step": 400,
"step_time": 87.25570530630648
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.040625,
"completions/max_length": 3304.6,
"completions/max_terminated_length": 3197.0,
"completions/mean_length": 1342.228125,
"completions/mean_terminated_length": 1226.4643310546876,
"completions/min_length": 403.7,
"completions/min_terminated_length": 403.7,
"entropy": 0.36126827299594877,
"epoch": 2.9927007299270074,
"frac_reward_zero_std": 0.6,
"grad_norm": 0.0,
"learning_rate": 4.866180048661801e-08,
"loss": 0.0057,
"num_tokens": 51585277.0,
"reward": 0.634375,
"reward_std": 0.18369721844792367,
"rewards/qwen_accuracy_reward/mean": 0.634375,
"rewards/qwen_accuracy_reward/std": 0.4095410585403442,
"step": 410,
"step_time": 121.25470138275996
}
],
"logging_steps": 10,
"max_steps": 411,
"num_input_tokens_seen": 51708911,
"num_train_epochs": 3,
"save_steps": 50,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}