Upload folder using huggingface_hub

c333ac3 verified 4 months ago

41.3 kB

	{
	"best_global_step": null,
	"best_metric": null,
	"best_model_checkpoint": null,
	"epoch": 3.0,
	"eval_steps": 500,
	"global_step": 411,
	"is_hyper_param_search": false,
	"is_local_process_zero": true,
	"is_world_process_zero": true,
	"log_history": [
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.009375,
	"completions/max_length": 3070.0,
	"completions/max_terminated_length": 2896.7,
	"completions/mean_length": 1017.178125,
	"completions/mean_terminated_length": 988.4677978515625,
	"completions/min_length": 239.7,
	"completions/min_terminated_length": 239.7,
	"entropy": 0.3605123937129974,
	"epoch": 0.072992700729927,
	"frac_reward_zero_std": 0.675,
	"grad_norm": 0.6796875,
	"learning_rate": 9.78102189781022e-06,
	"loss": -0.0091,
	"num_tokens": 1212849.0,
	"reward": 0.434375,
	"reward_std": 0.15717875137925147,
	"rewards/qwen_accuracy_reward/mean": 0.434375,
	"rewards/qwen_accuracy_reward/std": 0.46030205190181733,
	"step": 10,
	"step_time": 106.30040930798278
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.071875,
	"completions/max_length": 3382.3,
	"completions/max_terminated_length": 3000.2,
	"completions/mean_length": 1388.7375,
	"completions/mean_terminated_length": 1186.2124328613281,
	"completions/min_length": 297.7,
	"completions/min_terminated_length": 297.7,
	"entropy": 0.4167850613594055,
	"epoch": 0.145985401459854,
	"frac_reward_zero_std": 0.375,
	"grad_norm": 1.046875,
	"learning_rate": 9.537712895377129e-06,
	"loss": 0.0608,
	"num_tokens": 2697493.0,
	"reward": 0.478125,
	"reward_std": 0.27870663031935694,
	"rewards/qwen_accuracy_reward/mean": 0.478125,
	"rewards/qwen_accuracy_reward/std": 0.4272594749927521,
	"step": 20,
	"step_time": 116.88615104537458
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.053125,
	"completions/max_length": 3308.6,
	"completions/max_terminated_length": 3243.7,
	"completions/mean_length": 1304.284375,
	"completions/mean_terminated_length": 1165.7695190429688,
	"completions/min_length": 270.6,
	"completions/min_terminated_length": 270.6,
	"entropy": 0.4009901225566864,
	"epoch": 0.21897810218978103,
	"frac_reward_zero_std": 0.55,
	"grad_norm": 1.875,
	"learning_rate": 9.294403892944039e-06,
	"loss": 0.0254,
	"num_tokens": 4025352.0,
	"reward": 0.471875,
	"reward_std": 0.19138479307293893,
	"rewards/qwen_accuracy_reward/mean": 0.471875,
	"rewards/qwen_accuracy_reward/std": 0.48829147815704343,
	"step": 30,
	"step_time": 106.55014775730669
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.040625,
	"completions/max_length": 3444.4,
	"completions/max_terminated_length": 2867.6,
	"completions/mean_length": 1196.390625,
	"completions/mean_terminated_length": 1088.1075134277344,
	"completions/min_length": 288.2,
	"completions/min_terminated_length": 288.2,
	"entropy": 0.4015911310911179,
	"epoch": 0.291970802919708,
	"frac_reward_zero_std": 0.6,
	"grad_norm": 0.53515625,
	"learning_rate": 9.05109489051095e-06,
	"loss": 0.0434,
	"num_tokens": 5191093.0,
	"reward": 0.521875,
	"reward_std": 0.18105824217200278,
	"rewards/qwen_accuracy_reward/mean": 0.521875,
	"rewards/qwen_accuracy_reward/std": 0.48740494847297666,
	"step": 40,
	"step_time": 116.13580646244809
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.025,
	"completions/max_length": 3239.4,
	"completions/max_terminated_length": 3025.9,
	"completions/mean_length": 1166.659375,
	"completions/mean_terminated_length": 1098.1408386230469,
	"completions/min_length": 246.9,
	"completions/min_terminated_length": 246.9,
	"entropy": 0.4154060840606689,
	"epoch": 0.36496350364963503,
	"frac_reward_zero_std": 0.75,
	"grad_norm": 0.71484375,
	"learning_rate": 8.80778588807786e-06,
	"loss": 0.0252,
	"num_tokens": 6289576.0,
	"reward": 0.56875,
	"reward_std": 0.11836256608366966,
	"rewards/qwen_accuracy_reward/mean": 0.56875,
	"rewards/qwen_accuracy_reward/std": 0.47113593220710753,
	"step": 50,
	"step_time": 102.3454062897712
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.009375,
	"completions/max_length": 2896.0,
	"completions/max_terminated_length": 2697.2,
	"completions/mean_length": 1067.15,
	"completions/mean_terminated_length": 1040.172314453125,
	"completions/min_length": 278.2,
	"completions/min_terminated_length": 278.2,
	"entropy": 0.4182083398103714,
	"epoch": 0.43795620437956206,
	"frac_reward_zero_std": 0.675,
	"grad_norm": 0.0,
	"learning_rate": 8.56447688564477e-06,
	"loss": -0.0045,
	"num_tokens": 7485720.0,
	"reward": 0.46875,
	"reward_std": 0.14403236508369446,
	"rewards/qwen_accuracy_reward/mean": 0.46875,
	"rewards/qwen_accuracy_reward/std": 0.4563746154308319,
	"step": 60,
	"step_time": 103.93981777941808
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.009375,
	"completions/max_length": 2650.0,
	"completions/max_terminated_length": 2549.8,
	"completions/mean_length": 948.559375,
	"completions/mean_terminated_length": 924.6193481445313,
	"completions/min_length": 209.7,
	"completions/min_terminated_length": 209.7,
	"entropy": 0.34950864464044573,
	"epoch": 0.5109489051094891,
	"frac_reward_zero_std": 0.65,
	"grad_norm": 0.0,
	"learning_rate": 8.32116788321168e-06,
	"loss": 0.016,
	"num_tokens": 8730603.0,
	"reward": 0.528125,
	"reward_std": 0.1587614081799984,
	"rewards/qwen_accuracy_reward/mean": 0.528125,
	"rewards/qwen_accuracy_reward/std": 0.4472527623176575,
	"step": 70,
	"step_time": 90.1208279568702
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.003125,
	"completions/max_length": 2975.0,
	"completions/max_terminated_length": 2827.8,
	"completions/mean_length": 988.715625,
	"completions/mean_terminated_length": 980.0193481445312,
	"completions/min_length": 225.9,
	"completions/min_terminated_length": 225.9,
	"entropy": 0.37495362758636475,
	"epoch": 0.583941605839416,
	"frac_reward_zero_std": 0.6,
	"grad_norm": 1.234375,
	"learning_rate": 8.07785888077859e-06,
	"loss": -0.021,
	"num_tokens": 9949824.0,
	"reward": 0.521875,
	"reward_std": 0.18053897097706795,
	"rewards/qwen_accuracy_reward/mean": 0.521875,
	"rewards/qwen_accuracy_reward/std": 0.4544884204864502,
	"step": 80,
	"step_time": 97.22130602933467
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.015625,
	"completions/max_length": 2843.5,
	"completions/max_terminated_length": 2426.7,
	"completions/mean_length": 921.684375,
	"completions/mean_terminated_length": 873.637158203125,
	"completions/min_length": 278.9,
	"completions/min_terminated_length": 278.9,
	"entropy": 0.37491864860057833,
	"epoch": 0.656934306569343,
	"frac_reward_zero_std": 0.625,
	"grad_norm": 1.546875,
	"learning_rate": 7.8345498783455e-06,
	"loss": 0.0557,
	"num_tokens": 11177307.0,
	"reward": 0.5625,
	"reward_std": 0.1712738409638405,
	"rewards/qwen_accuracy_reward/mean": 0.5625,
	"rewards/qwen_accuracy_reward/std": 0.46112917065620423,
	"step": 90,
	"step_time": 95.75766938729211
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.015625,
	"completions/max_length": 2333.1,
	"completions/max_terminated_length": 2311.9,
	"completions/mean_length": 859.08125,
	"completions/mean_terminated_length": 811.47939453125,
	"completions/min_length": 218.5,
	"completions/min_terminated_length": 218.5,
	"entropy": 0.36555847227573396,
	"epoch": 0.7299270072992701,
	"frac_reward_zero_std": 0.675,
	"grad_norm": 0.96875,
	"learning_rate": 7.591240875912409e-06,
	"loss": -0.002,
	"num_tokens": 12404869.0,
	"reward": 0.440625,
	"reward_std": 0.14087215512990953,
	"rewards/qwen_accuracy_reward/mean": 0.440625,
	"rewards/qwen_accuracy_reward/std": 0.4636432766914368,
	"step": 100,
	"step_time": 82.00954903159291
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.009375,
	"completions/max_length": 3052.7,
	"completions/max_terminated_length": 2857.8,
	"completions/mean_length": 1005.96875,
	"completions/mean_terminated_length": 977.4821411132813,
	"completions/min_length": 276.6,
	"completions/min_terminated_length": 276.6,
	"entropy": 0.36328954696655275,
	"epoch": 0.8029197080291971,
	"frac_reward_zero_std": 0.675,
	"grad_norm": 1.0546875,
	"learning_rate": 7.347931873479319e-06,
	"loss": 0.0546,
	"num_tokens": 13642787.0,
	"reward": 0.66875,
	"reward_std": 0.1395061768591404,
	"rewards/qwen_accuracy_reward/mean": 0.66875,
	"rewards/qwen_accuracy_reward/std": 0.40047125071287154,
	"step": 110,
	"step_time": 102.26997727015987
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0125,
	"completions/max_length": 3162.9,
	"completions/max_terminated_length": 2987.6,
	"completions/mean_length": 1102.9875,
	"completions/mean_terminated_length": 1069.5874267578124,
	"completions/min_length": 304.1,
	"completions/min_terminated_length": 304.1,
	"entropy": 0.35163818299770355,
	"epoch": 0.8759124087591241,
	"frac_reward_zero_std": 0.575,
	"grad_norm": 0.84375,
	"learning_rate": 7.1046228710462296e-06,
	"loss": 0.0128,
	"num_tokens": 14774175.0,
	"reward": 0.534375,
	"reward_std": 0.18348564356565475,
	"rewards/qwen_accuracy_reward/mean": 0.534375,
	"rewards/qwen_accuracy_reward/std": 0.4590408980846405,
	"step": 120,
	"step_time": 97.75387887172401
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.009375,
	"completions/max_length": 2885.8,
	"completions/max_terminated_length": 2841.5,
	"completions/mean_length": 1009.309375,
	"completions/mean_terminated_length": 981.8988159179687,
	"completions/min_length": 288.1,
	"completions/min_terminated_length": 288.1,
	"entropy": 0.34102891981601713,
	"epoch": 0.948905109489051,
	"frac_reward_zero_std": 0.7,
	"grad_norm": 0.82421875,
	"learning_rate": 6.86131386861314e-06,
	"loss": -0.0044,
	"num_tokens": 15966530.0,
	"reward": 0.70625,
	"reward_std": 0.14433300495147705,
	"rewards/qwen_accuracy_reward/mean": 0.70625,
	"rewards/qwen_accuracy_reward/std": 0.411708801984787,
	"step": 130,
	"step_time": 103.35281996680423
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.04375,
	"completions/max_length": 2664.1,
	"completions/max_terminated_length": 2568.1,
	"completions/mean_length": 1134.346875,
	"completions/mean_terminated_length": 1014.4972045898437,
	"completions/min_length": 318.0,
	"completions/min_terminated_length": 318.0,
	"entropy": 0.33926538228988645,
	"epoch": 1.0218978102189782,
	"frac_reward_zero_std": 0.625,
	"grad_norm": 0.0,
	"learning_rate": 6.618004866180049e-06,
	"loss": 0.0148,
	"num_tokens": 17214129.0,
	"reward": 0.540625,
	"reward_std": 0.1648663252592087,
	"rewards/qwen_accuracy_reward/mean": 0.540625,
	"rewards/qwen_accuracy_reward/std": 0.45352436900138854,
	"step": 140,
	"step_time": 91.66583738289773
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.00625,
	"completions/max_length": 2898.0,
	"completions/max_terminated_length": 2755.6,
	"completions/mean_length": 1108.35625,
	"completions/mean_terminated_length": 1091.26259765625,
	"completions/min_length": 349.7,
	"completions/min_terminated_length": 349.7,
	"entropy": 0.33356466293334963,
	"epoch": 1.094890510948905,
	"frac_reward_zero_std": 0.675,
	"grad_norm": 0.734375,
	"learning_rate": 6.3746958637469595e-06,
	"loss": 0.0161,
	"num_tokens": 18739043.0,
	"reward": 0.65,
	"reward_std": 0.14624504819512368,
	"rewards/qwen_accuracy_reward/mean": 0.65,
	"rewards/qwen_accuracy_reward/std": 0.4041076198220253,
	"step": 150,
	"step_time": 111.4213294208981
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.05,
	"completions/max_length": 2954.3,
	"completions/max_terminated_length": 2814.9,
	"completions/mean_length": 1294.83125,
	"completions/mean_terminated_length": 1166.3035034179688,
	"completions/min_length": 401.2,
	"completions/min_terminated_length": 401.2,
	"entropy": 0.36098510324954985,
	"epoch": 1.167883211678832,
	"frac_reward_zero_std": 0.675,
	"grad_norm": 0.0,
	"learning_rate": 6.13138686131387e-06,
	"loss": 0.0291,
	"num_tokens": 20131237.0,
	"reward": 0.71875,
	"reward_std": 0.14718669205904006,
	"rewards/qwen_accuracy_reward/mean": 0.71875,
	"rewards/qwen_accuracy_reward/std": 0.3977723315358162,
	"step": 160,
	"step_time": 100.20992619153112
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.00625,
	"completions/max_length": 2856.4,
	"completions/max_terminated_length": 2700.9,
	"completions/mean_length": 1204.96875,
	"completions/mean_terminated_length": 1187.974169921875,
	"completions/min_length": 393.8,
	"completions/min_terminated_length": 393.8,
	"entropy": 0.3756037563085556,
	"epoch": 1.2408759124087592,
	"frac_reward_zero_std": 0.55,
	"grad_norm": 0.671875,
	"learning_rate": 5.888077858880778e-06,
	"loss": 0.0116,
	"num_tokens": 21383299.0,
	"reward": 0.596875,
	"reward_std": 0.20872601345181466,
	"rewards/qwen_accuracy_reward/mean": 0.596875,
	"rewards/qwen_accuracy_reward/std": 0.46048612892627716,
	"step": 170,
	"step_time": 99.4391059097834
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.00625,
	"completions/max_length": 3201.4,
	"completions/max_terminated_length": 3001.8,
	"completions/mean_length": 1198.003125,
	"completions/mean_terminated_length": 1179.4652099609375,
	"completions/min_length": 440.1,
	"completions/min_terminated_length": 440.1,
	"entropy": 0.38104947507381437,
	"epoch": 1.313868613138686,
	"frac_reward_zero_std": 0.75,
	"grad_norm": 0.5625,
	"learning_rate": 5.6447688564476885e-06,
	"loss": 0.0187,
	"num_tokens": 22501172.0,
	"reward": 0.7625,
	"reward_std": 0.10478792265057564,
	"rewards/qwen_accuracy_reward/mean": 0.7625,
	"rewards/qwen_accuracy_reward/std": 0.3862288236618042,
	"step": 180,
	"step_time": 101.38317952565849
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.01875,
	"completions/max_length": 2751.6,
	"completions/max_terminated_length": 2576.8,
	"completions/mean_length": 1141.85625,
	"completions/mean_terminated_length": 1095.5864868164062,
	"completions/min_length": 378.7,
	"completions/min_terminated_length": 378.7,
	"entropy": 0.37404528707265855,
	"epoch": 1.3868613138686132,
	"frac_reward_zero_std": 0.575,
	"grad_norm": 0.0,
	"learning_rate": 5.401459854014599e-06,
	"loss": 0.0105,
	"num_tokens": 23793222.0,
	"reward": 0.521875,
	"reward_std": 0.18928286358714103,
	"rewards/qwen_accuracy_reward/mean": 0.521875,
	"rewards/qwen_accuracy_reward/std": 0.43783398270606994,
	"step": 190,
	"step_time": 91.93207672638819
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0375,
	"completions/max_length": 3231.8,
	"completions/max_terminated_length": 3000.5,
	"completions/mean_length": 1305.665625,
	"completions/mean_terminated_length": 1198.9109619140625,
	"completions/min_length": 365.7,
	"completions/min_terminated_length": 365.7,
	"entropy": 0.3786713719367981,
	"epoch": 1.4598540145985401,
	"frac_reward_zero_std": 0.725,
	"grad_norm": 0.80078125,
	"learning_rate": 5.158150851581509e-06,
	"loss": 0.0082,
	"num_tokens": 25037259.0,
	"reward": 0.603125,
	"reward_std": 0.12046253234148026,
	"rewards/qwen_accuracy_reward/mean": 0.603125,
	"rewards/qwen_accuracy_reward/std": 0.42881221920251844,
	"step": 200,
	"step_time": 103.0258747473359
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.015625,
	"completions/max_length": 3153.1,
	"completions/max_terminated_length": 3017.2,
	"completions/mean_length": 1360.325,
	"completions/mean_terminated_length": 1316.6628784179688,
	"completions/min_length": 432.6,
	"completions/min_terminated_length": 432.6,
	"entropy": 0.3885278135538101,
	"epoch": 1.5328467153284673,
	"frac_reward_zero_std": 0.6,
	"grad_norm": 0.50390625,
	"learning_rate": 4.914841849148419e-06,
	"loss": 0.034,
	"num_tokens": 26258443.0,
	"reward": 0.66875,
	"reward_std": 0.18044402971863746,
	"rewards/qwen_accuracy_reward/mean": 0.66875,
	"rewards/qwen_accuracy_reward/std": 0.4504780650138855,
	"step": 210,
	"step_time": 96.96674608923495
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 2816.0,
	"completions/max_terminated_length": 2816.0,
	"completions/mean_length": 1161.459375,
	"completions/mean_terminated_length": 1161.459375,
	"completions/min_length": 397.7,
	"completions/min_terminated_length": 397.7,
	"entropy": 0.38007940649986266,
	"epoch": 1.6058394160583942,
	"frac_reward_zero_std": 0.75,
	"grad_norm": 0.7421875,
	"learning_rate": 4.671532846715329e-06,
	"loss": 0.0182,
	"num_tokens": 27530662.0,
	"reward": 0.7,
	"reward_std": 0.11110442206263542,
	"rewards/qwen_accuracy_reward/mean": 0.7,
	"rewards/qwen_accuracy_reward/std": 0.36664991080760956,
	"step": 220,
	"step_time": 93.51679303245619
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.034375,
	"completions/max_length": 2982.4,
	"completions/max_terminated_length": 2932.6,
	"completions/mean_length": 1282.75,
	"completions/mean_terminated_length": 1197.8360961914063,
	"completions/min_length": 432.0,
	"completions/min_terminated_length": 432.0,
	"entropy": 0.36942420303821566,
	"epoch": 1.6788321167883211,
	"frac_reward_zero_std": 0.55,
	"grad_norm": 0.68359375,
	"learning_rate": 4.428223844282239e-06,
	"loss": 0.0049,
	"num_tokens": 28913198.0,
	"reward": 0.559375,
	"reward_std": 0.1864362359046936,
	"rewards/qwen_accuracy_reward/mean": 0.559375,
	"rewards/qwen_accuracy_reward/std": 0.4745823562145233,
	"step": 230,
	"step_time": 106.11833359738812
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.028125,
	"completions/max_length": 3358.5,
	"completions/max_terminated_length": 3078.0,
	"completions/mean_length": 1389.68125,
	"completions/mean_terminated_length": 1318.075048828125,
	"completions/min_length": 451.0,
	"completions/min_terminated_length": 451.0,
	"entropy": 0.4076398193836212,
	"epoch": 1.7518248175182483,
	"frac_reward_zero_std": 0.625,
	"grad_norm": 0.640625,
	"learning_rate": 4.184914841849148e-06,
	"loss": 0.0356,
	"num_tokens": 30106480.0,
	"reward": 0.640625,
	"reward_std": 0.16717590987682343,
	"rewards/qwen_accuracy_reward/mean": 0.640625,
	"rewards/qwen_accuracy_reward/std": 0.4686185359954834,
	"step": 240,
	"step_time": 115.86419467311353
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.015625,
	"completions/max_length": 3010.5,
	"completions/max_terminated_length": 2811.1,
	"completions/mean_length": 1186.053125,
	"completions/mean_terminated_length": 1145.055078125,
	"completions/min_length": 383.4,
	"completions/min_terminated_length": 383.4,
	"entropy": 0.3604145884513855,
	"epoch": 1.8248175182481752,
	"frac_reward_zero_std": 0.625,
	"grad_norm": 0.8984375,
	"learning_rate": 3.9416058394160585e-06,
	"loss": 0.0403,
	"num_tokens": 31363833.0,
	"reward": 0.59375,
	"reward_std": 0.16990982741117477,
	"rewards/qwen_accuracy_reward/mean": 0.59375,
	"rewards/qwen_accuracy_reward/std": 0.43118971437215803,
	"step": 250,
	"step_time": 99.8452183963731
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 2370.0,
	"completions/max_terminated_length": 2370.0,
	"completions/mean_length": 1080.86875,
	"completions/mean_terminated_length": 1080.86875,
	"completions/min_length": 409.6,
	"completions/min_terminated_length": 409.6,
	"entropy": 0.37349976003170016,
	"epoch": 1.897810218978102,
	"frac_reward_zero_std": 0.825,
	"grad_norm": 0.53515625,
	"learning_rate": 3.698296836982969e-06,
	"loss": -0.0252,
	"num_tokens": 32702599.0,
	"reward": 0.65,
	"reward_std": 0.07775410786271095,
	"rewards/qwen_accuracy_reward/mean": 0.65,
	"rewards/qwen_accuracy_reward/std": 0.3688706248998642,
	"step": 260,
	"step_time": 86.65903639029712
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.025,
	"completions/max_length": 2481.9,
	"completions/max_terminated_length": 2415.4,
	"completions/mean_length": 1075.425,
	"completions/mean_terminated_length": 1002.4191040039062,
	"completions/min_length": 349.1,
	"completions/min_terminated_length": 349.1,
	"entropy": 0.34225144386291506,
	"epoch": 1.9708029197080292,
	"frac_reward_zero_std": 0.7,
	"grad_norm": 0.96484375,
	"learning_rate": 3.454987834549879e-06,
	"loss": 0.0167,
	"num_tokens": 33787199.0,
	"reward": 0.578125,
	"reward_std": 0.12972569838166237,
	"rewards/qwen_accuracy_reward/mean": 0.578125,
	"rewards/qwen_accuracy_reward/std": 0.4668997347354889,
	"step": 270,
	"step_time": 80.8301064182073
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.009375,
	"completions/max_length": 2574.6,
	"completions/max_terminated_length": 2453.7,
	"completions/mean_length": 1054.425,
	"completions/mean_terminated_length": 1026.784130859375,
	"completions/min_length": 392.3,
	"completions/min_terminated_length": 392.3,
	"entropy": 0.3572549015283585,
	"epoch": 2.0437956204379564,
	"frac_reward_zero_std": 0.85,
	"grad_norm": 0.8046875,
	"learning_rate": 3.2116788321167884e-06,
	"loss": 0.0306,
	"num_tokens": 34995479.0,
	"reward": 0.821875,
	"reward_std": 0.06165712922811508,
	"rewards/qwen_accuracy_reward/mean": 0.821875,
	"rewards/qwen_accuracy_reward/std": 0.2794704169034958,
	"step": 280,
	"step_time": 85.92972797648981
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 2578.2,
	"completions/max_terminated_length": 2578.2,
	"completions/mean_length": 1058.575,
	"completions/mean_terminated_length": 1058.575,
	"completions/min_length": 371.0,
	"completions/min_terminated_length": 371.0,
	"entropy": 0.3524599611759186,
	"epoch": 2.116788321167883,
	"frac_reward_zero_std": 0.775,
	"grad_norm": 0.0,
	"learning_rate": 2.9683698296836987e-06,
	"loss": 0.0015,
	"num_tokens": 36338735.0,
	"reward": 0.621875,
	"reward_std": 0.10636548325419426,
	"rewards/qwen_accuracy_reward/mean": 0.621875,
	"rewards/qwen_accuracy_reward/std": 0.4201431304216385,
	"step": 290,
	"step_time": 93.4068580438383
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.009375,
	"completions/max_length": 3337.3,
	"completions/max_terminated_length": 3052.6,
	"completions/mean_length": 1304.146875,
	"completions/mean_terminated_length": 1277.986474609375,
	"completions/min_length": 366.6,
	"completions/min_terminated_length": 366.6,
	"entropy": 0.37322444319725034,
	"epoch": 2.18978102189781,
	"frac_reward_zero_std": 0.675,
	"grad_norm": 0.90625,
	"learning_rate": 2.7250608272506085e-06,
	"loss": 0.0091,
	"num_tokens": 37781158.0,
	"reward": 0.684375,
	"reward_std": 0.13267236873507499,
	"rewards/qwen_accuracy_reward/mean": 0.684375,
	"rewards/qwen_accuracy_reward/std": 0.36027481555938723,
	"step": 300,
	"step_time": 124.43963868878782
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.025,
	"completions/max_length": 3103.4,
	"completions/max_terminated_length": 3048.6,
	"completions/mean_length": 1280.71875,
	"completions/mean_terminated_length": 1224.5757568359375,
	"completions/min_length": 414.7,
	"completions/min_terminated_length": 414.7,
	"entropy": 0.36646572649478915,
	"epoch": 2.2627737226277373,
	"frac_reward_zero_std": 0.575,
	"grad_norm": 0.75390625,
	"learning_rate": 2.4817518248175183e-06,
	"loss": -0.0218,
	"num_tokens": 39053748.0,
	"reward": 0.60625,
	"reward_std": 0.19305532947182655,
	"rewards/qwen_accuracy_reward/mean": 0.60625,
	"rewards/qwen_accuracy_reward/std": 0.45746631026268003,
	"step": 310,
	"step_time": 96.06518278419972
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.025,
	"completions/max_length": 3248.1,
	"completions/max_terminated_length": 3058.9,
	"completions/mean_length": 1360.24375,
	"completions/mean_terminated_length": 1295.442919921875,
	"completions/min_length": 362.0,
	"completions/min_terminated_length": 362.0,
	"entropy": 0.3854114145040512,
	"epoch": 2.335766423357664,
	"frac_reward_zero_std": 0.625,
	"grad_norm": 1.078125,
	"learning_rate": 2.2384428223844286e-06,
	"loss": 0.0638,
	"num_tokens": 40304938.0,
	"reward": 0.70625,
	"reward_std": 0.16034209728240967,
	"rewards/qwen_accuracy_reward/mean": 0.70625,
	"rewards/qwen_accuracy_reward/std": 0.3804707407951355,
	"step": 320,
	"step_time": 109.24540220741183
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.025,
	"completions/max_length": 2834.8,
	"completions/max_terminated_length": 2617.5,
	"completions/mean_length": 1194.184375,
	"completions/mean_terminated_length": 1109.6020874023438,
	"completions/min_length": 388.7,
	"completions/min_terminated_length": 388.7,
	"entropy": 0.36874857246875764,
	"epoch": 2.408759124087591,
	"frac_reward_zero_std": 0.7,
	"grad_norm": 0.0,
	"learning_rate": 1.9951338199513384e-06,
	"loss": -0.005,
	"num_tokens": 41547669.0,
	"reward": 0.7,
	"reward_std": 0.13057240098714828,
	"rewards/qwen_accuracy_reward/mean": 0.7,
	"rewards/qwen_accuracy_reward/std": 0.3882273375988007,
	"step": 330,
	"step_time": 94.99811747204512
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 2663.3,
	"completions/max_terminated_length": 2663.3,
	"completions/mean_length": 1023.784375,
	"completions/mean_terminated_length": 1023.784375,
	"completions/min_length": 363.6,
	"completions/min_terminated_length": 363.6,
	"entropy": 0.3397214740514755,
	"epoch": 2.4817518248175183,
	"frac_reward_zero_std": 0.675,
	"grad_norm": 0.88671875,
	"learning_rate": 1.7518248175182485e-06,
	"loss": 0.0084,
	"num_tokens": 42817864.0,
	"reward": 0.740625,
	"reward_std": 0.14487907364964486,
	"rewards/qwen_accuracy_reward/mean": 0.740625,
	"rewards/qwen_accuracy_reward/std": 0.35987740010023117,
	"step": 340,
	"step_time": 88.57682326808572
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 2727.0,
	"completions/max_terminated_length": 2727.0,
	"completions/mean_length": 1162.9625,
	"completions/mean_terminated_length": 1162.9625,
	"completions/min_length": 442.9,
	"completions/min_terminated_length": 442.9,
	"entropy": 0.37642553746700286,
	"epoch": 2.554744525547445,
	"frac_reward_zero_std": 0.825,
	"grad_norm": 0.0,
	"learning_rate": 1.5085158150851583e-06,
	"loss": -0.011,
	"num_tokens": 44131612.0,
	"reward": 0.80625,
	"reward_std": 0.07280554845929146,
	"rewards/qwen_accuracy_reward/mean": 0.80625,
	"rewards/qwen_accuracy_reward/std": 0.2779258817434311,
	"step": 350,
	"step_time": 94.38652676101773
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.00625,
	"completions/max_length": 3228.0,
	"completions/max_terminated_length": 2900.1,
	"completions/mean_length": 1259.84375,
	"completions/mean_terminated_length": 1240.6628967285155,
	"completions/min_length": 373.4,
	"completions/min_terminated_length": 373.4,
	"entropy": 0.36303475201129914,
	"epoch": 2.627737226277372,
	"frac_reward_zero_std": 0.675,
	"grad_norm": 0.69140625,
	"learning_rate": 1.2652068126520683e-06,
	"loss": 0.0103,
	"num_tokens": 45384666.0,
	"reward": 0.58125,
	"reward_std": 0.14351309314370156,
	"rewards/qwen_accuracy_reward/mean": 0.58125,
	"rewards/qwen_accuracy_reward/std": 0.4772630840539932,
	"step": 360,
	"step_time": 100.1825181835331
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.05,
	"completions/max_length": 3610.3,
	"completions/max_terminated_length": 3382.4,
	"completions/mean_length": 1473.825,
	"completions/mean_terminated_length": 1346.1535278320312,
	"completions/min_length": 388.9,
	"completions/min_terminated_length": 388.9,
	"entropy": 0.3781063288450241,
	"epoch": 2.7007299270072993,
	"frac_reward_zero_std": 0.575,
	"grad_norm": 1.4375,
	"learning_rate": 1.0218978102189781e-06,
	"loss": 0.0401,
	"num_tokens": 46681234.0,
	"reward": 0.55625,
	"reward_std": 0.19663594886660576,
	"rewards/qwen_accuracy_reward/mean": 0.55625,
	"rewards/qwen_accuracy_reward/std": 0.4555644616484642,
	"step": 370,
	"step_time": 121.74294393500313
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.00625,
	"completions/max_length": 2807.6,
	"completions/max_terminated_length": 2798.0,
	"completions/mean_length": 1129.05,
	"completions/mean_terminated_length": 1113.3502197265625,
	"completions/min_length": 362.2,
	"completions/min_terminated_length": 362.2,
	"entropy": 0.3525690257549286,
	"epoch": 2.7737226277372264,
	"frac_reward_zero_std": 0.725,
	"grad_norm": 0.66015625,
	"learning_rate": 7.785888077858882e-07,
	"loss": 0.024,
	"num_tokens": 47890746.0,
	"reward": 0.609375,
	"reward_std": 0.12319448739290237,
	"rewards/qwen_accuracy_reward/mean": 0.609375,
	"rewards/qwen_accuracy_reward/std": 0.42726452350616456,
	"step": 380,
	"step_time": 91.42815532507375
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.03125,
	"completions/max_length": 3030.0,
	"completions/max_terminated_length": 2951.3,
	"completions/mean_length": 1326.25,
	"completions/mean_terminated_length": 1238.2517333984374,
	"completions/min_length": 360.5,
	"completions/min_terminated_length": 360.5,
	"entropy": 0.3662068575620651,
	"epoch": 2.846715328467153,
	"frac_reward_zero_std": 0.575,
	"grad_norm": 0.640625,
	"learning_rate": 5.352798053527981e-07,
	"loss": 0.0044,
	"num_tokens": 49127866.0,
	"reward": 0.60625,
	"reward_std": 0.18800986632704736,
	"rewards/qwen_accuracy_reward/mean": 0.60625,
	"rewards/qwen_accuracy_reward/std": 0.4078336015343666,
	"step": 390,
	"step_time": 111.37025026166812
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 2633.6,
	"completions/max_terminated_length": 2633.6,
	"completions/mean_length": 1038.35625,
	"completions/mean_terminated_length": 1038.35625,
	"completions/min_length": 389.5,
	"completions/min_terminated_length": 389.5,
	"entropy": 0.3475939750671387,
	"epoch": 2.9197080291970803,
	"frac_reward_zero_std": 0.775,
	"grad_norm": 0.859375,
	"learning_rate": 2.9197080291970804e-07,
	"loss": 0.0046,
	"num_tokens": 50219684.0,
	"reward": 0.703125,
	"reward_std": 0.09816569313406945,
	"rewards/qwen_accuracy_reward/mean": 0.703125,
	"rewards/qwen_accuracy_reward/std": 0.3836729422211647,
	"step": 400,
	"step_time": 87.25570530630648
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.040625,
	"completions/max_length": 3304.6,
	"completions/max_terminated_length": 3197.0,
	"completions/mean_length": 1342.228125,
	"completions/mean_terminated_length": 1226.4643310546876,
	"completions/min_length": 403.7,
	"completions/min_terminated_length": 403.7,
	"entropy": 0.36126827299594877,
	"epoch": 2.9927007299270074,
	"frac_reward_zero_std": 0.6,
	"grad_norm": 0.0,
	"learning_rate": 4.866180048661801e-08,
	"loss": 0.0057,
	"num_tokens": 51585277.0,
	"reward": 0.634375,
	"reward_std": 0.18369721844792367,
	"rewards/qwen_accuracy_reward/mean": 0.634375,
	"rewards/qwen_accuracy_reward/std": 0.4095410585403442,
	"step": 410,
	"step_time": 121.25470138275996
	}
	],
	"logging_steps": 10,
	"max_steps": 411,
	"num_input_tokens_seen": 51708911,
	"num_train_epochs": 3,
	"save_steps": 50,
	"stateful_callbacks": {
	"TrainerControl": {
	"args": {
	"should_epoch_stop": false,
	"should_evaluate": false,
	"should_log": false,
	"should_save": true,
	"should_training_stop": true
	},
	"attributes": {}
	}
	},
	"total_flos": 0.0,
	"train_batch_size": 4,
	"trial_name": null,
	"trial_params": null
	}