leonMW's picture
Upload folder using huggingface_hub
1628d0a verified
{
"best_global_step": 128,
"best_metric": 0.00011446899588918313,
"best_model_checkpoint": "data/DeepSeek-R1-Distill-Qwen-14B-Staged-1/checkpoint-128",
"epoch": 2.0,
"eval_steps": 500,
"global_step": 128,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.005859375,
"completions/max_length": 3072.0,
"completions/max_terminated_length": 2976.0,
"completions/mean_length": 593.9775390625,
"completions/mean_terminated_length": 579.372314453125,
"completions/min_length": 161.0,
"completions/min_terminated_length": 161.0,
"entropy": 0.21448766812682152,
"epoch": 0.015625,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.15673470497131348,
"learning_rate": 1e-05,
"loss": 0.095,
"num_tokens": 2563666.0,
"reward": 4.010468482971191,
"reward_std": 0.056801095604896545,
"rewards/ngram_repetition2/mean": 0.672095000743866,
"rewards/ngram_repetition2/std": 0.10666719824075699,
"rewards/ngram_repetition3/mean": 0.8145524859428406,
"rewards/ngram_repetition3/std": 0.08704482018947601,
"rewards/symbolic_reward_accuracy/mean": 0.99609375,
"rewards/symbolic_reward_accuracy/std": 0.06239304319024086,
"rewards/symbolic_reward_partial_score/mean": 0.996826171875,
"rewards/symbolic_reward_partial_score/std": 0.05290473252534866,
"rewards/tag_count_reward/mean": 0.9970703125,
"rewards/tag_count_reward/std": 0.03817030414938927,
"rewards/thinking_answer_ratio_reward/mean": 0.9518401622772217,
"rewards/thinking_answer_ratio_reward/std": 0.07770728319883347,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0463709831237793,
"sampling/importance_sampling_ratio/min": 0.0010976478224620223,
"sampling/sampling_logp_difference/max": 6.8145856857299805,
"sampling/sampling_logp_difference/mean": 0.09316523373126984,
"step": 1
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.3932291666666667,
"clip_ratio/low_min": 0.14583333333333334,
"clip_ratio/region_mean": 0.3932291666666667,
"entropy": 0.26243093982338905,
"epoch": 0.0625,
"grad_norm": 0.12268827110528946,
"learning_rate": 1e-05,
"loss": 0.0979,
"step": 4
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.3955078125,
"clip_ratio/low_min": 0.1875,
"clip_ratio/region_mean": 0.3955078125,
"entropy": 0.32875449024140835,
"epoch": 0.125,
"grad_norm": 0.07022340595722198,
"learning_rate": 1e-05,
"loss": 0.0903,
"step": 8
},
{
"clip_ratio/high_max": 0.078125,
"clip_ratio/high_mean": 0.0224609375,
"clip_ratio/low_mean": 0.3046875,
"clip_ratio/low_min": 0.1328125,
"clip_ratio/region_mean": 0.3271484375,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2691.0,
"completions/max_terminated_length": 2691.0,
"completions/mean_length": 550.14892578125,
"completions/mean_terminated_length": 550.14892578125,
"completions/min_length": 51.0,
"completions/min_terminated_length": 51.0,
"entropy": 0.3310157172381878,
"epoch": 0.1875,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.06514108180999756,
"learning_rate": 1e-05,
"loss": 0.0373,
"num_tokens": 5042563.0,
"reward": 4.015265464782715,
"reward_std": 0.03912237286567688,
"rewards/ngram_repetition2/mean": 0.683986246585846,
"rewards/ngram_repetition2/std": 0.06738097220659256,
"rewards/ngram_repetition3/mean": 0.8293638825416565,
"rewards/ngram_repetition3/std": 0.05390477925539017,
"rewards/symbolic_reward_accuracy/mean": 0.99658203125,
"rewards/symbolic_reward_accuracy/std": 0.05837765336036682,
"rewards/symbolic_reward_partial_score/mean": 0.997314453125,
"rewards/symbolic_reward_partial_score/std": 0.04809629172086716,
"rewards/tag_count_reward/mean": 1.0,
"rewards/tag_count_reward/std": 0.0,
"rewards/thinking_answer_ratio_reward/mean": 0.9653676748275757,
"rewards/thinking_answer_ratio_reward/std": 0.017185064032673836,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.079781413078308,
"sampling/importance_sampling_ratio/min": 0.0017149768536910415,
"sampling/sampling_logp_difference/max": 6.368355751037598,
"sampling/sampling_logp_difference/mean": 0.13660897314548492,
"step": 12
},
{
"clip_ratio/high_max": 0.1171875,
"clip_ratio/high_mean": 0.025390625,
"clip_ratio/low_mean": 0.4423828125,
"clip_ratio/low_min": 0.15625,
"clip_ratio/region_mean": 0.4677734375,
"entropy": 0.3334905654191971,
"epoch": 0.25,
"grad_norm": 0.05110664293169975,
"learning_rate": 1e-05,
"loss": 0.0353,
"step": 16
},
{
"clip_ratio/high_max": 0.171875,
"clip_ratio/high_mean": 0.068359375,
"clip_ratio/low_mean": 0.2587890625,
"clip_ratio/low_min": 0.0859375,
"clip_ratio/region_mean": 0.3271484375,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1782.0,
"completions/max_terminated_length": 1782.0,
"completions/mean_length": 462.2470703125,
"completions/mean_terminated_length": 462.2470703125,
"completions/min_length": 184.0,
"completions/min_terminated_length": 184.0,
"entropy": 0.36026287637650967,
"epoch": 0.3125,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.04351092875003815,
"learning_rate": 1e-05,
"loss": 0.0274,
"num_tokens": 7343101.0,
"reward": 3.989396095275879,
"reward_std": 0.1385808140039444,
"rewards/ngram_repetition2/mean": 0.7290781736373901,
"rewards/ngram_repetition2/std": 0.061636194586753845,
"rewards/ngram_repetition3/mean": 0.8645428419113159,
"rewards/ngram_repetition3/std": 0.047739289700984955,
"rewards/symbolic_reward_accuracy/mean": 0.98681640625,
"rewards/symbolic_reward_accuracy/std": 0.11408830434083939,
"rewards/symbolic_reward_partial_score/mean": 0.990478515625,
"rewards/symbolic_reward_partial_score/std": 0.08719795942306519,
"rewards/tag_count_reward/mean": 0.999755859375,
"rewards/tag_count_reward/std": 0.011048543266952038,
"rewards/thinking_answer_ratio_reward/mean": 0.9592640399932861,
"rewards/thinking_answer_ratio_reward/std": 0.021742122247815132,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0902045965194702,
"sampling/importance_sampling_ratio/min": 0.0015116139547899365,
"sampling/sampling_logp_difference/max": 6.494577407836914,
"sampling/sampling_logp_difference/mean": 0.15023121237754822,
"step": 20
},
{
"clip_ratio/high_max": 0.1640625,
"clip_ratio/high_mean": 0.046875,
"clip_ratio/low_mean": 0.3720703125,
"clip_ratio/low_min": 0.1875,
"clip_ratio/region_mean": 0.4189453125,
"entropy": 0.366955591365695,
"epoch": 0.375,
"grad_norm": 0.039240479469299316,
"learning_rate": 1e-05,
"loss": 0.0148,
"step": 24
},
{
"clip_ratio/high_max": 0.1953125,
"clip_ratio/high_mean": 0.0732421875,
"clip_ratio/low_mean": 0.2822265625,
"clip_ratio/low_min": 0.078125,
"clip_ratio/region_mean": 0.35546875,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1998.0,
"completions/max_terminated_length": 1998.0,
"completions/mean_length": 429.3818359375,
"completions/mean_terminated_length": 429.3818359375,
"completions/min_length": 28.0,
"completions/min_terminated_length": 28.0,
"entropy": 0.37247131764888763,
"epoch": 0.4375,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.04849822074174881,
"learning_rate": 1e-05,
"loss": 0.0149,
"num_tokens": 9582987.0,
"reward": 4.010948181152344,
"reward_std": 0.06129944697022438,
"rewards/ngram_repetition2/mean": 0.7602318525314331,
"rewards/ngram_repetition2/std": 0.05239401385188103,
"rewards/ngram_repetition3/mean": 0.8901417255401611,
"rewards/ngram_repetition3/std": 0.039680566638708115,
"rewards/symbolic_reward_accuracy/mean": 0.99462890625,
"rewards/symbolic_reward_accuracy/std": 0.07310851663351059,
"rewards/symbolic_reward_partial_score/mean": 0.99560546875,
"rewards/symbolic_reward_partial_score/std": 0.06236054003238678,
"rewards/tag_count_reward/mean": 1.0,
"rewards/tag_count_reward/std": 0.0,
"rewards/thinking_answer_ratio_reward/mean": 0.9581317901611328,
"rewards/thinking_answer_ratio_reward/std": 0.03222493454813957,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0975332260131836,
"sampling/importance_sampling_ratio/min": 0.002007565228268504,
"sampling/sampling_logp_difference/max": 6.210832595825195,
"sampling/sampling_logp_difference/mean": 0.15882712602615356,
"step": 28
},
{
"clip_ratio/high_max": 0.203125,
"clip_ratio/high_mean": 0.056640625,
"clip_ratio/low_mean": 0.435546875,
"clip_ratio/low_min": 0.1953125,
"clip_ratio/region_mean": 0.4921875,
"entropy": 0.3766433894634247,
"epoch": 0.5,
"grad_norm": 0.036272790282964706,
"learning_rate": 1e-05,
"loss": 0.0171,
"step": 32
},
{
"clip_ratio/high_max": 0.2109375,
"clip_ratio/high_mean": 0.08984375,
"clip_ratio/low_mean": 0.2900390625,
"clip_ratio/low_min": 0.125,
"clip_ratio/region_mean": 0.3798828125,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1418.0,
"completions/max_terminated_length": 1418.0,
"completions/mean_length": 380.4365234375,
"completions/mean_terminated_length": 380.4365234375,
"completions/min_length": 171.0,
"completions/min_terminated_length": 171.0,
"entropy": 0.37771076895296574,
"epoch": 0.5625,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.04128963500261307,
"learning_rate": 1e-05,
"loss": 0.0201,
"num_tokens": 11687689.0,
"reward": 4.007978439331055,
"reward_std": 0.0718117207288742,
"rewards/ngram_repetition2/mean": 0.7869799733161926,
"rewards/ngram_repetition2/std": 0.049044348299503326,
"rewards/ngram_repetition3/mean": 0.9102581143379211,
"rewards/ngram_repetition3/std": 0.03698350489139557,
"rewards/symbolic_reward_accuracy/mean": 0.9931640625,
"rewards/symbolic_reward_accuracy/std": 0.08241677284240723,
"rewards/symbolic_reward_partial_score/mean": 0.9951171875,
"rewards/symbolic_reward_partial_score/std": 0.06232419237494469,
"rewards/tag_count_reward/mean": 1.0,
"rewards/tag_count_reward/std": 0.0,
"rewards/thinking_answer_ratio_reward/mean": 0.9560877084732056,
"rewards/thinking_answer_ratio_reward/std": 0.013234787620604038,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0998252630233765,
"sampling/importance_sampling_ratio/min": 0.001202415325678885,
"sampling/sampling_logp_difference/max": 6.723423004150391,
"sampling/sampling_logp_difference/mean": 0.1619987189769745,
"step": 36
},
{
"clip_ratio/high_max": 0.25,
"clip_ratio/high_mean": 0.087890625,
"clip_ratio/low_mean": 0.3935546875,
"clip_ratio/low_min": 0.1640625,
"clip_ratio/region_mean": 0.4814453125,
"entropy": 0.3803216014057398,
"epoch": 0.625,
"grad_norm": 0.03413194790482521,
"learning_rate": 1e-05,
"loss": 0.0067,
"step": 40
},
{
"clip_ratio/high_max": 0.25,
"clip_ratio/high_mean": 0.1328125,
"clip_ratio/low_mean": 0.263671875,
"clip_ratio/low_min": 0.0859375,
"clip_ratio/region_mean": 0.396484375,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1068.0,
"completions/max_terminated_length": 1068.0,
"completions/mean_length": 341.1298828125,
"completions/mean_terminated_length": 341.1298828125,
"completions/min_length": 141.0,
"completions/min_terminated_length": 141.0,
"entropy": 0.3850418608635664,
"epoch": 0.6875,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.031503826379776,
"learning_rate": 1e-05,
"loss": 0.0112,
"num_tokens": 13728531.0,
"reward": 4.011279106140137,
"reward_std": 0.06094657629728317,
"rewards/ngram_repetition2/mean": 0.824249267578125,
"rewards/ngram_repetition2/std": 0.043261680752038956,
"rewards/ngram_repetition3/mean": 0.9390015602111816,
"rewards/ngram_repetition3/std": 0.02868303656578064,
"rewards/symbolic_reward_accuracy/mean": 0.994140625,
"rewards/symbolic_reward_accuracy/std": 0.07634060829877853,
"rewards/symbolic_reward_partial_score/mean": 0.995849609375,
"rewards/symbolic_reward_partial_score/std": 0.05727367848157883,
"rewards/tag_count_reward/mean": 1.0,
"rewards/tag_count_reward/std": 0.0,
"rewards/thinking_answer_ratio_reward/mean": 0.9515953063964844,
"rewards/thinking_answer_ratio_reward/std": 0.01442283671349287,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.1041243076324463,
"sampling/importance_sampling_ratio/min": 0.0016117201885208488,
"sampling/sampling_logp_difference/max": 6.430453300476074,
"sampling/sampling_logp_difference/mean": 0.1668510138988495,
"step": 44
},
{
"clip_ratio/high_max": 0.21875,
"clip_ratio/high_mean": 0.0908203125,
"clip_ratio/low_mean": 0.3994140625,
"clip_ratio/low_min": 0.1796875,
"clip_ratio/region_mean": 0.490234375,
"entropy": 0.3868873305618763,
"epoch": 0.75,
"grad_norm": 0.03029937855899334,
"learning_rate": 1e-05,
"loss": 0.009,
"step": 48
},
{
"clip_ratio/high_max": 0.1875,
"clip_ratio/high_mean": 0.09765625,
"clip_ratio/low_mean": 0.2685546875,
"clip_ratio/low_min": 0.0859375,
"clip_ratio/region_mean": 0.3662109375,
"completions/clipped_ratio": 0.0,
"completions/max_length": 562.0,
"completions/max_terminated_length": 562.0,
"completions/mean_length": 307.357421875,
"completions/mean_terminated_length": 307.357421875,
"completions/min_length": 140.0,
"completions/min_terminated_length": 140.0,
"entropy": 0.38150897435843945,
"epoch": 0.8125,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.026803122833371162,
"learning_rate": 1e-05,
"loss": 0.0116,
"num_tokens": 15703535.0,
"reward": 4.009974479675293,
"reward_std": 0.060223549604415894,
"rewards/ngram_repetition2/mean": 0.8514289855957031,
"rewards/ngram_repetition2/std": 0.037876468151807785,
"rewards/ngram_repetition3/mean": 0.9560329914093018,
"rewards/ngram_repetition3/std": 0.02248253859579563,
"rewards/symbolic_reward_accuracy/mean": 0.99365234375,
"rewards/symbolic_reward_accuracy/std": 0.07943830639123917,
"rewards/symbolic_reward_partial_score/mean": 0.9951171875,
"rewards/symbolic_reward_partial_score/std": 0.06425390392541885,
"rewards/tag_count_reward/mean": 1.0,
"rewards/tag_count_reward/std": 0.0,
"rewards/thinking_answer_ratio_reward/mean": 0.9478025436401367,
"rewards/thinking_answer_ratio_reward/std": 0.014074806123971939,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.1058030128479004,
"sampling/importance_sampling_ratio/min": 0.00232778606005013,
"sampling/sampling_logp_difference/max": 6.062837600708008,
"sampling/sampling_logp_difference/mean": 0.1683150827884674,
"step": 52
},
{
"clip_ratio/high_max": 0.2734375,
"clip_ratio/high_mean": 0.1171875,
"clip_ratio/low_mean": 0.3798828125,
"clip_ratio/low_min": 0.171875,
"clip_ratio/region_mean": 0.4970703125,
"entropy": 0.3815920725464821,
"epoch": 0.875,
"grad_norm": 0.024431413039565086,
"learning_rate": 1e-05,
"loss": 0.0028,
"step": 56
},
{
"clip_ratio/high_max": 0.28125,
"clip_ratio/high_mean": 0.1259765625,
"clip_ratio/low_mean": 0.244140625,
"clip_ratio/low_min": 0.0859375,
"clip_ratio/region_mean": 0.3701171875,
"completions/clipped_ratio": 0.0,
"completions/max_length": 748.0,
"completions/max_terminated_length": 748.0,
"completions/mean_length": 303.3486328125,
"completions/mean_terminated_length": 303.3486328125,
"completions/min_length": 150.0,
"completions/min_terminated_length": 150.0,
"entropy": 0.3807190824300051,
"epoch": 0.9375,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.025611596181988716,
"learning_rate": 1e-05,
"loss": 0.0043,
"num_tokens": 17668665.0,
"reward": 4.0197319984436035,
"reward_std": 0.03265571966767311,
"rewards/ngram_repetition2/mean": 0.8663961887359619,
"rewards/ngram_repetition2/std": 0.035074710845947266,
"rewards/ngram_repetition3/mean": 0.9652769565582275,
"rewards/ngram_repetition3/std": 0.019182542338967323,
"rewards/symbolic_reward_accuracy/mean": 0.9970703125,
"rewards/symbolic_reward_accuracy/std": 0.0540604442358017,
"rewards/symbolic_reward_partial_score/mean": 0.997802734375,
"rewards/symbolic_reward_partial_score/std": 0.04274481162428856,
"rewards/tag_count_reward/mean": 1.0,
"rewards/tag_count_reward/std": 0.0,
"rewards/thinking_answer_ratio_reward/mean": 0.9471962451934814,
"rewards/thinking_answer_ratio_reward/std": 0.01360977441072464,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.1089144945144653,
"sampling/importance_sampling_ratio/min": 0.0014806825201958418,
"sampling/sampling_logp_difference/max": 6.515252113342285,
"sampling/sampling_logp_difference/mean": 0.17024272680282593,
"step": 60
},
{
"clip_ratio/high_max": 0.25,
"clip_ratio/high_mean": 0.0966796875,
"clip_ratio/low_mean": 0.4091796875,
"clip_ratio/low_min": 0.1796875,
"clip_ratio/region_mean": 0.505859375,
"entropy": 0.38510454073548317,
"epoch": 1.0,
"grad_norm": 0.024223582819104195,
"learning_rate": 1e-05,
"loss": 0.0096,
"step": 64
},
{
"epoch": 1.0,
"eval_clip_ratio/high_max": 0.0,
"eval_clip_ratio/high_mean": 0.0,
"eval_clip_ratio/low_mean": 0.0,
"eval_clip_ratio/low_min": 0.0,
"eval_clip_ratio/region_mean": 0.0,
"eval_completions/clipped_ratio": 0.0,
"eval_completions/max_length": 461.2631578947368,
"eval_completions/max_terminated_length": 461.2631578947368,
"eval_completions/mean_length": 285.5579769736842,
"eval_completions/mean_terminated_length": 285.5579769736842,
"eval_completions/min_length": 171.5,
"eval_completions/min_terminated_length": 171.5,
"eval_entropy": 0.4054659785408723,
"eval_frac_reward_zero_std": 0.0,
"eval_loss": 0.0016032923012971878,
"eval_num_tokens": 17668665.0,
"eval_reward": 4.020004686556365,
"eval_reward_std": 0.03250175694962596,
"eval_rewards/ngram_repetition2/mean": 0.8798494354674691,
"eval_rewards/ngram_repetition2/std": 0.03326555988506267,
"eval_rewards/ngram_repetition3/mean": 0.9702528150458085,
"eval_rewards/ngram_repetition3/std": 0.01740601247078494,
"eval_rewards/symbolic_reward_accuracy/mean": 0.9971217105263158,
"eval_rewards/symbolic_reward_accuracy/std": 0.02106231843170367,
"eval_rewards/symbolic_reward_partial_score/mean": 0.9977384868421053,
"eval_rewards/symbolic_reward_partial_score/std": 0.01612810790538788,
"eval_rewards/tag_count_reward/mean": 1.0,
"eval_rewards/tag_count_reward/std": 0.0,
"eval_rewards/thinking_answer_ratio_reward/mean": 0.9521772830109847,
"eval_rewards/thinking_answer_ratio_reward/std": 0.01325364425582321,
"eval_runtime": 1024.8679,
"eval_samples_per_second": 0.146,
"eval_sampling/importance_sampling_ratio/max": 2.0,
"eval_sampling/importance_sampling_ratio/mean": 1.113611522473787,
"eval_sampling/importance_sampling_ratio/min": 0.013400349171685153,
"eval_sampling/sampling_logp_difference/max": 4.50554064700478,
"eval_sampling/sampling_logp_difference/mean": 0.17763970870720713,
"eval_steps_per_second": 0.003,
"step": 64
},
{
"clip_ratio/high_max": 0.2578125,
"clip_ratio/high_mean": 0.111328125,
"clip_ratio/low_mean": 0.275390625,
"clip_ratio/low_min": 0.0859375,
"clip_ratio/region_mean": 0.38671875,
"completions/clipped_ratio": 0.0,
"completions/max_length": 608.0,
"completions/max_terminated_length": 608.0,
"completions/mean_length": 289.3232421875,
"completions/mean_terminated_length": 289.3232421875,
"completions/min_length": 162.0,
"completions/min_terminated_length": 162.0,
"entropy": 0.38059367053210735,
"epoch": 1.0625,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.025241762399673462,
"learning_rate": 1e-05,
"loss": 0.0059,
"num_tokens": 19605071.0,
"reward": 4.018987655639648,
"reward_std": 0.03650522977113724,
"rewards/ngram_repetition2/mean": 0.883100688457489,
"rewards/ngram_repetition2/std": 0.031195858493447304,
"rewards/ngram_repetition3/mean": 0.9731463193893433,
"rewards/ngram_repetition3/std": 0.01614222675561905,
"rewards/symbolic_reward_accuracy/mean": 0.99658203125,
"rewards/symbolic_reward_accuracy/std": 0.05837765336036682,
"rewards/symbolic_reward_partial_score/mean": 0.997802734375,
"rewards/symbolic_reward_partial_score/std": 0.03978516161441803,
"rewards/tag_count_reward/mean": 1.0,
"rewards/tag_count_reward/std": 0.0,
"rewards/thinking_answer_ratio_reward/mean": 0.945807933807373,
"rewards/thinking_answer_ratio_reward/std": 0.013301613740622997,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.109527587890625,
"sampling/importance_sampling_ratio/min": 0.0029354249127209187,
"sampling/sampling_logp_difference/max": 5.830903053283691,
"sampling/sampling_logp_difference/mean": 0.17033454775810242,
"step": 68
},
{
"clip_ratio/high_max": 0.2578125,
"clip_ratio/high_mean": 0.0927734375,
"clip_ratio/low_mean": 0.4150390625,
"clip_ratio/low_min": 0.171875,
"clip_ratio/region_mean": 0.5078125,
"entropy": 0.37396370619535446,
"epoch": 1.125,
"grad_norm": 0.029076889157295227,
"learning_rate": 1e-05,
"loss": 0.0047,
"step": 72
},
{
"clip_ratio/high_max": 0.234375,
"clip_ratio/high_mean": 0.0966796875,
"clip_ratio/low_mean": 0.2822265625,
"clip_ratio/low_min": 0.125,
"clip_ratio/region_mean": 0.37890625,
"completions/clipped_ratio": 0.0,
"completions/max_length": 584.0,
"completions/max_terminated_length": 584.0,
"completions/mean_length": 276.4482421875,
"completions/mean_terminated_length": 276.4482421875,
"completions/min_length": 138.0,
"completions/min_terminated_length": 138.0,
"entropy": 0.3730292562395334,
"epoch": 1.1875,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.027809176594018936,
"learning_rate": 1e-05,
"loss": 0.0051,
"num_tokens": 21516773.0,
"reward": 4.028271198272705,
"reward_std": 0.0003417174448259175,
"rewards/ngram_repetition2/mean": 0.9015256762504578,
"rewards/ngram_repetition2/std": 0.027837282046675682,
"rewards/ngram_repetition3/mean": 0.9807107448577881,
"rewards/ngram_repetition3/std": 0.01325258519500494,
"rewards/symbolic_reward_accuracy/mean": 1.0,
"rewards/symbolic_reward_accuracy/std": 0.0,
"rewards/symbolic_reward_partial_score/mean": 1.0,
"rewards/symbolic_reward_partial_score/std": 0.0,
"rewards/tag_count_reward/mean": 1.0,
"rewards/tag_count_reward/std": 0.0,
"rewards/thinking_answer_ratio_reward/mean": 0.9448793530464172,
"rewards/thinking_answer_ratio_reward/std": 0.013124315068125725,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.1104830503463745,
"sampling/importance_sampling_ratio/min": 0.0012398589169606566,
"sampling/sampling_logp_difference/max": 6.692757606506348,
"sampling/sampling_logp_difference/mean": 0.17016106843948364,
"step": 76
},
{
"clip_ratio/high_max": 0.2109375,
"clip_ratio/high_mean": 0.0859375,
"clip_ratio/low_mean": 0.44140625,
"clip_ratio/low_min": 0.1796875,
"clip_ratio/region_mean": 0.52734375,
"entropy": 0.3784319721162319,
"epoch": 1.25,
"grad_norm": 0.025980466976761818,
"learning_rate": 1e-05,
"loss": 0.0045,
"step": 80
},
{
"clip_ratio/high_max": 0.21875,
"clip_ratio/high_mean": 0.0986328125,
"clip_ratio/low_mean": 0.263671875,
"clip_ratio/low_min": 0.1171875,
"clip_ratio/region_mean": 0.3623046875,
"completions/clipped_ratio": 0.0,
"completions/max_length": 579.0,
"completions/max_terminated_length": 579.0,
"completions/mean_length": 267.55810546875,
"completions/mean_terminated_length": 267.55810546875,
"completions/min_length": 142.0,
"completions/min_terminated_length": 142.0,
"entropy": 0.3766753375530243,
"epoch": 1.3125,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.027696413919329643,
"learning_rate": 1e-05,
"loss": 0.0048,
"num_tokens": 23416924.0,
"reward": 4.016022682189941,
"reward_std": 0.05009516328573227,
"rewards/ngram_repetition2/mean": 0.9155223369598389,
"rewards/ngram_repetition2/std": 0.02600882574915886,
"rewards/ngram_repetition3/mean": 0.986021876335144,
"rewards/ngram_repetition3/std": 0.011384704150259495,
"rewards/symbolic_reward_accuracy/mean": 0.99560546875,
"rewards/symbolic_reward_accuracy/std": 0.06616159528493881,
"rewards/symbolic_reward_partial_score/mean": 0.996337890625,
"rewards/symbolic_reward_partial_score/std": 0.05730699002742767,
"rewards/tag_count_reward/mean": 1.0,
"rewards/tag_count_reward/std": 0.0,
"rewards/thinking_answer_ratio_reward/mean": 0.9458417892456055,
"rewards/thinking_answer_ratio_reward/std": 0.012686546891927719,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.1121628284454346,
"sampling/importance_sampling_ratio/min": 0.0017338492907583714,
"sampling/sampling_logp_difference/max": 6.3574113845825195,
"sampling/sampling_logp_difference/mean": 0.17337316274642944,
"step": 84
},
{
"clip_ratio/high_max": 0.25,
"clip_ratio/high_mean": 0.0888671875,
"clip_ratio/low_mean": 0.3642578125,
"clip_ratio/low_min": 0.140625,
"clip_ratio/region_mean": 0.453125,
"entropy": 0.37970343604683876,
"epoch": 1.375,
"grad_norm": 0.024490008130669594,
"learning_rate": 1e-05,
"loss": 0.003,
"step": 88
},
{
"clip_ratio/high_max": 0.3046875,
"clip_ratio/high_mean": 0.1318359375,
"clip_ratio/low_mean": 0.2841796875,
"clip_ratio/low_min": 0.1328125,
"clip_ratio/region_mean": 0.416015625,
"completions/clipped_ratio": 0.0,
"completions/max_length": 572.0,
"completions/max_terminated_length": 572.0,
"completions/mean_length": 270.39697265625,
"completions/mean_terminated_length": 270.39697265625,
"completions/min_length": 159.0,
"completions/min_terminated_length": 159.0,
"entropy": 0.3762910068035126,
"epoch": 1.4375,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.020992670208215714,
"learning_rate": 1e-05,
"loss": 0.0028,
"num_tokens": 25321225.0,
"reward": 4.025911808013916,
"reward_std": 0.011015485972166061,
"rewards/ngram_repetition2/mean": 0.9226169586181641,
"rewards/ngram_repetition2/std": 0.022972460836172104,
"rewards/ngram_repetition3/mean": 0.9887727499008179,
"rewards/ngram_repetition3/std": 0.009258674457669258,
"rewards/symbolic_reward_accuracy/mean": 0.9990234375,
"rewards/symbolic_reward_accuracy/std": 0.031242365017533302,
"rewards/symbolic_reward_partial_score/mean": 0.999267578125,
"rewards/symbolic_reward_partial_score/std": 0.02470046654343605,
"rewards/tag_count_reward/mean": 1.0,
"rewards/tag_count_reward/std": 0.0,
"rewards/thinking_answer_ratio_reward/mean": 0.9483402967453003,
"rewards/thinking_answer_ratio_reward/std": 0.011332533322274685,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.1124824285507202,
"sampling/importance_sampling_ratio/min": 0.0016270694322884083,
"sampling/sampling_logp_difference/max": 6.4209747314453125,
"sampling/sampling_logp_difference/mean": 0.1726430356502533,
"step": 92
},
{
"clip_ratio/high_max": 0.2734375,
"clip_ratio/high_mean": 0.0869140625,
"clip_ratio/low_mean": 0.4208984375,
"clip_ratio/low_min": 0.1640625,
"clip_ratio/region_mean": 0.5078125,
"entropy": 0.3793158773332834,
"epoch": 1.5,
"grad_norm": 0.02606261894106865,
"learning_rate": 1e-05,
"loss": 0.0039,
"step": 96
},
{
"clip_ratio/high_max": 0.2734375,
"clip_ratio/high_mean": 0.1201171875,
"clip_ratio/low_mean": 0.248046875,
"clip_ratio/low_min": 0.09375,
"clip_ratio/region_mean": 0.3681640625,
"completions/clipped_ratio": 0.0,
"completions/max_length": 538.0,
"completions/max_terminated_length": 538.0,
"completions/mean_length": 266.08642578125,
"completions/mean_terminated_length": 266.08642578125,
"completions/min_length": 148.0,
"completions/min_terminated_length": 148.0,
"entropy": 0.3877852316945791,
"epoch": 1.5625,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.023656491190195084,
"learning_rate": 1e-05,
"loss": 0.0022,
"num_tokens": 27206714.0,
"reward": 4.023347854614258,
"reward_std": 0.02174052968621254,
"rewards/ngram_repetition2/mean": 0.9308995008468628,
"rewards/ngram_repetition2/std": 0.022394709289073944,
"rewards/ngram_repetition3/mean": 0.990999698638916,
"rewards/ngram_repetition3/std": 0.008796615526080132,
"rewards/symbolic_reward_accuracy/mean": 0.998046875,
"rewards/symbolic_reward_accuracy/std": 0.044161777943372726,
"rewards/symbolic_reward_partial_score/mean": 0.99853515625,
"rewards/symbolic_reward_partial_score/std": 0.03491636738181114,
"rewards/tag_count_reward/mean": 1.0,
"rewards/tag_count_reward/std": 0.0,
"rewards/thinking_answer_ratio_reward/mean": 0.9500227570533752,
"rewards/thinking_answer_ratio_reward/std": 0.009935123845934868,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.1162710189819336,
"sampling/importance_sampling_ratio/min": 0.0009795920923352242,
"sampling/sampling_logp_difference/max": 6.928374290466309,
"sampling/sampling_logp_difference/mean": 0.17734766006469727,
"step": 100
},
{
"clip_ratio/high_max": 0.2734375,
"clip_ratio/high_mean": 0.0986328125,
"clip_ratio/low_mean": 0.41015625,
"clip_ratio/low_min": 0.1328125,
"clip_ratio/region_mean": 0.5087890625,
"entropy": 0.3830826133489609,
"epoch": 1.625,
"grad_norm": 0.020566586405038834,
"learning_rate": 1e-05,
"loss": 0.0053,
"step": 104
},
{
"clip_ratio/high_max": 0.25,
"clip_ratio/high_mean": 0.1044921875,
"clip_ratio/low_mean": 0.2587890625,
"clip_ratio/low_min": 0.0625,
"clip_ratio/region_mean": 0.36328125,
"completions/clipped_ratio": 0.0,
"completions/max_length": 428.0,
"completions/max_terminated_length": 428.0,
"completions/mean_length": 255.6845703125,
"completions/mean_terminated_length": 255.6845703125,
"completions/min_length": 144.0,
"completions/min_terminated_length": 144.0,
"entropy": 0.3835675735026598,
"epoch": 1.6875,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.02022464945912361,
"learning_rate": 1e-05,
"loss": 0.0024,
"num_tokens": 29075892.0,
"reward": 4.025917053222656,
"reward_std": 0.011935505084693432,
"rewards/ngram_repetition2/mean": 0.9410616159439087,
"rewards/ngram_repetition2/std": 0.019873203709721565,
"rewards/ngram_repetition3/mean": 0.9937294721603394,
"rewards/ngram_repetition3/std": 0.0068909707479178905,
"rewards/symbolic_reward_accuracy/mean": 0.9990234375,
"rewards/symbolic_reward_accuracy/std": 0.031242365017533302,
"rewards/symbolic_reward_partial_score/mean": 0.9990234375,
"rewards/symbolic_reward_partial_score/std": 0.031242365017533302,
"rewards/tag_count_reward/mean": 1.0,
"rewards/tag_count_reward/std": 0.0,
"rewards/thinking_answer_ratio_reward/mean": 0.949914813041687,
"rewards/thinking_answer_ratio_reward/std": 0.009325054474174976,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.1154059171676636,
"sampling/importance_sampling_ratio/min": 0.0033768429420888424,
"sampling/sampling_logp_difference/max": 5.690814018249512,
"sampling/sampling_logp_difference/mean": 0.1758098602294922,
"step": 108
},
{
"clip_ratio/high_max": 0.28125,
"clip_ratio/high_mean": 0.1044921875,
"clip_ratio/low_mean": 0.390625,
"clip_ratio/low_min": 0.1640625,
"clip_ratio/region_mean": 0.4951171875,
"entropy": 0.3785879872739315,
"epoch": 1.75,
"grad_norm": 0.018482210114598274,
"learning_rate": 1e-05,
"loss": 0.0032,
"step": 112
},
{
"clip_ratio/high_max": 0.2578125,
"clip_ratio/high_mean": 0.1142578125,
"clip_ratio/low_mean": 0.2529296875,
"clip_ratio/low_min": 0.09375,
"clip_ratio/region_mean": 0.3671875,
"completions/clipped_ratio": 0.0,
"completions/max_length": 432.0,
"completions/max_terminated_length": 432.0,
"completions/mean_length": 250.443359375,
"completions/mean_terminated_length": 250.443359375,
"completions/min_length": 146.0,
"completions/min_terminated_length": 146.0,
"entropy": 0.37489572539925575,
"epoch": 1.8125,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.024314837530255318,
"learning_rate": 1e-05,
"loss": 0.003,
"num_tokens": 30937664.0,
"reward": 4.023824691772461,
"reward_std": 0.020693320780992508,
"rewards/ngram_repetition2/mean": 0.9501452445983887,
"rewards/ngram_repetition2/std": 0.017883246764540672,
"rewards/ngram_repetition3/mean": 0.9955066442489624,
"rewards/ngram_repetition3/std": 0.0059976824559271336,
"rewards/symbolic_reward_accuracy/mean": 0.998046875,
"rewards/symbolic_reward_accuracy/std": 0.044161777943372726,
"rewards/symbolic_reward_partial_score/mean": 0.998779296875,
"rewards/symbolic_reward_partial_score/std": 0.029213331639766693,
"rewards/tag_count_reward/mean": 1.0,
"rewards/tag_count_reward/std": 0.0,
"rewards/thinking_answer_ratio_reward/mean": 0.9495062828063965,
"rewards/thinking_answer_ratio_reward/std": 0.008742393925786018,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.1135070323944092,
"sampling/importance_sampling_ratio/min": 0.0015908645000308752,
"sampling/sampling_logp_difference/max": 6.443477630615234,
"sampling/sampling_logp_difference/mean": 0.1719757616519928,
"step": 116
},
{
"clip_ratio/high_max": 0.296875,
"clip_ratio/high_mean": 0.109375,
"clip_ratio/low_mean": 0.357421875,
"clip_ratio/low_min": 0.109375,
"clip_ratio/region_mean": 0.466796875,
"entropy": 0.3766605220735073,
"epoch": 1.875,
"grad_norm": 0.020362574607133865,
"learning_rate": 1e-05,
"loss": 0.0003,
"step": 120
},
{
"clip_ratio/high_max": 0.296875,
"clip_ratio/high_mean": 0.115234375,
"clip_ratio/low_mean": 0.228515625,
"clip_ratio/low_min": 0.078125,
"clip_ratio/region_mean": 0.34375,
"completions/clipped_ratio": 0.0,
"completions/max_length": 443.0,
"completions/max_terminated_length": 443.0,
"completions/mean_length": 250.9296875,
"completions/mean_terminated_length": 250.9296875,
"completions/min_length": 148.0,
"completions/min_terminated_length": 148.0,
"entropy": 0.36353896372020245,
"epoch": 1.9375,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.021443529054522514,
"learning_rate": 1e-05,
"loss": -0.0007,
"num_tokens": 32792112.0,
"reward": 4.0251312255859375,
"reward_std": 0.015792513266205788,
"rewards/ngram_repetition2/mean": 0.95717853307724,
"rewards/ngram_repetition2/std": 0.016464218497276306,
"rewards/ngram_repetition3/mean": 0.9966345429420471,
"rewards/ngram_repetition3/std": 0.004997830372303724,
"rewards/symbolic_reward_accuracy/mean": 0.99853515625,
"rewards/symbolic_reward_accuracy/std": 0.038254573941230774,
"rewards/symbolic_reward_partial_score/mean": 0.9990234375,
"rewards/symbolic_reward_partial_score/std": 0.027052273973822594,
"rewards/tag_count_reward/mean": 1.0,
"rewards/tag_count_reward/std": 0.0,
"rewards/thinking_answer_ratio_reward/mean": 0.9499266743659973,
"rewards/thinking_answer_ratio_reward/std": 0.008525022305548191,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.1124534606933594,
"sampling/importance_sampling_ratio/min": 0.0008624744368717074,
"sampling/sampling_logp_difference/max": 7.0557050704956055,
"sampling/sampling_logp_difference/mean": 0.1690160632133484,
"step": 124
},
{
"clip_ratio/high_max": 0.296875,
"clip_ratio/high_mean": 0.12890625,
"clip_ratio/low_mean": 0.3896484375,
"clip_ratio/low_min": 0.125,
"clip_ratio/region_mean": 0.5185546875,
"entropy": 0.3609350845217705,
"epoch": 2.0,
"grad_norm": 0.019108088687062263,
"learning_rate": 1e-05,
"loss": 0.0032,
"step": 128
},
{
"epoch": 2.0,
"eval_clip_ratio/high_max": 0.0,
"eval_clip_ratio/high_mean": 0.0,
"eval_clip_ratio/low_mean": 0.0,
"eval_clip_ratio/low_min": 0.0,
"eval_clip_ratio/region_mean": 0.0,
"eval_completions/clipped_ratio": 0.0,
"eval_completions/max_length": 350.5,
"eval_completions/max_terminated_length": 350.5,
"eval_completions/mean_length": 250.32401315789474,
"eval_completions/mean_terminated_length": 250.32401315789474,
"eval_completions/min_length": 175.52631578947367,
"eval_completions/min_terminated_length": 175.52631578947367,
"eval_entropy": 0.3766616096622066,
"eval_frac_reward_zero_std": 0.0,
"eval_loss": 0.00011446899588918313,
"eval_num_tokens": 32792112.0,
"eval_reward": 4.021890213615016,
"eval_reward_std": 0.028941871161039575,
"eval_rewards/ngram_repetition2/mean": 0.9618394170936785,
"eval_rewards/ngram_repetition2/std": 0.015655246342679386,
"eval_rewards/ngram_repetition3/mean": 0.9969736039638519,
"eval_rewards/ngram_repetition3/std": 0.004860803239831799,
"eval_rewards/symbolic_reward_accuracy/mean": 0.9971217105263158,
"eval_rewards/symbolic_reward_accuracy/std": 0.023026315789473683,
"eval_rewards/symbolic_reward_partial_score/mean": 0.9985608552631579,
"eval_rewards/symbolic_reward_partial_score/std": 0.011513157894736841,
"eval_rewards/tag_count_reward/mean": 1.0,
"eval_rewards/tag_count_reward/std": 0.0,
"eval_rewards/thinking_answer_ratio_reward/mean": 0.9497831416757483,
"eval_rewards/thinking_answer_ratio_reward/std": 0.008818138268237052,
"eval_runtime": 918.9382,
"eval_samples_per_second": 0.163,
"eval_sampling/importance_sampling_ratio/max": 2.0,
"eval_sampling/importance_sampling_ratio/mean": 1.1145887280765332,
"eval_sampling/importance_sampling_ratio/min": 0.016234988169009357,
"eval_sampling/sampling_logp_difference/max": 4.472469405124062,
"eval_sampling/sampling_logp_difference/mean": 0.172603645214909,
"eval_steps_per_second": 0.003,
"step": 128
},
{
"epoch": 2.0,
"step": 128,
"total_flos": 0.0,
"train_loss": 0.014476574131549569,
"train_runtime": 14432.346,
"train_samples_per_second": 0.146,
"train_steps_per_second": 0.009
}
],
"logging_steps": 4,
"max_steps": 128,
"num_input_tokens_seen": 32792112,
"num_train_epochs": 2,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}