| { |
| "best_global_step": 128, |
| "best_metric": 0.00011446899588918313, |
| "best_model_checkpoint": "data/DeepSeek-R1-Distill-Qwen-14B-Staged-1/checkpoint-128", |
| "epoch": 2.0, |
| "eval_steps": 500, |
| "global_step": 128, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.005859375, |
| "completions/max_length": 3072.0, |
| "completions/max_terminated_length": 2976.0, |
| "completions/mean_length": 593.9775390625, |
| "completions/mean_terminated_length": 579.372314453125, |
| "completions/min_length": 161.0, |
| "completions/min_terminated_length": 161.0, |
| "entropy": 0.21448766812682152, |
| "epoch": 0.015625, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.15673470497131348, |
| "learning_rate": 1e-05, |
| "loss": 0.095, |
| "num_tokens": 2563666.0, |
| "reward": 4.010468482971191, |
| "reward_std": 0.056801095604896545, |
| "rewards/ngram_repetition2/mean": 0.672095000743866, |
| "rewards/ngram_repetition2/std": 0.10666719824075699, |
| "rewards/ngram_repetition3/mean": 0.8145524859428406, |
| "rewards/ngram_repetition3/std": 0.08704482018947601, |
| "rewards/symbolic_reward_accuracy/mean": 0.99609375, |
| "rewards/symbolic_reward_accuracy/std": 0.06239304319024086, |
| "rewards/symbolic_reward_partial_score/mean": 0.996826171875, |
| "rewards/symbolic_reward_partial_score/std": 0.05290473252534866, |
| "rewards/tag_count_reward/mean": 0.9970703125, |
| "rewards/tag_count_reward/std": 0.03817030414938927, |
| "rewards/thinking_answer_ratio_reward/mean": 0.9518401622772217, |
| "rewards/thinking_answer_ratio_reward/std": 0.07770728319883347, |
| "sampling/importance_sampling_ratio/max": 2.0, |
| "sampling/importance_sampling_ratio/mean": 1.0463709831237793, |
| "sampling/importance_sampling_ratio/min": 0.0010976478224620223, |
| "sampling/sampling_logp_difference/max": 6.8145856857299805, |
| "sampling/sampling_logp_difference/mean": 0.09316523373126984, |
| "step": 1 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.3932291666666667, |
| "clip_ratio/low_min": 0.14583333333333334, |
| "clip_ratio/region_mean": 0.3932291666666667, |
| "entropy": 0.26243093982338905, |
| "epoch": 0.0625, |
| "grad_norm": 0.12268827110528946, |
| "learning_rate": 1e-05, |
| "loss": 0.0979, |
| "step": 4 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.3955078125, |
| "clip_ratio/low_min": 0.1875, |
| "clip_ratio/region_mean": 0.3955078125, |
| "entropy": 0.32875449024140835, |
| "epoch": 0.125, |
| "grad_norm": 0.07022340595722198, |
| "learning_rate": 1e-05, |
| "loss": 0.0903, |
| "step": 8 |
| }, |
| { |
| "clip_ratio/high_max": 0.078125, |
| "clip_ratio/high_mean": 0.0224609375, |
| "clip_ratio/low_mean": 0.3046875, |
| "clip_ratio/low_min": 0.1328125, |
| "clip_ratio/region_mean": 0.3271484375, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 2691.0, |
| "completions/max_terminated_length": 2691.0, |
| "completions/mean_length": 550.14892578125, |
| "completions/mean_terminated_length": 550.14892578125, |
| "completions/min_length": 51.0, |
| "completions/min_terminated_length": 51.0, |
| "entropy": 0.3310157172381878, |
| "epoch": 0.1875, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.06514108180999756, |
| "learning_rate": 1e-05, |
| "loss": 0.0373, |
| "num_tokens": 5042563.0, |
| "reward": 4.015265464782715, |
| "reward_std": 0.03912237286567688, |
| "rewards/ngram_repetition2/mean": 0.683986246585846, |
| "rewards/ngram_repetition2/std": 0.06738097220659256, |
| "rewards/ngram_repetition3/mean": 0.8293638825416565, |
| "rewards/ngram_repetition3/std": 0.05390477925539017, |
| "rewards/symbolic_reward_accuracy/mean": 0.99658203125, |
| "rewards/symbolic_reward_accuracy/std": 0.05837765336036682, |
| "rewards/symbolic_reward_partial_score/mean": 0.997314453125, |
| "rewards/symbolic_reward_partial_score/std": 0.04809629172086716, |
| "rewards/tag_count_reward/mean": 1.0, |
| "rewards/tag_count_reward/std": 0.0, |
| "rewards/thinking_answer_ratio_reward/mean": 0.9653676748275757, |
| "rewards/thinking_answer_ratio_reward/std": 0.017185064032673836, |
| "sampling/importance_sampling_ratio/max": 2.0, |
| "sampling/importance_sampling_ratio/mean": 1.079781413078308, |
| "sampling/importance_sampling_ratio/min": 0.0017149768536910415, |
| "sampling/sampling_logp_difference/max": 6.368355751037598, |
| "sampling/sampling_logp_difference/mean": 0.13660897314548492, |
| "step": 12 |
| }, |
| { |
| "clip_ratio/high_max": 0.1171875, |
| "clip_ratio/high_mean": 0.025390625, |
| "clip_ratio/low_mean": 0.4423828125, |
| "clip_ratio/low_min": 0.15625, |
| "clip_ratio/region_mean": 0.4677734375, |
| "entropy": 0.3334905654191971, |
| "epoch": 0.25, |
| "grad_norm": 0.05110664293169975, |
| "learning_rate": 1e-05, |
| "loss": 0.0353, |
| "step": 16 |
| }, |
| { |
| "clip_ratio/high_max": 0.171875, |
| "clip_ratio/high_mean": 0.068359375, |
| "clip_ratio/low_mean": 0.2587890625, |
| "clip_ratio/low_min": 0.0859375, |
| "clip_ratio/region_mean": 0.3271484375, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 1782.0, |
| "completions/max_terminated_length": 1782.0, |
| "completions/mean_length": 462.2470703125, |
| "completions/mean_terminated_length": 462.2470703125, |
| "completions/min_length": 184.0, |
| "completions/min_terminated_length": 184.0, |
| "entropy": 0.36026287637650967, |
| "epoch": 0.3125, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.04351092875003815, |
| "learning_rate": 1e-05, |
| "loss": 0.0274, |
| "num_tokens": 7343101.0, |
| "reward": 3.989396095275879, |
| "reward_std": 0.1385808140039444, |
| "rewards/ngram_repetition2/mean": 0.7290781736373901, |
| "rewards/ngram_repetition2/std": 0.061636194586753845, |
| "rewards/ngram_repetition3/mean": 0.8645428419113159, |
| "rewards/ngram_repetition3/std": 0.047739289700984955, |
| "rewards/symbolic_reward_accuracy/mean": 0.98681640625, |
| "rewards/symbolic_reward_accuracy/std": 0.11408830434083939, |
| "rewards/symbolic_reward_partial_score/mean": 0.990478515625, |
| "rewards/symbolic_reward_partial_score/std": 0.08719795942306519, |
| "rewards/tag_count_reward/mean": 0.999755859375, |
| "rewards/tag_count_reward/std": 0.011048543266952038, |
| "rewards/thinking_answer_ratio_reward/mean": 0.9592640399932861, |
| "rewards/thinking_answer_ratio_reward/std": 0.021742122247815132, |
| "sampling/importance_sampling_ratio/max": 2.0, |
| "sampling/importance_sampling_ratio/mean": 1.0902045965194702, |
| "sampling/importance_sampling_ratio/min": 0.0015116139547899365, |
| "sampling/sampling_logp_difference/max": 6.494577407836914, |
| "sampling/sampling_logp_difference/mean": 0.15023121237754822, |
| "step": 20 |
| }, |
| { |
| "clip_ratio/high_max": 0.1640625, |
| "clip_ratio/high_mean": 0.046875, |
| "clip_ratio/low_mean": 0.3720703125, |
| "clip_ratio/low_min": 0.1875, |
| "clip_ratio/region_mean": 0.4189453125, |
| "entropy": 0.366955591365695, |
| "epoch": 0.375, |
| "grad_norm": 0.039240479469299316, |
| "learning_rate": 1e-05, |
| "loss": 0.0148, |
| "step": 24 |
| }, |
| { |
| "clip_ratio/high_max": 0.1953125, |
| "clip_ratio/high_mean": 0.0732421875, |
| "clip_ratio/low_mean": 0.2822265625, |
| "clip_ratio/low_min": 0.078125, |
| "clip_ratio/region_mean": 0.35546875, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 1998.0, |
| "completions/max_terminated_length": 1998.0, |
| "completions/mean_length": 429.3818359375, |
| "completions/mean_terminated_length": 429.3818359375, |
| "completions/min_length": 28.0, |
| "completions/min_terminated_length": 28.0, |
| "entropy": 0.37247131764888763, |
| "epoch": 0.4375, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.04849822074174881, |
| "learning_rate": 1e-05, |
| "loss": 0.0149, |
| "num_tokens": 9582987.0, |
| "reward": 4.010948181152344, |
| "reward_std": 0.06129944697022438, |
| "rewards/ngram_repetition2/mean": 0.7602318525314331, |
| "rewards/ngram_repetition2/std": 0.05239401385188103, |
| "rewards/ngram_repetition3/mean": 0.8901417255401611, |
| "rewards/ngram_repetition3/std": 0.039680566638708115, |
| "rewards/symbolic_reward_accuracy/mean": 0.99462890625, |
| "rewards/symbolic_reward_accuracy/std": 0.07310851663351059, |
| "rewards/symbolic_reward_partial_score/mean": 0.99560546875, |
| "rewards/symbolic_reward_partial_score/std": 0.06236054003238678, |
| "rewards/tag_count_reward/mean": 1.0, |
| "rewards/tag_count_reward/std": 0.0, |
| "rewards/thinking_answer_ratio_reward/mean": 0.9581317901611328, |
| "rewards/thinking_answer_ratio_reward/std": 0.03222493454813957, |
| "sampling/importance_sampling_ratio/max": 2.0, |
| "sampling/importance_sampling_ratio/mean": 1.0975332260131836, |
| "sampling/importance_sampling_ratio/min": 0.002007565228268504, |
| "sampling/sampling_logp_difference/max": 6.210832595825195, |
| "sampling/sampling_logp_difference/mean": 0.15882712602615356, |
| "step": 28 |
| }, |
| { |
| "clip_ratio/high_max": 0.203125, |
| "clip_ratio/high_mean": 0.056640625, |
| "clip_ratio/low_mean": 0.435546875, |
| "clip_ratio/low_min": 0.1953125, |
| "clip_ratio/region_mean": 0.4921875, |
| "entropy": 0.3766433894634247, |
| "epoch": 0.5, |
| "grad_norm": 0.036272790282964706, |
| "learning_rate": 1e-05, |
| "loss": 0.0171, |
| "step": 32 |
| }, |
| { |
| "clip_ratio/high_max": 0.2109375, |
| "clip_ratio/high_mean": 0.08984375, |
| "clip_ratio/low_mean": 0.2900390625, |
| "clip_ratio/low_min": 0.125, |
| "clip_ratio/region_mean": 0.3798828125, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 1418.0, |
| "completions/max_terminated_length": 1418.0, |
| "completions/mean_length": 380.4365234375, |
| "completions/mean_terminated_length": 380.4365234375, |
| "completions/min_length": 171.0, |
| "completions/min_terminated_length": 171.0, |
| "entropy": 0.37771076895296574, |
| "epoch": 0.5625, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.04128963500261307, |
| "learning_rate": 1e-05, |
| "loss": 0.0201, |
| "num_tokens": 11687689.0, |
| "reward": 4.007978439331055, |
| "reward_std": 0.0718117207288742, |
| "rewards/ngram_repetition2/mean": 0.7869799733161926, |
| "rewards/ngram_repetition2/std": 0.049044348299503326, |
| "rewards/ngram_repetition3/mean": 0.9102581143379211, |
| "rewards/ngram_repetition3/std": 0.03698350489139557, |
| "rewards/symbolic_reward_accuracy/mean": 0.9931640625, |
| "rewards/symbolic_reward_accuracy/std": 0.08241677284240723, |
| "rewards/symbolic_reward_partial_score/mean": 0.9951171875, |
| "rewards/symbolic_reward_partial_score/std": 0.06232419237494469, |
| "rewards/tag_count_reward/mean": 1.0, |
| "rewards/tag_count_reward/std": 0.0, |
| "rewards/thinking_answer_ratio_reward/mean": 0.9560877084732056, |
| "rewards/thinking_answer_ratio_reward/std": 0.013234787620604038, |
| "sampling/importance_sampling_ratio/max": 2.0, |
| "sampling/importance_sampling_ratio/mean": 1.0998252630233765, |
| "sampling/importance_sampling_ratio/min": 0.001202415325678885, |
| "sampling/sampling_logp_difference/max": 6.723423004150391, |
| "sampling/sampling_logp_difference/mean": 0.1619987189769745, |
| "step": 36 |
| }, |
| { |
| "clip_ratio/high_max": 0.25, |
| "clip_ratio/high_mean": 0.087890625, |
| "clip_ratio/low_mean": 0.3935546875, |
| "clip_ratio/low_min": 0.1640625, |
| "clip_ratio/region_mean": 0.4814453125, |
| "entropy": 0.3803216014057398, |
| "epoch": 0.625, |
| "grad_norm": 0.03413194790482521, |
| "learning_rate": 1e-05, |
| "loss": 0.0067, |
| "step": 40 |
| }, |
| { |
| "clip_ratio/high_max": 0.25, |
| "clip_ratio/high_mean": 0.1328125, |
| "clip_ratio/low_mean": 0.263671875, |
| "clip_ratio/low_min": 0.0859375, |
| "clip_ratio/region_mean": 0.396484375, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 1068.0, |
| "completions/max_terminated_length": 1068.0, |
| "completions/mean_length": 341.1298828125, |
| "completions/mean_terminated_length": 341.1298828125, |
| "completions/min_length": 141.0, |
| "completions/min_terminated_length": 141.0, |
| "entropy": 0.3850418608635664, |
| "epoch": 0.6875, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.031503826379776, |
| "learning_rate": 1e-05, |
| "loss": 0.0112, |
| "num_tokens": 13728531.0, |
| "reward": 4.011279106140137, |
| "reward_std": 0.06094657629728317, |
| "rewards/ngram_repetition2/mean": 0.824249267578125, |
| "rewards/ngram_repetition2/std": 0.043261680752038956, |
| "rewards/ngram_repetition3/mean": 0.9390015602111816, |
| "rewards/ngram_repetition3/std": 0.02868303656578064, |
| "rewards/symbolic_reward_accuracy/mean": 0.994140625, |
| "rewards/symbolic_reward_accuracy/std": 0.07634060829877853, |
| "rewards/symbolic_reward_partial_score/mean": 0.995849609375, |
| "rewards/symbolic_reward_partial_score/std": 0.05727367848157883, |
| "rewards/tag_count_reward/mean": 1.0, |
| "rewards/tag_count_reward/std": 0.0, |
| "rewards/thinking_answer_ratio_reward/mean": 0.9515953063964844, |
| "rewards/thinking_answer_ratio_reward/std": 0.01442283671349287, |
| "sampling/importance_sampling_ratio/max": 2.0, |
| "sampling/importance_sampling_ratio/mean": 1.1041243076324463, |
| "sampling/importance_sampling_ratio/min": 0.0016117201885208488, |
| "sampling/sampling_logp_difference/max": 6.430453300476074, |
| "sampling/sampling_logp_difference/mean": 0.1668510138988495, |
| "step": 44 |
| }, |
| { |
| "clip_ratio/high_max": 0.21875, |
| "clip_ratio/high_mean": 0.0908203125, |
| "clip_ratio/low_mean": 0.3994140625, |
| "clip_ratio/low_min": 0.1796875, |
| "clip_ratio/region_mean": 0.490234375, |
| "entropy": 0.3868873305618763, |
| "epoch": 0.75, |
| "grad_norm": 0.03029937855899334, |
| "learning_rate": 1e-05, |
| "loss": 0.009, |
| "step": 48 |
| }, |
| { |
| "clip_ratio/high_max": 0.1875, |
| "clip_ratio/high_mean": 0.09765625, |
| "clip_ratio/low_mean": 0.2685546875, |
| "clip_ratio/low_min": 0.0859375, |
| "clip_ratio/region_mean": 0.3662109375, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 562.0, |
| "completions/max_terminated_length": 562.0, |
| "completions/mean_length": 307.357421875, |
| "completions/mean_terminated_length": 307.357421875, |
| "completions/min_length": 140.0, |
| "completions/min_terminated_length": 140.0, |
| "entropy": 0.38150897435843945, |
| "epoch": 0.8125, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.026803122833371162, |
| "learning_rate": 1e-05, |
| "loss": 0.0116, |
| "num_tokens": 15703535.0, |
| "reward": 4.009974479675293, |
| "reward_std": 0.060223549604415894, |
| "rewards/ngram_repetition2/mean": 0.8514289855957031, |
| "rewards/ngram_repetition2/std": 0.037876468151807785, |
| "rewards/ngram_repetition3/mean": 0.9560329914093018, |
| "rewards/ngram_repetition3/std": 0.02248253859579563, |
| "rewards/symbolic_reward_accuracy/mean": 0.99365234375, |
| "rewards/symbolic_reward_accuracy/std": 0.07943830639123917, |
| "rewards/symbolic_reward_partial_score/mean": 0.9951171875, |
| "rewards/symbolic_reward_partial_score/std": 0.06425390392541885, |
| "rewards/tag_count_reward/mean": 1.0, |
| "rewards/tag_count_reward/std": 0.0, |
| "rewards/thinking_answer_ratio_reward/mean": 0.9478025436401367, |
| "rewards/thinking_answer_ratio_reward/std": 0.014074806123971939, |
| "sampling/importance_sampling_ratio/max": 2.0, |
| "sampling/importance_sampling_ratio/mean": 1.1058030128479004, |
| "sampling/importance_sampling_ratio/min": 0.00232778606005013, |
| "sampling/sampling_logp_difference/max": 6.062837600708008, |
| "sampling/sampling_logp_difference/mean": 0.1683150827884674, |
| "step": 52 |
| }, |
| { |
| "clip_ratio/high_max": 0.2734375, |
| "clip_ratio/high_mean": 0.1171875, |
| "clip_ratio/low_mean": 0.3798828125, |
| "clip_ratio/low_min": 0.171875, |
| "clip_ratio/region_mean": 0.4970703125, |
| "entropy": 0.3815920725464821, |
| "epoch": 0.875, |
| "grad_norm": 0.024431413039565086, |
| "learning_rate": 1e-05, |
| "loss": 0.0028, |
| "step": 56 |
| }, |
| { |
| "clip_ratio/high_max": 0.28125, |
| "clip_ratio/high_mean": 0.1259765625, |
| "clip_ratio/low_mean": 0.244140625, |
| "clip_ratio/low_min": 0.0859375, |
| "clip_ratio/region_mean": 0.3701171875, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 748.0, |
| "completions/max_terminated_length": 748.0, |
| "completions/mean_length": 303.3486328125, |
| "completions/mean_terminated_length": 303.3486328125, |
| "completions/min_length": 150.0, |
| "completions/min_terminated_length": 150.0, |
| "entropy": 0.3807190824300051, |
| "epoch": 0.9375, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.025611596181988716, |
| "learning_rate": 1e-05, |
| "loss": 0.0043, |
| "num_tokens": 17668665.0, |
| "reward": 4.0197319984436035, |
| "reward_std": 0.03265571966767311, |
| "rewards/ngram_repetition2/mean": 0.8663961887359619, |
| "rewards/ngram_repetition2/std": 0.035074710845947266, |
| "rewards/ngram_repetition3/mean": 0.9652769565582275, |
| "rewards/ngram_repetition3/std": 0.019182542338967323, |
| "rewards/symbolic_reward_accuracy/mean": 0.9970703125, |
| "rewards/symbolic_reward_accuracy/std": 0.0540604442358017, |
| "rewards/symbolic_reward_partial_score/mean": 0.997802734375, |
| "rewards/symbolic_reward_partial_score/std": 0.04274481162428856, |
| "rewards/tag_count_reward/mean": 1.0, |
| "rewards/tag_count_reward/std": 0.0, |
| "rewards/thinking_answer_ratio_reward/mean": 0.9471962451934814, |
| "rewards/thinking_answer_ratio_reward/std": 0.01360977441072464, |
| "sampling/importance_sampling_ratio/max": 2.0, |
| "sampling/importance_sampling_ratio/mean": 1.1089144945144653, |
| "sampling/importance_sampling_ratio/min": 0.0014806825201958418, |
| "sampling/sampling_logp_difference/max": 6.515252113342285, |
| "sampling/sampling_logp_difference/mean": 0.17024272680282593, |
| "step": 60 |
| }, |
| { |
| "clip_ratio/high_max": 0.25, |
| "clip_ratio/high_mean": 0.0966796875, |
| "clip_ratio/low_mean": 0.4091796875, |
| "clip_ratio/low_min": 0.1796875, |
| "clip_ratio/region_mean": 0.505859375, |
| "entropy": 0.38510454073548317, |
| "epoch": 1.0, |
| "grad_norm": 0.024223582819104195, |
| "learning_rate": 1e-05, |
| "loss": 0.0096, |
| "step": 64 |
| }, |
| { |
| "epoch": 1.0, |
| "eval_clip_ratio/high_max": 0.0, |
| "eval_clip_ratio/high_mean": 0.0, |
| "eval_clip_ratio/low_mean": 0.0, |
| "eval_clip_ratio/low_min": 0.0, |
| "eval_clip_ratio/region_mean": 0.0, |
| "eval_completions/clipped_ratio": 0.0, |
| "eval_completions/max_length": 461.2631578947368, |
| "eval_completions/max_terminated_length": 461.2631578947368, |
| "eval_completions/mean_length": 285.5579769736842, |
| "eval_completions/mean_terminated_length": 285.5579769736842, |
| "eval_completions/min_length": 171.5, |
| "eval_completions/min_terminated_length": 171.5, |
| "eval_entropy": 0.4054659785408723, |
| "eval_frac_reward_zero_std": 0.0, |
| "eval_loss": 0.0016032923012971878, |
| "eval_num_tokens": 17668665.0, |
| "eval_reward": 4.020004686556365, |
| "eval_reward_std": 0.03250175694962596, |
| "eval_rewards/ngram_repetition2/mean": 0.8798494354674691, |
| "eval_rewards/ngram_repetition2/std": 0.03326555988506267, |
| "eval_rewards/ngram_repetition3/mean": 0.9702528150458085, |
| "eval_rewards/ngram_repetition3/std": 0.01740601247078494, |
| "eval_rewards/symbolic_reward_accuracy/mean": 0.9971217105263158, |
| "eval_rewards/symbolic_reward_accuracy/std": 0.02106231843170367, |
| "eval_rewards/symbolic_reward_partial_score/mean": 0.9977384868421053, |
| "eval_rewards/symbolic_reward_partial_score/std": 0.01612810790538788, |
| "eval_rewards/tag_count_reward/mean": 1.0, |
| "eval_rewards/tag_count_reward/std": 0.0, |
| "eval_rewards/thinking_answer_ratio_reward/mean": 0.9521772830109847, |
| "eval_rewards/thinking_answer_ratio_reward/std": 0.01325364425582321, |
| "eval_runtime": 1024.8679, |
| "eval_samples_per_second": 0.146, |
| "eval_sampling/importance_sampling_ratio/max": 2.0, |
| "eval_sampling/importance_sampling_ratio/mean": 1.113611522473787, |
| "eval_sampling/importance_sampling_ratio/min": 0.013400349171685153, |
| "eval_sampling/sampling_logp_difference/max": 4.50554064700478, |
| "eval_sampling/sampling_logp_difference/mean": 0.17763970870720713, |
| "eval_steps_per_second": 0.003, |
| "step": 64 |
| }, |
| { |
| "clip_ratio/high_max": 0.2578125, |
| "clip_ratio/high_mean": 0.111328125, |
| "clip_ratio/low_mean": 0.275390625, |
| "clip_ratio/low_min": 0.0859375, |
| "clip_ratio/region_mean": 0.38671875, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 608.0, |
| "completions/max_terminated_length": 608.0, |
| "completions/mean_length": 289.3232421875, |
| "completions/mean_terminated_length": 289.3232421875, |
| "completions/min_length": 162.0, |
| "completions/min_terminated_length": 162.0, |
| "entropy": 0.38059367053210735, |
| "epoch": 1.0625, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.025241762399673462, |
| "learning_rate": 1e-05, |
| "loss": 0.0059, |
| "num_tokens": 19605071.0, |
| "reward": 4.018987655639648, |
| "reward_std": 0.03650522977113724, |
| "rewards/ngram_repetition2/mean": 0.883100688457489, |
| "rewards/ngram_repetition2/std": 0.031195858493447304, |
| "rewards/ngram_repetition3/mean": 0.9731463193893433, |
| "rewards/ngram_repetition3/std": 0.01614222675561905, |
| "rewards/symbolic_reward_accuracy/mean": 0.99658203125, |
| "rewards/symbolic_reward_accuracy/std": 0.05837765336036682, |
| "rewards/symbolic_reward_partial_score/mean": 0.997802734375, |
| "rewards/symbolic_reward_partial_score/std": 0.03978516161441803, |
| "rewards/tag_count_reward/mean": 1.0, |
| "rewards/tag_count_reward/std": 0.0, |
| "rewards/thinking_answer_ratio_reward/mean": 0.945807933807373, |
| "rewards/thinking_answer_ratio_reward/std": 0.013301613740622997, |
| "sampling/importance_sampling_ratio/max": 2.0, |
| "sampling/importance_sampling_ratio/mean": 1.109527587890625, |
| "sampling/importance_sampling_ratio/min": 0.0029354249127209187, |
| "sampling/sampling_logp_difference/max": 5.830903053283691, |
| "sampling/sampling_logp_difference/mean": 0.17033454775810242, |
| "step": 68 |
| }, |
| { |
| "clip_ratio/high_max": 0.2578125, |
| "clip_ratio/high_mean": 0.0927734375, |
| "clip_ratio/low_mean": 0.4150390625, |
| "clip_ratio/low_min": 0.171875, |
| "clip_ratio/region_mean": 0.5078125, |
| "entropy": 0.37396370619535446, |
| "epoch": 1.125, |
| "grad_norm": 0.029076889157295227, |
| "learning_rate": 1e-05, |
| "loss": 0.0047, |
| "step": 72 |
| }, |
| { |
| "clip_ratio/high_max": 0.234375, |
| "clip_ratio/high_mean": 0.0966796875, |
| "clip_ratio/low_mean": 0.2822265625, |
| "clip_ratio/low_min": 0.125, |
| "clip_ratio/region_mean": 0.37890625, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 584.0, |
| "completions/max_terminated_length": 584.0, |
| "completions/mean_length": 276.4482421875, |
| "completions/mean_terminated_length": 276.4482421875, |
| "completions/min_length": 138.0, |
| "completions/min_terminated_length": 138.0, |
| "entropy": 0.3730292562395334, |
| "epoch": 1.1875, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.027809176594018936, |
| "learning_rate": 1e-05, |
| "loss": 0.0051, |
| "num_tokens": 21516773.0, |
| "reward": 4.028271198272705, |
| "reward_std": 0.0003417174448259175, |
| "rewards/ngram_repetition2/mean": 0.9015256762504578, |
| "rewards/ngram_repetition2/std": 0.027837282046675682, |
| "rewards/ngram_repetition3/mean": 0.9807107448577881, |
| "rewards/ngram_repetition3/std": 0.01325258519500494, |
| "rewards/symbolic_reward_accuracy/mean": 1.0, |
| "rewards/symbolic_reward_accuracy/std": 0.0, |
| "rewards/symbolic_reward_partial_score/mean": 1.0, |
| "rewards/symbolic_reward_partial_score/std": 0.0, |
| "rewards/tag_count_reward/mean": 1.0, |
| "rewards/tag_count_reward/std": 0.0, |
| "rewards/thinking_answer_ratio_reward/mean": 0.9448793530464172, |
| "rewards/thinking_answer_ratio_reward/std": 0.013124315068125725, |
| "sampling/importance_sampling_ratio/max": 2.0, |
| "sampling/importance_sampling_ratio/mean": 1.1104830503463745, |
| "sampling/importance_sampling_ratio/min": 0.0012398589169606566, |
| "sampling/sampling_logp_difference/max": 6.692757606506348, |
| "sampling/sampling_logp_difference/mean": 0.17016106843948364, |
| "step": 76 |
| }, |
| { |
| "clip_ratio/high_max": 0.2109375, |
| "clip_ratio/high_mean": 0.0859375, |
| "clip_ratio/low_mean": 0.44140625, |
| "clip_ratio/low_min": 0.1796875, |
| "clip_ratio/region_mean": 0.52734375, |
| "entropy": 0.3784319721162319, |
| "epoch": 1.25, |
| "grad_norm": 0.025980466976761818, |
| "learning_rate": 1e-05, |
| "loss": 0.0045, |
| "step": 80 |
| }, |
| { |
| "clip_ratio/high_max": 0.21875, |
| "clip_ratio/high_mean": 0.0986328125, |
| "clip_ratio/low_mean": 0.263671875, |
| "clip_ratio/low_min": 0.1171875, |
| "clip_ratio/region_mean": 0.3623046875, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 579.0, |
| "completions/max_terminated_length": 579.0, |
| "completions/mean_length": 267.55810546875, |
| "completions/mean_terminated_length": 267.55810546875, |
| "completions/min_length": 142.0, |
| "completions/min_terminated_length": 142.0, |
| "entropy": 0.3766753375530243, |
| "epoch": 1.3125, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.027696413919329643, |
| "learning_rate": 1e-05, |
| "loss": 0.0048, |
| "num_tokens": 23416924.0, |
| "reward": 4.016022682189941, |
| "reward_std": 0.05009516328573227, |
| "rewards/ngram_repetition2/mean": 0.9155223369598389, |
| "rewards/ngram_repetition2/std": 0.02600882574915886, |
| "rewards/ngram_repetition3/mean": 0.986021876335144, |
| "rewards/ngram_repetition3/std": 0.011384704150259495, |
| "rewards/symbolic_reward_accuracy/mean": 0.99560546875, |
| "rewards/symbolic_reward_accuracy/std": 0.06616159528493881, |
| "rewards/symbolic_reward_partial_score/mean": 0.996337890625, |
| "rewards/symbolic_reward_partial_score/std": 0.05730699002742767, |
| "rewards/tag_count_reward/mean": 1.0, |
| "rewards/tag_count_reward/std": 0.0, |
| "rewards/thinking_answer_ratio_reward/mean": 0.9458417892456055, |
| "rewards/thinking_answer_ratio_reward/std": 0.012686546891927719, |
| "sampling/importance_sampling_ratio/max": 2.0, |
| "sampling/importance_sampling_ratio/mean": 1.1121628284454346, |
| "sampling/importance_sampling_ratio/min": 0.0017338492907583714, |
| "sampling/sampling_logp_difference/max": 6.3574113845825195, |
| "sampling/sampling_logp_difference/mean": 0.17337316274642944, |
| "step": 84 |
| }, |
| { |
| "clip_ratio/high_max": 0.25, |
| "clip_ratio/high_mean": 0.0888671875, |
| "clip_ratio/low_mean": 0.3642578125, |
| "clip_ratio/low_min": 0.140625, |
| "clip_ratio/region_mean": 0.453125, |
| "entropy": 0.37970343604683876, |
| "epoch": 1.375, |
| "grad_norm": 0.024490008130669594, |
| "learning_rate": 1e-05, |
| "loss": 0.003, |
| "step": 88 |
| }, |
| { |
| "clip_ratio/high_max": 0.3046875, |
| "clip_ratio/high_mean": 0.1318359375, |
| "clip_ratio/low_mean": 0.2841796875, |
| "clip_ratio/low_min": 0.1328125, |
| "clip_ratio/region_mean": 0.416015625, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 572.0, |
| "completions/max_terminated_length": 572.0, |
| "completions/mean_length": 270.39697265625, |
| "completions/mean_terminated_length": 270.39697265625, |
| "completions/min_length": 159.0, |
| "completions/min_terminated_length": 159.0, |
| "entropy": 0.3762910068035126, |
| "epoch": 1.4375, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.020992670208215714, |
| "learning_rate": 1e-05, |
| "loss": 0.0028, |
| "num_tokens": 25321225.0, |
| "reward": 4.025911808013916, |
| "reward_std": 0.011015485972166061, |
| "rewards/ngram_repetition2/mean": 0.9226169586181641, |
| "rewards/ngram_repetition2/std": 0.022972460836172104, |
| "rewards/ngram_repetition3/mean": 0.9887727499008179, |
| "rewards/ngram_repetition3/std": 0.009258674457669258, |
| "rewards/symbolic_reward_accuracy/mean": 0.9990234375, |
| "rewards/symbolic_reward_accuracy/std": 0.031242365017533302, |
| "rewards/symbolic_reward_partial_score/mean": 0.999267578125, |
| "rewards/symbolic_reward_partial_score/std": 0.02470046654343605, |
| "rewards/tag_count_reward/mean": 1.0, |
| "rewards/tag_count_reward/std": 0.0, |
| "rewards/thinking_answer_ratio_reward/mean": 0.9483402967453003, |
| "rewards/thinking_answer_ratio_reward/std": 0.011332533322274685, |
| "sampling/importance_sampling_ratio/max": 2.0, |
| "sampling/importance_sampling_ratio/mean": 1.1124824285507202, |
| "sampling/importance_sampling_ratio/min": 0.0016270694322884083, |
| "sampling/sampling_logp_difference/max": 6.4209747314453125, |
| "sampling/sampling_logp_difference/mean": 0.1726430356502533, |
| "step": 92 |
| }, |
| { |
| "clip_ratio/high_max": 0.2734375, |
| "clip_ratio/high_mean": 0.0869140625, |
| "clip_ratio/low_mean": 0.4208984375, |
| "clip_ratio/low_min": 0.1640625, |
| "clip_ratio/region_mean": 0.5078125, |
| "entropy": 0.3793158773332834, |
| "epoch": 1.5, |
| "grad_norm": 0.02606261894106865, |
| "learning_rate": 1e-05, |
| "loss": 0.0039, |
| "step": 96 |
| }, |
| { |
| "clip_ratio/high_max": 0.2734375, |
| "clip_ratio/high_mean": 0.1201171875, |
| "clip_ratio/low_mean": 0.248046875, |
| "clip_ratio/low_min": 0.09375, |
| "clip_ratio/region_mean": 0.3681640625, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 538.0, |
| "completions/max_terminated_length": 538.0, |
| "completions/mean_length": 266.08642578125, |
| "completions/mean_terminated_length": 266.08642578125, |
| "completions/min_length": 148.0, |
| "completions/min_terminated_length": 148.0, |
| "entropy": 0.3877852316945791, |
| "epoch": 1.5625, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.023656491190195084, |
| "learning_rate": 1e-05, |
| "loss": 0.0022, |
| "num_tokens": 27206714.0, |
| "reward": 4.023347854614258, |
| "reward_std": 0.02174052968621254, |
| "rewards/ngram_repetition2/mean": 0.9308995008468628, |
| "rewards/ngram_repetition2/std": 0.022394709289073944, |
| "rewards/ngram_repetition3/mean": 0.990999698638916, |
| "rewards/ngram_repetition3/std": 0.008796615526080132, |
| "rewards/symbolic_reward_accuracy/mean": 0.998046875, |
| "rewards/symbolic_reward_accuracy/std": 0.044161777943372726, |
| "rewards/symbolic_reward_partial_score/mean": 0.99853515625, |
| "rewards/symbolic_reward_partial_score/std": 0.03491636738181114, |
| "rewards/tag_count_reward/mean": 1.0, |
| "rewards/tag_count_reward/std": 0.0, |
| "rewards/thinking_answer_ratio_reward/mean": 0.9500227570533752, |
| "rewards/thinking_answer_ratio_reward/std": 0.009935123845934868, |
| "sampling/importance_sampling_ratio/max": 2.0, |
| "sampling/importance_sampling_ratio/mean": 1.1162710189819336, |
| "sampling/importance_sampling_ratio/min": 0.0009795920923352242, |
| "sampling/sampling_logp_difference/max": 6.928374290466309, |
| "sampling/sampling_logp_difference/mean": 0.17734766006469727, |
| "step": 100 |
| }, |
| { |
| "clip_ratio/high_max": 0.2734375, |
| "clip_ratio/high_mean": 0.0986328125, |
| "clip_ratio/low_mean": 0.41015625, |
| "clip_ratio/low_min": 0.1328125, |
| "clip_ratio/region_mean": 0.5087890625, |
| "entropy": 0.3830826133489609, |
| "epoch": 1.625, |
| "grad_norm": 0.020566586405038834, |
| "learning_rate": 1e-05, |
| "loss": 0.0053, |
| "step": 104 |
| }, |
| { |
| "clip_ratio/high_max": 0.25, |
| "clip_ratio/high_mean": 0.1044921875, |
| "clip_ratio/low_mean": 0.2587890625, |
| "clip_ratio/low_min": 0.0625, |
| "clip_ratio/region_mean": 0.36328125, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 428.0, |
| "completions/max_terminated_length": 428.0, |
| "completions/mean_length": 255.6845703125, |
| "completions/mean_terminated_length": 255.6845703125, |
| "completions/min_length": 144.0, |
| "completions/min_terminated_length": 144.0, |
| "entropy": 0.3835675735026598, |
| "epoch": 1.6875, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.02022464945912361, |
| "learning_rate": 1e-05, |
| "loss": 0.0024, |
| "num_tokens": 29075892.0, |
| "reward": 4.025917053222656, |
| "reward_std": 0.011935505084693432, |
| "rewards/ngram_repetition2/mean": 0.9410616159439087, |
| "rewards/ngram_repetition2/std": 0.019873203709721565, |
| "rewards/ngram_repetition3/mean": 0.9937294721603394, |
| "rewards/ngram_repetition3/std": 0.0068909707479178905, |
| "rewards/symbolic_reward_accuracy/mean": 0.9990234375, |
| "rewards/symbolic_reward_accuracy/std": 0.031242365017533302, |
| "rewards/symbolic_reward_partial_score/mean": 0.9990234375, |
| "rewards/symbolic_reward_partial_score/std": 0.031242365017533302, |
| "rewards/tag_count_reward/mean": 1.0, |
| "rewards/tag_count_reward/std": 0.0, |
| "rewards/thinking_answer_ratio_reward/mean": 0.949914813041687, |
| "rewards/thinking_answer_ratio_reward/std": 0.009325054474174976, |
| "sampling/importance_sampling_ratio/max": 2.0, |
| "sampling/importance_sampling_ratio/mean": 1.1154059171676636, |
| "sampling/importance_sampling_ratio/min": 0.0033768429420888424, |
| "sampling/sampling_logp_difference/max": 5.690814018249512, |
| "sampling/sampling_logp_difference/mean": 0.1758098602294922, |
| "step": 108 |
| }, |
| { |
| "clip_ratio/high_max": 0.28125, |
| "clip_ratio/high_mean": 0.1044921875, |
| "clip_ratio/low_mean": 0.390625, |
| "clip_ratio/low_min": 0.1640625, |
| "clip_ratio/region_mean": 0.4951171875, |
| "entropy": 0.3785879872739315, |
| "epoch": 1.75, |
| "grad_norm": 0.018482210114598274, |
| "learning_rate": 1e-05, |
| "loss": 0.0032, |
| "step": 112 |
| }, |
| { |
| "clip_ratio/high_max": 0.2578125, |
| "clip_ratio/high_mean": 0.1142578125, |
| "clip_ratio/low_mean": 0.2529296875, |
| "clip_ratio/low_min": 0.09375, |
| "clip_ratio/region_mean": 0.3671875, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 432.0, |
| "completions/max_terminated_length": 432.0, |
| "completions/mean_length": 250.443359375, |
| "completions/mean_terminated_length": 250.443359375, |
| "completions/min_length": 146.0, |
| "completions/min_terminated_length": 146.0, |
| "entropy": 0.37489572539925575, |
| "epoch": 1.8125, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.024314837530255318, |
| "learning_rate": 1e-05, |
| "loss": 0.003, |
| "num_tokens": 30937664.0, |
| "reward": 4.023824691772461, |
| "reward_std": 0.020693320780992508, |
| "rewards/ngram_repetition2/mean": 0.9501452445983887, |
| "rewards/ngram_repetition2/std": 0.017883246764540672, |
| "rewards/ngram_repetition3/mean": 0.9955066442489624, |
| "rewards/ngram_repetition3/std": 0.0059976824559271336, |
| "rewards/symbolic_reward_accuracy/mean": 0.998046875, |
| "rewards/symbolic_reward_accuracy/std": 0.044161777943372726, |
| "rewards/symbolic_reward_partial_score/mean": 0.998779296875, |
| "rewards/symbolic_reward_partial_score/std": 0.029213331639766693, |
| "rewards/tag_count_reward/mean": 1.0, |
| "rewards/tag_count_reward/std": 0.0, |
| "rewards/thinking_answer_ratio_reward/mean": 0.9495062828063965, |
| "rewards/thinking_answer_ratio_reward/std": 0.008742393925786018, |
| "sampling/importance_sampling_ratio/max": 2.0, |
| "sampling/importance_sampling_ratio/mean": 1.1135070323944092, |
| "sampling/importance_sampling_ratio/min": 0.0015908645000308752, |
| "sampling/sampling_logp_difference/max": 6.443477630615234, |
| "sampling/sampling_logp_difference/mean": 0.1719757616519928, |
| "step": 116 |
| }, |
| { |
| "clip_ratio/high_max": 0.296875, |
| "clip_ratio/high_mean": 0.109375, |
| "clip_ratio/low_mean": 0.357421875, |
| "clip_ratio/low_min": 0.109375, |
| "clip_ratio/region_mean": 0.466796875, |
| "entropy": 0.3766605220735073, |
| "epoch": 1.875, |
| "grad_norm": 0.020362574607133865, |
| "learning_rate": 1e-05, |
| "loss": 0.0003, |
| "step": 120 |
| }, |
| { |
| "clip_ratio/high_max": 0.296875, |
| "clip_ratio/high_mean": 0.115234375, |
| "clip_ratio/low_mean": 0.228515625, |
| "clip_ratio/low_min": 0.078125, |
| "clip_ratio/region_mean": 0.34375, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 443.0, |
| "completions/max_terminated_length": 443.0, |
| "completions/mean_length": 250.9296875, |
| "completions/mean_terminated_length": 250.9296875, |
| "completions/min_length": 148.0, |
| "completions/min_terminated_length": 148.0, |
| "entropy": 0.36353896372020245, |
| "epoch": 1.9375, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.021443529054522514, |
| "learning_rate": 1e-05, |
| "loss": -0.0007, |
| "num_tokens": 32792112.0, |
| "reward": 4.0251312255859375, |
| "reward_std": 0.015792513266205788, |
| "rewards/ngram_repetition2/mean": 0.95717853307724, |
| "rewards/ngram_repetition2/std": 0.016464218497276306, |
| "rewards/ngram_repetition3/mean": 0.9966345429420471, |
| "rewards/ngram_repetition3/std": 0.004997830372303724, |
| "rewards/symbolic_reward_accuracy/mean": 0.99853515625, |
| "rewards/symbolic_reward_accuracy/std": 0.038254573941230774, |
| "rewards/symbolic_reward_partial_score/mean": 0.9990234375, |
| "rewards/symbolic_reward_partial_score/std": 0.027052273973822594, |
| "rewards/tag_count_reward/mean": 1.0, |
| "rewards/tag_count_reward/std": 0.0, |
| "rewards/thinking_answer_ratio_reward/mean": 0.9499266743659973, |
| "rewards/thinking_answer_ratio_reward/std": 0.008525022305548191, |
| "sampling/importance_sampling_ratio/max": 2.0, |
| "sampling/importance_sampling_ratio/mean": 1.1124534606933594, |
| "sampling/importance_sampling_ratio/min": 0.0008624744368717074, |
| "sampling/sampling_logp_difference/max": 7.0557050704956055, |
| "sampling/sampling_logp_difference/mean": 0.1690160632133484, |
| "step": 124 |
| }, |
| { |
| "clip_ratio/high_max": 0.296875, |
| "clip_ratio/high_mean": 0.12890625, |
| "clip_ratio/low_mean": 0.3896484375, |
| "clip_ratio/low_min": 0.125, |
| "clip_ratio/region_mean": 0.5185546875, |
| "entropy": 0.3609350845217705, |
| "epoch": 2.0, |
| "grad_norm": 0.019108088687062263, |
| "learning_rate": 1e-05, |
| "loss": 0.0032, |
| "step": 128 |
| }, |
| { |
| "epoch": 2.0, |
| "eval_clip_ratio/high_max": 0.0, |
| "eval_clip_ratio/high_mean": 0.0, |
| "eval_clip_ratio/low_mean": 0.0, |
| "eval_clip_ratio/low_min": 0.0, |
| "eval_clip_ratio/region_mean": 0.0, |
| "eval_completions/clipped_ratio": 0.0, |
| "eval_completions/max_length": 350.5, |
| "eval_completions/max_terminated_length": 350.5, |
| "eval_completions/mean_length": 250.32401315789474, |
| "eval_completions/mean_terminated_length": 250.32401315789474, |
| "eval_completions/min_length": 175.52631578947367, |
| "eval_completions/min_terminated_length": 175.52631578947367, |
| "eval_entropy": 0.3766616096622066, |
| "eval_frac_reward_zero_std": 0.0, |
| "eval_loss": 0.00011446899588918313, |
| "eval_num_tokens": 32792112.0, |
| "eval_reward": 4.021890213615016, |
| "eval_reward_std": 0.028941871161039575, |
| "eval_rewards/ngram_repetition2/mean": 0.9618394170936785, |
| "eval_rewards/ngram_repetition2/std": 0.015655246342679386, |
| "eval_rewards/ngram_repetition3/mean": 0.9969736039638519, |
| "eval_rewards/ngram_repetition3/std": 0.004860803239831799, |
| "eval_rewards/symbolic_reward_accuracy/mean": 0.9971217105263158, |
| "eval_rewards/symbolic_reward_accuracy/std": 0.023026315789473683, |
| "eval_rewards/symbolic_reward_partial_score/mean": 0.9985608552631579, |
| "eval_rewards/symbolic_reward_partial_score/std": 0.011513157894736841, |
| "eval_rewards/tag_count_reward/mean": 1.0, |
| "eval_rewards/tag_count_reward/std": 0.0, |
| "eval_rewards/thinking_answer_ratio_reward/mean": 0.9497831416757483, |
| "eval_rewards/thinking_answer_ratio_reward/std": 0.008818138268237052, |
| "eval_runtime": 918.9382, |
| "eval_samples_per_second": 0.163, |
| "eval_sampling/importance_sampling_ratio/max": 2.0, |
| "eval_sampling/importance_sampling_ratio/mean": 1.1145887280765332, |
| "eval_sampling/importance_sampling_ratio/min": 0.016234988169009357, |
| "eval_sampling/sampling_logp_difference/max": 4.472469405124062, |
| "eval_sampling/sampling_logp_difference/mean": 0.172603645214909, |
| "eval_steps_per_second": 0.003, |
| "step": 128 |
| }, |
| { |
| "epoch": 2.0, |
| "step": 128, |
| "total_flos": 0.0, |
| "train_loss": 0.014476574131549569, |
| "train_runtime": 14432.346, |
| "train_samples_per_second": 0.146, |
| "train_steps_per_second": 0.009 |
| } |
| ], |
| "logging_steps": 4, |
| "max_steps": 128, |
| "num_input_tokens_seen": 32792112, |
| "num_train_epochs": 2, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 0.0, |
| "train_batch_size": 8, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|