{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 5.0, "eval_steps": 500, "global_step": 1900, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10161.0, "completions/max_terminated_length": 10161.0, "completions/mean_length": 1391.376953125, "completions/mean_terminated_length": 1391.376953125, "completions/min_length": 267.0, "completions/min_terminated_length": 267.0, "entropy": 0.20091663300991058, "epoch": 0.002631578947368421, "frac_reward_zero_std": 0.625, "grad_norm": 0.014021246694028378, "learning_rate": 1e-06, "loss": -0.0042, "num_tokens": 1116193.0, "reward": 0.8677713871002197, "reward_std": 0.09401699155569077, "rewards/progression_diversity/mean": -0.00020985309674870223, "rewards/progression_diversity/std": 0.0047484333626925945, "rewards/symbolic_reward_accuracy/mean": 0.9609375, "rewards/symbolic_reward_accuracy/std": 0.1939331740140915, "rewards/symbolic_reward_partial_score/mean": 0.970703125, "rewards/symbolic_reward_partial_score/std": 0.1583337038755417, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0430030822753906, "sampling/importance_sampling_ratio/min": 0.0076784128323197365, "sampling/sampling_logp_difference/max": 4.86934232711792, "sampling/sampling_logp_difference/mean": 0.08526837825775146, "step": 1 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1328125, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.140625, "entropy": 0.2019985392689705, "epoch": 0.005263157894736842, "grad_norm": 0.014356282539665699, "learning_rate": 1e-06, "loss": 0.0029, "step": 2 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.15625, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.171875, "entropy": 0.2056555300951004, "epoch": 0.007894736842105263, "grad_norm": 0.011324469931423664, "learning_rate": 1e-06, "loss": 0.0053, "step": 3 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.140625, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1484375, "entropy": 0.20365586876869202, "epoch": 0.010526315789473684, "grad_norm": 0.011997531168162823, "learning_rate": 1e-06, "loss": 0.0162, "step": 4 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 10126.0, "completions/mean_length": 1447.798828125, "completions/mean_terminated_length": 1418.5694580078125, "completions/min_length": 245.0, "completions/min_terminated_length": 245.0, "entropy": 0.20460012555122375, "epoch": 0.013157894736842105, "frac_reward_zero_std": 0.75, "grad_norm": 0.015437809750437737, "learning_rate": 1e-06, "loss": 0.0001, "num_tokens": 2251930.0, "reward": 0.8787109851837158, "reward_std": 0.05945787578821182, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.97265625, "rewards/symbolic_reward_accuracy/std": 0.16324250400066376, "rewards/symbolic_reward_partial_score/mean": 0.984375, "rewards/symbolic_reward_partial_score/std": 0.10607754439115524, "rewards/tag_count_reward/mean": -0.001953125, "rewards/tag_count_reward/std": 0.04419417306780815, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0432064533233643, "sampling/importance_sampling_ratio/min": 0.0042533609084784985, "sampling/sampling_logp_difference/max": 5.46004581451416, "sampling/sampling_logp_difference/mean": 0.0852469801902771, "step": 5 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.046875, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0546875, "entropy": 0.19932150095701218, "epoch": 0.015789473684210527, "grad_norm": 0.01119388360530138, "learning_rate": 1e-06, "loss": 0.0039, "step": 6 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.078125, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.09375, "entropy": 0.20202334970235825, "epoch": 0.018421052631578946, "grad_norm": 0.00617096247151494, "learning_rate": 1e-06, "loss": 0.0015, "step": 7 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.09375, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.109375, "entropy": 0.20664908736944199, "epoch": 0.021052631578947368, "grad_norm": 0.005414201412349939, "learning_rate": 1e-06, "loss": -0.0008, "step": 8 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 10477.0, "completions/mean_length": 1538.962890625, "completions/mean_terminated_length": 1509.911865234375, "completions/min_length": 253.0, "completions/min_terminated_length": 253.0, "entropy": 0.20387286692857742, "epoch": 0.02368421052631579, "frac_reward_zero_std": 0.8125, "grad_norm": 0.005543245002627373, "learning_rate": 1e-06, "loss": 0.0015, "num_tokens": 3432999.0, "reward": 0.8880859613418579, "reward_std": 0.04043589532375336, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.984375, "rewards/symbolic_reward_accuracy/std": 0.12414088100194931, "rewards/symbolic_reward_partial_score/mean": 0.9915364384651184, "rewards/symbolic_reward_partial_score/std": 0.0816628709435463, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.043768048286438, "sampling/importance_sampling_ratio/min": 0.0021881461143493652, "sampling/sampling_logp_difference/max": 6.124700546264648, "sampling/sampling_logp_difference/mean": 0.08690699934959412, "step": 9 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0703125, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.078125, "entropy": 0.2075653076171875, "epoch": 0.02631578947368421, "grad_norm": 0.01261059194803238, "learning_rate": 1e-06, "loss": 0.0142, "step": 10 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.078125, "entropy": 0.20507482439279556, "epoch": 0.02894736842105263, "grad_norm": 0.013912621885538101, "learning_rate": 1e-06, "loss": -0.0051, "step": 11 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0859375, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1015625, "entropy": 0.20482902228832245, "epoch": 0.031578947368421054, "grad_norm": 0.0035958297085016966, "learning_rate": 1e-06, "loss": 0.0067, "step": 12 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 7366.0, "completions/mean_length": 1636.892578125, "completions/mean_terminated_length": 1608.033203125, "completions/min_length": 319.0, "completions/min_terminated_length": 319.0, "entropy": 0.20521555095911026, "epoch": 0.034210526315789476, "frac_reward_zero_std": 0.6875, "grad_norm": 0.01693623512983322, "learning_rate": 1e-06, "loss": -0.0115, "num_tokens": 4665232.0, "reward": 0.8704585433006287, "reward_std": 0.07698770612478256, "rewards/progression_diversity/mean": -4.9206595576833934e-05, "rewards/progression_diversity/std": 0.0011134181404486299, "rewards/symbolic_reward_accuracy/mean": 0.9609375, "rewards/symbolic_reward_accuracy/std": 0.1939331740140915, "rewards/symbolic_reward_partial_score/mean": 0.9803059697151184, "rewards/symbolic_reward_partial_score/std": 0.12284255772829056, "rewards/tag_count_reward/mean": -0.001953125, "rewards/tag_count_reward/std": 0.04419417306780815, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0436358451843262, "sampling/importance_sampling_ratio/min": 0.0026307811494916677, "sampling/sampling_logp_difference/max": 5.940474510192871, "sampling/sampling_logp_difference/mean": 0.08618403971195221, "step": 13 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0859375, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.09375, "entropy": 0.20716875791549683, "epoch": 0.03684210526315789, "grad_norm": 0.012125448323786259, "learning_rate": 1e-06, "loss": 0.0074, "step": 14 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1015625, "entropy": 0.2059406116604805, "epoch": 0.039473684210526314, "grad_norm": 0.0121714873239398, "learning_rate": 1e-06, "loss": -0.0036, "step": 15 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.078125, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0859375, "entropy": 0.201951302587986, "epoch": 0.042105263157894736, "grad_norm": 0.015417957678437233, "learning_rate": 1e-06, "loss": 0.0116, "step": 16 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 16384.0, "completions/max_terminated_length": 7433.0, "completions/mean_length": 1438.23046875, "completions/mean_terminated_length": 1379.61962890625, "completions/min_length": 306.0, "completions/min_terminated_length": 306.0, "entropy": 0.20444045960903168, "epoch": 0.04473684210526316, "frac_reward_zero_std": 0.71875, "grad_norm": 0.004432106390595436, "learning_rate": 1e-06, "loss": -0.0, "num_tokens": 5808742.0, "reward": 0.8843259811401367, "reward_std": 0.051335833966732025, "rewards/progression_diversity/mean": -2.0671535821747966e-05, "rewards/progression_diversity/std": 0.00046774346265010536, "rewards/symbolic_reward_accuracy/mean": 0.978515625, "rewards/symbolic_reward_accuracy/std": 0.14513419568538666, "rewards/symbolic_reward_partial_score/mean": 0.99072265625, "rewards/symbolic_reward_partial_score/std": 0.07632413506507874, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0428900718688965, "sampling/importance_sampling_ratio/min": 0.012728744186460972, "sampling/sampling_logp_difference/max": 4.363892555236816, "sampling/sampling_logp_difference/mean": 0.08412274718284607, "step": 17 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.109375, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1171875, "entropy": 0.20181895792484283, "epoch": 0.04736842105263158, "grad_norm": 0.003477479564025998, "learning_rate": 1e-06, "loss": -0.0062, "step": 18 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1015625, "entropy": 0.20018255710601807, "epoch": 0.05, "grad_norm": 0.01829039305448532, "learning_rate": 1e-06, "loss": 0.0141, "step": 19 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0625, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.078125, "entropy": 0.20390576124191284, "epoch": 0.05263157894736842, "grad_norm": 0.004513983614742756, "learning_rate": 1e-06, "loss": 0.0315, "step": 20 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6317.0, "completions/max_terminated_length": 6317.0, "completions/mean_length": 1523.595703125, "completions/mean_terminated_length": 1523.595703125, "completions/min_length": 358.0, "completions/min_terminated_length": 358.0, "entropy": 0.20335309207439423, "epoch": 0.05526315789473684, "frac_reward_zero_std": 0.75, "grad_norm": 0.01375944260507822, "learning_rate": 1e-06, "loss": 0.0094, "num_tokens": 6993111.0, "reward": 0.8709961175918579, "reward_std": 0.06476965546607971, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.958984375, "rewards/symbolic_reward_accuracy/std": 0.19852031767368317, "rewards/symbolic_reward_partial_score/mean": 0.9853515625, "rewards/symbolic_reward_partial_score/std": 0.09239538013935089, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0445630550384521, "sampling/importance_sampling_ratio/min": 0.00613828282803297, "sampling/sampling_logp_difference/max": 5.093210220336914, "sampling/sampling_logp_difference/mean": 0.08676405251026154, "step": 21 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.046875, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0546875, "entropy": 0.20985324680805206, "epoch": 0.05789473684210526, "grad_norm": 0.010196764953434467, "learning_rate": 1e-06, "loss": 0.0098, "step": 22 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0625, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.09375, "entropy": 0.20811310410499573, "epoch": 0.060526315789473685, "grad_norm": 0.019789736717939377, "learning_rate": 1e-06, "loss": 0.0053, "step": 23 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0625, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0703125, "entropy": 0.20689492672681808, "epoch": 0.06315789473684211, "grad_norm": 0.015070080757141113, "learning_rate": 1e-06, "loss": 0.0017, "step": 24 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 16384.0, "completions/max_terminated_length": 11945.0, "completions/mean_length": 1804.28515625, "completions/mean_terminated_length": 1747.10986328125, "completions/min_length": 400.0, "completions/min_terminated_length": 400.0, "entropy": 0.20920950174331665, "epoch": 0.06578947368421052, "frac_reward_zero_std": 0.6875, "grad_norm": 0.013051200658082962, "learning_rate": 1e-06, "loss": 0.0086, "num_tokens": 8328393.0, "reward": 0.8593750596046448, "reward_std": 0.08293415606021881, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.947265625, "rewards/symbolic_reward_accuracy/std": 0.22372129559516907, "rewards/symbolic_reward_partial_score/mean": 0.970703125, "rewards/symbolic_reward_partial_score/std": 0.14404156804084778, "rewards/tag_count_reward/mean": -0.001953125, "rewards/tag_count_reward/std": 0.04419417306780815, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.04477858543396, "sampling/importance_sampling_ratio/min": 0.00535097811371088, "sampling/sampling_logp_difference/max": 5.230475902557373, "sampling/sampling_logp_difference/mean": 0.088081955909729, "step": 25 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.125, "clip_ratio/low_mean": 0.0234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1484375, "entropy": 0.20915643125772476, "epoch": 0.06842105263157895, "grad_norm": 0.019974825903773308, "learning_rate": 1e-06, "loss": 0.0271, "step": 26 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.125, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1328125, "entropy": 0.2107773870229721, "epoch": 0.07105263157894737, "grad_norm": 0.028451336547732353, "learning_rate": 1e-06, "loss": 0.0074, "step": 27 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.1015625, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1171875, "entropy": 0.21294020861387253, "epoch": 0.07368421052631578, "grad_norm": 0.01962115429341793, "learning_rate": 1e-06, "loss": 0.008, "step": 28 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 16384.0, "completions/max_terminated_length": 10846.0, "completions/mean_length": 1951.671875, "completions/mean_terminated_length": 1866.609130859375, "completions/min_length": 259.0, "completions/min_terminated_length": 259.0, "entropy": 0.20540130138397217, "epoch": 0.07631578947368421, "frac_reward_zero_std": 0.78125, "grad_norm": 0.02834128402173519, "learning_rate": 1e-06, "loss": 0.0346, "num_tokens": 9728449.0, "reward": 0.8772948980331421, "reward_std": 0.04927993565797806, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.970703125, "rewards/symbolic_reward_accuracy/std": 0.16880230605602264, "rewards/symbolic_reward_partial_score/mean": 0.9835611581802368, "rewards/symbolic_reward_partial_score/std": 0.10691188275814056, "rewards/tag_count_reward/mean": -0.001953125, "rewards/tag_count_reward/std": 0.04419417306780815, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0446946620941162, "sampling/importance_sampling_ratio/min": 0.007605313323438168, "sampling/sampling_logp_difference/max": 4.878908157348633, "sampling/sampling_logp_difference/mean": 0.08789953589439392, "step": 29 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0703125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0703125, "entropy": 0.2092168629169464, "epoch": 0.07894736842105263, "grad_norm": 0.013763340190052986, "learning_rate": 1e-06, "loss": -0.0003, "step": 30 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0625, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.078125, "entropy": 0.2100147306919098, "epoch": 0.08157894736842106, "grad_norm": 0.008466778323054314, "learning_rate": 1e-06, "loss": 0.0102, "step": 31 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0703125, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0859375, "entropy": 0.21328876912593842, "epoch": 0.08421052631578947, "grad_norm": 0.012747718952596188, "learning_rate": 1e-06, "loss": 0.0008, "step": 32 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8941.0, "completions/max_terminated_length": 8941.0, "completions/mean_length": 1582.947265625, "completions/mean_terminated_length": 1582.947265625, "completions/min_length": 293.0, "completions/min_terminated_length": 293.0, "entropy": 0.21076196432113647, "epoch": 0.0868421052631579, "frac_reward_zero_std": 0.8125, "grad_norm": 0.005839271936565638, "learning_rate": 1e-06, "loss": -0.0001, "num_tokens": 10936390.0, "reward": 0.8851074576377869, "reward_std": 0.04421120509505272, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.98046875, "rewards/symbolic_reward_accuracy/std": 0.1385180652141571, "rewards/symbolic_reward_partial_score/mean": 0.9894205331802368, "rewards/symbolic_reward_partial_score/std": 0.0831495076417923, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0460643768310547, "sampling/importance_sampling_ratio/min": 0.004250554833561182, "sampling/sampling_logp_difference/max": 5.460705757141113, "sampling/sampling_logp_difference/mean": 0.09021396934986115, "step": 33 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0625, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.078125, "entropy": 0.2153472900390625, "epoch": 0.08947368421052632, "grad_norm": 0.031077047809958458, "learning_rate": 1e-06, "loss": 0.0122, "step": 34 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0703125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0703125, "entropy": 0.21500591933727264, "epoch": 0.09210526315789473, "grad_norm": 0.00698199775069952, "learning_rate": 1e-06, "loss": -0.008, "step": 35 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.078125, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.09375, "entropy": 0.21618827432394028, "epoch": 0.09473684210526316, "grad_norm": 0.0022439216263592243, "learning_rate": 1e-06, "loss": 0.0075, "step": 36 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9080.0, "completions/max_terminated_length": 9080.0, "completions/mean_length": 1849.35546875, "completions/mean_terminated_length": 1849.35546875, "completions/min_length": 251.0, "completions/min_terminated_length": 251.0, "entropy": 0.21958760172128677, "epoch": 0.09736842105263158, "frac_reward_zero_std": 0.71875, "grad_norm": 0.005718402564525604, "learning_rate": 1e-06, "loss": -0.0093, "num_tokens": 12260860.0, "reward": 0.881591796875, "reward_std": 0.06236070767045021, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.9765625, "rewards/symbolic_reward_accuracy/std": 0.15143637359142303, "rewards/symbolic_reward_partial_score/mean": 0.9855142831802368, "rewards/symbolic_reward_partial_score/std": 0.10358394682407379, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0466870069503784, "sampling/importance_sampling_ratio/min": 0.007643694523721933, "sampling/sampling_logp_difference/max": 4.873874187469482, "sampling/sampling_logp_difference/mean": 0.09172279387712479, "step": 37 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0390625, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.046875, "entropy": 0.21241623163223267, "epoch": 0.1, "grad_norm": 0.014239384792745113, "learning_rate": 1e-06, "loss": 0.0023, "step": 38 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0546875, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0703125, "entropy": 0.216999851167202, "epoch": 0.10263157894736842, "grad_norm": 0.0052719274535775185, "learning_rate": 1e-06, "loss": 0.0018, "step": 39 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1015625, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.109375, "entropy": 0.2182709276676178, "epoch": 0.10526315789473684, "grad_norm": 0.028353918343782425, "learning_rate": 1e-06, "loss": 0.0241, "step": 40 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11047.0, "completions/max_terminated_length": 11047.0, "completions/mean_length": 1883.564453125, "completions/mean_terminated_length": 1883.564453125, "completions/min_length": 456.0, "completions/min_terminated_length": 456.0, "entropy": 0.21055693924427032, "epoch": 0.10789473684210527, "frac_reward_zero_std": 0.8125, "grad_norm": 0.02785821631550789, "learning_rate": 1e-06, "loss": 0.0162, "num_tokens": 13619549.0, "reward": 0.8826660513877869, "reward_std": 0.04418104887008667, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.9765625, "rewards/symbolic_reward_accuracy/std": 0.15143637359142303, "rewards/symbolic_reward_partial_score/mean": 0.9890950918197632, "rewards/symbolic_reward_partial_score/std": 0.08212035894393921, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0455224514007568, "sampling/importance_sampling_ratio/min": 0.003889448009431362, "sampling/sampling_logp_difference/max": 5.549488067626953, "sampling/sampling_logp_difference/mean": 0.09000338613986969, "step": 41 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0546875, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0625, "entropy": 0.2140364646911621, "epoch": 0.11052631578947368, "grad_norm": 0.006537741981446743, "learning_rate": 1e-06, "loss": -0.0032, "step": 42 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0390625, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.046875, "entropy": 0.2172713428735733, "epoch": 0.11315789473684211, "grad_norm": 0.0061754463240504265, "learning_rate": 1e-06, "loss": 0.012, "step": 43 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0703125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0703125, "entropy": 0.21182067692279816, "epoch": 0.11578947368421053, "grad_norm": 0.014230622909963131, "learning_rate": 1e-06, "loss": -0.0035, "step": 44 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 12173.0, "completions/mean_length": 1821.95703125, "completions/mean_terminated_length": 1793.4598388671875, "completions/min_length": 347.0, "completions/min_terminated_length": 347.0, "entropy": 0.20662643015384674, "epoch": 0.11842105263157894, "frac_reward_zero_std": 0.78125, "grad_norm": 0.011362558230757713, "learning_rate": 1e-06, "loss": -0.0038, "num_tokens": 14959047.0, "reward": 0.882568359375, "reward_std": 0.05293341726064682, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.978515625, "rewards/symbolic_reward_accuracy/std": 0.14513419568538666, "rewards/symbolic_reward_partial_score/mean": 0.98486328125, "rewards/symbolic_reward_partial_score/std": 0.11010749638080597, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.044746994972229, "sampling/importance_sampling_ratio/min": 0.011351748369634151, "sampling/sampling_logp_difference/max": 4.478383541107178, "sampling/sampling_logp_difference/mean": 0.08754965662956238, "step": 45 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0703125, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.078125, "entropy": 0.2084914967417717, "epoch": 0.12105263157894737, "grad_norm": 0.008337480016052723, "learning_rate": 1e-06, "loss": 0.0155, "step": 46 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0703125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0703125, "entropy": 0.20731748640537262, "epoch": 0.12368421052631579, "grad_norm": 0.015734924003481865, "learning_rate": 1e-06, "loss": -0.0034, "step": 47 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0625, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0703125, "entropy": 0.20854239910840988, "epoch": 0.12631578947368421, "grad_norm": 0.02697189897298813, "learning_rate": 1e-06, "loss": 0.0068, "step": 48 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8576.0, "completions/max_terminated_length": 8576.0, "completions/mean_length": 1799.326171875, "completions/mean_terminated_length": 1799.326171875, "completions/min_length": 343.0, "completions/min_terminated_length": 343.0, "entropy": 0.2124495506286621, "epoch": 0.12894736842105264, "frac_reward_zero_std": 0.875, "grad_norm": 0.02483353577554226, "learning_rate": 1e-06, "loss": 0.0154, "num_tokens": 16278094.0, "reward": 0.8875000476837158, "reward_std": 0.03356994688510895, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.984375, "rewards/symbolic_reward_accuracy/std": 0.12414088100194931, "rewards/symbolic_reward_partial_score/mean": 0.9895833134651184, "rewards/symbolic_reward_partial_score/std": 0.09384167939424515, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.046118974685669, "sampling/importance_sampling_ratio/min": 0.003369852202013135, "sampling/sampling_logp_difference/max": 5.6928863525390625, "sampling/sampling_logp_difference/mean": 0.09072060883045197, "step": 49 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.046875, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.046875, "entropy": 0.214863121509552, "epoch": 0.13157894736842105, "grad_norm": 0.013115070760250092, "learning_rate": 1e-06, "loss": -0.0025, "step": 50 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0390625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0390625, "entropy": 0.21474382281303406, "epoch": 0.13421052631578947, "grad_norm": 0.01395503245294094, "learning_rate": 1e-06, "loss": -0.0036, "step": 51 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0390625, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.046875, "entropy": 0.21375417709350586, "epoch": 0.1368421052631579, "grad_norm": 0.0036448845639824867, "learning_rate": 1e-06, "loss": -0.0032, "step": 52 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13257.0, "completions/max_terminated_length": 13257.0, "completions/mean_length": 1848.3828125, "completions/mean_terminated_length": 1848.3828125, "completions/min_length": 323.0, "completions/min_terminated_length": 323.0, "entropy": 0.21266036480665207, "epoch": 0.1394736842105263, "frac_reward_zero_std": 0.84375, "grad_norm": 0.008418217301368713, "learning_rate": 1e-06, "loss": -0.004, "num_tokens": 17631602.0, "reward": 0.8755859732627869, "reward_std": 0.042875953018665314, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.966796875, "rewards/symbolic_reward_accuracy/std": 0.17934183776378632, "rewards/symbolic_reward_partial_score/mean": 0.9850260019302368, "rewards/symbolic_reward_partial_score/std": 0.10527178645133972, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.044999122619629, "sampling/importance_sampling_ratio/min": 0.0029879333451390266, "sampling/sampling_logp_difference/max": 5.813173294067383, "sampling/sampling_logp_difference/mean": 0.08819356560707092, "step": 53 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0390625, "entropy": 0.20696671307086945, "epoch": 0.14210526315789473, "grad_norm": 0.007748573552817106, "learning_rate": 1e-06, "loss": 0.0076, "step": 54 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03125, "entropy": 0.20677582174539566, "epoch": 0.14473684210526316, "grad_norm": 0.03168824687600136, "learning_rate": 1e-06, "loss": 0.0157, "step": 55 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0390625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0390625, "entropy": 0.20693296194076538, "epoch": 0.14736842105263157, "grad_norm": 0.024818619713187218, "learning_rate": 1e-06, "loss": 0.0102, "step": 56 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 16384.0, "completions/max_terminated_length": 7274.0, "completions/mean_length": 1957.873046875, "completions/mean_terminated_length": 1844.281494140625, "completions/min_length": 436.0, "completions/min_terminated_length": 436.0, "entropy": 0.2097914069890976, "epoch": 0.15, "frac_reward_zero_std": 0.75, "grad_norm": 0.025202110409736633, "learning_rate": 1e-06, "loss": 0.0199, "num_tokens": 19045361.0, "reward": 0.8901357054710388, "reward_std": 0.0394572876393795, "rewards/progression_diversity/mean": -0.00010399930033599958, "rewards/progression_diversity/std": 0.002353235613554716, "rewards/symbolic_reward_accuracy/mean": 0.98828125, "rewards/symbolic_reward_accuracy/std": 0.10772226005792618, "rewards/symbolic_reward_partial_score/mean": 0.9931640625, "rewards/symbolic_reward_partial_score/std": 0.0773763433098793, "rewards/tag_count_reward/mean": -0.0078125, "rewards/tag_count_reward/std": 0.08812850713729858, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0448851585388184, "sampling/importance_sampling_ratio/min": 0.006745446939021349, "sampling/sampling_logp_difference/max": 4.998887538909912, "sampling/sampling_logp_difference/mean": 0.08811356127262115, "step": 57 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1328125, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.140625, "entropy": 0.21365194022655487, "epoch": 0.15263157894736842, "grad_norm": 0.003246638458222151, "learning_rate": 1e-06, "loss": 0.0212, "step": 58 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0859375, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0859375, "entropy": 0.2101447507739067, "epoch": 0.15526315789473685, "grad_norm": 0.030230272561311722, "learning_rate": 1e-06, "loss": 0.0657, "step": 59 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.09375, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.09375, "entropy": 0.2098483145236969, "epoch": 0.15789473684210525, "grad_norm": 0.02699144557118416, "learning_rate": 1e-06, "loss": 0.0038, "step": 60 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8055.0, "completions/max_terminated_length": 8055.0, "completions/mean_length": 1772.138671875, "completions/mean_terminated_length": 1772.138671875, "completions/min_length": 456.0, "completions/min_terminated_length": 456.0, "entropy": 0.20673604309558868, "epoch": 0.16052631578947368, "frac_reward_zero_std": 0.78125, "grad_norm": 0.016008341684937477, "learning_rate": 1e-06, "loss": -0.0056, "num_tokens": 20353336.0, "reward": 0.8881348371505737, "reward_std": 0.04341777786612511, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.984375, "rewards/symbolic_reward_accuracy/std": 0.12414088100194931, "rewards/symbolic_reward_partial_score/mean": 0.99169921875, "rewards/symbolic_reward_partial_score/std": 0.08109518140554428, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0453189611434937, "sampling/importance_sampling_ratio/min": 0.0037225810810923576, "sampling/sampling_logp_difference/max": 5.5933380126953125, "sampling/sampling_logp_difference/mean": 0.08831556141376495, "step": 61 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.078125, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0859375, "entropy": 0.20994160324335098, "epoch": 0.1631578947368421, "grad_norm": 0.012515343725681305, "learning_rate": 1e-06, "loss": -0.0053, "step": 62 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1015625, "entropy": 0.20736460387706757, "epoch": 0.16578947368421051, "grad_norm": 0.025071127340197563, "learning_rate": 1e-06, "loss": 0.008, "step": 63 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0859375, "clip_ratio/low_mean": 0.0234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.109375, "entropy": 0.21027550101280212, "epoch": 0.16842105263157894, "grad_norm": 0.003961838316172361, "learning_rate": 1e-06, "loss": 0.0034, "step": 64 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 10275.0, "completions/mean_length": 2000.541015625, "completions/mean_terminated_length": 1972.393310546875, "completions/min_length": 401.0, "completions/min_terminated_length": 401.0, "entropy": 0.21027490496635437, "epoch": 0.17105263157894737, "frac_reward_zero_std": 0.71875, "grad_norm": 0.030528031289577484, "learning_rate": 1e-06, "loss": 0.0355, "num_tokens": 21810957.0, "reward": 0.8799805045127869, "reward_std": 0.061015717685222626, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.97265625, "rewards/symbolic_reward_accuracy/std": 0.16324250400066376, "rewards/symbolic_reward_partial_score/mean": 0.9879556894302368, "rewards/symbolic_reward_partial_score/std": 0.08918653428554535, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.044339895248413, "sampling/importance_sampling_ratio/min": 0.00023883562244009227, "sampling/sampling_logp_difference/max": 8.33973503112793, "sampling/sampling_logp_difference/mean": 0.08711807429790497, "step": 65 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0859375, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.09375, "entropy": 0.20773346722126007, "epoch": 0.1736842105263158, "grad_norm": 0.00989668071269989, "learning_rate": 1e-06, "loss": 0.004, "step": 66 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.078125, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0859375, "entropy": 0.20601221174001694, "epoch": 0.1763157894736842, "grad_norm": 0.007690164726227522, "learning_rate": 1e-06, "loss": 0.0267, "step": 67 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0703125, "clip_ratio/low_mean": 0.0234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.09375, "entropy": 0.20519739389419556, "epoch": 0.17894736842105263, "grad_norm": 0.005974603351205587, "learning_rate": 1e-06, "loss": 0.0012, "step": 68 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 15165.0, "completions/mean_length": 1792.30078125, "completions/mean_terminated_length": 1763.74560546875, "completions/min_length": 331.0, "completions/min_terminated_length": 331.0, "entropy": 0.21119263768196106, "epoch": 0.18157894736842106, "frac_reward_zero_std": 0.8125, "grad_norm": 0.01741369068622589, "learning_rate": 1e-06, "loss": 0.0081, "num_tokens": 23118567.0, "reward": 0.8806641101837158, "reward_std": 0.04559167101979256, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.97265625, "rewards/symbolic_reward_accuracy/std": 0.16324250400066376, "rewards/symbolic_reward_partial_score/mean": 0.990234375, "rewards/symbolic_reward_partial_score/std": 0.07527661323547363, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0458157062530518, "sampling/importance_sampling_ratio/min": 0.004343151114881039, "sampling/sampling_logp_difference/max": 5.439155101776123, "sampling/sampling_logp_difference/mean": 0.08970936387777328, "step": 69 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0390625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0390625, "entropy": 0.21382297575473785, "epoch": 0.18421052631578946, "grad_norm": 0.024715770035982132, "learning_rate": 1e-06, "loss": 0.0034, "step": 70 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0390625, "clip_ratio/low_mean": 0.0234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0625, "entropy": 0.20969125628471375, "epoch": 0.1868421052631579, "grad_norm": 0.009425345808267593, "learning_rate": 1e-06, "loss": 0.0163, "step": 71 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.078125, "entropy": 0.21333520859479904, "epoch": 0.18947368421052632, "grad_norm": 0.016832726076245308, "learning_rate": 1e-06, "loss": 0.0067, "step": 72 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 16384.0, "completions/max_terminated_length": 6794.0, "completions/mean_length": 1582.59765625, "completions/mean_terminated_length": 1524.552978515625, "completions/min_length": 390.0, "completions/min_terminated_length": 390.0, "entropy": 0.2081427499651909, "epoch": 0.19210526315789472, "frac_reward_zero_std": 0.875, "grad_norm": 0.003800163511186838, "learning_rate": 1e-06, "loss": -0.0004, "num_tokens": 24329497.0, "reward": 0.8927246332168579, "reward_std": 0.025883354246616364, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.990234375, "rewards/symbolic_reward_accuracy/std": 0.09843364357948303, "rewards/symbolic_reward_partial_score/mean": 0.9959309697151184, "rewards/symbolic_reward_partial_score/std": 0.05158122628927231, "rewards/tag_count_reward/mean": -0.001953125, "rewards/tag_count_reward/std": 0.04419417306780815, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0448415279388428, "sampling/importance_sampling_ratio/min": 0.008288132958114147, "sampling/sampling_logp_difference/max": 4.792930603027344, "sampling/sampling_logp_difference/mean": 0.08850154280662537, "step": 73 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.20644701272249222, "epoch": 0.19473684210526315, "grad_norm": 0.02778162993490696, "learning_rate": 1e-06, "loss": 0.0268, "step": 74 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03125, "entropy": 0.2121978998184204, "epoch": 0.19736842105263158, "grad_norm": 0.0030854835640639067, "learning_rate": 1e-06, "loss": 0.0003, "step": 75 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.21026697009801865, "epoch": 0.2, "grad_norm": 0.009819623082876205, "learning_rate": 1e-06, "loss": -0.0028, "step": 76 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 7570.0, "completions/mean_length": 1720.986328125, "completions/mean_terminated_length": 1692.2916259765625, "completions/min_length": 444.0, "completions/min_terminated_length": 444.0, "entropy": 0.20008539408445358, "epoch": 0.2026315789473684, "frac_reward_zero_std": 0.8125, "grad_norm": 0.030614569783210754, "learning_rate": 1e-06, "loss": 0.0294, "num_tokens": 25622130.0, "reward": 0.8901855945587158, "reward_std": 0.03925781324505806, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.98828125, "rewards/symbolic_reward_accuracy/std": 0.10772226005792618, "rewards/symbolic_reward_partial_score/mean": 0.9913736581802368, "rewards/symbolic_reward_partial_score/std": 0.08904998004436493, "rewards/tag_count_reward/mean": -0.001953125, "rewards/tag_count_reward/std": 0.04419417306780815, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0445349216461182, "sampling/importance_sampling_ratio/min": 0.007543839979916811, "sampling/sampling_logp_difference/max": 4.88702392578125, "sampling/sampling_logp_difference/mean": 0.08748432993888855, "step": 77 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0546875, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0546875, "entropy": 0.20930125564336777, "epoch": 0.20526315789473684, "grad_norm": 0.022404422983527184, "learning_rate": 1e-06, "loss": -0.0003, "step": 78 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0703125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0703125, "entropy": 0.20594292134046555, "epoch": 0.20789473684210527, "grad_norm": 0.0036063790321350098, "learning_rate": 1e-06, "loss": -0.008, "step": 79 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0546875, "clip_ratio/low_mean": 0.0234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.078125, "entropy": 0.20701226592063904, "epoch": 0.21052631578947367, "grad_norm": 0.002571528311818838, "learning_rate": 1e-06, "loss": 0.0118, "step": 80 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 9040.0, "completions/mean_length": 1800.029296875, "completions/mean_terminated_length": 1771.4892578125, "completions/min_length": 447.0, "completions/min_terminated_length": 447.0, "entropy": 0.21440183371305466, "epoch": 0.2131578947368421, "frac_reward_zero_std": 0.84375, "grad_norm": 0.00505083380267024, "learning_rate": 1e-06, "loss": -0.0036, "num_tokens": 26956737.0, "reward": 0.8875488638877869, "reward_std": 0.03598622605204582, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.984375, "rewards/symbolic_reward_accuracy/std": 0.12414088100194931, "rewards/symbolic_reward_partial_score/mean": 0.9903970956802368, "rewards/symbolic_reward_partial_score/std": 0.08615581691265106, "rewards/tag_count_reward/mean": -0.001953125, "rewards/tag_count_reward/std": 0.04419417306780815, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.044743537902832, "sampling/importance_sampling_ratio/min": 0.0041437954641878605, "sampling/sampling_logp_difference/max": 5.486143112182617, "sampling/sampling_logp_difference/mean": 0.08827289938926697, "step": 81 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0390625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0390625, "entropy": 0.20806460827589035, "epoch": 0.21578947368421053, "grad_norm": 0.005579834803938866, "learning_rate": 1e-06, "loss": 0.0046, "step": 82 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0546875, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0546875, "entropy": 0.20855534076690674, "epoch": 0.21842105263157896, "grad_norm": 0.005343756638467312, "learning_rate": 1e-06, "loss": -0.001, "step": 83 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.046875, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.046875, "entropy": 0.2036062777042389, "epoch": 0.22105263157894736, "grad_norm": 0.039197273552417755, "learning_rate": 1e-06, "loss": 0.0353, "step": 84 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 10938.0, "completions/mean_length": 1671.66015625, "completions/mean_terminated_length": 1642.868896484375, "completions/min_length": 338.0, "completions/min_terminated_length": 338.0, "entropy": 0.2060195505619049, "epoch": 0.2236842105263158, "frac_reward_zero_std": 0.9375, "grad_norm": 0.01799774542450905, "learning_rate": 1e-06, "loss": 0.0183, "num_tokens": 28200915.0, "reward": 0.8948242664337158, "reward_std": 0.01601562649011612, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.994140625, "rewards/symbolic_reward_accuracy/std": 0.07639661431312561, "rewards/symbolic_reward_partial_score/mean": 0.9951171875, "rewards/symbolic_reward_partial_score/std": 0.06617584824562073, "rewards/tag_count_reward/mean": -0.001953125, "rewards/tag_count_reward/std": 0.04419417306780815, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0441960096359253, "sampling/importance_sampling_ratio/min": 0.003996466752141714, "sampling/sampling_logp_difference/max": 5.522344589233398, "sampling/sampling_logp_difference/mean": 0.0878855437040329, "step": 85 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.20544211566448212, "epoch": 0.22631578947368422, "grad_norm": 0.0028636252973228693, "learning_rate": 1e-06, "loss": 0.0043, "step": 86 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.20531418174505234, "epoch": 0.22894736842105262, "grad_norm": 0.0029805577360093594, "learning_rate": 1e-06, "loss": 0.0178, "step": 87 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.20847825706005096, "epoch": 0.23157894736842105, "grad_norm": 0.001331424224190414, "learning_rate": 1e-06, "loss": -0.0033, "step": 88 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5967.0, "completions/max_terminated_length": 5967.0, "completions/mean_length": 1397.779296875, "completions/mean_terminated_length": 1397.779296875, "completions/min_length": 361.0, "completions/min_terminated_length": 361.0, "entropy": 0.20501412451267242, "epoch": 0.23421052631578948, "frac_reward_zero_std": 0.90625, "grad_norm": 0.014097422361373901, "learning_rate": 1e-06, "loss": 0.0028, "num_tokens": 29303842.0, "reward": 0.89501953125, "reward_std": 0.01992187649011612, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.994140625, "rewards/symbolic_reward_accuracy/std": 0.07639661431312561, "rewards/symbolic_reward_partial_score/mean": 0.9951171875, "rewards/symbolic_reward_partial_score/std": 0.06617584824562073, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0450979471206665, "sampling/importance_sampling_ratio/min": 0.006796242203563452, "sampling/sampling_logp_difference/max": 4.991385459899902, "sampling/sampling_logp_difference/mean": 0.08861608803272247, "step": 89 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0546875, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0546875, "entropy": 0.21127699315547943, "epoch": 0.23684210526315788, "grad_norm": 0.002064671367406845, "learning_rate": 1e-06, "loss": 0.006, "step": 90 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.046875, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.046875, "entropy": 0.20220820605754852, "epoch": 0.2394736842105263, "grad_norm": 0.0021278061904013157, "learning_rate": 1e-06, "loss": -0.0033, "step": 91 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.2103341892361641, "epoch": 0.24210526315789474, "grad_norm": 0.002533518709242344, "learning_rate": 1e-06, "loss": -0.0042, "step": 92 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 11535.0, "completions/mean_length": 1775.90234375, "completions/mean_terminated_length": 1747.3150634765625, "completions/min_length": 424.0, "completions/min_terminated_length": 424.0, "entropy": 0.20379643887281418, "epoch": 0.24473684210526317, "frac_reward_zero_std": 0.8125, "grad_norm": 0.011236423626542091, "learning_rate": 1e-06, "loss": -0.0015, "num_tokens": 30601232.0, "reward": 0.8870605230331421, "reward_std": 0.03621069714426994, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.984375, "rewards/symbolic_reward_accuracy/std": 0.12414088100194931, "rewards/symbolic_reward_partial_score/mean": 0.98876953125, "rewards/symbolic_reward_partial_score/std": 0.10026202350854874, "rewards/tag_count_reward/mean": -0.001953125, "rewards/tag_count_reward/std": 0.04419417306780815, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0442211627960205, "sampling/importance_sampling_ratio/min": 0.0072532459162175655, "sampling/sampling_logp_difference/max": 4.926306247711182, "sampling/sampling_logp_difference/mean": 0.08726542443037033, "step": 93 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0703125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0703125, "entropy": 0.20621806383132935, "epoch": 0.24736842105263157, "grad_norm": 0.020073598250746727, "learning_rate": 1e-06, "loss": 0.0001, "step": 94 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.078125, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0859375, "entropy": 0.21164795756340027, "epoch": 0.25, "grad_norm": 0.00920848362147808, "learning_rate": 1e-06, "loss": 0.0001, "step": 95 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.046875, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.046875, "entropy": 0.2064327448606491, "epoch": 0.25263157894736843, "grad_norm": 0.024632828310132027, "learning_rate": 1e-06, "loss": 0.0443, "step": 96 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9000.0, "completions/max_terminated_length": 9000.0, "completions/mean_length": 1691.830078125, "completions/mean_terminated_length": 1691.830078125, "completions/min_length": 436.0, "completions/min_terminated_length": 436.0, "entropy": 0.2086646407842636, "epoch": 0.25526315789473686, "frac_reward_zero_std": 0.96875, "grad_norm": 0.0020495187491178513, "learning_rate": 1e-06, "loss": -0.0014, "num_tokens": 31869593.0, "reward": 0.8982422351837158, "reward_std": 0.00703125074505806, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.998046875, "rewards/symbolic_reward_accuracy/std": 0.04419417306780815, "rewards/symbolic_reward_partial_score/mean": 0.998046875, "rewards/symbolic_reward_partial_score/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0450183153152466, "sampling/importance_sampling_ratio/min": 0.0031182451639324427, "sampling/sampling_logp_difference/max": 5.770484924316406, "sampling/sampling_logp_difference/mean": 0.08857271075248718, "step": 97 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.20590169727802277, "epoch": 0.2578947368421053, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0095, "step": 98 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.2117590680718422, "epoch": 0.26052631578947366, "grad_norm": 0.0015901580918580294, "learning_rate": 1e-06, "loss": -0.0015, "step": 99 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.20680592954158783, "epoch": 0.2631578947368421, "grad_norm": 0.0009834379889070988, "learning_rate": 1e-06, "loss": -0.001, "step": 100 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6940.0, "completions/max_terminated_length": 6940.0, "completions/mean_length": 1685.759765625, "completions/mean_terminated_length": 1685.759765625, "completions/min_length": 390.0, "completions/min_terminated_length": 390.0, "entropy": 0.21084479242563248, "epoch": 0.2657894736842105, "frac_reward_zero_std": 0.875, "grad_norm": 0.024995367974042892, "learning_rate": 1e-06, "loss": 0.0044, "num_tokens": 33142526.0, "reward": 0.8939453363418579, "reward_std": 0.02421875298023224, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.9921875, "rewards/symbolic_reward_accuracy/std": 0.08812850713729858, "rewards/symbolic_reward_partial_score/mean": 0.9954426884651184, "rewards/symbolic_reward_partial_score/std": 0.06325981765985489, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0453993082046509, "sampling/importance_sampling_ratio/min": 0.00554183404892683, "sampling/sampling_logp_difference/max": 5.195429801940918, "sampling/sampling_logp_difference/mean": 0.0886693000793457, "step": 101 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0703125, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.078125, "entropy": 0.2091396600008011, "epoch": 0.26842105263157895, "grad_norm": 0.001542137935757637, "learning_rate": 1e-06, "loss": 0.0017, "step": 102 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03125, "entropy": 0.21141375601291656, "epoch": 0.2710526315789474, "grad_norm": 0.002429763786494732, "learning_rate": 1e-06, "loss": -0.0044, "step": 103 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0390625, "entropy": 0.20988311618566513, "epoch": 0.2736842105263158, "grad_norm": 0.0027020061388611794, "learning_rate": 1e-06, "loss": 0.006, "step": 104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 16384.0, "completions/max_terminated_length": 12355.0, "completions/mean_length": 1652.708984375, "completions/mean_terminated_length": 1594.9393310546875, "completions/min_length": 390.0, "completions/min_terminated_length": 390.0, "entropy": 0.20948264002799988, "epoch": 0.27631578947368424, "frac_reward_zero_std": 0.71875, "grad_norm": 0.012331722304224968, "learning_rate": 1e-06, "loss": -0.0091, "num_tokens": 34397673.0, "reward": 0.8832969665527344, "reward_std": 0.056164130568504333, "rewards/progression_diversity/mean": -0.0003878828720189631, "rewards/progression_diversity/std": 0.008584557101130486, "rewards/symbolic_reward_accuracy/mean": 0.978515625, "rewards/symbolic_reward_accuracy/std": 0.14513419568538666, "rewards/symbolic_reward_partial_score/mean": 0.9886067509651184, "rewards/symbolic_reward_partial_score/std": 0.09459387511014938, "rewards/tag_count_reward/mean": -0.00390625, "rewards/tag_count_reward/std": 0.06243881583213806, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0448832511901855, "sampling/importance_sampling_ratio/min": 0.0026828704867511988, "sampling/sampling_logp_difference/max": 5.920867919921875, "sampling/sampling_logp_difference/mean": 0.08767466247081757, "step": 105 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.109375, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.140625, "entropy": 0.20856205374002457, "epoch": 0.2789473684210526, "grad_norm": 0.010227479040622711, "learning_rate": 1e-06, "loss": 0.0365, "step": 106 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.109375, "clip_ratio/low_mean": 0.0234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1328125, "entropy": 0.20701056718826294, "epoch": 0.28157894736842104, "grad_norm": 0.024928631260991096, "learning_rate": 1e-06, "loss": 0.0412, "step": 107 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1171875, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1328125, "entropy": 0.21442615240812302, "epoch": 0.28421052631578947, "grad_norm": 0.00423438660800457, "learning_rate": 1e-06, "loss": -0.0068, "step": 108 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9787.0, "completions/max_terminated_length": 9787.0, "completions/mean_length": 1712.083984375, "completions/mean_terminated_length": 1712.083984375, "completions/min_length": 438.0, "completions/min_terminated_length": 438.0, "entropy": 0.21213070303201675, "epoch": 0.2868421052631579, "frac_reward_zero_std": 0.84375, "grad_norm": 0.013260642066597939, "learning_rate": 1e-06, "loss": 0.0112, "num_tokens": 35679412.0, "reward": 0.8808594346046448, "reward_std": 0.042662762105464935, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.974609375, "rewards/symbolic_reward_accuracy/std": 0.15746226906776428, "rewards/symbolic_reward_partial_score/mean": 0.9869791269302368, "rewards/symbolic_reward_partial_score/std": 0.09409477561712265, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0453011989593506, "sampling/importance_sampling_ratio/min": 0.0025328195188194513, "sampling/sampling_logp_difference/max": 5.978422164916992, "sampling/sampling_logp_difference/mean": 0.0887899324297905, "step": 109 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.046875, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.046875, "entropy": 0.2099447175860405, "epoch": 0.2894736842105263, "grad_norm": 0.022086583077907562, "learning_rate": 1e-06, "loss": 0.0067, "step": 110 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0390625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0390625, "entropy": 0.2074357271194458, "epoch": 0.29210526315789476, "grad_norm": 0.013523987494409084, "learning_rate": 1e-06, "loss": -0.0047, "step": 111 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0625, "entropy": 0.20638947188854218, "epoch": 0.29473684210526313, "grad_norm": 0.011643585748970509, "learning_rate": 1e-06, "loss": -0.0091, "step": 112 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8612.0, "completions/max_terminated_length": 8612.0, "completions/mean_length": 1587.6328125, "completions/mean_terminated_length": 1587.6328125, "completions/min_length": 435.0, "completions/min_terminated_length": 435.0, "entropy": 0.21661588549613953, "epoch": 0.29736842105263156, "frac_reward_zero_std": 0.90625, "grad_norm": 0.0032090393360704184, "learning_rate": 1e-06, "loss": -0.0046, "num_tokens": 36894904.0, "reward": 0.8954101800918579, "reward_std": 0.01835937798023224, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.994140625, "rewards/symbolic_reward_accuracy/std": 0.07639661431312561, "rewards/symbolic_reward_partial_score/mean": 0.9964193105697632, "rewards/symbolic_reward_partial_score/std": 0.051485683768987656, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.046404480934143, "sampling/importance_sampling_ratio/min": 0.003222900675609708, "sampling/sampling_logp_difference/max": 5.737473487854004, "sampling/sampling_logp_difference/mean": 0.09082243591547012, "step": 113 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.2141873762011528, "epoch": 0.3, "grad_norm": 0.0027777093928307295, "learning_rate": 1e-06, "loss": -0.0039, "step": 114 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0390625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0390625, "entropy": 0.2160024642944336, "epoch": 0.3026315789473684, "grad_norm": 0.020424243062734604, "learning_rate": 1e-06, "loss": 0.0027, "step": 115 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.2109028697013855, "epoch": 0.30526315789473685, "grad_norm": 0.02847341261804104, "learning_rate": 1e-06, "loss": 0.0234, "step": 116 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10945.0, "completions/max_terminated_length": 10945.0, "completions/mean_length": 1894.62109375, "completions/mean_terminated_length": 1894.62109375, "completions/min_length": 402.0, "completions/min_terminated_length": 402.0, "entropy": 0.2095939666032791, "epoch": 0.3078947368421053, "frac_reward_zero_std": 0.8125, "grad_norm": 0.029239900410175323, "learning_rate": 1e-06, "loss": 0.0061, "num_tokens": 38290934.0, "reward": 0.88671875, "reward_std": 0.042625680565834045, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.982421875, "rewards/symbolic_reward_accuracy/std": 0.13154059648513794, "rewards/symbolic_reward_partial_score/mean": 0.9908853769302368, "rewards/symbolic_reward_partial_score/std": 0.07801654189825058, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0455983877182007, "sampling/importance_sampling_ratio/min": 0.002526824129745364, "sampling/sampling_logp_difference/max": 5.980792045593262, "sampling/sampling_logp_difference/mean": 0.08917158842086792, "step": 117 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0390625, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.046875, "entropy": 0.21050098538398743, "epoch": 0.3105263157894737, "grad_norm": 0.01623697020113468, "learning_rate": 1e-06, "loss": 0.0063, "step": 118 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0703125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0703125, "entropy": 0.21095051616430283, "epoch": 0.3131578947368421, "grad_norm": 0.019014954566955566, "learning_rate": 1e-06, "loss": -0.0061, "step": 119 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0546875, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0625, "entropy": 0.21450290828943253, "epoch": 0.3157894736842105, "grad_norm": 0.005355321802198887, "learning_rate": 1e-06, "loss": -0.0078, "step": 120 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6974.0, "completions/max_terminated_length": 6974.0, "completions/mean_length": 1639.470703125, "completions/mean_terminated_length": 1639.470703125, "completions/min_length": 489.0, "completions/min_terminated_length": 489.0, "entropy": 0.21340486407279968, "epoch": 0.31842105263157894, "frac_reward_zero_std": 0.84375, "grad_norm": 0.004862508270889521, "learning_rate": 1e-06, "loss": -0.0058, "num_tokens": 39538663.0, "reward": 0.8884277939796448, "reward_std": 0.03546273708343506, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.984375, "rewards/symbolic_reward_accuracy/std": 0.12414088100194931, "rewards/symbolic_reward_partial_score/mean": 0.99267578125, "rewards/symbolic_reward_partial_score/std": 0.07158216834068298, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0456147193908691, "sampling/importance_sampling_ratio/min": 0.006313262972980738, "sampling/sampling_logp_difference/max": 5.065102577209473, "sampling/sampling_logp_difference/mean": 0.08961638063192368, "step": 121 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03125, "entropy": 0.20896470546722412, "epoch": 0.32105263157894737, "grad_norm": 0.008511355146765709, "learning_rate": 1e-06, "loss": -0.0044, "step": 122 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.21015820652246475, "epoch": 0.3236842105263158, "grad_norm": 0.025438793003559113, "learning_rate": 1e-06, "loss": 0.0153, "step": 123 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03125, "entropy": 0.2102859541773796, "epoch": 0.3263157894736842, "grad_norm": 0.004443157464265823, "learning_rate": 1e-06, "loss": -0.0078, "step": 124 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 8759.0, "completions/mean_length": 1866.654296875, "completions/mean_terminated_length": 1838.24462890625, "completions/min_length": 475.0, "completions/min_terminated_length": 475.0, "entropy": 0.21435657143592834, "epoch": 0.32894736842105265, "frac_reward_zero_std": 0.875, "grad_norm": 0.004789907950907946, "learning_rate": 1e-06, "loss": 0.0007, "num_tokens": 40883702.0, "reward": 0.89028000831604, "reward_std": 0.02699219435453415, "rewards/progression_diversity/mean": -0.0003213902818970382, "rewards/progression_diversity/std": 0.007272232323884964, "rewards/symbolic_reward_accuracy/mean": 0.98828125, "rewards/symbolic_reward_accuracy/std": 0.10772226005792618, "rewards/symbolic_reward_partial_score/mean": 0.99169921875, "rewards/symbolic_reward_partial_score/std": 0.08308180421590805, "rewards/tag_count_reward/mean": -0.001953125, "rewards/tag_count_reward/std": 0.04419417306780815, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.045685052871704, "sampling/importance_sampling_ratio/min": 0.006483785808086395, "sampling/sampling_logp_difference/max": 5.038450717926025, "sampling/sampling_logp_difference/mean": 0.08934558928012848, "step": 125 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.046875, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.046875, "entropy": 0.2118215411901474, "epoch": 0.33157894736842103, "grad_norm": 0.02653645910322666, "learning_rate": 1e-06, "loss": 0.012, "step": 126 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03125, "entropy": 0.2117708995938301, "epoch": 0.33421052631578946, "grad_norm": 0.013068128377199173, "learning_rate": 1e-06, "loss": 0.0253, "step": 127 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0546875, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0546875, "entropy": 0.210942342877388, "epoch": 0.3368421052631579, "grad_norm": 0.004611820913851261, "learning_rate": 1e-06, "loss": -0.009, "step": 128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6268.0, "completions/max_terminated_length": 6268.0, "completions/mean_length": 1521.951171875, "completions/mean_terminated_length": 1521.951171875, "completions/min_length": 426.0, "completions/min_terminated_length": 426.0, "entropy": 0.20949231833219528, "epoch": 0.3394736842105263, "frac_reward_zero_std": 0.96875, "grad_norm": 0.0008793366723693907, "learning_rate": 1e-06, "loss": 0.0004, "num_tokens": 42062077.0, "reward": 0.8985351920127869, "reward_std": 0.005859375, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.998046875, "rewards/symbolic_reward_accuracy/std": 0.04419417306780815, "rewards/symbolic_reward_partial_score/mean": 0.9990234375, "rewards/symbolic_reward_partial_score/std": 0.022097086533904076, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0461798906326294, "sampling/importance_sampling_ratio/min": 0.007740411441773176, "sampling/sampling_logp_difference/max": 4.861300468444824, "sampling/sampling_logp_difference/mean": 0.0897115021944046, "step": 129 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.2133314609527588, "epoch": 0.34210526315789475, "grad_norm": 0.0008075212826952338, "learning_rate": 1e-06, "loss": -0.0006, "step": 130 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.20833444595336914, "epoch": 0.3447368421052632, "grad_norm": 0.0006465095211751759, "learning_rate": 1e-06, "loss": -0.0008, "step": 131 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.20756175369024277, "epoch": 0.3473684210526316, "grad_norm": 0.0012012607185170054, "learning_rate": 1e-06, "loss": -0.001, "step": 132 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6244.0, "completions/max_terminated_length": 6244.0, "completions/mean_length": 1593.48046875, "completions/mean_terminated_length": 1593.48046875, "completions/min_length": 392.0, "completions/min_terminated_length": 392.0, "entropy": 0.2085244581103325, "epoch": 0.35, "frac_reward_zero_std": 0.96875, "grad_norm": 0.0017708096420392394, "learning_rate": 1e-06, "loss": -0.0015, "num_tokens": 43280403.0, "reward": 0.8987305164337158, "reward_std": 0.005078125279396772, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.998046875, "rewards/symbolic_reward_accuracy/std": 0.04419417306780815, "rewards/symbolic_reward_partial_score/mean": 0.9996744394302368, "rewards/symbolic_reward_partial_score/std": 0.007365696597844362, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0450639724731445, "sampling/importance_sampling_ratio/min": 0.005164094734936953, "sampling/sampling_logp_difference/max": 5.266025543212891, "sampling/sampling_logp_difference/mean": 0.08883464336395264, "step": 133 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.20980224758386612, "epoch": 0.3526315789473684, "grad_norm": 0.0018608415266498923, "learning_rate": 1e-06, "loss": -0.0016, "step": 134 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.2056858092546463, "epoch": 0.35526315789473684, "grad_norm": 0.0015929264482110739, "learning_rate": 1e-06, "loss": 0.0073, "step": 135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.20565607398748398, "epoch": 0.35789473684210527, "grad_norm": 0.0017638880526646972, "learning_rate": 1e-06, "loss": -0.0017, "step": 136 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14162.0, "completions/max_terminated_length": 14162.0, "completions/mean_length": 1874.09765625, "completions/mean_terminated_length": 1874.09765625, "completions/min_length": 369.0, "completions/min_terminated_length": 369.0, "entropy": 0.20833812654018402, "epoch": 0.3605263157894737, "frac_reward_zero_std": 0.875, "grad_norm": 0.0055350083857774734, "learning_rate": 1e-06, "loss": -0.0096, "num_tokens": 44629733.0, "reward": 0.8904297351837158, "reward_std": 0.027488719671964645, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.986328125, "rewards/symbolic_reward_accuracy/std": 0.1162383034825325, "rewards/symbolic_reward_partial_score/mean": 0.9954426884651184, "rewards/symbolic_reward_partial_score/std": 0.04242921993136406, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0446752309799194, "sampling/importance_sampling_ratio/min": 0.006580098997801542, "sampling/sampling_logp_difference/max": 5.02370548248291, "sampling/sampling_logp_difference/mean": 0.08842315524816513, "step": 137 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0703125, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0859375, "entropy": 0.2066892758011818, "epoch": 0.3631578947368421, "grad_norm": 0.003356419736519456, "learning_rate": 1e-06, "loss": 0.0146, "step": 138 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.046875, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.046875, "entropy": 0.20385056734085083, "epoch": 0.36578947368421055, "grad_norm": 0.029059309512376785, "learning_rate": 1e-06, "loss": 0.0064, "step": 139 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0390625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0390625, "entropy": 0.20674044638872147, "epoch": 0.3684210526315789, "grad_norm": 0.017651837319135666, "learning_rate": 1e-06, "loss": 0.0018, "step": 140 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6719.0, "completions/max_terminated_length": 6719.0, "completions/mean_length": 1592.986328125, "completions/mean_terminated_length": 1592.986328125, "completions/min_length": 490.0, "completions/min_terminated_length": 490.0, "entropy": 0.20134814828634262, "epoch": 0.37105263157894736, "frac_reward_zero_std": 0.96875, "grad_norm": 0.0019348851637914777, "learning_rate": 1e-06, "loss": -0.0013, "num_tokens": 45843294.0, "reward": 0.8982422351837158, "reward_std": 0.00703125074505806, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.998046875, "rewards/symbolic_reward_accuracy/std": 0.04419417306780815, "rewards/symbolic_reward_partial_score/mean": 0.998046875, "rewards/symbolic_reward_partial_score/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0445852279663086, "sampling/importance_sampling_ratio/min": 0.0025451150722801685, "sampling/sampling_logp_difference/max": 5.973579406738281, "sampling/sampling_logp_difference/mean": 0.08802568167448044, "step": 141 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.20769786834716797, "epoch": 0.3736842105263158, "grad_norm": 0.0011362970108166337, "learning_rate": 1e-06, "loss": -0.0015, "step": 142 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.2063521146774292, "epoch": 0.3763157894736842, "grad_norm": 0.001671973499469459, "learning_rate": 1e-06, "loss": -0.001, "step": 143 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.20639032125473022, "epoch": 0.37894736842105264, "grad_norm": 0.01958642341196537, "learning_rate": 1e-06, "loss": 0.0043, "step": 144 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 16384.0, "completions/max_terminated_length": 13829.0, "completions/mean_length": 1866.8828125, "completions/mean_terminated_length": 1809.9530029296875, "completions/min_length": 431.0, "completions/min_terminated_length": 431.0, "entropy": 0.20035217702388763, "epoch": 0.3815789473684211, "frac_reward_zero_std": 0.78125, "grad_norm": 0.009615018032491207, "learning_rate": 1e-06, "loss": 0.0108, "num_tokens": 47192258.0, "reward": 0.877197265625, "reward_std": 0.048984870314598083, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.96875, "rewards/symbolic_reward_accuracy/std": 0.17416280508041382, "rewards/symbolic_reward_partial_score/mean": 0.98779296875, "rewards/symbolic_reward_partial_score/std": 0.08177053928375244, "rewards/tag_count_reward/mean": -0.00390625, "rewards/tag_count_reward/std": 0.06243881583213806, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.043588638305664, "sampling/importance_sampling_ratio/min": 0.0023568610195070505, "sampling/sampling_logp_difference/max": 6.050424575805664, "sampling/sampling_logp_difference/mean": 0.08597667515277863, "step": 145 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.078125, "entropy": 0.2044568881392479, "epoch": 0.38421052631578945, "grad_norm": 0.007010954432189465, "learning_rate": 1e-06, "loss": 0.0077, "step": 146 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0859375, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.09375, "entropy": 0.20459038019180298, "epoch": 0.3868421052631579, "grad_norm": 0.006047468166798353, "learning_rate": 1e-06, "loss": -0.0013, "step": 147 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.046875, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.046875, "entropy": 0.20103536546230316, "epoch": 0.3894736842105263, "grad_norm": 0.02023169957101345, "learning_rate": 1e-06, "loss": 0.041, "step": 148 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10930.0, "completions/max_terminated_length": 10930.0, "completions/mean_length": 1789.4453125, "completions/mean_terminated_length": 1789.4453125, "completions/min_length": 434.0, "completions/min_terminated_length": 434.0, "entropy": 0.20401398837566376, "epoch": 0.39210526315789473, "frac_reward_zero_std": 0.9375, "grad_norm": 0.01833491213619709, "learning_rate": 1e-06, "loss": 0.0027, "num_tokens": 48518278.0, "reward": 0.89599609375, "reward_std": 0.012571612372994423, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.994140625, "rewards/symbolic_reward_accuracy/std": 0.07639661431312561, "rewards/symbolic_reward_partial_score/mean": 0.9983723759651184, "rewards/symbolic_reward_partial_score/std": 0.024398809298872948, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0439424514770508, "sampling/importance_sampling_ratio/min": 0.0017211800441145897, "sampling/sampling_logp_difference/max": 6.364745140075684, "sampling/sampling_logp_difference/mean": 0.08654054999351501, "step": 149 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.20435316115617752, "epoch": 0.39473684210526316, "grad_norm": 0.002459852024912834, "learning_rate": 1e-06, "loss": -0.0026, "step": 150 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.20143085718154907, "epoch": 0.3973684210526316, "grad_norm": 0.0016587390564382076, "learning_rate": 1e-06, "loss": -0.0028, "step": 151 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0390625, "entropy": 0.20314547419548035, "epoch": 0.4, "grad_norm": 0.0013358999276533723, "learning_rate": 1e-06, "loss": 0.013, "step": 152 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11794.0, "completions/max_terminated_length": 11794.0, "completions/mean_length": 1545.12109375, "completions/mean_terminated_length": 1545.12109375, "completions/min_length": 457.0, "completions/min_terminated_length": 457.0, "entropy": 0.20236484706401825, "epoch": 0.4026315789473684, "frac_reward_zero_std": 0.9375, "grad_norm": 0.0019950850401073694, "learning_rate": 1e-06, "loss": -0.0014, "num_tokens": 49711364.0, "reward": 0.8971680402755737, "reward_std": 0.01132812537252903, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.99609375, "rewards/symbolic_reward_accuracy/std": 0.06243881583213806, "rewards/symbolic_reward_partial_score/mean": 0.9983724355697632, "rewards/symbolic_reward_partial_score/std": 0.026533395051956177, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0447338819503784, "sampling/importance_sampling_ratio/min": 0.002444764832034707, "sampling/sampling_logp_difference/max": 6.013806343078613, "sampling/sampling_logp_difference/mean": 0.08731556683778763, "step": 153 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03125, "entropy": 0.2040649652481079, "epoch": 0.4052631578947368, "grad_norm": 0.001786226173862815, "learning_rate": 1e-06, "loss": 0.0023, "step": 154 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03125, "entropy": 0.20653317123651505, "epoch": 0.40789473684210525, "grad_norm": 0.0017548453761264682, "learning_rate": 1e-06, "loss": -0.0004, "step": 155 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03125, "entropy": 0.205593079328537, "epoch": 0.4105263157894737, "grad_norm": 0.0007879452896304429, "learning_rate": 1e-06, "loss": -0.0015, "step": 156 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6548.0, "completions/max_terminated_length": 6548.0, "completions/mean_length": 1599.859375, "completions/mean_terminated_length": 1599.859375, "completions/min_length": 446.0, "completions/min_terminated_length": 446.0, "entropy": 0.20530398935079575, "epoch": 0.4131578947368421, "frac_reward_zero_std": 0.90625, "grad_norm": 0.0027853313367813826, "learning_rate": 1e-06, "loss": -0.0035, "num_tokens": 50939452.0, "reward": 0.89599609375, "reward_std": 0.01601562649011612, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.994140625, "rewards/symbolic_reward_accuracy/std": 0.07639661431312561, "rewards/symbolic_reward_partial_score/mean": 0.9983723759651184, "rewards/symbolic_reward_partial_score/std": 0.024398809298872948, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0450772047042847, "sampling/importance_sampling_ratio/min": 0.001956768799573183, "sampling/sampling_logp_difference/max": 6.2364606857299805, "sampling/sampling_logp_difference/mean": 0.08825993537902832, "step": 157 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.2067273110151291, "epoch": 0.41578947368421054, "grad_norm": 0.022896897047758102, "learning_rate": 1e-06, "loss": 0.0046, "step": 158 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0546875, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0546875, "entropy": 0.20823199301958084, "epoch": 0.41842105263157897, "grad_norm": 0.02219291776418686, "learning_rate": 1e-06, "loss": 0.0033, "step": 159 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.2033417671918869, "epoch": 0.42105263157894735, "grad_norm": 0.009670994244515896, "learning_rate": 1e-06, "loss": -0.0021, "step": 160 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11035.0, "completions/max_terminated_length": 11035.0, "completions/mean_length": 1671.208984375, "completions/mean_terminated_length": 1671.208984375, "completions/min_length": 459.0, "completions/min_terminated_length": 459.0, "entropy": 0.20134872198104858, "epoch": 0.4236842105263158, "frac_reward_zero_std": 0.90625, "grad_norm": 0.01667424663901329, "learning_rate": 1e-06, "loss": 0.0042, "num_tokens": 52197095.0, "reward": 0.8926758170127869, "reward_std": 0.02172715961933136, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.990234375, "rewards/symbolic_reward_accuracy/std": 0.09843364357948303, "rewards/symbolic_reward_partial_score/mean": 0.9951171875, "rewards/symbolic_reward_partial_score/std": 0.05642101168632507, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0440382957458496, "sampling/importance_sampling_ratio/min": 0.0072457268834114075, "sampling/sampling_logp_difference/max": 4.927343368530273, "sampling/sampling_logp_difference/mean": 0.0872282162308693, "step": 161 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03125, "entropy": 0.20248983800411224, "epoch": 0.4263157894736842, "grad_norm": 0.00344076263718307, "learning_rate": 1e-06, "loss": 0.0037, "step": 162 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.046875, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0546875, "entropy": 0.20415233820676804, "epoch": 0.42894736842105263, "grad_norm": 0.0032204578164964914, "learning_rate": 1e-06, "loss": 0.0118, "step": 163 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03125, "entropy": 0.20045092701911926, "epoch": 0.43157894736842106, "grad_norm": 0.0043795425444841385, "learning_rate": 1e-06, "loss": -0.0081, "step": 164 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6054.0, "completions/max_terminated_length": 6054.0, "completions/mean_length": 1695.689453125, "completions/mean_terminated_length": 1695.689453125, "completions/min_length": 469.0, "completions/min_terminated_length": 469.0, "entropy": 0.20333505421876907, "epoch": 0.4342105263157895, "frac_reward_zero_std": 0.9375, "grad_norm": 0.0022764981258660555, "learning_rate": 1e-06, "loss": -0.0008, "num_tokens": 53465608.0, "reward": 0.8974609375, "reward_std": 0.010156250558793545, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.99609375, "rewards/symbolic_reward_accuracy/std": 0.06243881583213806, "rewards/symbolic_reward_partial_score/mean": 0.9993489384651184, "rewards/symbolic_reward_partial_score/std": 0.01040646992623806, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0437192916870117, "sampling/importance_sampling_ratio/min": 0.0021089445799589157, "sampling/sampling_logp_difference/max": 6.161567687988281, "sampling/sampling_logp_difference/mean": 0.08602581918239594, "step": 165 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.19720236212015152, "epoch": 0.4368421052631579, "grad_norm": 0.0020800495985895395, "learning_rate": 1e-06, "loss": -0.0022, "step": 166 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.20310265570878983, "epoch": 0.4394736842105263, "grad_norm": 0.002397719770669937, "learning_rate": 1e-06, "loss": -0.002, "step": 167 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.2030971497297287, "epoch": 0.4421052631578947, "grad_norm": 0.020433863624930382, "learning_rate": 1e-06, "loss": 0.0033, "step": 168 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6246.0, "completions/max_terminated_length": 6246.0, "completions/mean_length": 1551.830078125, "completions/mean_terminated_length": 1551.830078125, "completions/min_length": 485.0, "completions/min_terminated_length": 485.0, "entropy": 0.2052939310669899, "epoch": 0.44473684210526315, "frac_reward_zero_std": 0.9375, "grad_norm": 0.018735572695732117, "learning_rate": 1e-06, "loss": 0.0031, "num_tokens": 54644401.0, "reward": 0.896484375, "reward_std": 0.01406250149011612, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.99609375, "rewards/symbolic_reward_accuracy/std": 0.06243881583213806, "rewards/symbolic_reward_partial_score/mean": 0.99609375, "rewards/symbolic_reward_partial_score/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0444538593292236, "sampling/importance_sampling_ratio/min": 0.0012084310874342918, "sampling/sampling_logp_difference/max": 6.718432426452637, "sampling/sampling_logp_difference/mean": 0.08754122257232666, "step": 169 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.20626305788755417, "epoch": 0.4473684210526316, "grad_norm": 0.0018479041755199432, "learning_rate": 1e-06, "loss": -0.0022, "step": 170 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0390625, "entropy": 0.20383583009243011, "epoch": 0.45, "grad_norm": 0.0016645521391183138, "learning_rate": 1e-06, "loss": 0.0044, "step": 171 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.20584166795015335, "epoch": 0.45263157894736844, "grad_norm": 0.0020347461104393005, "learning_rate": 1e-06, "loss": -0.0018, "step": 172 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11650.0, "completions/max_terminated_length": 11650.0, "completions/mean_length": 1625.888671875, "completions/mean_terminated_length": 1625.888671875, "completions/min_length": 435.0, "completions/min_terminated_length": 435.0, "entropy": 0.2089935690164566, "epoch": 0.45526315789473687, "frac_reward_zero_std": 0.9375, "grad_norm": 0.0025170159060508013, "learning_rate": 1e-06, "loss": -0.0029, "num_tokens": 55855800.0, "reward": 0.89697265625, "reward_std": 0.01210937649011612, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.99609375, "rewards/symbolic_reward_accuracy/std": 0.06243881583213806, "rewards/symbolic_reward_partial_score/mean": 0.9977213144302368, "rewards/symbolic_reward_partial_score/std": 0.04478955641388893, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0446631908416748, "sampling/importance_sampling_ratio/min": 0.0064276158809661865, "sampling/sampling_logp_difference/max": 5.047151565551758, "sampling/sampling_logp_difference/mean": 0.0878274068236351, "step": 173 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.20580630004405975, "epoch": 0.45789473684210524, "grad_norm": 0.03141406178474426, "learning_rate": 1e-06, "loss": 0.0196, "step": 174 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.202674500644207, "epoch": 0.4605263157894737, "grad_norm": 0.0014878485817462206, "learning_rate": 1e-06, "loss": 0.0055, "step": 175 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.2056107148528099, "epoch": 0.4631578947368421, "grad_norm": 0.002387217478826642, "learning_rate": 1e-06, "loss": -0.0027, "step": 176 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6994.0, "completions/max_terminated_length": 6994.0, "completions/mean_length": 1598.962890625, "completions/mean_terminated_length": 1598.962890625, "completions/min_length": 466.0, "completions/min_terminated_length": 466.0, "entropy": 0.1980273425579071, "epoch": 0.46578947368421053, "frac_reward_zero_std": 0.9375, "grad_norm": 0.01680873893201351, "learning_rate": 1e-06, "loss": 0.0021, "num_tokens": 57079301.0, "reward": 0.8973633050918579, "reward_std": 0.010546875186264515, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.99609375, "rewards/symbolic_reward_accuracy/std": 0.06243881583213806, "rewards/symbolic_reward_partial_score/mean": 0.9990234375, "rewards/symbolic_reward_partial_score/std": 0.015609703958034515, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0433744192123413, "sampling/importance_sampling_ratio/min": 0.007755798753350973, "sampling/sampling_logp_difference/max": 4.859314441680908, "sampling/sampling_logp_difference/mean": 0.08567874133586884, "step": 177 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03125, "entropy": 0.20319605618715286, "epoch": 0.46842105263157896, "grad_norm": 0.0013190533500164747, "learning_rate": 1e-06, "loss": -0.0015, "step": 178 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0390625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0390625, "entropy": 0.19863566756248474, "epoch": 0.4710526315789474, "grad_norm": 0.0011027466971427202, "learning_rate": 1e-06, "loss": -0.0017, "step": 179 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.20039895921945572, "epoch": 0.47368421052631576, "grad_norm": 0.0016516863834112883, "learning_rate": 1e-06, "loss": 0.0036, "step": 180 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8289.0, "completions/max_terminated_length": 8289.0, "completions/mean_length": 1768.48046875, "completions/mean_terminated_length": 1768.48046875, "completions/min_length": 380.0, "completions/min_terminated_length": 380.0, "entropy": 0.20320740342140198, "epoch": 0.4763157894736842, "frac_reward_zero_std": 0.96875, "grad_norm": 0.001373225124552846, "learning_rate": 1e-06, "loss": -0.0009, "num_tokens": 58398075.0, "reward": 0.8987305164337158, "reward_std": 0.005078125279396772, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.998046875, "rewards/symbolic_reward_accuracy/std": 0.04419417306780815, "rewards/symbolic_reward_partial_score/mean": 0.9996744394302368, "rewards/symbolic_reward_partial_score/std": 0.007365696597844362, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0436054468154907, "sampling/importance_sampling_ratio/min": 0.006883922498673201, "sampling/sampling_logp_difference/max": 4.978566646575928, "sampling/sampling_logp_difference/mean": 0.08568958193063736, "step": 181 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.20068252086639404, "epoch": 0.4789473684210526, "grad_norm": 0.0008993051596917212, "learning_rate": 1e-06, "loss": -0.0007, "step": 182 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.20189565420150757, "epoch": 0.48157894736842105, "grad_norm": 0.0008057448430918157, "learning_rate": 1e-06, "loss": 0.0013, "step": 183 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.20442000031471252, "epoch": 0.4842105263157895, "grad_norm": 0.0015741715906187892, "learning_rate": 1e-06, "loss": -0.001, "step": 184 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8313.0, "completions/max_terminated_length": 8313.0, "completions/mean_length": 1583.791015625, "completions/mean_terminated_length": 1583.791015625, "completions/min_length": 419.0, "completions/min_terminated_length": 419.0, "entropy": 0.20106445252895355, "epoch": 0.4868421052631579, "frac_reward_zero_std": 0.96875, "grad_norm": 0.0010945764370262623, "learning_rate": 1e-06, "loss": 0.001, "num_tokens": 59623632.0, "reward": 0.8987305164337158, "reward_std": 0.005078125279396772, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.998046875, "rewards/symbolic_reward_accuracy/std": 0.04419417306780815, "rewards/symbolic_reward_partial_score/mean": 0.9996744394302368, "rewards/symbolic_reward_partial_score/std": 0.007365696597844362, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.04358971118927, "sampling/importance_sampling_ratio/min": 0.004502002149820328, "sampling/sampling_logp_difference/max": 5.403233051300049, "sampling/sampling_logp_difference/mean": 0.08590994775295258, "step": 185 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.1990531086921692, "epoch": 0.48947368421052634, "grad_norm": 0.0008449859451502562, "learning_rate": 1e-06, "loss": -0.0009, "step": 186 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.20007263123989105, "epoch": 0.4921052631578947, "grad_norm": 0.0009163685026578605, "learning_rate": 1e-06, "loss": -0.0008, "step": 187 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.19802528619766235, "epoch": 0.49473684210526314, "grad_norm": 0.0008740195771679282, "learning_rate": 1e-06, "loss": -0.0008, "step": 188 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12552.0, "completions/max_terminated_length": 12552.0, "completions/mean_length": 1765.1875, "completions/mean_terminated_length": 1765.1875, "completions/min_length": 512.0, "completions/min_terminated_length": 512.0, "entropy": 0.2067388892173767, "epoch": 0.49736842105263157, "frac_reward_zero_std": 0.90625, "grad_norm": 0.010071384720504284, "learning_rate": 1e-06, "loss": -0.001, "num_tokens": 60930032.0, "reward": 0.89599609375, "reward_std": 0.01601562649011612, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.994140625, "rewards/symbolic_reward_accuracy/std": 0.07639661431312561, "rewards/symbolic_reward_partial_score/mean": 0.9983723759651184, "rewards/symbolic_reward_partial_score/std": 0.024398809298872948, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.044545292854309, "sampling/importance_sampling_ratio/min": 0.0025527041871100664, "sampling/sampling_logp_difference/max": 5.970602035522461, "sampling/sampling_logp_difference/mean": 0.08827649056911469, "step": 189 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.20628813654184341, "epoch": 0.5, "grad_norm": 0.029791492968797684, "learning_rate": 1e-06, "loss": 0.0154, "step": 190 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.046875, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.046875, "entropy": 0.21144938468933105, "epoch": 0.5026315789473684, "grad_norm": 0.01595800742506981, "learning_rate": 1e-06, "loss": 0.0003, "step": 191 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03125, "entropy": 0.2084823176264763, "epoch": 0.5052631578947369, "grad_norm": 0.0016642897389829159, "learning_rate": 1e-06, "loss": -0.003, "step": 192 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6630.0, "completions/max_terminated_length": 6630.0, "completions/mean_length": 1644.80078125, "completions/mean_terminated_length": 1644.80078125, "completions/min_length": 429.0, "completions/min_terminated_length": 429.0, "entropy": 0.1987621933221817, "epoch": 0.5078947368421053, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 62171466.0, "reward": 0.9000000357627869, "reward_std": 0.0, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 1.0, "rewards/symbolic_reward_accuracy/std": 0.0, "rewards/symbolic_reward_partial_score/mean": 1.0, "rewards/symbolic_reward_partial_score/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0439753532409668, "sampling/importance_sampling_ratio/min": 0.005273656919598579, "sampling/sampling_logp_difference/max": 5.245031356811523, "sampling/sampling_logp_difference/mean": 0.08630666136741638, "step": 193 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.2056480348110199, "epoch": 0.5105263157894737, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 194 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.20121751725673676, "epoch": 0.5131578947368421, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 195 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.2004585713148117, "epoch": 0.5157894736842106, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 196 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8394.0, "completions/max_terminated_length": 8394.0, "completions/mean_length": 1475.658203125, "completions/mean_terminated_length": 1475.658203125, "completions/min_length": 488.0, "completions/min_terminated_length": 488.0, "entropy": 0.2078559324145317, "epoch": 0.5184210526315789, "frac_reward_zero_std": 0.9375, "grad_norm": 0.0013712873915210366, "learning_rate": 1e-06, "loss": 0.0003, "num_tokens": 63330811.0, "reward": 0.8969238996505737, "reward_std": 0.01230468787252903, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.99609375, "rewards/symbolic_reward_accuracy/std": 0.06243881583213806, "rewards/symbolic_reward_partial_score/mean": 0.99755859375, "rewards/symbolic_reward_partial_score/std": 0.045533329248428345, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0454661846160889, "sampling/importance_sampling_ratio/min": 0.002436209237203002, "sampling/sampling_logp_difference/max": 6.017312049865723, "sampling/sampling_logp_difference/mean": 0.08809937536716461, "step": 197 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03125, "entropy": 0.2093820720911026, "epoch": 0.5210526315789473, "grad_norm": 0.0011987773468717933, "learning_rate": 1e-06, "loss": -0.0009, "step": 198 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.20546579360961914, "epoch": 0.5236842105263158, "grad_norm": 0.026652248576283455, "learning_rate": 1e-06, "loss": 0.0147, "step": 199 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03125, "entropy": 0.20495006442070007, "epoch": 0.5263157894736842, "grad_norm": 0.0006486320053227246, "learning_rate": 1e-06, "loss": -0.0011, "step": 200 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7120.0, "completions/max_terminated_length": 7120.0, "completions/mean_length": 1734.412109375, "completions/mean_terminated_length": 1734.412109375, "completions/min_length": 487.0, "completions/min_terminated_length": 487.0, "entropy": 0.20813734084367752, "epoch": 0.5289473684210526, "frac_reward_zero_std": 0.9375, "grad_norm": 0.01946190744638443, "learning_rate": 1e-06, "loss": 0.0054, "num_tokens": 64636334.0, "reward": 0.896191418170929, "reward_std": 0.01201616507023573, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.994140625, "rewards/symbolic_reward_accuracy/std": 0.07639661431312561, "rewards/symbolic_reward_partial_score/mean": 0.9990234375, "rewards/symbolic_reward_partial_score/std": 0.012732770293951035, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0451641082763672, "sampling/importance_sampling_ratio/min": 0.004545527510344982, "sampling/sampling_logp_difference/max": 5.393611431121826, "sampling/sampling_logp_difference/mean": 0.0885031446814537, "step": 201 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0390625, "entropy": 0.20639535784721375, "epoch": 0.531578947368421, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0042, "step": 202 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03125, "entropy": 0.21026064455509186, "epoch": 0.5342105263157895, "grad_norm": 0.0026174013037234545, "learning_rate": 1e-06, "loss": -0.0032, "step": 203 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03125, "entropy": 0.20871484279632568, "epoch": 0.5368421052631579, "grad_norm": 0.0027984855696558952, "learning_rate": 1e-06, "loss": -0.0029, "step": 204 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8250.0, "completions/max_terminated_length": 8250.0, "completions/mean_length": 1774.5390625, "completions/mean_terminated_length": 1774.5390625, "completions/min_length": 542.0, "completions/min_terminated_length": 542.0, "entropy": 0.20492591708898544, "epoch": 0.5394736842105263, "frac_reward_zero_std": 0.96875, "grad_norm": 0.0011379069183021784, "learning_rate": 1e-06, "loss": -0.0006, "num_tokens": 65957890.0, "reward": 0.8985351920127869, "reward_std": 0.005859375, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.998046875, "rewards/symbolic_reward_accuracy/std": 0.04419417306780815, "rewards/symbolic_reward_partial_score/mean": 0.9990234375, "rewards/symbolic_reward_partial_score/std": 0.022097086533904076, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.04496431350708, "sampling/importance_sampling_ratio/min": 0.0022515549790114164, "sampling/sampling_logp_difference/max": 6.096134185791016, "sampling/sampling_logp_difference/mean": 0.0886443629860878, "step": 205 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.20791493356227875, "epoch": 0.5421052631578948, "grad_norm": 0.0014517066301777959, "learning_rate": 1e-06, "loss": -0.0006, "step": 206 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.20718786120414734, "epoch": 0.5447368421052632, "grad_norm": 0.0006582405767403543, "learning_rate": 1e-06, "loss": -0.0005, "step": 207 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.21040219068527222, "epoch": 0.5473684210526316, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0033, "step": 208 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5315.0, "completions/max_terminated_length": 5315.0, "completions/mean_length": 1542.2421875, "completions/mean_terminated_length": 1542.2421875, "completions/min_length": 480.0, "completions/min_terminated_length": 480.0, "entropy": 0.21437843143939972, "epoch": 0.55, "frac_reward_zero_std": 0.84375, "grad_norm": 0.0035406171809881926, "learning_rate": 1e-06, "loss": -0.0028, "num_tokens": 67129790.0, "reward": 0.8929687738418579, "reward_std": 0.02812500111758709, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.990234375, "rewards/symbolic_reward_accuracy/std": 0.09843364357948303, "rewards/symbolic_reward_partial_score/mean": 0.99609375, "rewards/symbolic_reward_partial_score/std": 0.04957897961139679, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0469050407409668, "sampling/importance_sampling_ratio/min": 0.008794832043349743, "sampling/sampling_logp_difference/max": 4.733591079711914, "sampling/sampling_logp_difference/mean": 0.09154820442199707, "step": 209 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.046875, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.046875, "entropy": 0.21689823269844055, "epoch": 0.5526315789473685, "grad_norm": 0.022885171696543694, "learning_rate": 1e-06, "loss": 0.0136, "step": 210 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0546875, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0546875, "entropy": 0.21222791820764542, "epoch": 0.5552631578947368, "grad_norm": 0.002514797495678067, "learning_rate": 1e-06, "loss": -0.0045, "step": 211 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0546875, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0546875, "entropy": 0.21504241228103638, "epoch": 0.5578947368421052, "grad_norm": 0.0030841731932014227, "learning_rate": 1e-06, "loss": 0.0016, "step": 212 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8602.0, "completions/max_terminated_length": 8602.0, "completions/mean_length": 1863.544921875, "completions/mean_terminated_length": 1863.544921875, "completions/min_length": 471.0, "completions/min_terminated_length": 471.0, "entropy": 0.2134803980588913, "epoch": 0.5605263157894737, "frac_reward_zero_std": 0.84375, "grad_norm": 0.01821248233318329, "learning_rate": 1e-06, "loss": 0.0031, "num_tokens": 68479733.0, "reward": 0.8914550542831421, "reward_std": 0.03037816472351551, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.98828125, "rewards/symbolic_reward_accuracy/std": 0.10772226005792618, "rewards/symbolic_reward_partial_score/mean": 0.9949544072151184, "rewards/symbolic_reward_partial_score/std": 0.05604410544037819, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0458984375, "sampling/importance_sampling_ratio/min": 0.008836278691887856, "sampling/sampling_logp_difference/max": 4.728889465332031, "sampling/sampling_logp_difference/mean": 0.09042114019393921, "step": 213 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.046875, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.046875, "entropy": 0.2126595377922058, "epoch": 0.5631578947368421, "grad_norm": 0.004523898474872112, "learning_rate": 1e-06, "loss": -0.0081, "step": 214 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0703125, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.078125, "entropy": 0.2155783772468567, "epoch": 0.5657894736842105, "grad_norm": 0.024723783135414124, "learning_rate": 1e-06, "loss": 0.0109, "step": 215 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03125, "entropy": 0.20893920958042145, "epoch": 0.5684210526315789, "grad_norm": 0.022363588213920593, "learning_rate": 1e-06, "loss": -0.0014, "step": 216 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6386.0, "completions/max_terminated_length": 6386.0, "completions/mean_length": 1714.8359375, "completions/mean_terminated_length": 1714.8359375, "completions/min_length": 530.0, "completions/min_terminated_length": 530.0, "entropy": 0.21516434848308563, "epoch": 0.5710526315789474, "frac_reward_zero_std": 0.96875, "grad_norm": 0.018323076888918877, "learning_rate": 1e-06, "loss": 0.0042, "num_tokens": 69759873.0, "reward": 0.898681640625, "reward_std": 0.0052734375931322575, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.998046875, "rewards/symbolic_reward_accuracy/std": 0.04419417306780815, "rewards/symbolic_reward_partial_score/mean": 0.99951171875, "rewards/symbolic_reward_partial_score/std": 0.011048543266952038, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0460017919540405, "sampling/importance_sampling_ratio/min": 0.005587624851614237, "sampling/sampling_logp_difference/max": 5.187201023101807, "sampling/sampling_logp_difference/mean": 0.09048692137002945, "step": 217 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.21278921514749527, "epoch": 0.5736842105263158, "grad_norm": 0.001296954695135355, "learning_rate": 1e-06, "loss": -0.0008, "step": 218 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.2114720195531845, "epoch": 0.5763157894736842, "grad_norm": 0.0012835001107305288, "learning_rate": 1e-06, "loss": -0.001, "step": 219 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.2083267718553543, "epoch": 0.5789473684210527, "grad_norm": 0.0017823954112827778, "learning_rate": 1e-06, "loss": -0.0011, "step": 220 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8512.0, "completions/max_terminated_length": 8512.0, "completions/mean_length": 1752.11328125, "completions/mean_terminated_length": 1752.11328125, "completions/min_length": 490.0, "completions/min_terminated_length": 490.0, "entropy": 0.2101665735244751, "epoch": 0.5815789473684211, "frac_reward_zero_std": 0.96875, "grad_norm": 0.0014260360039770603, "learning_rate": 1e-06, "loss": -0.0008, "num_tokens": 71069627.0, "reward": 0.8987305164337158, "reward_std": 0.005078125279396772, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.998046875, "rewards/symbolic_reward_accuracy/std": 0.04419417306780815, "rewards/symbolic_reward_partial_score/mean": 0.9996744394302368, "rewards/symbolic_reward_partial_score/std": 0.007365696597844362, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.04522705078125, "sampling/importance_sampling_ratio/min": 0.0064138080924749374, "sampling/sampling_logp_difference/max": 5.049302101135254, "sampling/sampling_logp_difference/mean": 0.08903295546770096, "step": 221 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.2055136188864708, "epoch": 0.5842105263157895, "grad_norm": 0.0009096140274778008, "learning_rate": 1e-06, "loss": -0.0005, "step": 222 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.21155240386724472, "epoch": 0.5868421052631579, "grad_norm": 0.001043591764755547, "learning_rate": 1e-06, "loss": -0.0006, "step": 223 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.21102114021778107, "epoch": 0.5894736842105263, "grad_norm": 0.020602818578481674, "learning_rate": 1e-06, "loss": 0.0037, "step": 224 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9736.0, "completions/max_terminated_length": 9736.0, "completions/mean_length": 1842.458984375, "completions/mean_terminated_length": 1842.458984375, "completions/min_length": 406.0, "completions/min_terminated_length": 406.0, "entropy": 0.2156578153371811, "epoch": 0.5921052631578947, "frac_reward_zero_std": 0.96875, "grad_norm": 0.0008444565464742482, "learning_rate": 1e-06, "loss": -0.0005, "num_tokens": 72374726.0, "reward": 0.8999991416931152, "reward_std": 3.7672618873330066e-06, "rewards/progression_diversity/mean": -9.418398985872045e-05, "rewards/progression_diversity/std": 0.0021311405580490828, "rewards/symbolic_reward_accuracy/mean": 1.0, "rewards/symbolic_reward_accuracy/std": 0.0, "rewards/symbolic_reward_partial_score/mean": 1.0, "rewards/symbolic_reward_partial_score/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0471808910369873, "sampling/importance_sampling_ratio/min": 0.0061684283427894115, "sampling/sampling_logp_difference/max": 5.088311195373535, "sampling/sampling_logp_difference/mean": 0.09226832538843155, "step": 225 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.21982311457395554, "epoch": 0.5947368421052631, "grad_norm": 0.0005437855143100023, "learning_rate": 1e-06, "loss": -0.0003, "step": 226 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.21570423990488052, "epoch": 0.5973684210526315, "grad_norm": 0.01244170218706131, "learning_rate": 1e-06, "loss": 0.0094, "step": 227 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.21865856647491455, "epoch": 0.6, "grad_norm": 0.0010427986271679401, "learning_rate": 1e-06, "loss": -0.0005, "step": 228 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6001.0, "completions/max_terminated_length": 6001.0, "completions/mean_length": 1712.8515625, "completions/mean_terminated_length": 1712.8515625, "completions/min_length": 479.0, "completions/min_terminated_length": 479.0, "entropy": 0.21149300783872604, "epoch": 0.6026315789473684, "frac_reward_zero_std": 0.96875, "grad_norm": 0.002495717955753207, "learning_rate": 1e-06, "loss": -0.0019, "num_tokens": 73636826.0, "reward": 0.8974609375, "reward_std": 0.006938039790838957, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.99609375, "rewards/symbolic_reward_accuracy/std": 0.06243881583213806, "rewards/symbolic_reward_partial_score/mean": 0.9993489384651184, "rewards/symbolic_reward_partial_score/std": 0.01040646992623806, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0451886653900146, "sampling/importance_sampling_ratio/min": 0.0031846752390265465, "sampling/sampling_logp_difference/max": 5.7494049072265625, "sampling/sampling_logp_difference/mean": 0.08880776911973953, "step": 229 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.2117486447095871, "epoch": 0.6052631578947368, "grad_norm": 0.0016441460466012359, "learning_rate": 1e-06, "loss": -0.002, "step": 230 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.2056504786014557, "epoch": 0.6078947368421053, "grad_norm": 0.0019054630538448691, "learning_rate": 1e-06, "loss": -0.0014, "step": 231 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.204166479408741, "epoch": 0.6105263157894737, "grad_norm": 0.009160654619336128, "learning_rate": 1e-06, "loss": 0.0022, "step": 232 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10948.0, "completions/max_terminated_length": 10948.0, "completions/mean_length": 1679.716796875, "completions/mean_terminated_length": 1679.716796875, "completions/min_length": 393.0, "completions/min_terminated_length": 393.0, "entropy": 0.20356527715921402, "epoch": 0.6131578947368421, "frac_reward_zero_std": 0.9375, "grad_norm": 0.003294473048299551, "learning_rate": 1e-06, "loss": -0.0038, "num_tokens": 74897321.0, "reward": 0.8961914777755737, "reward_std": 0.01201616507023573, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.994140625, "rewards/symbolic_reward_accuracy/std": 0.07639661431312561, "rewards/symbolic_reward_partial_score/mean": 0.9990234375, "rewards/symbolic_reward_partial_score/std": 0.012732770293951035, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0435457229614258, "sampling/importance_sampling_ratio/min": 0.006915641948580742, "sampling/sampling_logp_difference/max": 4.973969459533691, "sampling/sampling_logp_difference/mean": 0.0868978351354599, "step": 233 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.20163266360759735, "epoch": 0.6157894736842106, "grad_norm": 0.026085246354341507, "learning_rate": 1e-06, "loss": 0.0103, "step": 234 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03125, "entropy": 0.20436248183250427, "epoch": 0.618421052631579, "grad_norm": 0.003504726802930236, "learning_rate": 1e-06, "loss": -0.0038, "step": 235 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.2010190784931183, "epoch": 0.6210526315789474, "grad_norm": 0.011520801112055779, "learning_rate": 1e-06, "loss": -0.0034, "step": 236 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8684.0, "completions/max_terminated_length": 8684.0, "completions/mean_length": 1731.49609375, "completions/mean_terminated_length": 1731.49609375, "completions/min_length": 441.0, "completions/min_terminated_length": 441.0, "entropy": 0.20273616909980774, "epoch": 0.6236842105263158, "frac_reward_zero_std": 0.9375, "grad_norm": 0.011561810038983822, "learning_rate": 1e-06, "loss": 0.0005, "num_tokens": 76190823.0, "reward": 0.8974121809005737, "reward_std": 0.01035156287252903, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.99609375, "rewards/symbolic_reward_accuracy/std": 0.06243881583213806, "rewards/symbolic_reward_partial_score/mean": 0.9991861581802368, "rewards/symbolic_reward_partial_score/std": 0.013266698457300663, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0434190034866333, "sampling/importance_sampling_ratio/min": 0.003309565130621195, "sampling/sampling_logp_difference/max": 5.710938453674316, "sampling/sampling_logp_difference/mean": 0.08634347468614578, "step": 237 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.19939197599887848, "epoch": 0.6263157894736842, "grad_norm": 0.002393747214227915, "learning_rate": 1e-06, "loss": -0.0028, "step": 238 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.19980376213788986, "epoch": 0.6289473684210526, "grad_norm": 0.002126500476151705, "learning_rate": 1e-06, "loss": -0.0006, "step": 239 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.20721298456192017, "epoch": 0.631578947368421, "grad_norm": 0.0021553891710937023, "learning_rate": 1e-06, "loss": -0.0021, "step": 240 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6395.0, "completions/max_terminated_length": 6395.0, "completions/mean_length": 1841.615234375, "completions/mean_terminated_length": 1841.615234375, "completions/min_length": 477.0, "completions/min_terminated_length": 477.0, "entropy": 0.20705539733171463, "epoch": 0.6342105263157894, "frac_reward_zero_std": 0.9375, "grad_norm": 0.0024814207572489977, "learning_rate": 1e-06, "loss": -0.0028, "num_tokens": 77514498.0, "reward": 0.8967773914337158, "reward_std": 0.01289062574505806, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.99609375, "rewards/symbolic_reward_accuracy/std": 0.06243881583213806, "rewards/symbolic_reward_partial_score/mean": 0.9970703125, "rewards/symbolic_reward_partial_score/std": 0.04937189444899559, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0453053712844849, "sampling/importance_sampling_ratio/min": 0.004106632433831692, "sampling/sampling_logp_difference/max": 5.495151996612549, "sampling/sampling_logp_difference/mean": 0.0892755463719368, "step": 241 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.21014024317264557, "epoch": 0.6368421052631579, "grad_norm": 0.0027103442698717117, "learning_rate": 1e-06, "loss": 0.0059, "step": 242 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.2095521092414856, "epoch": 0.6394736842105263, "grad_norm": 0.0022136298939585686, "learning_rate": 1e-06, "loss": -0.0025, "step": 243 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.046875, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0546875, "entropy": 0.2114603966474533, "epoch": 0.6421052631578947, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0073, "step": 244 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7414.0, "completions/max_terminated_length": 7414.0, "completions/mean_length": 1889.2265625, "completions/mean_terminated_length": 1889.2265625, "completions/min_length": 592.0, "completions/min_terminated_length": 592.0, "entropy": 0.2019776701927185, "epoch": 0.6447368421052632, "frac_reward_zero_std": 0.96875, "grad_norm": 0.0015179699985310435, "learning_rate": 1e-06, "loss": -0.0014, "num_tokens": 78899446.0, "reward": 0.8987305164337158, "reward_std": 0.005078125279396772, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.998046875, "rewards/symbolic_reward_accuracy/std": 0.04419417306780815, "rewards/symbolic_reward_partial_score/mean": 0.9996744394302368, "rewards/symbolic_reward_partial_score/std": 0.007365696597844362, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0435829162597656, "sampling/importance_sampling_ratio/min": 0.0036078530829399824, "sampling/sampling_logp_difference/max": 5.624642372131348, "sampling/sampling_logp_difference/mean": 0.08568894863128662, "step": 245 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.2012660875916481, "epoch": 0.6473684210526316, "grad_norm": 0.0017207972705364227, "learning_rate": 1e-06, "loss": -0.0017, "step": 246 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.19967712461948395, "epoch": 0.65, "grad_norm": 0.0012365675065666437, "learning_rate": 1e-06, "loss": -0.0009, "step": 247 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.20087897777557373, "epoch": 0.6526315789473685, "grad_norm": 0.0017120677512139082, "learning_rate": 1e-06, "loss": 0.0052, "step": 248 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12530.0, "completions/max_terminated_length": 12530.0, "completions/mean_length": 1972.412109375, "completions/mean_terminated_length": 1972.412109375, "completions/min_length": 517.0, "completions/min_terminated_length": 517.0, "entropy": 0.20324888825416565, "epoch": 0.6552631578947369, "frac_reward_zero_std": 0.96875, "grad_norm": 0.0016805394552648067, "learning_rate": 1e-06, "loss": -0.001, "num_tokens": 80328649.0, "reward": 0.8984375, "reward_std": 0.0062500000931322575, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.998046875, "rewards/symbolic_reward_accuracy/std": 0.04419417306780815, "rewards/symbolic_reward_partial_score/mean": 0.9986979365348816, "rewards/symbolic_reward_partial_score/std": 0.0294627845287323, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0433330535888672, "sampling/importance_sampling_ratio/min": 0.007705639582127333, "sampling/sampling_logp_difference/max": 4.865802764892578, "sampling/sampling_logp_difference/mean": 0.08573935180902481, "step": 249 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.2015375792980194, "epoch": 0.6578947368421053, "grad_norm": 0.034747153520584106, "learning_rate": 1e-06, "loss": 0.0202, "step": 250 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.20250868797302246, "epoch": 0.6605263157894737, "grad_norm": 0.001513694878667593, "learning_rate": 1e-06, "loss": -0.0011, "step": 251 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.20309287309646606, "epoch": 0.6631578947368421, "grad_norm": 0.0011689492966979742, "learning_rate": 1e-06, "loss": -0.0012, "step": 252 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12100.0, "completions/max_terminated_length": 12100.0, "completions/mean_length": 2008.337890625, "completions/mean_terminated_length": 2008.337890625, "completions/min_length": 517.0, "completions/min_terminated_length": 517.0, "entropy": 0.2088308185338974, "epoch": 0.6657894736842105, "frac_reward_zero_std": 0.96875, "grad_norm": 0.00338501064106822, "learning_rate": 1e-06, "loss": 0.0092, "num_tokens": 81765078.0, "reward": 0.8958984613418579, "reward_std": 0.008836532942950726, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.994140625, "rewards/symbolic_reward_accuracy/std": 0.07639661431312561, "rewards/symbolic_reward_partial_score/mean": 0.998046875, "rewards/symbolic_reward_partial_score/std": 0.027517499402165413, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0453110933303833, "sampling/importance_sampling_ratio/min": 0.008764754980802536, "sampling/sampling_logp_difference/max": 4.737016677856445, "sampling/sampling_logp_difference/mean": 0.08917136490345001, "step": 253 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.21205566078424454, "epoch": 0.6684210526315789, "grad_norm": 0.004588665906339884, "learning_rate": 1e-06, "loss": 0.0005, "step": 254 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.21336667984724045, "epoch": 0.6710526315789473, "grad_norm": 0.0032944870181381702, "learning_rate": 1e-06, "loss": -0.0031, "step": 255 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.2149658203125, "epoch": 0.6736842105263158, "grad_norm": 0.004138459451496601, "learning_rate": 1e-06, "loss": 0.0065, "step": 256 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6211.0, "completions/max_terminated_length": 6211.0, "completions/mean_length": 1841.73828125, "completions/mean_terminated_length": 1841.73828125, "completions/min_length": 510.0, "completions/min_terminated_length": 510.0, "entropy": 0.21085526794195175, "epoch": 0.6763157894736842, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 83110352.0, "reward": 0.9000000357627869, "reward_std": 0.0, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 1.0, "rewards/symbolic_reward_accuracy/std": 0.0, "rewards/symbolic_reward_partial_score/mean": 1.0, "rewards/symbolic_reward_partial_score/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0461163520812988, "sampling/importance_sampling_ratio/min": 0.0036083245649933815, "sampling/sampling_logp_difference/max": 5.62451171875, "sampling/sampling_logp_difference/mean": 0.09053587913513184, "step": 257 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.21499212086200714, "epoch": 0.6789473684210526, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 258 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.21483533084392548, "epoch": 0.6815789473684211, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 259 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.21817511320114136, "epoch": 0.6842105263157895, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 260 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6017.0, "completions/max_terminated_length": 6017.0, "completions/mean_length": 1733.3125, "completions/mean_terminated_length": 1733.3125, "completions/min_length": 459.0, "completions/min_terminated_length": 459.0, "entropy": 0.22190633416175842, "epoch": 0.6868421052631579, "frac_reward_zero_std": 0.96875, "grad_norm": 0.0022445686627179384, "learning_rate": 1e-06, "loss": 0.0039, "num_tokens": 84379600.0, "reward": 0.8969238996505737, "reward_std": 0.008503163233399391, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.99609375, "rewards/symbolic_reward_accuracy/std": 0.06243881583213806, "rewards/symbolic_reward_partial_score/mean": 0.99755859375, "rewards/symbolic_reward_partial_score/std": 0.045533329248428345, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.048039197921753, "sampling/importance_sampling_ratio/min": 0.005418260581791401, "sampling/sampling_logp_difference/max": 5.21798038482666, "sampling/sampling_logp_difference/mean": 0.09296934306621552, "step": 261 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.21962124109268188, "epoch": 0.6894736842105263, "grad_norm": 0.016535937786102295, "learning_rate": 1e-06, "loss": 0.0045, "step": 262 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.22264452278614044, "epoch": 0.6921052631578948, "grad_norm": 0.0030597581062465906, "learning_rate": 1e-06, "loss": -0.0021, "step": 263 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.22010182589292526, "epoch": 0.6947368421052632, "grad_norm": 0.002245362149551511, "learning_rate": 1e-06, "loss": -0.0016, "step": 264 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9783.0, "completions/max_terminated_length": 9783.0, "completions/mean_length": 2136.318359375, "completions/mean_terminated_length": 2136.318359375, "completions/min_length": 540.0, "completions/min_terminated_length": 540.0, "entropy": 0.22025451809167862, "epoch": 0.6973684210526315, "frac_reward_zero_std": 0.96875, "grad_norm": 0.0035949989687651396, "learning_rate": 1e-06, "loss": -0.0044, "num_tokens": 85883539.0, "reward": 0.8972656726837158, "reward_std": 0.007471735123544931, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.99609375, "rewards/symbolic_reward_accuracy/std": 0.06243881583213806, "rewards/symbolic_reward_partial_score/mean": 0.9986979365348816, "rewards/symbolic_reward_partial_score/std": 0.02081293798983097, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0478429794311523, "sampling/importance_sampling_ratio/min": 0.004194667097181082, "sampling/sampling_logp_difference/max": 5.473941326141357, "sampling/sampling_logp_difference/mean": 0.09309262037277222, "step": 265 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.21724402904510498, "epoch": 0.7, "grad_norm": 0.0031054310966283083, "learning_rate": 1e-06, "loss": -0.0028, "step": 266 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.21982571482658386, "epoch": 0.7026315789473684, "grad_norm": 0.003461057087406516, "learning_rate": 1e-06, "loss": 0.0135, "step": 267 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.2251833751797676, "epoch": 0.7052631578947368, "grad_norm": 0.003466780995950103, "learning_rate": 1e-06, "loss": -0.0033, "step": 268 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7831.0, "completions/max_terminated_length": 7831.0, "completions/mean_length": 2102.580078125, "completions/mean_terminated_length": 2102.580078125, "completions/min_length": 487.0, "completions/min_terminated_length": 487.0, "entropy": 0.23076550662517548, "epoch": 0.7078947368421052, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 87329340.0, "reward": 0.9000000357627869, "reward_std": 0.0, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 1.0, "rewards/symbolic_reward_accuracy/std": 0.0, "rewards/symbolic_reward_partial_score/mean": 1.0, "rewards/symbolic_reward_partial_score/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0496454238891602, "sampling/importance_sampling_ratio/min": 0.005962003022432327, "sampling/sampling_logp_difference/max": 5.122348785400391, "sampling/sampling_logp_difference/mean": 0.09642425179481506, "step": 269 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.2311522662639618, "epoch": 0.7105263157894737, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 270 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.22953198850154877, "epoch": 0.7131578947368421, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 271 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.23017492145299911, "epoch": 0.7157894736842105, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 272 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7945.0, "completions/max_terminated_length": 7945.0, "completions/mean_length": 2125.33984375, "completions/mean_terminated_length": 2125.33984375, "completions/min_length": 538.0, "completions/min_terminated_length": 538.0, "entropy": 0.2236594781279564, "epoch": 0.718421052631579, "frac_reward_zero_std": 0.96875, "grad_norm": 0.0019094354938715696, "learning_rate": 1e-06, "loss": 0.002, "num_tokens": 88837002.0, "reward": 0.8987305164337158, "reward_std": 0.005078125279396772, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.998046875, "rewards/symbolic_reward_accuracy/std": 0.04419417306780815, "rewards/symbolic_reward_partial_score/mean": 0.9996744394302368, "rewards/symbolic_reward_partial_score/std": 0.007365696597844362, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0481009483337402, "sampling/importance_sampling_ratio/min": 0.007017483469098806, "sampling/sampling_logp_difference/max": 4.9593505859375, "sampling/sampling_logp_difference/mean": 0.09402163326740265, "step": 273 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.2232005000114441, "epoch": 0.7210526315789474, "grad_norm": 0.0018970274832099676, "learning_rate": 1e-06, "loss": -0.0014, "step": 274 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.2264328896999359, "epoch": 0.7236842105263158, "grad_norm": 0.0015384983271360397, "learning_rate": 1e-06, "loss": -0.0011, "step": 275 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.225415401160717, "epoch": 0.7263157894736842, "grad_norm": 0.0010754374088719487, "learning_rate": 1e-06, "loss": -0.001, "step": 276 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11061.0, "completions/max_terminated_length": 11061.0, "completions/mean_length": 2468.31640625, "completions/mean_terminated_length": 2468.31640625, "completions/min_length": 464.0, "completions/min_terminated_length": 464.0, "entropy": 0.21986736357212067, "epoch": 0.7289473684210527, "frac_reward_zero_std": 0.875, "grad_norm": 0.004219180904328823, "learning_rate": 1e-06, "loss": -0.0077, "num_tokens": 90535788.0, "reward": 0.8944336175918579, "reward_std": 0.02226562798023224, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.9921875, "rewards/symbolic_reward_accuracy/std": 0.08812850713729858, "rewards/symbolic_reward_partial_score/mean": 0.9970703125, "rewards/symbolic_reward_partial_score/std": 0.045950260013341904, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0480650663375854, "sampling/importance_sampling_ratio/min": 0.006456881761550903, "sampling/sampling_logp_difference/max": 5.042608737945557, "sampling/sampling_logp_difference/mean": 0.09354418516159058, "step": 277 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.22399064153432846, "epoch": 0.7315789473684211, "grad_norm": 0.003046881640329957, "learning_rate": 1e-06, "loss": 0.0017, "step": 278 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0390625, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.046875, "entropy": 0.22623611986637115, "epoch": 0.7342105263157894, "grad_norm": 0.003085202304646373, "learning_rate": 1e-06, "loss": 0.0172, "step": 279 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0625, "entropy": 0.2218364030122757, "epoch": 0.7368421052631579, "grad_norm": 0.0031208612490445375, "learning_rate": 1e-06, "loss": -0.0092, "step": 280 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10669.0, "completions/max_terminated_length": 10669.0, "completions/mean_length": 2234.390625, "completions/mean_terminated_length": 2234.390625, "completions/min_length": 437.0, "completions/min_terminated_length": 437.0, "entropy": 0.22583410143852234, "epoch": 0.7394736842105263, "frac_reward_zero_std": 0.9375, "grad_norm": 0.003318504663184285, "learning_rate": 1e-06, "loss": 0.0027, "num_tokens": 92103956.0, "reward": 0.8955078125, "reward_std": 0.014524737372994423, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.994140625, "rewards/symbolic_reward_accuracy/std": 0.07639661431312561, "rewards/symbolic_reward_partial_score/mean": 0.9967447519302368, "rewards/symbolic_reward_partial_score/std": 0.04989916458725929, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0479704141616821, "sampling/importance_sampling_ratio/min": 0.002618646016344428, "sampling/sampling_logp_difference/max": 5.945097923278809, "sampling/sampling_logp_difference/mean": 0.09308265149593353, "step": 281 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.22214125841856003, "epoch": 0.7421052631578947, "grad_norm": 0.0023426981642842293, "learning_rate": 1e-06, "loss": -0.0011, "step": 282 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03125, "entropy": 0.22179365158081055, "epoch": 0.7447368421052631, "grad_norm": 0.0018474400276318192, "learning_rate": 1e-06, "loss": -0.0042, "step": 283 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.22396309673786163, "epoch": 0.7473684210526316, "grad_norm": 0.019540395587682724, "learning_rate": 1e-06, "loss": 0.0032, "step": 284 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 12723.0, "completions/mean_length": 2388.96875, "completions/mean_terminated_length": 2361.581298828125, "completions/min_length": 558.0, "completions/min_terminated_length": 558.0, "entropy": 0.22346467524766922, "epoch": 0.75, "frac_reward_zero_std": 0.875, "grad_norm": 0.013754500076174736, "learning_rate": 1e-06, "loss": -0.0011, "num_tokens": 93741444.0, "reward": 0.8922852277755737, "reward_std": 0.027415363118052483, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.990234375, "rewards/symbolic_reward_accuracy/std": 0.09843364357948303, "rewards/symbolic_reward_partial_score/mean": 0.9944661259651184, "rewards/symbolic_reward_partial_score/std": 0.0669415220618248, "rewards/tag_count_reward/mean": -0.001953125, "rewards/tag_count_reward/std": 0.04419417306780815, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0481053590774536, "sampling/importance_sampling_ratio/min": 0.006317873951047659, "sampling/sampling_logp_difference/max": 5.064372539520264, "sampling/sampling_logp_difference/mean": 0.09325209259986877, "step": 285 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0390625, "entropy": 0.22469305992126465, "epoch": 0.7526315789473684, "grad_norm": 0.0036299778148531914, "learning_rate": 1e-06, "loss": 0.0228, "step": 286 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0390625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0390625, "entropy": 0.21972357481718063, "epoch": 0.7552631578947369, "grad_norm": 0.004400473088026047, "learning_rate": 1e-06, "loss": -0.01, "step": 287 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.22767239809036255, "epoch": 0.7578947368421053, "grad_norm": 0.018934298306703568, "learning_rate": 1e-06, "loss": 0.0165, "step": 288 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7534.0, "completions/max_terminated_length": 7534.0, "completions/mean_length": 2019.125, "completions/mean_terminated_length": 2019.125, "completions/min_length": 438.0, "completions/min_terminated_length": 438.0, "entropy": 0.22602290660142899, "epoch": 0.7605263157894737, "frac_reward_zero_std": 0.96875, "grad_norm": 0.0012526485370472074, "learning_rate": 1e-06, "loss": -0.0007, "num_tokens": 95188388.0, "reward": 0.898681640625, "reward_std": 0.0052734375931322575, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.998046875, "rewards/symbolic_reward_accuracy/std": 0.04419417306780815, "rewards/symbolic_reward_partial_score/mean": 0.99951171875, "rewards/symbolic_reward_partial_score/std": 0.011048543266952038, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0487861633300781, "sampling/importance_sampling_ratio/min": 0.00233937194570899, "sampling/sampling_logp_difference/max": 6.057872772216797, "sampling/sampling_logp_difference/mean": 0.0950511246919632, "step": 289 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.2291054129600525, "epoch": 0.7631578947368421, "grad_norm": 0.0007504570530727506, "learning_rate": 1e-06, "loss": -0.0005, "step": 290 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.22907574474811554, "epoch": 0.7657894736842106, "grad_norm": 0.0006523334886878729, "learning_rate": 1e-06, "loss": 0.0051, "step": 291 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.2235885113477707, "epoch": 0.7684210526315789, "grad_norm": 0.0011142125586047769, "learning_rate": 1e-06, "loss": -0.0007, "step": 292 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9810.0, "completions/max_terminated_length": 9810.0, "completions/mean_length": 2139.501953125, "completions/mean_terminated_length": 2139.501953125, "completions/min_length": 514.0, "completions/min_terminated_length": 514.0, "entropy": 0.22799117863178253, "epoch": 0.7710526315789473, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 96685637.0, "reward": 0.9000000357627869, "reward_std": 0.0, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 1.0, "rewards/symbolic_reward_accuracy/std": 0.0, "rewards/symbolic_reward_partial_score/mean": 1.0, "rewards/symbolic_reward_partial_score/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0489470958709717, "sampling/importance_sampling_ratio/min": 0.0006613240111619234, "sampling/sampling_logp_difference/max": 7.3212666511535645, "sampling/sampling_logp_difference/mean": 0.09484447538852692, "step": 293 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.22720052301883698, "epoch": 0.7736842105263158, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 294 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.22484512627124786, "epoch": 0.7763157894736842, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 295 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.22535771131515503, "epoch": 0.7789473684210526, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 296 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8716.0, "completions/max_terminated_length": 8716.0, "completions/mean_length": 2135.27734375, "completions/mean_terminated_length": 2135.27734375, "completions/min_length": 501.0, "completions/min_terminated_length": 501.0, "entropy": 0.2249126136302948, "epoch": 0.781578947368421, "frac_reward_zero_std": 0.875, "grad_norm": 0.0038070736918598413, "learning_rate": 1e-06, "loss": 0.0032, "num_tokens": 98187539.0, "reward": 0.892285168170929, "reward_std": 0.027144351974129677, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.990234375, "rewards/symbolic_reward_accuracy/std": 0.09843364357948303, "rewards/symbolic_reward_partial_score/mean": 0.9938150644302368, "rewards/symbolic_reward_partial_score/std": 0.07005997002124786, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0485694408416748, "sampling/importance_sampling_ratio/min": 0.005549346096813679, "sampling/sampling_logp_difference/max": 5.194075107574463, "sampling/sampling_logp_difference/mean": 0.09457787871360779, "step": 297 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.224434956908226, "epoch": 0.7842105263157895, "grad_norm": 0.004503680858761072, "learning_rate": 1e-06, "loss": 0.0045, "step": 298 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0546875, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0546875, "entropy": 0.22570159286260605, "epoch": 0.7868421052631579, "grad_norm": 0.0022635413333773613, "learning_rate": 1e-06, "loss": -0.0038, "step": 299 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.046875, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.046875, "entropy": 0.225453682243824, "epoch": 0.7894736842105263, "grad_norm": 0.01593317836523056, "learning_rate": 1e-06, "loss": 0.0001, "step": 300 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10320.0, "completions/max_terminated_length": 10320.0, "completions/mean_length": 2006.96484375, "completions/mean_terminated_length": 2006.96484375, "completions/min_length": 545.0, "completions/min_terminated_length": 545.0, "entropy": 0.2217450886964798, "epoch": 0.7921052631578948, "frac_reward_zero_std": 0.96875, "grad_norm": 0.0016080178320407867, "learning_rate": 1e-06, "loss": -0.0009, "num_tokens": 99638945.0, "reward": 0.8987305164337158, "reward_std": 0.005078125279396772, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.998046875, "rewards/symbolic_reward_accuracy/std": 0.04419417306780815, "rewards/symbolic_reward_partial_score/mean": 0.9996744394302368, "rewards/symbolic_reward_partial_score/std": 0.007365696597844362, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0480682849884033, "sampling/importance_sampling_ratio/min": 0.007076449226588011, "sampling/sampling_logp_difference/max": 4.950983047485352, "sampling/sampling_logp_difference/mean": 0.09347077459096909, "step": 301 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.22661878913640976, "epoch": 0.7947368421052632, "grad_norm": 0.0017253260593861341, "learning_rate": 1e-06, "loss": -0.0011, "step": 302 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.22114606946706772, "epoch": 0.7973684210526316, "grad_norm": 0.0008624455076642334, "learning_rate": 1e-06, "loss": -0.0007, "step": 303 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.2234853431582451, "epoch": 0.8, "grad_norm": 0.001347598503343761, "learning_rate": 1e-06, "loss": 0.0008, "step": 304 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9895.0, "completions/max_terminated_length": 9895.0, "completions/mean_length": 2228.990234375, "completions/mean_terminated_length": 2228.990234375, "completions/min_length": 468.0, "completions/min_terminated_length": 468.0, "entropy": 0.23081833124160767, "epoch": 0.8026315789473685, "frac_reward_zero_std": 0.9375, "grad_norm": 0.00201100273989141, "learning_rate": 1e-06, "loss": -0.0007, "num_tokens": 101188668.0, "reward": 0.8967773914337158, "reward_std": 0.01289062574505806, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.99609375, "rewards/symbolic_reward_accuracy/std": 0.06243881583213806, "rewards/symbolic_reward_partial_score/mean": 0.9970703125, "rewards/symbolic_reward_partial_score/std": 0.04937189444899559, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.049689531326294, "sampling/importance_sampling_ratio/min": 0.0004419268516357988, "sampling/sampling_logp_difference/max": 7.724366188049316, "sampling/sampling_logp_difference/mean": 0.09700985252857208, "step": 305 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.2303386703133583, "epoch": 0.8052631578947368, "grad_norm": 0.0014438509242609143, "learning_rate": 1e-06, "loss": -0.0003, "step": 306 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.23390042781829834, "epoch": 0.8078947368421052, "grad_norm": 0.0014085390139371157, "learning_rate": 1e-06, "loss": -0.0016, "step": 307 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.23198667913675308, "epoch": 0.8105263157894737, "grad_norm": 0.001618702313862741, "learning_rate": 1e-06, "loss": -0.0024, "step": 308 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14858.0, "completions/max_terminated_length": 14858.0, "completions/mean_length": 2356.30078125, "completions/mean_terminated_length": 2356.30078125, "completions/min_length": 565.0, "completions/min_terminated_length": 565.0, "entropy": 0.2375488206744194, "epoch": 0.8131578947368421, "frac_reward_zero_std": 0.96875, "grad_norm": 0.0034801126457750797, "learning_rate": 1e-06, "loss": -0.0047, "num_tokens": 102786230.0, "reward": 0.89697265625, "reward_std": 0.008394349366426468, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.99609375, "rewards/symbolic_reward_accuracy/std": 0.06243881583213806, "rewards/symbolic_reward_partial_score/mean": 0.9977213144302368, "rewards/symbolic_reward_partial_score/std": 0.04478955641388893, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0506583452224731, "sampling/importance_sampling_ratio/min": 0.00634022755548358, "sampling/sampling_logp_difference/max": 5.060840606689453, "sampling/sampling_logp_difference/mean": 0.09814760833978653, "step": 309 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.23480720818042755, "epoch": 0.8157894736842105, "grad_norm": 0.023176204413175583, "learning_rate": 1e-06, "loss": 0.0143, "step": 310 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.23360728472471237, "epoch": 0.8184210526315789, "grad_norm": 0.0038372529670596123, "learning_rate": 1e-06, "loss": -0.0048, "step": 311 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.23397377878427505, "epoch": 0.8210526315789474, "grad_norm": 0.0036800107918679714, "learning_rate": 1e-06, "loss": -0.0048, "step": 312 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12802.0, "completions/max_terminated_length": 12802.0, "completions/mean_length": 2314.001953125, "completions/mean_terminated_length": 2314.001953125, "completions/min_length": 558.0, "completions/min_terminated_length": 558.0, "entropy": 0.2241261526942253, "epoch": 0.8236842105263158, "frac_reward_zero_std": 0.96875, "grad_norm": 0.017415963113307953, "learning_rate": 1e-06, "loss": 0.0035, "num_tokens": 104380983.0, "reward": 0.8974609375, "reward_std": 0.006938039790838957, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.99609375, "rewards/symbolic_reward_accuracy/std": 0.06243881583213806, "rewards/symbolic_reward_partial_score/mean": 0.9993489384651184, "rewards/symbolic_reward_partial_score/std": 0.010406470857560635, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0495615005493164, "sampling/importance_sampling_ratio/min": 0.002217768458649516, "sampling/sampling_logp_difference/max": 6.11125373840332, "sampling/sampling_logp_difference/mean": 0.09616326540708542, "step": 313 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.23093115538358688, "epoch": 0.8263157894736842, "grad_norm": 0.0028665196150541306, "learning_rate": 1e-06, "loss": -0.0038, "step": 314 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.22975538671016693, "epoch": 0.8289473684210527, "grad_norm": 0.0031550289131700993, "learning_rate": 1e-06, "loss": -0.0046, "step": 315 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.23307201266288757, "epoch": 0.8315789473684211, "grad_norm": 0.020213622599840164, "learning_rate": 1e-06, "loss": 0.0123, "step": 316 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8564.0, "completions/max_terminated_length": 8564.0, "completions/mean_length": 2387.298828125, "completions/mean_terminated_length": 2387.298828125, "completions/min_length": 470.0, "completions/min_terminated_length": 470.0, "entropy": 0.23519239574670792, "epoch": 0.8342105263157895, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 106025776.0, "reward": 0.9000000357627869, "reward_std": 0.0, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 1.0, "rewards/symbolic_reward_accuracy/std": 0.0, "rewards/symbolic_reward_partial_score/mean": 1.0, "rewards/symbolic_reward_partial_score/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0499684810638428, "sampling/importance_sampling_ratio/min": 0.005841280799359083, "sampling/sampling_logp_difference/max": 5.142805099487305, "sampling/sampling_logp_difference/mean": 0.09669101238250732, "step": 317 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.23080847412347794, "epoch": 0.8368421052631579, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 318 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.23335885256528854, "epoch": 0.8394736842105263, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 319 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.232472762465477, "epoch": 0.8421052631578947, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 320 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7714.0, "completions/max_terminated_length": 7714.0, "completions/mean_length": 2196.029296875, "completions/mean_terminated_length": 2196.029296875, "completions/min_length": 488.0, "completions/min_terminated_length": 488.0, "entropy": 0.2395249679684639, "epoch": 0.8447368421052631, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 107538591.0, "reward": 0.9000000357627869, "reward_std": 0.0, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 1.0, "rewards/symbolic_reward_accuracy/std": 0.0, "rewards/symbolic_reward_partial_score/mean": 1.0, "rewards/symbolic_reward_partial_score/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0512621402740479, "sampling/importance_sampling_ratio/min": 0.0038059144280850887, "sampling/sampling_logp_difference/max": 5.5711989402771, "sampling/sampling_logp_difference/mean": 0.09992579370737076, "step": 321 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.24127565324306488, "epoch": 0.8473684210526315, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 322 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.23968005180358887, "epoch": 0.85, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 323 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.23994791507720947, "epoch": 0.8526315789473684, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 324 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 16384.0, "completions/max_terminated_length": 12491.0, "completions/mean_length": 2393.998046875, "completions/mean_terminated_length": 2339.135498046875, "completions/min_length": 496.0, "completions/min_terminated_length": 496.0, "entropy": 0.2380136027932167, "epoch": 0.8552631578947368, "frac_reward_zero_std": 0.90625, "grad_norm": 0.0029275177512317896, "learning_rate": 1e-06, "loss": -0.0047, "num_tokens": 109177310.0, "reward": 0.8962891101837158, "reward_std": 0.014843751676380634, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.99609375, "rewards/symbolic_reward_accuracy/std": 0.06243881583213806, "rewards/symbolic_reward_partial_score/mean": 0.99609375, "rewards/symbolic_reward_partial_score/std": 0.06243881583213806, "rewards/tag_count_reward/mean": -0.001953125, "rewards/tag_count_reward/std": 0.04419417306780815, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0509185791015625, "sampling/importance_sampling_ratio/min": 0.005248794797807932, "sampling/sampling_logp_difference/max": 5.249756813049316, "sampling/sampling_logp_difference/mean": 0.09781131148338318, "step": 325 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.23577817529439926, "epoch": 0.8578947368421053, "grad_norm": 0.03142658993601799, "learning_rate": 1e-06, "loss": 0.0248, "step": 326 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0390625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0390625, "entropy": 0.2361370250582695, "epoch": 0.8605263157894737, "grad_norm": 0.028767094016075134, "learning_rate": 1e-06, "loss": 0.0495, "step": 327 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0390625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0390625, "entropy": 0.23896943032741547, "epoch": 0.8631578947368421, "grad_norm": 0.002342435298487544, "learning_rate": 1e-06, "loss": -0.0051, "step": 328 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 16384.0, "completions/max_terminated_length": 10039.0, "completions/mean_length": 2652.3203125, "completions/mean_terminated_length": 2598.470703125, "completions/min_length": 583.0, "completions/min_terminated_length": 583.0, "entropy": 0.23856234550476074, "epoch": 0.8657894736842106, "frac_reward_zero_std": 0.9375, "grad_norm": 0.024437906220555305, "learning_rate": 1e-06, "loss": 0.0287, "num_tokens": 110934434.0, "reward": 0.8996094465255737, "reward_std": 0.0015625003725290298, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 1.0, "rewards/symbolic_reward_accuracy/std": 0.0, "rewards/symbolic_reward_partial_score/mean": 1.0, "rewards/symbolic_reward_partial_score/std": 0.0, "rewards/tag_count_reward/mean": -0.00390625, "rewards/tag_count_reward/std": 0.06243881583213806, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0511343479156494, "sampling/importance_sampling_ratio/min": 0.002698114374652505, "sampling/sampling_logp_difference/max": 5.9152021408081055, "sampling/sampling_logp_difference/mean": 0.09833870828151703, "step": 329 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.24087556451559067, "epoch": 0.868421052631579, "grad_norm": 0.0028070711996406317, "learning_rate": 1e-06, "loss": -0.0035, "step": 330 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.24100051820278168, "epoch": 0.8710526315789474, "grad_norm": 0.003012464614585042, "learning_rate": 1e-06, "loss": 0.0265, "step": 331 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.2416038140654564, "epoch": 0.8736842105263158, "grad_norm": 0.0026663094758987427, "learning_rate": 1e-06, "loss": -0.0028, "step": 332 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 8142.0, "completions/mean_length": 2450.986328125, "completions/mean_terminated_length": 2423.72021484375, "completions/min_length": 545.0, "completions/min_terminated_length": 545.0, "entropy": 0.2466694340109825, "epoch": 0.8763157894736842, "frac_reward_zero_std": 0.96875, "grad_norm": 0.0019038430182263255, "learning_rate": 1e-06, "loss": -0.0016, "num_tokens": 112593147.0, "reward": 0.8998047113418579, "reward_std": 0.0007812501862645149, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 1.0, "rewards/symbolic_reward_accuracy/std": 0.0, "rewards/symbolic_reward_partial_score/mean": 1.0, "rewards/symbolic_reward_partial_score/std": 0.0, "rewards/tag_count_reward/mean": -0.001953125, "rewards/tag_count_reward/std": 0.04419417306780815, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0524094104766846, "sampling/importance_sampling_ratio/min": 0.0038816439919173717, "sampling/sampling_logp_difference/max": 5.551496505737305, "sampling/sampling_logp_difference/mean": 0.10092766582965851, "step": 333 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.24384784698486328, "epoch": 0.8789473684210526, "grad_norm": 0.0009518684819340706, "learning_rate": 1e-06, "loss": -0.0013, "step": 334 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.24513080716133118, "epoch": 0.881578947368421, "grad_norm": 0.02787371724843979, "learning_rate": 1e-06, "loss": 0.0292, "step": 335 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.24995265156030655, "epoch": 0.8842105263157894, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": -0.0016, "step": 336 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7755.0, "completions/max_terminated_length": 7755.0, "completions/mean_length": 2148.58203125, "completions/mean_terminated_length": 2148.58203125, "completions/min_length": 544.0, "completions/min_terminated_length": 544.0, "entropy": 0.25536780059337616, "epoch": 0.8868421052631579, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 114091493.0, "reward": 0.9000000357627869, "reward_std": 0.0, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 1.0, "rewards/symbolic_reward_accuracy/std": 0.0, "rewards/symbolic_reward_partial_score/mean": 1.0, "rewards/symbolic_reward_partial_score/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.054577350616455, "sampling/importance_sampling_ratio/min": 0.005718884989619255, "sampling/sampling_logp_difference/max": 5.1639814376831055, "sampling/sampling_logp_difference/mean": 0.10422345995903015, "step": 337 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.2523108869791031, "epoch": 0.8894736842105263, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 338 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.2559451460838318, "epoch": 0.8921052631578947, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 339 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.25245994329452515, "epoch": 0.8947368421052632, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 340 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7512.0, "completions/max_terminated_length": 7512.0, "completions/mean_length": 2270.498046875, "completions/mean_terminated_length": 2270.498046875, "completions/min_length": 464.0, "completions/min_terminated_length": 464.0, "entropy": 0.2544011175632477, "epoch": 0.8973684210526316, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 115651620.0, "reward": 0.9000000357627869, "reward_std": 0.0, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 1.0, "rewards/symbolic_reward_accuracy/std": 0.0, "rewards/symbolic_reward_partial_score/mean": 1.0, "rewards/symbolic_reward_partial_score/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.05623197555542, "sampling/importance_sampling_ratio/min": 0.001793963136151433, "sampling/sampling_logp_difference/max": 6.323328018188477, "sampling/sampling_logp_difference/mean": 0.10619939863681793, "step": 341 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.262422651052475, "epoch": 0.9, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 342 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.26385945081710815, "epoch": 0.9026315789473685, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 343 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.26096177101135254, "epoch": 0.9052631578947369, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 344 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7878.0, "completions/max_terminated_length": 7878.0, "completions/mean_length": 2223.076171875, "completions/mean_terminated_length": 2223.076171875, "completions/min_length": 643.0, "completions/min_terminated_length": 643.0, "entropy": 0.2525327652692795, "epoch": 0.9078947368421053, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 117218507.0, "reward": 0.9000000357627869, "reward_std": 0.0, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 1.0, "rewards/symbolic_reward_accuracy/std": 0.0, "rewards/symbolic_reward_partial_score/mean": 1.0, "rewards/symbolic_reward_partial_score/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0553967952728271, "sampling/importance_sampling_ratio/min": 0.0015314362244680524, "sampling/sampling_logp_difference/max": 6.481549263000488, "sampling/sampling_logp_difference/mean": 0.10489603132009506, "step": 345 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.25677473843097687, "epoch": 0.9105263157894737, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 346 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.26054495573043823, "epoch": 0.9131578947368421, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 347 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.25792329013347626, "epoch": 0.9157894736842105, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 348 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7940.0, "completions/max_terminated_length": 7940.0, "completions/mean_length": 2171.408203125, "completions/mean_terminated_length": 2171.408203125, "completions/min_length": 474.0, "completions/min_terminated_length": 474.0, "entropy": 0.26010720431804657, "epoch": 0.9184210526315789, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 118728060.0, "reward": 0.9000000357627869, "reward_std": 0.0, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 1.0, "rewards/symbolic_reward_accuracy/std": 0.0, "rewards/symbolic_reward_partial_score/mean": 1.0, "rewards/symbolic_reward_partial_score/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0559751987457275, "sampling/importance_sampling_ratio/min": 0.0019339441787451506, "sampling/sampling_logp_difference/max": 6.248193740844727, "sampling/sampling_logp_difference/mean": 0.10623260587453842, "step": 349 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.25828756392002106, "epoch": 0.9210526315789473, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 350 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.25984811782836914, "epoch": 0.9236842105263158, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 351 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.2607768476009369, "epoch": 0.9263157894736842, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 352 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9770.0, "completions/max_terminated_length": 9770.0, "completions/mean_length": 2271.884765625, "completions/mean_terminated_length": 2271.884765625, "completions/min_length": 603.0, "completions/min_terminated_length": 603.0, "entropy": 0.2634229362010956, "epoch": 0.9289473684210526, "frac_reward_zero_std": 0.96875, "grad_norm": 0.001863920479081571, "learning_rate": 1e-06, "loss": -0.0018, "num_tokens": 120299905.0, "reward": 0.8987305164337158, "reward_std": 0.005078125279396772, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.998046875, "rewards/symbolic_reward_accuracy/std": 0.04419417306780815, "rewards/symbolic_reward_partial_score/mean": 0.9996744394302368, "rewards/symbolic_reward_partial_score/std": 0.007365696597844362, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0567827224731445, "sampling/importance_sampling_ratio/min": 0.001876403228379786, "sampling/sampling_logp_difference/max": 6.278398513793945, "sampling/sampling_logp_difference/mean": 0.10805174708366394, "step": 353 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.26522062718868256, "epoch": 0.9315789473684211, "grad_norm": 0.0010967060225084424, "learning_rate": 1e-06, "loss": 0.0021, "step": 354 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.26312026381492615, "epoch": 0.9342105263157895, "grad_norm": 0.0011214044643566012, "learning_rate": 1e-06, "loss": -0.001, "step": 355 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.2659604549407959, "epoch": 0.9368421052631579, "grad_norm": 0.001248165383003652, "learning_rate": 1e-06, "loss": -0.0017, "step": 356 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 10414.0, "completions/mean_length": 2434.859375, "completions/mean_terminated_length": 2407.5615234375, "completions/min_length": 587.0, "completions/min_terminated_length": 587.0, "entropy": 0.2705574333667755, "epoch": 0.9394736842105263, "frac_reward_zero_std": 0.96875, "grad_norm": 0.0029057427309453487, "learning_rate": 1e-06, "loss": 0.0106, "num_tokens": 121950521.0, "reward": 0.8973633050918579, "reward_std": 0.007210533134639263, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.99609375, "rewards/symbolic_reward_accuracy/std": 0.06243881583213806, "rewards/symbolic_reward_partial_score/mean": 0.9990234375, "rewards/symbolic_reward_partial_score/std": 0.016457298770546913, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0575973987579346, "sampling/importance_sampling_ratio/min": 0.004899222403764725, "sampling/sampling_logp_difference/max": 5.318678855895996, "sampling/sampling_logp_difference/mean": 0.10986323654651642, "step": 357 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.26986919343471527, "epoch": 0.9421052631578948, "grad_norm": 0.0018621939234435558, "learning_rate": 1e-06, "loss": -0.004, "step": 358 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.2656458765268326, "epoch": 0.9447368421052632, "grad_norm": 0.026229264214634895, "learning_rate": 1e-06, "loss": 0.0178, "step": 359 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.27086082100868225, "epoch": 0.9473684210526315, "grad_norm": 0.0021815821528434753, "learning_rate": 1e-06, "loss": -0.0049, "step": 360 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7569.0, "completions/max_terminated_length": 7569.0, "completions/mean_length": 2336.83984375, "completions/mean_terminated_length": 2336.83984375, "completions/min_length": 665.0, "completions/min_terminated_length": 665.0, "entropy": 0.27485327422618866, "epoch": 0.95, "frac_reward_zero_std": 0.96875, "grad_norm": 0.0016712540527805686, "learning_rate": 1e-06, "loss": -0.0013, "num_tokens": 123533127.0, "reward": 0.8982422351837158, "reward_std": 0.00703125074505806, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.998046875, "rewards/symbolic_reward_accuracy/std": 0.04419417306780815, "rewards/symbolic_reward_partial_score/mean": 0.998046875, "rewards/symbolic_reward_partial_score/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0588257312774658, "sampling/importance_sampling_ratio/min": 0.005976428277790546, "sampling/sampling_logp_difference/max": 5.119932174682617, "sampling/sampling_logp_difference/mean": 0.11021911352872849, "step": 361 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.2705720067024231, "epoch": 0.9526315789473684, "grad_norm": 0.0009871588554233313, "learning_rate": 1e-06, "loss": -0.0013, "step": 362 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.26575684547424316, "epoch": 0.9552631578947368, "grad_norm": 0.0014662131434306502, "learning_rate": 1e-06, "loss": 0.0065, "step": 363 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.274458646774292, "epoch": 0.9578947368421052, "grad_norm": 0.0019163885153830051, "learning_rate": 1e-06, "loss": -0.0012, "step": 364 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8514.0, "completions/max_terminated_length": 8514.0, "completions/mean_length": 2526.6328125, "completions/mean_terminated_length": 2526.6328125, "completions/min_length": 407.0, "completions/min_terminated_length": 407.0, "entropy": 0.27694442868232727, "epoch": 0.9605263157894737, "frac_reward_zero_std": 0.96875, "grad_norm": 0.0018505159532651305, "learning_rate": 1e-06, "loss": -0.0015, "num_tokens": 125219243.0, "reward": 0.8982422351837158, "reward_std": 0.00703125074505806, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.998046875, "rewards/symbolic_reward_accuracy/std": 0.04419417306780815, "rewards/symbolic_reward_partial_score/mean": 0.998046875, "rewards/symbolic_reward_partial_score/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0588514804840088, "sampling/importance_sampling_ratio/min": 0.004635735414922237, "sampling/sampling_logp_difference/max": 5.373960494995117, "sampling/sampling_logp_difference/mean": 0.1100192666053772, "step": 365 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.2708867937326431, "epoch": 0.9631578947368421, "grad_norm": 0.0007901579374447465, "learning_rate": 1e-06, "loss": -0.0015, "step": 366 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.27127164602279663, "epoch": 0.9657894736842105, "grad_norm": 0.002608968410640955, "learning_rate": 1e-06, "loss": 0.0046, "step": 367 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.27101272344589233, "epoch": 0.968421052631579, "grad_norm": 0.001521129161119461, "learning_rate": 1e-06, "loss": -0.0016, "step": 368 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10071.0, "completions/max_terminated_length": 10071.0, "completions/mean_length": 2368.806640625, "completions/mean_terminated_length": 2368.806640625, "completions/min_length": 510.0, "completions/min_terminated_length": 510.0, "entropy": 0.26611191034317017, "epoch": 0.9710526315789474, "frac_reward_zero_std": 0.96875, "grad_norm": 0.0017959439428523183, "learning_rate": 1e-06, "loss": -0.0016, "num_tokens": 126838888.0, "reward": 0.8987305164337158, "reward_std": 0.005078125279396772, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.998046875, "rewards/symbolic_reward_accuracy/std": 0.04419417306780815, "rewards/symbolic_reward_partial_score/mean": 0.9996744394302368, "rewards/symbolic_reward_partial_score/std": 0.007365696597844362, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0575342178344727, "sampling/importance_sampling_ratio/min": 0.0019048165995627642, "sampling/sampling_logp_difference/max": 6.263369560241699, "sampling/sampling_logp_difference/mean": 0.10833673179149628, "step": 369 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.26864004135131836, "epoch": 0.9736842105263158, "grad_norm": 0.001969138393178582, "learning_rate": 1e-06, "loss": -0.0021, "step": 370 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.26397430896759033, "epoch": 0.9763157894736842, "grad_norm": 0.026864636689424515, "learning_rate": 1e-06, "loss": 0.0115, "step": 371 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.26963819563388824, "epoch": 0.9789473684210527, "grad_norm": 0.0014100598637014627, "learning_rate": 1e-06, "loss": -0.0028, "step": 372 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6822.0, "completions/max_terminated_length": 6822.0, "completions/mean_length": 2196.564453125, "completions/mean_terminated_length": 2196.564453125, "completions/min_length": 550.0, "completions/min_terminated_length": 550.0, "entropy": 0.2688636928796768, "epoch": 0.9815789473684211, "frac_reward_zero_std": 0.96875, "grad_norm": 0.0016775439726188779, "learning_rate": 1e-06, "loss": -0.0011, "num_tokens": 128361321.0, "reward": 0.8982422351837158, "reward_std": 0.00703125074505806, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.998046875, "rewards/symbolic_reward_accuracy/std": 0.04419417306780815, "rewards/symbolic_reward_partial_score/mean": 0.998046875, "rewards/symbolic_reward_partial_score/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0584321022033691, "sampling/importance_sampling_ratio/min": 0.004710399080067873, "sampling/sampling_logp_difference/max": 5.357982635498047, "sampling/sampling_logp_difference/mean": 0.10981157422065735, "step": 373 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.274547815322876, "epoch": 0.9842105263157894, "grad_norm": 0.001210524351336062, "learning_rate": 1e-06, "loss": -0.0017, "step": 374 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.2660208195447922, "epoch": 0.9868421052631579, "grad_norm": 0.0016719524282962084, "learning_rate": 1e-06, "loss": -0.002, "step": 375 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.27128584682941437, "epoch": 0.9894736842105263, "grad_norm": 0.019210757687687874, "learning_rate": 1e-06, "loss": 0.0045, "step": 376 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15169.0, "completions/max_terminated_length": 15169.0, "completions/mean_length": 2193.333984375, "completions/mean_terminated_length": 2193.333984375, "completions/min_length": 510.0, "completions/min_terminated_length": 510.0, "entropy": 0.2721587121486664, "epoch": 0.9921052631578947, "frac_reward_zero_std": 0.96875, "grad_norm": 0.0013233161298558116, "learning_rate": 1e-06, "loss": 0.0275, "num_tokens": 129891924.0, "reward": 0.8984375, "reward_std": 0.0062500000931322575, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.998046875, "rewards/symbolic_reward_accuracy/std": 0.04419417306780815, "rewards/symbolic_reward_partial_score/mean": 0.9986979365348816, "rewards/symbolic_reward_partial_score/std": 0.0294627845287323, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0585943460464478, "sampling/importance_sampling_ratio/min": 0.002915969118475914, "sampling/sampling_logp_difference/max": 5.837553024291992, "sampling/sampling_logp_difference/mean": 0.11031664907932281, "step": 377 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.2689424753189087, "epoch": 0.9947368421052631, "grad_norm": 0.0009457448613829911, "learning_rate": 1e-06, "loss": -0.0013, "step": 378 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.2722350209951401, "epoch": 0.9973684210526316, "grad_norm": 0.0007782538887113333, "learning_rate": 1e-06, "loss": -0.0016, "step": 379 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.2723027318716049, "epoch": 1.0, "grad_norm": 0.001802962739020586, "learning_rate": 1e-06, "loss": -0.0013, "step": 380 }, { "epoch": 1.0, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.00048828125, "eval_completions/max_length": 6804.09375, "eval_completions/max_terminated_length": 6673.8125, "eval_completions/mean_length": 1785.35693359375, "eval_completions/mean_terminated_length": 1778.2728424072266, "eval_completions/min_length": 520.0, "eval_completions/min_terminated_length": 520.0, "eval_entropy": 0.27594462037086487, "eval_frac_reward_zero_std": 0.9765625, "eval_loss": 0.0009819812839850783, "eval_num_tokens": 129891924.0, "eval_reward": 0.8987668994814157, "eval_reward_std": 0.003836137172754661, "eval_rewards/progression_diversity/mean": -2.2440133761847392e-05, "eval_rewards/progression_diversity/std": 0.0002538811240810901, "eval_rewards/symbolic_reward_accuracy/mean": 0.998291015625, "eval_rewards/symbolic_reward_accuracy/std": 0.014161451952531934, "eval_rewards/symbolic_reward_partial_score/mean": 0.9994710274040699, "eval_rewards/symbolic_reward_partial_score/std": 0.004156619019340724, "eval_rewards/tag_count_reward/mean": -0.00048828125, "eval_rewards/tag_count_reward/std": 0.003890840569511056, "eval_runtime": 3294.0903, "eval_samples_per_second": 0.076, "eval_sampling/importance_sampling_ratio/max": 2.0, "eval_sampling/importance_sampling_ratio/mean": 1.0596566386520863, "eval_sampling/importance_sampling_ratio/min": 0.010347805728088133, "eval_sampling/sampling_logp_difference/max": 4.666457310318947, "eval_sampling/sampling_logp_difference/mean": 0.11182636907324195, "eval_steps_per_second": 0.001, "step": 380 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11190.0, "completions/max_terminated_length": 11190.0, "completions/mean_length": 2478.263671875, "completions/mean_terminated_length": 2478.263671875, "completions/min_length": 596.0, "completions/min_terminated_length": 596.0, "entropy": 0.2696339935064316, "epoch": 1.0026315789473683, "frac_reward_zero_std": 0.96875, "grad_norm": 0.0021782810799777508, "learning_rate": 1e-06, "loss": 0.0073, "num_tokens": 131583131.0, "reward": 0.8985351920127869, "reward_std": 0.005859375, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.998046875, "rewards/symbolic_reward_accuracy/std": 0.04419417306780815, "rewards/symbolic_reward_partial_score/mean": 0.9990234375, "rewards/symbolic_reward_partial_score/std": 0.022097086533904076, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0590980052947998, "sampling/importance_sampling_ratio/min": 0.004538754932582378, "sampling/sampling_logp_difference/max": 5.395102500915527, "sampling/sampling_logp_difference/mean": 0.11058938503265381, "step": 381 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.27230630815029144, "epoch": 1.0052631578947369, "grad_norm": 0.0011016832431778312, "learning_rate": 1e-06, "loss": -0.0019, "step": 382 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.27085432410240173, "epoch": 1.0078947368421052, "grad_norm": 0.0021980858873575926, "learning_rate": 1e-06, "loss": -0.0025, "step": 383 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.2782110571861267, "epoch": 1.0105263157894737, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": -0.0031, "step": 384 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12789.0, "completions/max_terminated_length": 12789.0, "completions/mean_length": 2632.734375, "completions/mean_terminated_length": 2632.734375, "completions/min_length": 574.0, "completions/min_terminated_length": 574.0, "entropy": 0.26923768222332, "epoch": 1.013157894736842, "frac_reward_zero_std": 0.9375, "grad_norm": 0.003790569957345724, "learning_rate": 1e-06, "loss": -0.0053, "num_tokens": 133345907.0, "reward": 0.89599609375, "reward_std": 0.012797415256500244, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.994140625, "rewards/symbolic_reward_accuracy/std": 0.07639661431312561, "rewards/symbolic_reward_partial_score/mean": 0.9983723759651184, "rewards/symbolic_reward_partial_score/std": 0.024398809298872948, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0587010383605957, "sampling/importance_sampling_ratio/min": 0.007694330997765064, "sampling/sampling_logp_difference/max": 4.867271423339844, "sampling/sampling_logp_difference/mean": 0.11019694805145264, "step": 385 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.27567151188850403, "epoch": 1.0157894736842106, "grad_norm": 0.002176067791879177, "learning_rate": 1e-06, "loss": 0.0113, "step": 386 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.27289701998233795, "epoch": 1.018421052631579, "grad_norm": 0.0036155260168015957, "learning_rate": 1e-06, "loss": -0.0046, "step": 387 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0390625, "entropy": 0.27020932734012604, "epoch": 1.0210526315789474, "grad_norm": 0.029289431869983673, "learning_rate": 1e-06, "loss": 0.0317, "step": 388 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10453.0, "completions/max_terminated_length": 10453.0, "completions/mean_length": 2388.123046875, "completions/mean_terminated_length": 2388.123046875, "completions/min_length": 571.0, "completions/min_terminated_length": 571.0, "entropy": 0.2755410969257355, "epoch": 1.0236842105263158, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 134960082.0, "reward": 0.9000000357627869, "reward_std": 0.0, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 1.0, "rewards/symbolic_reward_accuracy/std": 0.0, "rewards/symbolic_reward_partial_score/mean": 1.0, "rewards/symbolic_reward_partial_score/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0601894855499268, "sampling/importance_sampling_ratio/min": 0.0016817081486806273, "sampling/sampling_logp_difference/max": 6.387945175170898, "sampling/sampling_logp_difference/mean": 0.11273640394210815, "step": 389 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.27650390565395355, "epoch": 1.0263157894736843, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 390 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.28008225560188293, "epoch": 1.0289473684210526, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 391 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.2785145789384842, "epoch": 1.0315789473684212, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 392 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15523.0, "completions/max_terminated_length": 15523.0, "completions/mean_length": 2476.703125, "completions/mean_terminated_length": 2476.703125, "completions/min_length": 506.0, "completions/min_terminated_length": 506.0, "entropy": 0.2723507732152939, "epoch": 1.0342105263157895, "frac_reward_zero_std": 0.90625, "grad_norm": 0.0035871483851224184, "learning_rate": 1e-06, "loss": -0.0072, "num_tokens": 136631962.0, "reward": 0.89599609375, "reward_std": 0.01601562649011612, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.994140625, "rewards/symbolic_reward_accuracy/std": 0.07639661431312561, "rewards/symbolic_reward_partial_score/mean": 0.9983723759651184, "rewards/symbolic_reward_partial_score/std": 0.024398809298872948, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0591776371002197, "sampling/importance_sampling_ratio/min": 0.0016026663361117244, "sampling/sampling_logp_difference/max": 6.436086654663086, "sampling/sampling_logp_difference/mean": 0.11075634509325027, "step": 393 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0390625, "entropy": 0.2732235938310623, "epoch": 1.0368421052631578, "grad_norm": 0.0025476044975221157, "learning_rate": 1e-06, "loss": 0.003, "step": 394 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.2743808627128601, "epoch": 1.0394736842105263, "grad_norm": 0.04132957383990288, "learning_rate": 1e-06, "loss": 0.03, "step": 395 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.2754429578781128, "epoch": 1.0421052631578946, "grad_norm": 0.0038282752502709627, "learning_rate": 1e-06, "loss": -0.0074, "step": 396 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7617.0, "completions/max_terminated_length": 7617.0, "completions/mean_length": 2224.314453125, "completions/mean_terminated_length": 2224.314453125, "completions/min_length": 526.0, "completions/min_terminated_length": 526.0, "entropy": 0.27194325625896454, "epoch": 1.0447368421052632, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 138171771.0, "reward": 0.9000000357627869, "reward_std": 0.0, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 1.0, "rewards/symbolic_reward_accuracy/std": 0.0, "rewards/symbolic_reward_partial_score/mean": 1.0, "rewards/symbolic_reward_partial_score/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.059114933013916, "sampling/importance_sampling_ratio/min": 0.001705495873466134, "sampling/sampling_logp_difference/max": 6.373899459838867, "sampling/sampling_logp_difference/mean": 0.11046701669692993, "step": 397 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.2725122720003128, "epoch": 1.0473684210526315, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 398 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.2705809473991394, "epoch": 1.05, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 399 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.2737552225589752, "epoch": 1.0526315789473684, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 400 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7558.0, "completions/max_terminated_length": 7558.0, "completions/mean_length": 2577.232421875, "completions/mean_terminated_length": 2577.232421875, "completions/min_length": 466.0, "completions/min_terminated_length": 466.0, "entropy": 0.27660855650901794, "epoch": 1.055263157894737, "frac_reward_zero_std": 0.9375, "grad_norm": 0.0024011079221963882, "learning_rate": 1e-06, "loss": -0.0032, "num_tokens": 139885618.0, "reward": 0.8971680402755737, "reward_std": 0.01132812537252903, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.99609375, "rewards/symbolic_reward_accuracy/std": 0.06243881583213806, "rewards/symbolic_reward_partial_score/mean": 0.9983724355697632, "rewards/symbolic_reward_partial_score/std": 0.026533395051956177, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0601541996002197, "sampling/importance_sampling_ratio/min": 0.004088182467967272, "sampling/sampling_logp_difference/max": 5.499654769897461, "sampling/sampling_logp_difference/mean": 0.11233454942703247, "step": 401 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.2764456570148468, "epoch": 1.0578947368421052, "grad_norm": 0.02229779213666916, "learning_rate": 1e-06, "loss": 0.0113, "step": 402 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.28018976747989655, "epoch": 1.0605263157894738, "grad_norm": 0.0016946468967944384, "learning_rate": 1e-06, "loss": -0.0027, "step": 403 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.274475559592247, "epoch": 1.063157894736842, "grad_norm": 0.0024285921826958656, "learning_rate": 1e-06, "loss": -0.0043, "step": 404 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8829.0, "completions/max_terminated_length": 8829.0, "completions/mean_length": 2668.482421875, "completions/mean_terminated_length": 2668.482421875, "completions/min_length": 536.0, "completions/min_terminated_length": 536.0, "entropy": 0.2707562744617462, "epoch": 1.0657894736842106, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 141666537.0, "reward": 0.9000000357627869, "reward_std": 0.0, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 1.0, "rewards/symbolic_reward_accuracy/std": 0.0, "rewards/symbolic_reward_partial_score/mean": 1.0, "rewards/symbolic_reward_partial_score/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.058478593826294, "sampling/importance_sampling_ratio/min": 0.0013187599834054708, "sampling/sampling_logp_difference/max": 6.631063461303711, "sampling/sampling_logp_difference/mean": 0.10924874246120453, "step": 405 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.27069367468357086, "epoch": 1.068421052631579, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 406 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.2708383947610855, "epoch": 1.0710526315789473, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 407 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.2638653963804245, "epoch": 1.0736842105263158, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 408 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11984.0, "completions/max_terminated_length": 11984.0, "completions/mean_length": 2919.5078125, "completions/mean_terminated_length": 2919.5078125, "completions/min_length": 418.0, "completions/min_terminated_length": 418.0, "entropy": 0.27394281327724457, "epoch": 1.0763157894736841, "frac_reward_zero_std": 0.9375, "grad_norm": 0.02310675010085106, "learning_rate": 1e-06, "loss": 0.0062, "num_tokens": 143566637.0, "reward": 0.897412121295929, "reward_std": 0.01035156287252903, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.99609375, "rewards/symbolic_reward_accuracy/std": 0.06243881583213806, "rewards/symbolic_reward_partial_score/mean": 0.9991861581802368, "rewards/symbolic_reward_partial_score/std": 0.013266698457300663, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0586392879486084, "sampling/importance_sampling_ratio/min": 0.005263695493340492, "sampling/sampling_logp_difference/max": 5.246922016143799, "sampling/sampling_logp_difference/mean": 0.10996603965759277, "step": 409 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03125, "entropy": 0.2724384367465973, "epoch": 1.0789473684210527, "grad_norm": 0.001800637342967093, "learning_rate": 1e-06, "loss": -0.0061, "step": 410 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.2698005735874176, "epoch": 1.081578947368421, "grad_norm": 0.0021911198273301125, "learning_rate": 1e-06, "loss": -0.0056, "step": 411 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.2690558433532715, "epoch": 1.0842105263157895, "grad_norm": 0.027593424543738365, "learning_rate": 1e-06, "loss": 0.0075, "step": 412 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8554.0, "completions/max_terminated_length": 8554.0, "completions/mean_length": 2559.966796875, "completions/mean_terminated_length": 2559.966796875, "completions/min_length": 423.0, "completions/min_terminated_length": 423.0, "entropy": 0.2713516354560852, "epoch": 1.0868421052631578, "frac_reward_zero_std": 0.96875, "grad_norm": 0.011653747409582138, "learning_rate": 1e-06, "loss": 0.0009, "num_tokens": 145287644.0, "reward": 0.8987305164337158, "reward_std": 0.005078125279396772, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.998046875, "rewards/symbolic_reward_accuracy/std": 0.04419417306780815, "rewards/symbolic_reward_partial_score/mean": 0.9996744394302368, "rewards/symbolic_reward_partial_score/std": 0.007365696597844362, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0594277381896973, "sampling/importance_sampling_ratio/min": 0.006852603517472744, "sampling/sampling_logp_difference/max": 4.983126640319824, "sampling/sampling_logp_difference/mean": 0.11137549579143524, "step": 413 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.2771768271923065, "epoch": 1.0894736842105264, "grad_norm": 0.0011253054253757, "learning_rate": 1e-06, "loss": -0.0014, "step": 414 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.27702221274375916, "epoch": 1.0921052631578947, "grad_norm": 0.0014809310669079423, "learning_rate": 1e-06, "loss": -0.0013, "step": 415 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.2784002870321274, "epoch": 1.0947368421052632, "grad_norm": 0.0015354289207607508, "learning_rate": 1e-06, "loss": -0.0022, "step": 416 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9950.0, "completions/max_terminated_length": 9950.0, "completions/mean_length": 2360.125, "completions/mean_terminated_length": 2360.125, "completions/min_length": 448.0, "completions/min_terminated_length": 448.0, "entropy": 0.27533894777297974, "epoch": 1.0973684210526315, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 146893980.0, "reward": 0.9000000357627869, "reward_std": 0.0, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 1.0, "rewards/symbolic_reward_accuracy/std": 0.0, "rewards/symbolic_reward_partial_score/mean": 1.0, "rewards/symbolic_reward_partial_score/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.05975341796875, "sampling/importance_sampling_ratio/min": 0.002090906724333763, "sampling/sampling_logp_difference/max": 6.170157432556152, "sampling/sampling_logp_difference/mean": 0.11152775585651398, "step": 417 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.27772340178489685, "epoch": 1.1, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 418 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.27275168895721436, "epoch": 1.1026315789473684, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 419 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.27689535915851593, "epoch": 1.1052631578947367, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 420 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9654.0, "completions/max_terminated_length": 9654.0, "completions/mean_length": 2394.55859375, "completions/mean_terminated_length": 2394.55859375, "completions/min_length": 451.0, "completions/min_terminated_length": 451.0, "entropy": 0.2810089439153671, "epoch": 1.1078947368421053, "frac_reward_zero_std": 0.96875, "grad_norm": 0.01735091581940651, "learning_rate": 1e-06, "loss": 0.0051, "num_tokens": 148503450.0, "reward": 0.8970703482627869, "reward_std": 0.008005430921912193, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.99609375, "rewards/symbolic_reward_accuracy/std": 0.06243881583213806, "rewards/symbolic_reward_partial_score/mean": 0.998046875, "rewards/symbolic_reward_partial_score/std": 0.03121940791606903, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0609241724014282, "sampling/importance_sampling_ratio/min": 0.003710841527208686, "sampling/sampling_logp_difference/max": 5.59649658203125, "sampling/sampling_logp_difference/mean": 0.11337482184171677, "step": 421 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.2789938300848007, "epoch": 1.1105263157894736, "grad_norm": 0.003588972147554159, "learning_rate": 1e-06, "loss": -0.0041, "step": 422 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.27997761964797974, "epoch": 1.1131578947368421, "grad_norm": 0.0028675824869424105, "learning_rate": 1e-06, "loss": -0.0027, "step": 423 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.2786268889904022, "epoch": 1.1157894736842104, "grad_norm": 0.00302469776943326, "learning_rate": 1e-06, "loss": 0.0052, "step": 424 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8525.0, "completions/max_terminated_length": 8525.0, "completions/mean_length": 2491.763671875, "completions/mean_terminated_length": 2491.763671875, "completions/min_length": 532.0, "completions/min_terminated_length": 532.0, "entropy": 0.27415645122528076, "epoch": 1.118421052631579, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 150184385.0, "reward": 0.9000000357627869, "reward_std": 0.0, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 1.0, "rewards/symbolic_reward_accuracy/std": 0.0, "rewards/symbolic_reward_partial_score/mean": 1.0, "rewards/symbolic_reward_partial_score/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0598957538604736, "sampling/importance_sampling_ratio/min": 0.005324083846062422, "sampling/sampling_logp_difference/max": 5.2355146408081055, "sampling/sampling_logp_difference/mean": 0.11099953949451447, "step": 425 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.2769442945718765, "epoch": 1.1210526315789473, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 426 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.27085237205028534, "epoch": 1.1236842105263158, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 427 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.27177655696868896, "epoch": 1.1263157894736842, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 428 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11694.0, "completions/max_terminated_length": 11694.0, "completions/mean_length": 2519.423828125, "completions/mean_terminated_length": 2519.423828125, "completions/min_length": 400.0, "completions/min_terminated_length": 400.0, "entropy": 0.276354119181633, "epoch": 1.1289473684210527, "frac_reward_zero_std": 0.9375, "grad_norm": 0.0040257154032588005, "learning_rate": 1e-06, "loss": 0.0092, "num_tokens": 151857946.0, "reward": 0.8954590559005737, "reward_std": 0.014120899140834808, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.994140625, "rewards/symbolic_reward_accuracy/std": 0.07639661431312561, "rewards/symbolic_reward_partial_score/mean": 0.99658203125, "rewards/symbolic_reward_partial_score/std": 0.050564687699079514, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0611881017684937, "sampling/importance_sampling_ratio/min": 0.004735460504889488, "sampling/sampling_logp_difference/max": 5.3526763916015625, "sampling/sampling_logp_difference/mean": 0.11390456557273865, "step": 429 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.28108280897140503, "epoch": 1.131578947368421, "grad_norm": 0.0031915975268930197, "learning_rate": 1e-06, "loss": -0.0059, "step": 430 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.2809026688337326, "epoch": 1.1342105263157896, "grad_norm": 0.0033858264796435833, "learning_rate": 1e-06, "loss": -0.0051, "step": 431 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.2850513160228729, "epoch": 1.1368421052631579, "grad_norm": 0.0030346557032316923, "learning_rate": 1e-06, "loss": 0.0103, "step": 432 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8445.0, "completions/max_terminated_length": 8445.0, "completions/mean_length": 2323.69140625, "completions/mean_terminated_length": 2323.69140625, "completions/min_length": 510.0, "completions/min_terminated_length": 510.0, "entropy": 0.28349485993385315, "epoch": 1.1394736842105262, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 153432956.0, "reward": 0.9000000357627869, "reward_std": 0.0, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 1.0, "rewards/symbolic_reward_accuracy/std": 0.0, "rewards/symbolic_reward_partial_score/mean": 1.0, "rewards/symbolic_reward_partial_score/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0624750852584839, "sampling/importance_sampling_ratio/min": 0.0017669523367658257, "sampling/sampling_logp_difference/max": 6.338499069213867, "sampling/sampling_logp_difference/mean": 0.11573706567287445, "step": 433 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.2890060096979141, "epoch": 1.1421052631578947, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 434 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.2869379222393036, "epoch": 1.1447368421052633, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 435 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.2835257053375244, "epoch": 1.1473684210526316, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 436 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 16384.0, "completions/max_terminated_length": 12955.0, "completions/mean_length": 2845.474609375, "completions/mean_terminated_length": 2792.382568359375, "completions/min_length": 478.0, "completions/min_terminated_length": 478.0, "entropy": 0.2841552346944809, "epoch": 1.15, "frac_reward_zero_std": 0.9375, "grad_norm": 0.020851343870162964, "learning_rate": 1e-06, "loss": 0.0198, "num_tokens": 155279471.0, "reward": 0.8956055045127869, "reward_std": 0.013065746054053307, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.994140625, "rewards/symbolic_reward_accuracy/std": 0.07639661431312561, "rewards/symbolic_reward_partial_score/mean": 0.9983723759651184, "rewards/symbolic_reward_partial_score/std": 0.02205861359834671, "rewards/tag_count_reward/mean": -0.00390625, "rewards/tag_count_reward/std": 0.06243881583213806, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.061306357383728, "sampling/importance_sampling_ratio/min": 0.004384350962936878, "sampling/sampling_logp_difference/max": 5.429713726043701, "sampling/sampling_logp_difference/mean": 0.11378782242536545, "step": 437 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.2806518077850342, "epoch": 1.1526315789473685, "grad_norm": 0.0024340874515473843, "learning_rate": 1e-06, "loss": 0.0112, "step": 438 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.2841126322746277, "epoch": 1.1552631578947368, "grad_norm": 0.0109141506254673, "learning_rate": 1e-06, "loss": -0.0014, "step": 439 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03125, "entropy": 0.2834381014108658, "epoch": 1.1578947368421053, "grad_norm": 0.001393682323396206, "learning_rate": 1e-06, "loss": -0.0034, "step": 440 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6951.0, "completions/max_terminated_length": 6951.0, "completions/mean_length": 2337.494140625, "completions/mean_terminated_length": 2337.494140625, "completions/min_length": 517.0, "completions/min_terminated_length": 517.0, "entropy": 0.2747457027435303, "epoch": 1.1605263157894736, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 156903116.0, "reward": 0.9000000357627869, "reward_std": 0.0, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 1.0, "rewards/symbolic_reward_accuracy/std": 0.0, "rewards/symbolic_reward_partial_score/mean": 1.0, "rewards/symbolic_reward_partial_score/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0600950717926025, "sampling/importance_sampling_ratio/min": 0.0014589769998565316, "sampling/sampling_logp_difference/max": 6.530019760131836, "sampling/sampling_logp_difference/mean": 0.11164205521345139, "step": 441 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.2758783847093582, "epoch": 1.1631578947368422, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 442 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.27189917862415314, "epoch": 1.1657894736842105, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 443 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.27697035670280457, "epoch": 1.168421052631579, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 444 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7579.0, "completions/max_terminated_length": 7579.0, "completions/mean_length": 2509.4921875, "completions/mean_terminated_length": 2509.4921875, "completions/min_length": 605.0, "completions/min_terminated_length": 605.0, "entropy": 0.2784166634082794, "epoch": 1.1710526315789473, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 158582440.0, "reward": 0.9000000357627869, "reward_std": 0.0, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 1.0, "rewards/symbolic_reward_accuracy/std": 0.0, "rewards/symbolic_reward_partial_score/mean": 1.0, "rewards/symbolic_reward_partial_score/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0605262517929077, "sampling/importance_sampling_ratio/min": 0.00345178646966815, "sampling/sampling_logp_difference/max": 5.668863296508789, "sampling/sampling_logp_difference/mean": 0.11265573650598526, "step": 445 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.2815673053264618, "epoch": 1.1736842105263159, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 446 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.27738696336746216, "epoch": 1.1763157894736842, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 447 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.2767847776412964, "epoch": 1.1789473684210527, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 448 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7679.0, "completions/max_terminated_length": 7679.0, "completions/mean_length": 2524.8203125, "completions/mean_terminated_length": 2524.8203125, "completions/min_length": 553.0, "completions/min_terminated_length": 553.0, "entropy": 0.2820829302072525, "epoch": 1.181578947368421, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 160282444.0, "reward": 0.9000000357627869, "reward_std": 0.0, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 1.0, "rewards/symbolic_reward_accuracy/std": 0.0, "rewards/symbolic_reward_partial_score/mean": 1.0, "rewards/symbolic_reward_partial_score/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0618802309036255, "sampling/importance_sampling_ratio/min": 5.3814914281247184e-05, "sampling/sampling_logp_difference/max": 9.829959869384766, "sampling/sampling_logp_difference/mean": 0.11452903598546982, "step": 449 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.27996405959129333, "epoch": 1.1842105263157894, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 450 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.28499583899974823, "epoch": 1.186842105263158, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 451 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.2814007103443146, "epoch": 1.1894736842105262, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 452 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9028.0, "completions/max_terminated_length": 9028.0, "completions/mean_length": 2743.072265625, "completions/mean_terminated_length": 2743.072265625, "completions/min_length": 420.0, "completions/min_terminated_length": 420.0, "entropy": 0.28498663008213043, "epoch": 1.1921052631578948, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 162095217.0, "reward": 0.9000000357627869, "reward_std": 0.0, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 1.0, "rewards/symbolic_reward_accuracy/std": 0.0, "rewards/symbolic_reward_partial_score/mean": 1.0, "rewards/symbolic_reward_partial_score/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0614476203918457, "sampling/importance_sampling_ratio/min": 0.00492549454793334, "sampling/sampling_logp_difference/max": 5.31333065032959, "sampling/sampling_logp_difference/mean": 0.11436395347118378, "step": 453 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.28602689504623413, "epoch": 1.194736842105263, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 454 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.2808579057455063, "epoch": 1.1973684210526316, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 455 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.2764871418476105, "epoch": 1.2, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 456 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8113.0, "completions/max_terminated_length": 8113.0, "completions/mean_length": 2667.134765625, "completions/mean_terminated_length": 2667.134765625, "completions/min_length": 569.0, "completions/min_terminated_length": 569.0, "entropy": 0.2816062271595001, "epoch": 1.2026315789473685, "frac_reward_zero_std": 0.96875, "grad_norm": 0.001902776537463069, "learning_rate": 1e-06, "loss": 0.006, "num_tokens": 163877430.0, "reward": 0.8985351920127869, "reward_std": 0.005859375931322575, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.998046875, "rewards/symbolic_reward_accuracy/std": 0.04419417306780815, "rewards/symbolic_reward_partial_score/mean": 0.9990234375, "rewards/symbolic_reward_partial_score/std": 0.022097086533904076, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0612181425094604, "sampling/importance_sampling_ratio/min": 0.002987224142998457, "sampling/sampling_logp_difference/max": 5.813410758972168, "sampling/sampling_logp_difference/mean": 0.11340761184692383, "step": 457 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.2775525152683258, "epoch": 1.2052631578947368, "grad_norm": 0.0017164315795525908, "learning_rate": 1e-06, "loss": -0.001, "step": 458 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.2844899892807007, "epoch": 1.2078947368421054, "grad_norm": 0.0007408350938931108, "learning_rate": 1e-06, "loss": -0.0012, "step": 459 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.27909520268440247, "epoch": 1.2105263157894737, "grad_norm": 0.0013833829434588552, "learning_rate": 1e-06, "loss": -0.0013, "step": 460 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14522.0, "completions/max_terminated_length": 14522.0, "completions/mean_length": 2442.359375, "completions/mean_terminated_length": 2442.359375, "completions/min_length": 476.0, "completions/min_terminated_length": 476.0, "entropy": 0.2670242190361023, "epoch": 1.2131578947368422, "frac_reward_zero_std": 0.9375, "grad_norm": 0.004239482339471579, "learning_rate": 1e-06, "loss": -0.0049, "num_tokens": 165545902.0, "reward": 0.89599609375, "reward_std": 0.012797415256500244, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.994140625, "rewards/symbolic_reward_accuracy/std": 0.07639661431312561, "rewards/symbolic_reward_partial_score/mean": 0.9983723759651184, "rewards/symbolic_reward_partial_score/std": 0.024398809298872948, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0600000619888306, "sampling/importance_sampling_ratio/min": 0.003759420942515135, "sampling/sampling_logp_difference/max": 5.583490371704102, "sampling/sampling_logp_difference/mean": 0.11157608032226562, "step": 461 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.2760328948497772, "epoch": 1.2157894736842105, "grad_norm": 0.002046706387773156, "learning_rate": 1e-06, "loss": -0.0028, "step": 462 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.2781810760498047, "epoch": 1.2184210526315788, "grad_norm": 0.022850554436445236, "learning_rate": 1e-06, "loss": 0.022, "step": 463 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.2733093500137329, "epoch": 1.2210526315789474, "grad_norm": 0.0027880375273525715, "learning_rate": 1e-06, "loss": 0.016, "step": 464 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9999.0, "completions/max_terminated_length": 9999.0, "completions/mean_length": 2639.869140625, "completions/mean_terminated_length": 2639.869140625, "completions/min_length": 433.0, "completions/min_terminated_length": 433.0, "entropy": 0.281954750418663, "epoch": 1.2236842105263157, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 167301483.0, "reward": 0.9000000357627869, "reward_std": 0.0, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 1.0, "rewards/symbolic_reward_accuracy/std": 0.0, "rewards/symbolic_reward_partial_score/mean": 1.0, "rewards/symbolic_reward_partial_score/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0615510940551758, "sampling/importance_sampling_ratio/min": 0.0061265043914318085, "sampling/sampling_logp_difference/max": 5.095130920410156, "sampling/sampling_logp_difference/mean": 0.11390291154384613, "step": 465 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.28203193843364716, "epoch": 1.2263157894736842, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 466 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.2877836972475052, "epoch": 1.2289473684210526, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 467 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.28174304962158203, "epoch": 1.231578947368421, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 468 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8424.0, "completions/max_terminated_length": 8424.0, "completions/mean_length": 2653.875, "completions/mean_terminated_length": 2653.875, "completions/min_length": 497.0, "completions/min_terminated_length": 497.0, "entropy": 0.2743367850780487, "epoch": 1.2342105263157894, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 169093451.0, "reward": 0.9000000357627869, "reward_std": 0.0, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 1.0, "rewards/symbolic_reward_accuracy/std": 0.0, "rewards/symbolic_reward_partial_score/mean": 1.0, "rewards/symbolic_reward_partial_score/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0606870651245117, "sampling/importance_sampling_ratio/min": 0.0023666657507419586, "sampling/sampling_logp_difference/max": 6.046273231506348, "sampling/sampling_logp_difference/mean": 0.11223700642585754, "step": 469 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.2774965465068817, "epoch": 1.236842105263158, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 470 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.27688202261924744, "epoch": 1.2394736842105263, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 471 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.28171488642692566, "epoch": 1.2421052631578948, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 472 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14300.0, "completions/max_terminated_length": 14300.0, "completions/mean_length": 2640.60546875, "completions/mean_terminated_length": 2640.60546875, "completions/min_length": 519.0, "completions/min_terminated_length": 519.0, "entropy": 0.2770412266254425, "epoch": 1.2447368421052631, "frac_reward_zero_std": 0.96875, "grad_norm": 0.0023528176825493574, "learning_rate": 1e-06, "loss": -0.003, "num_tokens": 170852257.0, "reward": 0.8982422351837158, "reward_std": 0.00703125074505806, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.998046875, "rewards/symbolic_reward_accuracy/std": 0.04419417306780815, "rewards/symbolic_reward_partial_score/mean": 0.998046875, "rewards/symbolic_reward_partial_score/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.059995412826538, "sampling/importance_sampling_ratio/min": 0.0034695856738835573, "sampling/sampling_logp_difference/max": 5.66372013092041, "sampling/sampling_logp_difference/mean": 0.11187373846769333, "step": 473 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.27555151283741, "epoch": 1.2473684210526317, "grad_norm": 0.002396463416516781, "learning_rate": 1e-06, "loss": -0.003, "step": 474 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.2751545011997223, "epoch": 1.25, "grad_norm": 0.0025512168649584055, "learning_rate": 1e-06, "loss": -0.0037, "step": 475 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.27497801184654236, "epoch": 1.2526315789473683, "grad_norm": 0.0016162459505721927, "learning_rate": 1e-06, "loss": 0.008, "step": 476 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9699.0, "completions/max_terminated_length": 9699.0, "completions/mean_length": 2468.681640625, "completions/mean_terminated_length": 2468.681640625, "completions/min_length": 505.0, "completions/min_terminated_length": 505.0, "entropy": 0.28437162935733795, "epoch": 1.2552631578947369, "frac_reward_zero_std": 0.90625, "grad_norm": 0.003380117006599903, "learning_rate": 1e-06, "loss": 0.0071, "num_tokens": 172507998.0, "reward": 0.8956055045127869, "reward_std": 0.01757812686264515, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.994140625, "rewards/symbolic_reward_accuracy/std": 0.07639661431312561, "rewards/symbolic_reward_partial_score/mean": 0.9970703125, "rewards/symbolic_reward_partial_score/std": 0.038198307156562805, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0626821517944336, "sampling/importance_sampling_ratio/min": 0.0009619927732273936, "sampling/sampling_logp_difference/max": 6.946503639221191, "sampling/sampling_logp_difference/mean": 0.11581763625144958, "step": 477 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03125, "entropy": 0.28655223548412323, "epoch": 1.2578947368421054, "grad_norm": 0.0023962240666151047, "learning_rate": 1e-06, "loss": 0.0087, "step": 478 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.28648948669433594, "epoch": 1.2605263157894737, "grad_norm": 0.0027322375681251287, "learning_rate": 1e-06, "loss": -0.0046, "step": 479 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0390625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0390625, "entropy": 0.28735925257205963, "epoch": 1.263157894736842, "grad_norm": 0.002800721675157547, "learning_rate": 1e-06, "loss": -0.0043, "step": 480 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9964.0, "completions/max_terminated_length": 9964.0, "completions/mean_length": 2686.6484375, "completions/mean_terminated_length": 2686.6484375, "completions/min_length": 568.0, "completions/min_terminated_length": 568.0, "entropy": 0.28087250888347626, "epoch": 1.2657894736842106, "frac_reward_zero_std": 0.96875, "grad_norm": 0.0023496008943766356, "learning_rate": 1e-06, "loss": -0.0033, "num_tokens": 174287530.0, "reward": 0.8982422351837158, "reward_std": 0.00703125074505806, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.998046875, "rewards/symbolic_reward_accuracy/std": 0.04419417306780815, "rewards/symbolic_reward_partial_score/mean": 0.998046875, "rewards/symbolic_reward_partial_score/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.061124563217163, "sampling/importance_sampling_ratio/min": 0.006342970300465822, "sampling/sampling_logp_difference/max": 5.060408115386963, "sampling/sampling_logp_difference/mean": 0.11321946978569031, "step": 481 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.2758321464061737, "epoch": 1.268421052631579, "grad_norm": 0.002617916092276573, "learning_rate": 1e-06, "loss": -0.0036, "step": 482 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.27740998566150665, "epoch": 1.2710526315789474, "grad_norm": 0.0022106911055743694, "learning_rate": 1e-06, "loss": -0.0035, "step": 483 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.28228427469730377, "epoch": 1.2736842105263158, "grad_norm": 0.002715718001127243, "learning_rate": 1e-06, "loss": 0.012, "step": 484 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13338.0, "completions/max_terminated_length": 13338.0, "completions/mean_length": 2429.994140625, "completions/mean_terminated_length": 2429.994140625, "completions/min_length": 477.0, "completions/min_terminated_length": 477.0, "entropy": 0.2907012850046158, "epoch": 1.2763157894736843, "frac_reward_zero_std": 0.96875, "grad_norm": 0.0018957426073029637, "learning_rate": 1e-06, "loss": -0.0018, "num_tokens": 175913959.0, "reward": 0.8987305164337158, "reward_std": 0.005078125279396772, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.998046875, "rewards/symbolic_reward_accuracy/std": 0.04419417306780815, "rewards/symbolic_reward_partial_score/mean": 0.9996744394302368, "rewards/symbolic_reward_partial_score/std": 0.007365696597844362, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0628280639648438, "sampling/importance_sampling_ratio/min": 0.0052523077465593815, "sampling/sampling_logp_difference/max": 5.249087810516357, "sampling/sampling_logp_difference/mean": 0.11685489863157272, "step": 485 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.28757444024086, "epoch": 1.2789473684210526, "grad_norm": 0.0019598272629082203, "learning_rate": 1e-06, "loss": 0.0048, "step": 486 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.2879467010498047, "epoch": 1.2815789473684212, "grad_norm": 0.0018528875662013888, "learning_rate": 1e-06, "loss": -0.0014, "step": 487 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.287949874997139, "epoch": 1.2842105263157895, "grad_norm": 0.001262790523469448, "learning_rate": 1e-06, "loss": -0.0017, "step": 488 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9274.0, "completions/max_terminated_length": 9274.0, "completions/mean_length": 2448.8828125, "completions/mean_terminated_length": 2448.8828125, "completions/min_length": 434.0, "completions/min_terminated_length": 434.0, "entropy": 0.2807082086801529, "epoch": 1.2868421052631578, "frac_reward_zero_std": 0.96875, "grad_norm": 0.0029067937284708023, "learning_rate": 1e-06, "loss": -0.0029, "num_tokens": 177568267.0, "reward": 0.8985351920127869, "reward_std": 0.005859375931322575, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.998046875, "rewards/symbolic_reward_accuracy/std": 0.04419417306780815, "rewards/symbolic_reward_partial_score/mean": 0.9990234375, "rewards/symbolic_reward_partial_score/std": 0.022097086533904076, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0616849660873413, "sampling/importance_sampling_ratio/min": 0.0026855815667659044, "sampling/sampling_logp_difference/max": 5.919857978820801, "sampling/sampling_logp_difference/mean": 0.11421144753694534, "step": 489 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.2836399972438812, "epoch": 1.2894736842105263, "grad_norm": 0.002467453945428133, "learning_rate": 1e-06, "loss": 0.0095, "step": 490 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.28005798161029816, "epoch": 1.2921052631578949, "grad_norm": 0.002244848059490323, "learning_rate": 1e-06, "loss": -0.0027, "step": 491 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.2828148454427719, "epoch": 1.2947368421052632, "grad_norm": 0.002484746277332306, "learning_rate": 1e-06, "loss": -0.0027, "step": 492 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13885.0, "completions/max_terminated_length": 13885.0, "completions/mean_length": 2793.986328125, "completions/mean_terminated_length": 2793.986328125, "completions/min_length": 391.0, "completions/min_terminated_length": 391.0, "entropy": 0.2845616787672043, "epoch": 1.2973684210526315, "frac_reward_zero_std": 0.875, "grad_norm": 0.006213765125721693, "learning_rate": 1e-06, "loss": 0.0039, "num_tokens": 179405764.0, "reward": 0.8916992545127869, "reward_std": 0.026417125016450882, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.98828125, "rewards/symbolic_reward_accuracy/std": 0.10772226005792618, "rewards/symbolic_reward_partial_score/mean": 0.9957681894302368, "rewards/symbolic_reward_partial_score/std": 0.041818004101514816, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.062483549118042, "sampling/importance_sampling_ratio/min": 0.00047210551565513015, "sampling/sampling_logp_difference/max": 7.658308029174805, "sampling/sampling_logp_difference/mean": 0.11538608372211456, "step": 493 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.28666482865810394, "epoch": 1.3, "grad_norm": 0.0324806272983551, "learning_rate": 1e-06, "loss": 0.0198, "step": 494 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.28866955637931824, "epoch": 1.3026315789473684, "grad_norm": 0.022287368774414062, "learning_rate": 1e-06, "loss": -0.0037, "step": 495 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.28040313720703125, "epoch": 1.305263157894737, "grad_norm": 0.01724497228860855, "learning_rate": 1e-06, "loss": -0.005, "step": 496 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 7579.0, "completions/mean_length": 2827.302734375, "completions/mean_terminated_length": 2800.77294921875, "completions/min_length": 584.0, "completions/min_terminated_length": 584.0, "entropy": 0.2923204004764557, "epoch": 1.3078947368421052, "frac_reward_zero_std": 0.9375, "grad_norm": 0.002580299274995923, "learning_rate": 1e-06, "loss": -0.0027, "num_tokens": 181255647.0, "reward": 0.8983398675918579, "reward_std": 0.0066406261175870895, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.998046875, "rewards/symbolic_reward_accuracy/std": 0.04419417306780815, "rewards/symbolic_reward_partial_score/mean": 0.9990234375, "rewards/symbolic_reward_partial_score/std": 0.022097086533904076, "rewards/tag_count_reward/mean": -0.001953125, "rewards/tag_count_reward/std": 0.04419417306780815, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0639028549194336, "sampling/importance_sampling_ratio/min": 0.003443553578108549, "sampling/sampling_logp_difference/max": 5.67125129699707, "sampling/sampling_logp_difference/mean": 0.11685192584991455, "step": 497 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.2928921580314636, "epoch": 1.3105263157894738, "grad_norm": 0.002244093921035528, "learning_rate": 1e-06, "loss": 0.0301, "step": 498 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.2899506986141205, "epoch": 1.313157894736842, "grad_norm": 0.0024237188044935465, "learning_rate": 1e-06, "loss": -0.0028, "step": 499 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.294723778963089, "epoch": 1.3157894736842106, "grad_norm": 0.001790754497051239, "learning_rate": 1e-06, "loss": -0.0024, "step": 500 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8561.0, "completions/max_terminated_length": 8561.0, "completions/mean_length": 2586.064453125, "completions/mean_terminated_length": 2586.064453125, "completions/min_length": 525.0, "completions/min_terminated_length": 525.0, "entropy": 0.29522904753685, "epoch": 1.318421052631579, "frac_reward_zero_std": 0.96875, "grad_norm": 0.0018887402256950736, "learning_rate": 1e-06, "loss": 0.006, "num_tokens": 182983520.0, "reward": 0.8987305164337158, "reward_std": 0.005078125279396772, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.998046875, "rewards/symbolic_reward_accuracy/std": 0.04419417306780815, "rewards/symbolic_reward_partial_score/mean": 0.9996744394302368, "rewards/symbolic_reward_partial_score/std": 0.007365696597844362, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0631163120269775, "sampling/importance_sampling_ratio/min": 0.002711938926950097, "sampling/sampling_logp_difference/max": 5.910091400146484, "sampling/sampling_logp_difference/mean": 0.1166718378663063, "step": 501 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.28757698833942413, "epoch": 1.3210526315789473, "grad_norm": 0.0015428501646965742, "learning_rate": 1e-06, "loss": -0.0018, "step": 502 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.29205331206321716, "epoch": 1.3236842105263158, "grad_norm": 0.0018179051112383604, "learning_rate": 1e-06, "loss": -0.0016, "step": 503 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.2899082452058792, "epoch": 1.3263157894736843, "grad_norm": 0.001933448831550777, "learning_rate": 1e-06, "loss": -0.0014, "step": 504 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8693.0, "completions/max_terminated_length": 8693.0, "completions/mean_length": 2777.216796875, "completions/mean_terminated_length": 2777.216796875, "completions/min_length": 657.0, "completions/min_terminated_length": 657.0, "entropy": 0.28895828127861023, "epoch": 1.3289473684210527, "frac_reward_zero_std": 0.90625, "grad_norm": 0.0037044554483145475, "learning_rate": 1e-06, "loss": -0.0043, "num_tokens": 184842127.0, "reward": 0.8958008289337158, "reward_std": 0.01679687574505806, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.994140625, "rewards/symbolic_reward_accuracy/std": 0.07639661431312561, "rewards/symbolic_reward_partial_score/mean": 0.9977213144302368, "rewards/symbolic_reward_partial_score/std": 0.03205668181180954, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.062931776046753, "sampling/importance_sampling_ratio/min": 0.0019438152667135, "sampling/sampling_logp_difference/max": 6.243102550506592, "sampling/sampling_logp_difference/mean": 0.11610420048236847, "step": 505 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0390625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0390625, "entropy": 0.28731192648410797, "epoch": 1.331578947368421, "grad_norm": 0.0023044426925480366, "learning_rate": 1e-06, "loss": -0.0024, "step": 506 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0390625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0390625, "entropy": 0.2895679622888565, "epoch": 1.3342105263157895, "grad_norm": 0.0030420811381191015, "learning_rate": 1e-06, "loss": -0.0019, "step": 507 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0546875, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0625, "entropy": 0.2895590662956238, "epoch": 1.3368421052631578, "grad_norm": 0.002375815762206912, "learning_rate": 1e-06, "loss": 0.0062, "step": 508 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8394.0, "completions/max_terminated_length": 8394.0, "completions/mean_length": 2718.5859375, "completions/mean_terminated_length": 2718.5859375, "completions/min_length": 548.0, "completions/min_terminated_length": 548.0, "entropy": 0.29500146210193634, "epoch": 1.3394736842105264, "frac_reward_zero_std": 0.96875, "grad_norm": 0.0021614436991512775, "learning_rate": 1e-06, "loss": -0.0013, "num_tokens": 186634843.0, "reward": 0.8985351920127869, "reward_std": 0.005859375931322575, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.998046875, "rewards/symbolic_reward_accuracy/std": 0.04419417306780815, "rewards/symbolic_reward_partial_score/mean": 0.9990234375, "rewards/symbolic_reward_partial_score/std": 0.022097086533904076, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0652493238449097, "sampling/importance_sampling_ratio/min": 0.003036715090274811, "sampling/sampling_logp_difference/max": 5.796978950500488, "sampling/sampling_logp_difference/mean": 0.11945419013500214, "step": 509 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.29637689888477325, "epoch": 1.3421052631578947, "grad_norm": 0.0013739810092374682, "learning_rate": 1e-06, "loss": -0.0018, "step": 510 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.29799486696720123, "epoch": 1.3447368421052632, "grad_norm": 0.001422446803189814, "learning_rate": 1e-06, "loss": 0.0099, "step": 511 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.2994650602340698, "epoch": 1.3473684210526315, "grad_norm": 0.0012910488294437528, "learning_rate": 1e-06, "loss": -0.001, "step": 512 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9282.0, "completions/max_terminated_length": 9282.0, "completions/mean_length": 2966.93359375, "completions/mean_terminated_length": 2966.93359375, "completions/min_length": 333.0, "completions/min_terminated_length": 333.0, "entropy": 0.30382074415683746, "epoch": 1.35, "frac_reward_zero_std": 0.9375, "grad_norm": 0.004820153117179871, "learning_rate": 1e-06, "loss": -0.0075, "num_tokens": 188536185.0, "reward": 0.8935546875, "reward_std": 0.017157409340143204, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.9921875, "rewards/symbolic_reward_accuracy/std": 0.08812850713729858, "rewards/symbolic_reward_partial_score/mean": 0.994140625, "rewards/symbolic_reward_partial_score/std": 0.06969913095235825, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0672197341918945, "sampling/importance_sampling_ratio/min": 0.0033419285900890827, "sampling/sampling_logp_difference/max": 5.701207160949707, "sampling/sampling_logp_difference/mean": 0.12278600037097931, "step": 513 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.3057977557182312, "epoch": 1.3526315789473684, "grad_norm": 0.01580539159476757, "learning_rate": 1e-06, "loss": 0.0019, "step": 514 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.31199172139167786, "epoch": 1.3552631578947367, "grad_norm": 0.0020922357216477394, "learning_rate": 1e-06, "loss": -0.0002, "step": 515 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.31119248270988464, "epoch": 1.3578947368421053, "grad_norm": 0.016849294304847717, "learning_rate": 1e-06, "loss": 0.0142, "step": 516 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8227.0, "completions/max_terminated_length": 8227.0, "completions/mean_length": 2764.107421875, "completions/mean_terminated_length": 2764.107421875, "completions/min_length": 568.0, "completions/min_terminated_length": 568.0, "entropy": 0.30137281119823456, "epoch": 1.3605263157894738, "frac_reward_zero_std": 0.96875, "grad_norm": 0.0019240904366597533, "learning_rate": 1e-06, "loss": -0.0014, "num_tokens": 190345712.0, "reward": 0.8985351920127869, "reward_std": 0.005859375931322575, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.998046875, "rewards/symbolic_reward_accuracy/std": 0.04419417306780815, "rewards/symbolic_reward_partial_score/mean": 0.9990234375, "rewards/symbolic_reward_partial_score/std": 0.022097086533904076, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0664430856704712, "sampling/importance_sampling_ratio/min": 0.0019446952501311898, "sampling/sampling_logp_difference/max": 6.242650032043457, "sampling/sampling_logp_difference/mean": 0.12135472893714905, "step": 517 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.30609750747680664, "epoch": 1.3631578947368421, "grad_norm": 0.0019408866064622998, "learning_rate": 1e-06, "loss": -0.0015, "step": 518 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.30156731605529785, "epoch": 1.3657894736842104, "grad_norm": 0.0014759511686861515, "learning_rate": 1e-06, "loss": -0.0014, "step": 519 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.3068056106567383, "epoch": 1.368421052631579, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0052, "step": 520 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10992.0, "completions/max_terminated_length": 10992.0, "completions/mean_length": 3066.171875, "completions/mean_terminated_length": 3066.171875, "completions/min_length": 449.0, "completions/min_terminated_length": 449.0, "entropy": 0.3038928359746933, "epoch": 1.3710526315789473, "frac_reward_zero_std": 0.96875, "grad_norm": 0.005096717271953821, "learning_rate": 1e-06, "loss": 0.0064, "num_tokens": 192313224.0, "reward": 0.89599609375, "reward_std": 0.008633313700556755, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.994140625, "rewards/symbolic_reward_accuracy/std": 0.07639661431312561, "rewards/symbolic_reward_partial_score/mean": 0.9983723759651184, "rewards/symbolic_reward_partial_score/std": 0.024398809298872948, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0668981075286865, "sampling/importance_sampling_ratio/min": 0.0017107150051742792, "sampling/sampling_logp_difference/max": 6.370843887329102, "sampling/sampling_logp_difference/mean": 0.12223061174154282, "step": 521 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.3054998815059662, "epoch": 1.3736842105263158, "grad_norm": 0.003519161371514201, "learning_rate": 1e-06, "loss": -0.0001, "step": 522 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.3063057065010071, "epoch": 1.3763157894736842, "grad_norm": 0.003637876594439149, "learning_rate": 1e-06, "loss": -0.0068, "step": 523 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3064233660697937, "epoch": 1.3789473684210527, "grad_norm": 0.01453381311148405, "learning_rate": 1e-06, "loss": 0.0032, "step": 524 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 9371.0, "completions/mean_length": 3110.828125, "completions/mean_terminated_length": 3084.853271484375, "completions/min_length": 566.0, "completions/min_terminated_length": 566.0, "entropy": 0.30350251495838165, "epoch": 1.381578947368421, "frac_reward_zero_std": 0.96875, "grad_norm": 0.0019116182811558247, "learning_rate": 1e-06, "loss": -0.0017, "num_tokens": 194330288.0, "reward": 0.8998047113418579, "reward_std": 0.0007812501862645149, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 1.0, "rewards/symbolic_reward_accuracy/std": 0.0, "rewards/symbolic_reward_partial_score/mean": 1.0, "rewards/symbolic_reward_partial_score/std": 0.0, "rewards/tag_count_reward/mean": -0.001953125, "rewards/tag_count_reward/std": 0.04419417306780815, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0664637088775635, "sampling/importance_sampling_ratio/min": 0.0036207668017596006, "sampling/sampling_logp_difference/max": 5.621069431304932, "sampling/sampling_logp_difference/mean": 0.12134721875190735, "step": 525 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.30916810035705566, "epoch": 1.3842105263157896, "grad_norm": 0.0016269630286842585, "learning_rate": 1e-06, "loss": -0.0016, "step": 526 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.3063466101884842, "epoch": 1.3868421052631579, "grad_norm": 0.0020754123106598854, "learning_rate": 1e-06, "loss": -0.0021, "step": 527 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.3053359389305115, "epoch": 1.3894736842105262, "grad_norm": 0.03356393426656723, "learning_rate": 1e-06, "loss": 0.0293, "step": 528 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10903.0, "completions/max_terminated_length": 10903.0, "completions/mean_length": 3395.791015625, "completions/mean_terminated_length": 3395.791015625, "completions/min_length": 555.0, "completions/min_terminated_length": 555.0, "entropy": 0.30794358253479004, "epoch": 1.3921052631578947, "frac_reward_zero_std": 0.9375, "grad_norm": 0.003206896362826228, "learning_rate": 1e-06, "loss": -0.0057, "num_tokens": 196484773.0, "reward": 0.8972656726837158, "reward_std": 0.01093750074505806, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.99609375, "rewards/symbolic_reward_accuracy/std": 0.06243881583213806, "rewards/symbolic_reward_partial_score/mean": 0.9986978769302368, "rewards/symbolic_reward_partial_score/std": 0.023278694599866867, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0677857398986816, "sampling/importance_sampling_ratio/min": 0.002018261468037963, "sampling/sampling_logp_difference/max": 6.20551872253418, "sampling/sampling_logp_difference/mean": 0.12277509272098541, "step": 529 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.3103633373975754, "epoch": 1.3947368421052633, "grad_norm": 0.0024154309649020433, "learning_rate": 1e-06, "loss": 0.0166, "step": 530 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03125, "entropy": 0.3153703063726425, "epoch": 1.3973684210526316, "grad_norm": 0.015842389315366745, "learning_rate": 1e-06, "loss": -0.0001, "step": 531 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.30964767932891846, "epoch": 1.4, "grad_norm": 0.0032626211177557707, "learning_rate": 1e-06, "loss": -0.0056, "step": 532 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11778.0, "completions/max_terminated_length": 11778.0, "completions/mean_length": 3217.375, "completions/mean_terminated_length": 3217.375, "completions/min_length": 586.0, "completions/min_terminated_length": 586.0, "entropy": 0.3212677985429764, "epoch": 1.4026315789473685, "frac_reward_zero_std": 0.90625, "grad_norm": 0.0038719417061656713, "learning_rate": 1e-06, "loss": -0.0088, "num_tokens": 198543877.0, "reward": 0.8956055045127869, "reward_std": 0.017578125, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.994140625, "rewards/symbolic_reward_accuracy/std": 0.07639661431312561, "rewards/symbolic_reward_partial_score/mean": 0.9970703125, "rewards/symbolic_reward_partial_score/std": 0.038198307156562805, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0691947937011719, "sampling/importance_sampling_ratio/min": 0.003097565844655037, "sampling/sampling_logp_difference/max": 5.777138710021973, "sampling/sampling_logp_difference/mean": 0.1256110817193985, "step": 533 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.31328117847442627, "epoch": 1.4052631578947368, "grad_norm": 0.02439163438975811, "learning_rate": 1e-06, "loss": 0.0036, "step": 534 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.046875, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0546875, "entropy": 0.31977179646492004, "epoch": 1.4078947368421053, "grad_norm": 0.0023795654997229576, "learning_rate": 1e-06, "loss": 0.0076, "step": 535 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.31871090829372406, "epoch": 1.4105263157894736, "grad_norm": 0.002489138161763549, "learning_rate": 1e-06, "loss": 0.0055, "step": 536 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10252.0, "completions/max_terminated_length": 10252.0, "completions/mean_length": 3165.794921875, "completions/mean_terminated_length": 3165.794921875, "completions/min_length": 473.0, "completions/min_terminated_length": 473.0, "entropy": 0.32612091302871704, "epoch": 1.4131578947368422, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 200556540.0, "reward": 0.9000000357627869, "reward_std": 0.0, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 1.0, "rewards/symbolic_reward_accuracy/std": 0.0, "rewards/symbolic_reward_partial_score/mean": 1.0, "rewards/symbolic_reward_partial_score/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.072479486465454, "sampling/importance_sampling_ratio/min": 0.0008607304189354181, "sampling/sampling_logp_difference/max": 7.057729244232178, "sampling/sampling_logp_difference/mean": 0.1306590884923935, "step": 537 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3371126800775528, "epoch": 1.4157894736842105, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 538 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3325609713792801, "epoch": 1.418421052631579, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 539 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.33244629204273224, "epoch": 1.4210526315789473, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 540 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12630.0, "completions/max_terminated_length": 12630.0, "completions/mean_length": 2772.06640625, "completions/mean_terminated_length": 2772.06640625, "completions/min_length": 469.0, "completions/min_terminated_length": 469.0, "entropy": 0.3347853571176529, "epoch": 1.4236842105263157, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 202370622.0, "reward": 0.9000000357627869, "reward_std": 0.0, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 1.0, "rewards/symbolic_reward_accuracy/std": 0.0, "rewards/symbolic_reward_partial_score/mean": 1.0, "rewards/symbolic_reward_partial_score/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0723438262939453, "sampling/importance_sampling_ratio/min": 0.002322677057236433, "sampling/sampling_logp_difference/max": 6.065034866333008, "sampling/sampling_logp_difference/mean": 0.13011279702186584, "step": 541 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.33342115581035614, "epoch": 1.4263157894736842, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 542 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3246258795261383, "epoch": 1.4289473684210527, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 543 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3314095586538315, "epoch": 1.431578947368421, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 544 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9095.0, "completions/max_terminated_length": 9095.0, "completions/mean_length": 2907.65234375, "completions/mean_terminated_length": 2907.65234375, "completions/min_length": 379.0, "completions/min_terminated_length": 379.0, "entropy": 0.3414568305015564, "epoch": 1.4342105263157894, "frac_reward_zero_std": 0.96875, "grad_norm": 0.0021175374276936054, "learning_rate": 1e-06, "loss": -0.0021, "num_tokens": 204238284.0, "reward": 0.8985351920127869, "reward_std": 0.005859375, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.998046875, "rewards/symbolic_reward_accuracy/std": 0.04419417306780815, "rewards/symbolic_reward_partial_score/mean": 0.9990234375, "rewards/symbolic_reward_partial_score/std": 0.022097086533904076, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0743600130081177, "sampling/importance_sampling_ratio/min": 0.0015632578870281577, "sampling/sampling_logp_difference/max": 6.4609832763671875, "sampling/sampling_logp_difference/mean": 0.13287842273712158, "step": 545 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.3357093930244446, "epoch": 1.436842105263158, "grad_norm": 0.001350675243884325, "learning_rate": 1e-06, "loss": 0.0155, "step": 546 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.33649541437625885, "epoch": 1.4394736842105262, "grad_norm": 0.0015491022495552897, "learning_rate": 1e-06, "loss": -0.0019, "step": 547 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.34173116087913513, "epoch": 1.4421052631578948, "grad_norm": 0.0010022064670920372, "learning_rate": 1e-06, "loss": -0.0016, "step": 548 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9493.0, "completions/max_terminated_length": 9493.0, "completions/mean_length": 3305.783203125, "completions/mean_terminated_length": 3305.783203125, "completions/min_length": 405.0, "completions/min_terminated_length": 405.0, "entropy": 0.3298810124397278, "epoch": 1.444736842105263, "frac_reward_zero_std": 0.9375, "grad_norm": 0.002437557326629758, "learning_rate": 1e-06, "loss": -0.0027, "num_tokens": 206360861.0, "reward": 0.8970703482627869, "reward_std": 0.01171875, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.99609375, "rewards/symbolic_reward_accuracy/std": 0.06243881583213806, "rewards/symbolic_reward_partial_score/mean": 0.998046875, "rewards/symbolic_reward_partial_score/std": 0.03121940791606903, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0723556280136108, "sampling/importance_sampling_ratio/min": 0.0022042400669306517, "sampling/sampling_logp_difference/max": 6.117372512817383, "sampling/sampling_logp_difference/mean": 0.1293145716190338, "step": 549 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3315814286470413, "epoch": 1.4473684210526316, "grad_norm": 0.0027905437164008617, "learning_rate": 1e-06, "loss": -0.0029, "step": 550 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03125, "entropy": 0.3288280963897705, "epoch": 1.45, "grad_norm": 0.0008437609649263322, "learning_rate": 1e-06, "loss": 0.0138, "step": 551 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.3283464014530182, "epoch": 1.4526315789473685, "grad_norm": 0.0019002691842615604, "learning_rate": 1e-06, "loss": -0.003, "step": 552 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14233.0, "completions/max_terminated_length": 14233.0, "completions/mean_length": 2983.265625, "completions/mean_terminated_length": 2983.265625, "completions/min_length": 435.0, "completions/min_terminated_length": 435.0, "entropy": 0.34424005448818207, "epoch": 1.4552631578947368, "frac_reward_zero_std": 0.9375, "grad_norm": 0.0026063474360853434, "learning_rate": 1e-06, "loss": 0.0073, "num_tokens": 208285925.0, "reward": 0.8970703482627869, "reward_std": 0.011718750931322575, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.99609375, "rewards/symbolic_reward_accuracy/std": 0.06243881583213806, "rewards/symbolic_reward_partial_score/mean": 0.998046875, "rewards/symbolic_reward_partial_score/std": 0.03121940791606903, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.074610948562622, "sampling/importance_sampling_ratio/min": 0.001703870017081499, "sampling/sampling_logp_difference/max": 6.374853134155273, "sampling/sampling_logp_difference/mean": 0.13378797471523285, "step": 553 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.3416339308023453, "epoch": 1.4578947368421051, "grad_norm": 0.002811652608215809, "learning_rate": 1e-06, "loss": -0.0032, "step": 554 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.3438931852579117, "epoch": 1.4605263157894737, "grad_norm": 0.001903617288917303, "learning_rate": 1e-06, "loss": -0.0028, "step": 555 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.33544501662254333, "epoch": 1.4631578947368422, "grad_norm": 0.0029551892075687647, "learning_rate": 1e-06, "loss": -0.0044, "step": 556 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10386.0, "completions/max_terminated_length": 10386.0, "completions/mean_length": 3202.09765625, "completions/mean_terminated_length": 3202.09765625, "completions/min_length": 441.0, "completions/min_terminated_length": 441.0, "entropy": 0.34248383343219757, "epoch": 1.4657894736842105, "frac_reward_zero_std": 0.84375, "grad_norm": 0.006875012069940567, "learning_rate": 1e-06, "loss": 0.0018, "num_tokens": 210318519.0, "reward": 0.8882812857627869, "reward_std": 0.0350315123796463, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.984375, "rewards/symbolic_reward_accuracy/std": 0.12414088100194931, "rewards/symbolic_reward_partial_score/mean": 0.9921875, "rewards/symbolic_reward_partial_score/std": 0.062070440500974655, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0747203826904297, "sampling/importance_sampling_ratio/min": 0.004089592024683952, "sampling/sampling_logp_difference/max": 5.49931001663208, "sampling/sampling_logp_difference/mean": 0.13306158781051636, "step": 557 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0390625, "entropy": 0.339934766292572, "epoch": 1.4684210526315788, "grad_norm": 0.007019612472504377, "learning_rate": 1e-06, "loss": -0.0123, "step": 558 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0390625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0390625, "entropy": 0.33450521528720856, "epoch": 1.4710526315789474, "grad_norm": 0.006292671896517277, "learning_rate": 1e-06, "loss": -0.0076, "step": 559 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0546875, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0546875, "entropy": 0.3440406769514084, "epoch": 1.4736842105263157, "grad_norm": 0.03406362235546112, "learning_rate": 1e-06, "loss": 0.0321, "step": 560 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9966.0, "completions/max_terminated_length": 9966.0, "completions/mean_length": 3291.5859375, "completions/mean_terminated_length": 3291.5859375, "completions/min_length": 478.0, "completions/min_terminated_length": 478.0, "entropy": 0.33509014546871185, "epoch": 1.4763157894736842, "frac_reward_zero_std": 0.96875, "grad_norm": 0.002940047299489379, "learning_rate": 1e-06, "loss": -0.0035, "num_tokens": 212401283.0, "reward": 0.8974609375, "reward_std": 0.006938039790838957, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.99609375, "rewards/symbolic_reward_accuracy/std": 0.06243881583213806, "rewards/symbolic_reward_partial_score/mean": 0.9993489384651184, "rewards/symbolic_reward_partial_score/std": 0.01040646992623806, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0741732120513916, "sampling/importance_sampling_ratio/min": 0.0023123137652873993, "sampling/sampling_logp_difference/max": 6.069506645202637, "sampling/sampling_logp_difference/mean": 0.132564976811409, "step": 561 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.339825302362442, "epoch": 1.4789473684210526, "grad_norm": 0.003128638956695795, "learning_rate": 1e-06, "loss": -0.0034, "step": 562 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.3358955830335617, "epoch": 1.481578947368421, "grad_norm": 0.0021051180083304644, "learning_rate": 1e-06, "loss": -0.0039, "step": 563 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.34036052227020264, "epoch": 1.4842105263157894, "grad_norm": 0.0035713145043700933, "learning_rate": 1e-06, "loss": 0.0167, "step": 564 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10341.0, "completions/max_terminated_length": 10341.0, "completions/mean_length": 3385.42578125, "completions/mean_terminated_length": 3385.42578125, "completions/min_length": 338.0, "completions/min_terminated_length": 338.0, "entropy": 0.33899839222431183, "epoch": 1.486842105263158, "frac_reward_zero_std": 0.90625, "grad_norm": 0.004635068587958813, "learning_rate": 1e-06, "loss": 0.0095, "num_tokens": 214538749.0, "reward": 0.8931640982627869, "reward_std": 0.020123392343521118, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.990234375, "rewards/symbolic_reward_accuracy/std": 0.09843364357948303, "rewards/symbolic_reward_partial_score/mean": 0.9967447519302368, "rewards/symbolic_reward_partial_score/std": 0.03597240895032883, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0741487741470337, "sampling/importance_sampling_ratio/min": 0.0013679899275302887, "sampling/sampling_logp_difference/max": 6.594412803649902, "sampling/sampling_logp_difference/mean": 0.13286757469177246, "step": 565 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.3365507125854492, "epoch": 1.4894736842105263, "grad_norm": 0.02782021090388298, "learning_rate": 1e-06, "loss": 0.0081, "step": 566 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.34424707293510437, "epoch": 1.4921052631578946, "grad_norm": 0.0049812812358140945, "learning_rate": 1e-06, "loss": -0.0081, "step": 567 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0390625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0390625, "entropy": 0.3341289311647415, "epoch": 1.4947368421052631, "grad_norm": 0.002292440040037036, "learning_rate": 1e-06, "loss": 0.0017, "step": 568 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12403.0, "completions/max_terminated_length": 12403.0, "completions/mean_length": 3555.328125, "completions/mean_terminated_length": 3555.328125, "completions/min_length": 617.0, "completions/min_terminated_length": 617.0, "entropy": 0.3392559736967087, "epoch": 1.4973684210526317, "frac_reward_zero_std": 0.84375, "grad_norm": 0.03446955978870392, "learning_rate": 1e-06, "loss": 0.0157, "num_tokens": 216775077.0, "reward": 0.8941400051116943, "reward_std": 0.023439956828951836, "rewards/progression_diversity/mean": -6.141091580502689e-05, "rewards/progression_diversity/std": 0.001389570184983313, "rewards/symbolic_reward_accuracy/mean": 0.9921875, "rewards/symbolic_reward_accuracy/std": 0.08812850713729858, "rewards/symbolic_reward_partial_score/mean": 0.99609375, "rewards/symbolic_reward_partial_score/std": 0.04406425356864929, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0739307403564453, "sampling/importance_sampling_ratio/min": 0.003255876014009118, "sampling/sampling_logp_difference/max": 5.727293968200684, "sampling/sampling_logp_difference/mean": 0.13247054815292358, "step": 569 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.046875, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0546875, "entropy": 0.343242347240448, "epoch": 1.5, "grad_norm": 0.003873140085488558, "learning_rate": 1e-06, "loss": 0.0046, "step": 570 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.33468693494796753, "epoch": 1.5026315789473683, "grad_norm": 0.009148257784545422, "learning_rate": 1e-06, "loss": 0.0007, "step": 571 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.34035204350948334, "epoch": 1.5052631578947369, "grad_norm": 0.003910501021891832, "learning_rate": 1e-06, "loss": -0.0108, "step": 572 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12896.0, "completions/max_terminated_length": 12896.0, "completions/mean_length": 3400.837890625, "completions/mean_terminated_length": 3400.837890625, "completions/min_length": 397.0, "completions/min_terminated_length": 397.0, "entropy": 0.33666369318962097, "epoch": 1.5078947368421054, "frac_reward_zero_std": 0.9375, "grad_norm": 0.003032868728041649, "learning_rate": 1e-06, "loss": 0.0209, "num_tokens": 218914098.0, "reward": 0.8971680402755737, "reward_std": 0.01132812537252903, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.99609375, "rewards/symbolic_reward_accuracy/std": 0.06243881583213806, "rewards/symbolic_reward_partial_score/mean": 0.9983724355697632, "rewards/symbolic_reward_partial_score/std": 0.026533395051956177, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.074427604675293, "sampling/importance_sampling_ratio/min": 0.0018630468985065818, "sampling/sampling_logp_difference/max": 6.285542011260986, "sampling/sampling_logp_difference/mean": 0.13294216990470886, "step": 573 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.33754871785640717, "epoch": 1.5105263157894737, "grad_norm": 0.0022206089925020933, "learning_rate": 1e-06, "loss": -0.0062, "step": 574 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3425910621881485, "epoch": 1.513157894736842, "grad_norm": 0.0033709751442074776, "learning_rate": 1e-06, "loss": -0.0057, "step": 575 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.33938461542129517, "epoch": 1.5157894736842106, "grad_norm": 0.002217524219304323, "learning_rate": 1e-06, "loss": -0.0057, "step": 576 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10453.0, "completions/max_terminated_length": 10453.0, "completions/mean_length": 3602.17578125, "completions/mean_terminated_length": 3602.17578125, "completions/min_length": 509.0, "completions/min_terminated_length": 509.0, "entropy": 0.33259499073028564, "epoch": 1.518421052631579, "frac_reward_zero_std": 0.9375, "grad_norm": 0.0055699474178254604, "learning_rate": 1e-06, "loss": 0.0089, "num_tokens": 221185260.0, "reward": 0.8941406607627869, "reward_std": 0.015307335183024406, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.9921875, "rewards/symbolic_reward_accuracy/std": 0.08812850713729858, "rewards/symbolic_reward_partial_score/mean": 0.99609375, "rewards/symbolic_reward_partial_score/std": 0.04406425356864929, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0728031396865845, "sampling/importance_sampling_ratio/min": 0.0003546196676325053, "sampling/sampling_logp_difference/max": 7.944464683532715, "sampling/sampling_logp_difference/mean": 0.13009676337242126, "step": 577 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.33135366439819336, "epoch": 1.5210526315789474, "grad_norm": 0.004805116914212704, "learning_rate": 1e-06, "loss": -0.0085, "step": 578 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03125, "entropy": 0.3311533033847809, "epoch": 1.5236842105263158, "grad_norm": 0.005762310698628426, "learning_rate": 1e-06, "loss": -0.0019, "step": 579 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.3338763266801834, "epoch": 1.526315789473684, "grad_norm": 0.013107489794492722, "learning_rate": 1e-06, "loss": -0.0018, "step": 580 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10354.0, "completions/max_terminated_length": 10354.0, "completions/mean_length": 3139.427734375, "completions/mean_terminated_length": 3139.427734375, "completions/min_length": 438.0, "completions/min_terminated_length": 438.0, "entropy": 0.341913104057312, "epoch": 1.5289473684210526, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 223187431.0, "reward": 0.9000000357627869, "reward_std": 0.0, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 1.0, "rewards/symbolic_reward_accuracy/std": 0.0, "rewards/symbolic_reward_partial_score/mean": 1.0, "rewards/symbolic_reward_partial_score/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0753002166748047, "sampling/importance_sampling_ratio/min": 0.0029896635096520185, "sampling/sampling_logp_difference/max": 5.812594413757324, "sampling/sampling_logp_difference/mean": 0.13463395833969116, "step": 581 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.34040696918964386, "epoch": 1.5315789473684212, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 582 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.34305983781814575, "epoch": 1.5342105263157895, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 583 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.34769493341445923, "epoch": 1.5368421052631578, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 584 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8713.0, "completions/max_terminated_length": 8713.0, "completions/mean_length": 3208.552734375, "completions/mean_terminated_length": 3208.552734375, "completions/min_length": 528.0, "completions/min_terminated_length": 528.0, "entropy": 0.33174076676368713, "epoch": 1.5394736842105263, "frac_reward_zero_std": 0.96875, "grad_norm": 0.0015670978464186192, "learning_rate": 1e-06, "loss": -0.0011, "num_tokens": 225257538.0, "reward": 0.8985351920127869, "reward_std": 0.005859375931322575, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.998046875, "rewards/symbolic_reward_accuracy/std": 0.04419417306780815, "rewards/symbolic_reward_partial_score/mean": 0.9990234375, "rewards/symbolic_reward_partial_score/std": 0.022097086533904076, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0726796388626099, "sampling/importance_sampling_ratio/min": 0.00024072466476354748, "sampling/sampling_logp_difference/max": 8.331856727600098, "sampling/sampling_logp_difference/mean": 0.13068369030952454, "step": 585 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.3343939632177353, "epoch": 1.5421052631578949, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0031, "step": 586 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3326832950115204, "epoch": 1.5447368421052632, "grad_norm": 0.0014766680542379618, "learning_rate": 1e-06, "loss": -0.001, "step": 587 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.33043473958969116, "epoch": 1.5473684210526315, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": -0.0007, "step": 588 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8420.0, "completions/max_terminated_length": 8420.0, "completions/mean_length": 3067.419921875, "completions/mean_terminated_length": 3067.419921875, "completions/min_length": 512.0, "completions/min_terminated_length": 512.0, "entropy": 0.3476197272539139, "epoch": 1.55, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 227211513.0, "reward": 0.9000000357627869, "reward_std": 0.0, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 1.0, "rewards/symbolic_reward_accuracy/std": 0.0, "rewards/symbolic_reward_partial_score/mean": 1.0, "rewards/symbolic_reward_partial_score/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0766195058822632, "sampling/importance_sampling_ratio/min": 0.0019623679108917713, "sampling/sampling_logp_difference/max": 6.233603477478027, "sampling/sampling_logp_difference/mean": 0.13657987117767334, "step": 589 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3495018184185028, "epoch": 1.5526315789473686, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 590 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.34551526606082916, "epoch": 1.555263157894737, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 591 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3482108563184738, "epoch": 1.5578947368421052, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 592 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9909.0, "completions/max_terminated_length": 9909.0, "completions/mean_length": 3565.55078125, "completions/mean_terminated_length": 3565.55078125, "completions/min_length": 536.0, "completions/min_terminated_length": 536.0, "entropy": 0.3355472683906555, "epoch": 1.5605263157894735, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 229445235.0, "reward": 0.9000000357627869, "reward_std": 0.0, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 1.0, "rewards/symbolic_reward_accuracy/std": 0.0, "rewards/symbolic_reward_partial_score/mean": 1.0, "rewards/symbolic_reward_partial_score/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.07401442527771, "sampling/importance_sampling_ratio/min": 0.002402241574600339, "sampling/sampling_logp_difference/max": 6.031352996826172, "sampling/sampling_logp_difference/mean": 0.13260497152805328, "step": 593 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3372339904308319, "epoch": 1.563157894736842, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 594 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.34043624997138977, "epoch": 1.5657894736842106, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 595 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3385699391365051, "epoch": 1.568421052631579, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 596 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8710.0, "completions/max_terminated_length": 8710.0, "completions/mean_length": 3447.640625, "completions/mean_terminated_length": 3447.640625, "completions/min_length": 440.0, "completions/min_terminated_length": 440.0, "entropy": 0.34544624388217926, "epoch": 1.5710526315789473, "frac_reward_zero_std": 0.96875, "grad_norm": 0.0025073899887502193, "learning_rate": 1e-06, "loss": -0.0031, "num_tokens": 231617083.0, "reward": 0.898681640625, "reward_std": 0.0052734375931322575, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.998046875, "rewards/symbolic_reward_accuracy/std": 0.04419417306780815, "rewards/symbolic_reward_partial_score/mean": 0.99951171875, "rewards/symbolic_reward_partial_score/std": 0.011048543266952038, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0748827457427979, "sampling/importance_sampling_ratio/min": 0.00485796295106411, "sampling/sampling_logp_difference/max": 5.327136039733887, "sampling/sampling_logp_difference/mean": 0.13470253348350525, "step": 597 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.34406325221061707, "epoch": 1.5736842105263158, "grad_norm": 0.0018366898875683546, "learning_rate": 1e-06, "loss": -0.0033, "step": 598 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.3406994640827179, "epoch": 1.5763157894736843, "grad_norm": 0.0024943349417299032, "learning_rate": 1e-06, "loss": -0.0031, "step": 599 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.34323662519454956, "epoch": 1.5789473684210527, "grad_norm": 0.0022790487855672836, "learning_rate": 1e-06, "loss": 0.012, "step": 600 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8785.0, "completions/max_terminated_length": 8785.0, "completions/mean_length": 3447.328125, "completions/mean_terminated_length": 3447.328125, "completions/min_length": 595.0, "completions/min_terminated_length": 595.0, "entropy": 0.3342888504266739, "epoch": 1.581578947368421, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 233809283.0, "reward": 0.9000000357627869, "reward_std": 0.0, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 1.0, "rewards/symbolic_reward_accuracy/std": 0.0, "rewards/symbolic_reward_partial_score/mean": 1.0, "rewards/symbolic_reward_partial_score/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0735173225402832, "sampling/importance_sampling_ratio/min": 0.002092229202389717, "sampling/sampling_logp_difference/max": 6.169525146484375, "sampling/sampling_logp_difference/mean": 0.13161174952983856, "step": 601 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.336413249373436, "epoch": 1.5842105263157895, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 602 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.34070438146591187, "epoch": 1.586842105263158, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 603 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3335687518119812, "epoch": 1.5894736842105264, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 604 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10874.0, "completions/max_terminated_length": 10874.0, "completions/mean_length": 3652.05859375, "completions/mean_terminated_length": 3652.05859375, "completions/min_length": 447.0, "completions/min_terminated_length": 447.0, "entropy": 0.34368671476840973, "epoch": 1.5921052631578947, "frac_reward_zero_std": 0.9375, "grad_norm": 0.0033029625192284584, "learning_rate": 1e-06, "loss": 0.0066, "num_tokens": 236070433.0, "reward": 0.8967773914337158, "reward_std": 0.012890626676380634, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.99609375, "rewards/symbolic_reward_accuracy/std": 0.06243881583213806, "rewards/symbolic_reward_partial_score/mean": 0.9970703125, "rewards/symbolic_reward_partial_score/std": 0.04937189444899559, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0755150318145752, "sampling/importance_sampling_ratio/min": 0.005488465074449778, "sampling/sampling_logp_difference/max": 5.205106735229492, "sampling/sampling_logp_difference/mean": 0.1346893161535263, "step": 605 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.3443053811788559, "epoch": 1.594736842105263, "grad_norm": 0.0026250272057950497, "learning_rate": 1e-06, "loss": -0.0059, "step": 606 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.34095072746276855, "epoch": 1.5973684210526315, "grad_norm": 0.00230777938850224, "learning_rate": 1e-06, "loss": 0.0055, "step": 607 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.3454348146915436, "epoch": 1.6, "grad_norm": 0.0028525462839752436, "learning_rate": 1e-06, "loss": -0.0058, "step": 608 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10074.0, "completions/max_terminated_length": 10074.0, "completions/mean_length": 3599.337890625, "completions/mean_terminated_length": 3599.337890625, "completions/min_length": 578.0, "completions/min_terminated_length": 578.0, "entropy": 0.33644211292266846, "epoch": 1.6026315789473684, "frac_reward_zero_std": 0.96875, "grad_norm": 0.0020826924592256546, "learning_rate": 1e-06, "loss": -0.0025, "num_tokens": 238311086.0, "reward": 0.8985351920127869, "reward_std": 0.005859375, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.998046875, "rewards/symbolic_reward_accuracy/std": 0.04419417306780815, "rewards/symbolic_reward_partial_score/mean": 0.9990234375, "rewards/symbolic_reward_partial_score/std": 0.022097086533904076, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0749270915985107, "sampling/importance_sampling_ratio/min": 0.0038686427287757397, "sampling/sampling_logp_difference/max": 5.554851531982422, "sampling/sampling_logp_difference/mean": 0.13448479771614075, "step": 609 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.34268292784690857, "epoch": 1.6052631578947367, "grad_norm": 0.0019481638446450233, "learning_rate": 1e-06, "loss": -0.0019, "step": 610 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.34445828199386597, "epoch": 1.6078947368421053, "grad_norm": 0.02122001349925995, "learning_rate": 1e-06, "loss": 0.0055, "step": 611 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.345332071185112, "epoch": 1.6105263157894738, "grad_norm": 0.0013578027719631791, "learning_rate": 1e-06, "loss": -0.0021, "step": 612 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11677.0, "completions/max_terminated_length": 11677.0, "completions/mean_length": 3474.3359375, "completions/mean_terminated_length": 3474.3359375, "completions/min_length": 529.0, "completions/min_terminated_length": 529.0, "entropy": 0.3407287895679474, "epoch": 1.6131578947368421, "frac_reward_zero_std": 0.96875, "grad_norm": 0.0032210287172347307, "learning_rate": 1e-06, "loss": -0.0042, "num_tokens": 240479418.0, "reward": 0.89697265625, "reward_std": 0.008394349366426468, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.99609375, "rewards/symbolic_reward_accuracy/std": 0.06243881583213806, "rewards/symbolic_reward_partial_score/mean": 0.9977213144302368, "rewards/symbolic_reward_partial_score/std": 0.04478955641388893, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0745296478271484, "sampling/importance_sampling_ratio/min": 0.0037957418244332075, "sampling/sampling_logp_difference/max": 5.573875427246094, "sampling/sampling_logp_difference/mean": 0.13399165868759155, "step": 613 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.3345174044370651, "epoch": 1.6157894736842104, "grad_norm": 0.002166792983189225, "learning_rate": 1e-06, "loss": -0.0039, "step": 614 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3419598489999771, "epoch": 1.618421052631579, "grad_norm": 0.00296321720816195, "learning_rate": 1e-06, "loss": -0.0035, "step": 615 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3420746624469757, "epoch": 1.6210526315789475, "grad_norm": 0.0038187119644135237, "learning_rate": 1e-06, "loss": 0.0203, "step": 616 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8462.0, "completions/max_terminated_length": 8462.0, "completions/mean_length": 3327.880859375, "completions/mean_terminated_length": 3327.880859375, "completions/min_length": 504.0, "completions/min_terminated_length": 504.0, "entropy": 0.33613699674606323, "epoch": 1.6236842105263158, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 242590589.0, "reward": 0.9000000357627869, "reward_std": 0.0, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 1.0, "rewards/symbolic_reward_accuracy/std": 0.0, "rewards/symbolic_reward_partial_score/mean": 1.0, "rewards/symbolic_reward_partial_score/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.073636770248413, "sampling/importance_sampling_ratio/min": 0.003098087152466178, "sampling/sampling_logp_difference/max": 5.776970386505127, "sampling/sampling_logp_difference/mean": 0.13247191905975342, "step": 617 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.33432984352111816, "epoch": 1.6263157894736842, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 618 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3361922949552536, "epoch": 1.6289473684210525, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 619 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3377861827611923, "epoch": 1.631578947368421, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 620 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12986.0, "completions/max_terminated_length": 12986.0, "completions/mean_length": 3558.421875, "completions/mean_terminated_length": 3558.421875, "completions/min_length": 424.0, "completions/min_terminated_length": 424.0, "entropy": 0.3314055800437927, "epoch": 1.6342105263157896, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 244828661.0, "reward": 0.9000000357627869, "reward_std": 0.0, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 1.0, "rewards/symbolic_reward_accuracy/std": 0.0, "rewards/symbolic_reward_partial_score/mean": 1.0, "rewards/symbolic_reward_partial_score/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0731018781661987, "sampling/importance_sampling_ratio/min": 0.003603918943554163, "sampling/sampling_logp_difference/max": 5.625733375549316, "sampling/sampling_logp_difference/mean": 0.13093644380569458, "step": 621 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3309334069490433, "epoch": 1.6368421052631579, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 622 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3373106122016907, "epoch": 1.6394736842105262, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 623 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3339690566062927, "epoch": 1.6421052631578947, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 624 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9544.0, "completions/max_terminated_length": 9544.0, "completions/mean_length": 3277.38671875, "completions/mean_terminated_length": 3277.38671875, "completions/min_length": 496.0, "completions/min_terminated_length": 496.0, "entropy": 0.33971530199050903, "epoch": 1.6447368421052633, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 246879131.0, "reward": 0.9000000357627869, "reward_std": 0.0, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 1.0, "rewards/symbolic_reward_accuracy/std": 0.0, "rewards/symbolic_reward_partial_score/mean": 1.0, "rewards/symbolic_reward_partial_score/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0739367008209229, "sampling/importance_sampling_ratio/min": 0.002753001870587468, "sampling/sampling_logp_difference/max": 5.895063400268555, "sampling/sampling_logp_difference/mean": 0.13331833481788635, "step": 625 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3414822071790695, "epoch": 1.6473684210526316, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 626 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3398244380950928, "epoch": 1.65, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 627 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.33861152827739716, "epoch": 1.6526315789473685, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 628 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9862.0, "completions/max_terminated_length": 9862.0, "completions/mean_length": 3336.978515625, "completions/mean_terminated_length": 3336.978515625, "completions/min_length": 519.0, "completions/min_terminated_length": 519.0, "entropy": 0.3391132652759552, "epoch": 1.655263157894737, "frac_reward_zero_std": 0.96875, "grad_norm": 0.0024971726816147566, "learning_rate": 1e-06, "loss": 0.0105, "num_tokens": 248964784.0, "reward": 0.8985351920127869, "reward_std": 0.005859375, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.998046875, "rewards/symbolic_reward_accuracy/std": 0.04419417306780815, "rewards/symbolic_reward_partial_score/mean": 0.9990234375, "rewards/symbolic_reward_partial_score/std": 0.022097086533904076, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0743725299835205, "sampling/importance_sampling_ratio/min": 0.002229302190244198, "sampling/sampling_logp_difference/max": 6.106066703796387, "sampling/sampling_logp_difference/mean": 0.1329614520072937, "step": 629 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.3418237268924713, "epoch": 1.6578947368421053, "grad_norm": 0.0021945717744529247, "learning_rate": 1e-06, "loss": -0.0022, "step": 630 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.33484601974487305, "epoch": 1.6605263157894736, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": -0.0014, "step": 631 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.3404732346534729, "epoch": 1.663157894736842, "grad_norm": 0.0013345663901418447, "learning_rate": 1e-06, "loss": -0.0011, "step": 632 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9330.0, "completions/max_terminated_length": 9330.0, "completions/mean_length": 3384.9453125, "completions/mean_terminated_length": 3384.9453125, "completions/min_length": 521.0, "completions/min_terminated_length": 521.0, "entropy": 0.33236537873744965, "epoch": 1.6657894736842105, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 251081332.0, "reward": 0.9000000357627869, "reward_std": 0.0, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 1.0, "rewards/symbolic_reward_accuracy/std": 0.0, "rewards/symbolic_reward_partial_score/mean": 1.0, "rewards/symbolic_reward_partial_score/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0731902122497559, "sampling/importance_sampling_ratio/min": 0.0031901318579912186, "sampling/sampling_logp_difference/max": 5.747693061828613, "sampling/sampling_logp_difference/mean": 0.13135936856269836, "step": 633 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3363723158836365, "epoch": 1.668421052631579, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 634 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.33142542839050293, "epoch": 1.6710526315789473, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 635 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3376529961824417, "epoch": 1.6736842105263157, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 636 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9368.0, "completions/max_terminated_length": 9368.0, "completions/mean_length": 3274.16796875, "completions/mean_terminated_length": 3274.16796875, "completions/min_length": 522.0, "completions/min_terminated_length": 522.0, "entropy": 0.3349597305059433, "epoch": 1.6763157894736842, "frac_reward_zero_std": 0.96875, "grad_norm": 0.0023131445050239563, "learning_rate": 1e-06, "loss": -0.0023, "num_tokens": 253150666.0, "reward": 0.898681640625, "reward_std": 0.0052734375931322575, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.998046875, "rewards/symbolic_reward_accuracy/std": 0.04419417306780815, "rewards/symbolic_reward_partial_score/mean": 0.99951171875, "rewards/symbolic_reward_partial_score/std": 0.011048543266952038, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0731512308120728, "sampling/importance_sampling_ratio/min": 0.0032096439972519875, "sampling/sampling_logp_difference/max": 5.741595268249512, "sampling/sampling_logp_difference/mean": 0.13173502683639526, "step": 637 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.33449897170066833, "epoch": 1.6789473684210527, "grad_norm": 0.024591682478785515, "learning_rate": 1e-06, "loss": 0.0085, "step": 638 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.3377806395292282, "epoch": 1.681578947368421, "grad_norm": 0.0012858399422839284, "learning_rate": 1e-06, "loss": -0.0022, "step": 639 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.33622539043426514, "epoch": 1.6842105263157894, "grad_norm": 0.002245208714157343, "learning_rate": 1e-06, "loss": -0.0021, "step": 640 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9670.0, "completions/max_terminated_length": 9670.0, "completions/mean_length": 3464.44921875, "completions/mean_terminated_length": 3464.44921875, "completions/min_length": 488.0, "completions/min_terminated_length": 488.0, "entropy": 0.329209104180336, "epoch": 1.686842105263158, "frac_reward_zero_std": 0.96875, "grad_norm": 0.0020087119191884995, "learning_rate": 1e-06, "loss": -0.0015, "num_tokens": 255322256.0, "reward": 0.8985351920127869, "reward_std": 0.005859375931322575, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.998046875, "rewards/symbolic_reward_accuracy/std": 0.04419417306780815, "rewards/symbolic_reward_partial_score/mean": 0.9990234375, "rewards/symbolic_reward_partial_score/std": 0.022097086533904076, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0737781524658203, "sampling/importance_sampling_ratio/min": 0.002425621496513486, "sampling/sampling_logp_difference/max": 6.02166748046875, "sampling/sampling_logp_difference/mean": 0.13241076469421387, "step": 641 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.34017790853977203, "epoch": 1.6894736842105265, "grad_norm": 0.0017875040648505092, "learning_rate": 1e-06, "loss": -0.0011, "step": 642 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.34326620399951935, "epoch": 1.6921052631578948, "grad_norm": 0.018837234005331993, "learning_rate": 1e-06, "loss": 0.0035, "step": 643 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.3369489312171936, "epoch": 1.694736842105263, "grad_norm": 0.0020364064257591963, "learning_rate": 1e-06, "loss": -0.0016, "step": 644 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8938.0, "completions/max_terminated_length": 8938.0, "completions/mean_length": 3311.94921875, "completions/mean_terminated_length": 3311.94921875, "completions/min_length": 501.0, "completions/min_terminated_length": 501.0, "entropy": 0.33389556407928467, "epoch": 1.6973684210526314, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 257429462.0, "reward": 0.9000000357627869, "reward_std": 0.0, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 1.0, "rewards/symbolic_reward_accuracy/std": 0.0, "rewards/symbolic_reward_partial_score/mean": 1.0, "rewards/symbolic_reward_partial_score/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0737931728363037, "sampling/importance_sampling_ratio/min": 0.0022522357758134604, "sampling/sampling_logp_difference/max": 6.095831871032715, "sampling/sampling_logp_difference/mean": 0.1326713114976883, "step": 645 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.33604227006435394, "epoch": 1.7, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 646 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.33722665905952454, "epoch": 1.7026315789473685, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 647 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.33853714168071747, "epoch": 1.7052631578947368, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 648 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8644.0, "completions/max_terminated_length": 8644.0, "completions/mean_length": 3344.76171875, "completions/mean_terminated_length": 3344.76171875, "completions/min_length": 420.0, "completions/min_terminated_length": 420.0, "entropy": 0.34471651911735535, "epoch": 1.7078947368421051, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 259544444.0, "reward": 0.9000000357627869, "reward_std": 0.0, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 1.0, "rewards/symbolic_reward_accuracy/std": 0.0, "rewards/symbolic_reward_partial_score/mean": 1.0, "rewards/symbolic_reward_partial_score/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.075309157371521, "sampling/importance_sampling_ratio/min": 0.0023694748524576426, "sampling/sampling_logp_difference/max": 6.045086860656738, "sampling/sampling_logp_difference/mean": 0.1349734365940094, "step": 649 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3480121046304703, "epoch": 1.7105263157894737, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 650 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.34284743666648865, "epoch": 1.7131578947368422, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 651 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3409724235534668, "epoch": 1.7157894736842105, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 652 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8039.0, "completions/max_terminated_length": 8039.0, "completions/mean_length": 3284.404296875, "completions/mean_terminated_length": 3284.404296875, "completions/min_length": 525.0, "completions/min_terminated_length": 525.0, "entropy": 0.3481387346982956, "epoch": 1.7184210526315788, "frac_reward_zero_std": 0.96875, "grad_norm": 0.001667381846345961, "learning_rate": 1e-06, "loss": -0.0013, "num_tokens": 261609675.0, "reward": 0.8985351920127869, "reward_std": 0.005859375931322575, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.998046875, "rewards/symbolic_reward_accuracy/std": 0.04419417306780815, "rewards/symbolic_reward_partial_score/mean": 0.9990234375, "rewards/symbolic_reward_partial_score/std": 0.022097086533904076, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0761690139770508, "sampling/importance_sampling_ratio/min": 0.00039532361552119255, "sampling/sampling_logp_difference/max": 7.835805892944336, "sampling/sampling_logp_difference/mean": 0.13541975617408752, "step": 653 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.3457929790019989, "epoch": 1.7210526315789474, "grad_norm": 0.002012541750445962, "learning_rate": 1e-06, "loss": -0.0014, "step": 654 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.34793421626091003, "epoch": 1.723684210526316, "grad_norm": 0.017752334475517273, "learning_rate": 1e-06, "loss": 0.0016, "step": 655 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3432317227125168, "epoch": 1.7263157894736842, "grad_norm": 0.0024642220232635736, "learning_rate": 1e-06, "loss": -0.0024, "step": 656 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10953.0, "completions/max_terminated_length": 10953.0, "completions/mean_length": 3374.3046875, "completions/mean_terminated_length": 3374.3046875, "completions/min_length": 413.0, "completions/min_terminated_length": 413.0, "entropy": 0.3519081324338913, "epoch": 1.7289473684210526, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 263725927.0, "reward": 0.9000000357627869, "reward_std": 0.0, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 1.0, "rewards/symbolic_reward_accuracy/std": 0.0, "rewards/symbolic_reward_partial_score/mean": 1.0, "rewards/symbolic_reward_partial_score/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0768013000488281, "sampling/importance_sampling_ratio/min": 0.005194542929530144, "sampling/sampling_logp_difference/max": 5.260146617889404, "sampling/sampling_logp_difference/mean": 0.13699805736541748, "step": 657 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.35108350217342377, "epoch": 1.731578947368421, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 658 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3475366085767746, "epoch": 1.7342105263157894, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 659 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3490852415561676, "epoch": 1.736842105263158, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 660 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9033.0, "completions/max_terminated_length": 9033.0, "completions/mean_length": 3316.60546875, "completions/mean_terminated_length": 3316.60546875, "completions/min_length": 400.0, "completions/min_terminated_length": 400.0, "entropy": 0.3423232436180115, "epoch": 1.7394736842105263, "frac_reward_zero_std": 0.96875, "grad_norm": 0.0019378490978851914, "learning_rate": 1e-06, "loss": -0.0016, "num_tokens": 265812317.0, "reward": 0.8985351920127869, "reward_std": 0.005859375, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.998046875, "rewards/symbolic_reward_accuracy/std": 0.04419417306780815, "rewards/symbolic_reward_partial_score/mean": 0.9990234375, "rewards/symbolic_reward_partial_score/std": 0.022097086533904076, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0752824544906616, "sampling/importance_sampling_ratio/min": 0.004177606198936701, "sampling/sampling_logp_difference/max": 5.4780168533325195, "sampling/sampling_logp_difference/mean": 0.13485771417617798, "step": 661 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.34627868235111237, "epoch": 1.7421052631578946, "grad_norm": 0.0018145004287362099, "learning_rate": 1e-06, "loss": -0.0017, "step": 662 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3428933322429657, "epoch": 1.7447368421052631, "grad_norm": 0.014095846563577652, "learning_rate": 1e-06, "loss": 0.0015, "step": 663 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.34317877888679504, "epoch": 1.7473684210526317, "grad_norm": 0.0023692846298217773, "learning_rate": 1e-06, "loss": -0.0025, "step": 664 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15116.0, "completions/max_terminated_length": 15116.0, "completions/mean_length": 3510.1484375, "completions/mean_terminated_length": 3510.1484375, "completions/min_length": 415.0, "completions/min_terminated_length": 415.0, "entropy": 0.3386514484882355, "epoch": 1.75, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 268007145.0, "reward": 0.9000000357627869, "reward_std": 0.0, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 1.0, "rewards/symbolic_reward_accuracy/std": 0.0, "rewards/symbolic_reward_partial_score/mean": 1.0, "rewards/symbolic_reward_partial_score/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0744856595993042, "sampling/importance_sampling_ratio/min": 0.0019565075635910034, "sampling/sampling_logp_difference/max": 6.236594200134277, "sampling/sampling_logp_difference/mean": 0.13337922096252441, "step": 665 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3412347435951233, "epoch": 1.7526315789473683, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 666 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.34147390723228455, "epoch": 1.7552631578947369, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 667 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.338520348072052, "epoch": 1.7578947368421054, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 668 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10984.0, "completions/max_terminated_length": 10984.0, "completions/mean_length": 3533.943359375, "completions/mean_terminated_length": 3533.943359375, "completions/min_length": 364.0, "completions/min_terminated_length": 364.0, "entropy": 0.3367491662502289, "epoch": 1.7605263157894737, "frac_reward_zero_std": 0.96875, "grad_norm": 0.002047416754066944, "learning_rate": 1e-06, "loss": -0.0032, "num_tokens": 270215180.0, "reward": 0.8985351920127869, "reward_std": 0.005859375, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.998046875, "rewards/symbolic_reward_accuracy/std": 0.04419417306780815, "rewards/symbolic_reward_partial_score/mean": 0.9990234375, "rewards/symbolic_reward_partial_score/std": 0.022097086533904076, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0740793943405151, "sampling/importance_sampling_ratio/min": 0.004092976916581392, "sampling/sampling_logp_difference/max": 5.498482704162598, "sampling/sampling_logp_difference/mean": 0.13250817358493805, "step": 669 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.33420705795288086, "epoch": 1.763157894736842, "grad_norm": 0.0014927912270650268, "learning_rate": 1e-06, "loss": -0.0035, "step": 670 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3376428931951523, "epoch": 1.7657894736842106, "grad_norm": 0.022721925750374794, "learning_rate": 1e-06, "loss": 0.0063, "step": 671 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.33732420206069946, "epoch": 1.768421052631579, "grad_norm": 0.0017170595237985253, "learning_rate": 1e-06, "loss": -0.0039, "step": 672 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9807.0, "completions/max_terminated_length": 9807.0, "completions/mean_length": 3261.955078125, "completions/mean_terminated_length": 3261.955078125, "completions/min_length": 440.0, "completions/min_terminated_length": 440.0, "entropy": 0.34403252601623535, "epoch": 1.7710526315789474, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 272283093.0, "reward": 0.9000000357627869, "reward_std": 0.0, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 1.0, "rewards/symbolic_reward_accuracy/std": 0.0, "rewards/symbolic_reward_partial_score/mean": 1.0, "rewards/symbolic_reward_partial_score/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0746586322784424, "sampling/importance_sampling_ratio/min": 0.0021890460047870874, "sampling/sampling_logp_difference/max": 6.124289512634277, "sampling/sampling_logp_difference/mean": 0.13363787531852722, "step": 673 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.33919790387153625, "epoch": 1.7736842105263158, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 674 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3388398438692093, "epoch": 1.776315789473684, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 675 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3362094908952713, "epoch": 1.7789473684210526, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 676 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9577.0, "completions/max_terminated_length": 9577.0, "completions/mean_length": 3547.5234375, "completions/mean_terminated_length": 3547.5234375, "completions/min_length": 430.0, "completions/min_terminated_length": 430.0, "entropy": 0.3383502960205078, "epoch": 1.7815789473684212, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 274509569.0, "reward": 0.9000000357627869, "reward_std": 0.0, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 1.0, "rewards/symbolic_reward_accuracy/std": 0.0, "rewards/symbolic_reward_partial_score/mean": 1.0, "rewards/symbolic_reward_partial_score/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0738481283187866, "sampling/importance_sampling_ratio/min": 0.002451168606057763, "sampling/sampling_logp_difference/max": 6.011190414428711, "sampling/sampling_logp_difference/mean": 0.13186538219451904, "step": 677 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.33321164548397064, "epoch": 1.7842105263157895, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 678 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.33688168227672577, "epoch": 1.7868421052631578, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 679 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.33217282593250275, "epoch": 1.7894736842105263, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 680 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13706.0, "completions/max_terminated_length": 13706.0, "completions/mean_length": 3441.240234375, "completions/mean_terminated_length": 3441.240234375, "completions/min_length": 419.0, "completions/min_terminated_length": 419.0, "entropy": 0.3406982123851776, "epoch": 1.7921052631578949, "frac_reward_zero_std": 0.96875, "grad_norm": 0.0022487174719572067, "learning_rate": 1e-06, "loss": 0.005, "num_tokens": 276671644.0, "reward": 0.898681640625, "reward_std": 0.0052734375931322575, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.998046875, "rewards/symbolic_reward_accuracy/std": 0.04419417306780815, "rewards/symbolic_reward_partial_score/mean": 0.99951171875, "rewards/symbolic_reward_partial_score/std": 0.011048543266952038, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.074965000152588, "sampling/importance_sampling_ratio/min": 0.002463805489242077, "sampling/sampling_logp_difference/max": 6.006048202514648, "sampling/sampling_logp_difference/mean": 0.13387629389762878, "step": 681 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.34350623190402985, "epoch": 1.7947368421052632, "grad_norm": 0.0021750673186033964, "learning_rate": 1e-06, "loss": -0.0024, "step": 682 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3361895978450775, "epoch": 1.7973684210526315, "grad_norm": 0.0018264730460941792, "learning_rate": 1e-06, "loss": -0.0019, "step": 683 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.34400539100170135, "epoch": 1.8, "grad_norm": 0.0015250913565978408, "learning_rate": 1e-06, "loss": -0.0028, "step": 684 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10044.0, "completions/max_terminated_length": 10044.0, "completions/mean_length": 3462.36328125, "completions/mean_terminated_length": 3462.36328125, "completions/min_length": 453.0, "completions/min_terminated_length": 453.0, "entropy": 0.33978745341300964, "epoch": 1.8026315789473686, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 278855542.0, "reward": 0.9000000357627869, "reward_std": 0.0, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 1.0, "rewards/symbolic_reward_accuracy/std": 0.0, "rewards/symbolic_reward_partial_score/mean": 1.0, "rewards/symbolic_reward_partial_score/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.075028419494629, "sampling/importance_sampling_ratio/min": 0.0009050782537087798, "sampling/sampling_logp_difference/max": 7.007489204406738, "sampling/sampling_logp_difference/mean": 0.13410016894340515, "step": 685 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.34209468960762024, "epoch": 1.805263157894737, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 686 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.34040695428848267, "epoch": 1.8078947368421052, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 687 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3399760276079178, "epoch": 1.8105263157894735, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 688 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9879.0, "completions/max_terminated_length": 9879.0, "completions/mean_length": 3659.15625, "completions/mean_terminated_length": 3659.15625, "completions/min_length": 441.0, "completions/min_terminated_length": 441.0, "entropy": 0.341571107506752, "epoch": 1.813157894736842, "frac_reward_zero_std": 0.96875, "grad_norm": 0.0021675708703696728, "learning_rate": 1e-06, "loss": -0.0027, "num_tokens": 281132678.0, "reward": 0.8982422351837158, "reward_std": 0.00703125074505806, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.998046875, "rewards/symbolic_reward_accuracy/std": 0.04419417306780815, "rewards/symbolic_reward_partial_score/mean": 0.998046875, "rewards/symbolic_reward_partial_score/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0743193626403809, "sampling/importance_sampling_ratio/min": 0.003050927072763443, "sampling/sampling_logp_difference/max": 5.792309761047363, "sampling/sampling_logp_difference/mean": 0.13293233513832092, "step": 689 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.3404027372598648, "epoch": 1.8157894736842106, "grad_norm": 0.0012999862665310502, "learning_rate": 1e-06, "loss": 0.0072, "step": 690 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.33470624685287476, "epoch": 1.818421052631579, "grad_norm": 0.002411535009741783, "learning_rate": 1e-06, "loss": -0.0027, "step": 691 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.33844296634197235, "epoch": 1.8210526315789473, "grad_norm": 0.0015728527214378119, "learning_rate": 1e-06, "loss": -0.0026, "step": 692 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10146.0, "completions/max_terminated_length": 10146.0, "completions/mean_length": 3606.326171875, "completions/mean_terminated_length": 3606.326171875, "completions/min_length": 387.0, "completions/min_terminated_length": 387.0, "entropy": 0.34725117683410645, "epoch": 1.8236842105263158, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 283381261.0, "reward": 0.9000000357627869, "reward_std": 0.0, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 1.0, "rewards/symbolic_reward_accuracy/std": 0.0, "rewards/symbolic_reward_partial_score/mean": 1.0, "rewards/symbolic_reward_partial_score/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0758707523345947, "sampling/importance_sampling_ratio/min": 0.004136885516345501, "sampling/sampling_logp_difference/max": 5.487812042236328, "sampling/sampling_logp_difference/mean": 0.13532403111457825, "step": 693 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3438762426376343, "epoch": 1.8263157894736843, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 694 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.34386299550533295, "epoch": 1.8289473684210527, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 695 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3405921310186386, "epoch": 1.831578947368421, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 696 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 15234.0, "completions/mean_length": 3883.541015625, "completions/mean_terminated_length": 3859.078369140625, "completions/min_length": 472.0, "completions/min_terminated_length": 472.0, "entropy": 0.33742207288742065, "epoch": 1.8342105263157895, "frac_reward_zero_std": 0.96875, "grad_norm": 0.004055510275065899, "learning_rate": 1e-06, "loss": 0.0173, "num_tokens": 285765442.0, "reward": 0.8971680402755737, "reward_std": 0.0077438391745090485, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.99609375, "rewards/symbolic_reward_accuracy/std": 0.06243881583213806, "rewards/symbolic_reward_partial_score/mean": 0.9990234375, "rewards/symbolic_reward_partial_score/std": 0.016457298770546913, "rewards/tag_count_reward/mean": -0.001953125, "rewards/tag_count_reward/std": 0.04419417306780815, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0754303932189941, "sampling/importance_sampling_ratio/min": 0.004632679279893637, "sampling/sampling_logp_difference/max": 5.374619960784912, "sampling/sampling_logp_difference/mean": 0.1345752775669098, "step": 697 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.34411878883838654, "epoch": 1.836842105263158, "grad_norm": 0.004025896079838276, "learning_rate": 1e-06, "loss": -0.007, "step": 698 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.34465864300727844, "epoch": 1.8394736842105264, "grad_norm": 0.0026052154134958982, "learning_rate": 1e-06, "loss": 0.0041, "step": 699 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.34761345386505127, "epoch": 1.8421052631578947, "grad_norm": 0.003990442492067814, "learning_rate": 1e-06, "loss": -0.006, "step": 700 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9403.0, "completions/max_terminated_length": 9403.0, "completions/mean_length": 3361.40234375, "completions/mean_terminated_length": 3361.40234375, "completions/min_length": 421.0, "completions/min_terminated_length": 421.0, "entropy": 0.3364853858947754, "epoch": 1.844736842105263, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 287893616.0, "reward": 0.9000000357627869, "reward_std": 0.0, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 1.0, "rewards/symbolic_reward_accuracy/std": 0.0, "rewards/symbolic_reward_partial_score/mean": 1.0, "rewards/symbolic_reward_partial_score/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.073472499847412, "sampling/importance_sampling_ratio/min": 0.005268438719213009, "sampling/sampling_logp_difference/max": 5.246021270751953, "sampling/sampling_logp_difference/mean": 0.13186919689178467, "step": 701 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.33494099974632263, "epoch": 1.8473684210526315, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 702 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3360227197408676, "epoch": 1.85, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 703 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3305090069770813, "epoch": 1.8526315789473684, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 704 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9793.0, "completions/max_terminated_length": 9793.0, "completions/mean_length": 3962.095703125, "completions/mean_terminated_length": 3962.095703125, "completions/min_length": 376.0, "completions/min_terminated_length": 376.0, "entropy": 0.34073494374752045, "epoch": 1.8552631578947367, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 290336705.0, "reward": 0.9000000357627869, "reward_std": 0.0, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 1.0, "rewards/symbolic_reward_accuracy/std": 0.0, "rewards/symbolic_reward_partial_score/mean": 1.0, "rewards/symbolic_reward_partial_score/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.074371576309204, "sampling/importance_sampling_ratio/min": 0.004184260964393616, "sampling/sampling_logp_difference/max": 5.4764251708984375, "sampling/sampling_logp_difference/mean": 0.13311326503753662, "step": 705 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.33502396941185, "epoch": 1.8578947368421053, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 706 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3384702354669571, "epoch": 1.8605263157894738, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 707 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.34034082293510437, "epoch": 1.8631578947368421, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 708 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11967.0, "completions/max_terminated_length": 11967.0, "completions/mean_length": 3847.791015625, "completions/mean_terminated_length": 3847.791015625, "completions/min_length": 435.0, "completions/min_terminated_length": 435.0, "entropy": 0.32969391345977783, "epoch": 1.8657894736842104, "frac_reward_zero_std": 0.96875, "grad_norm": 0.0024385356809943914, "learning_rate": 1e-06, "loss": -0.004, "num_tokens": 292718422.0, "reward": 0.8982422351837158, "reward_std": 0.00703125074505806, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.998046875, "rewards/symbolic_reward_accuracy/std": 0.04419417306780815, "rewards/symbolic_reward_partial_score/mean": 0.998046875, "rewards/symbolic_reward_partial_score/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.073617696762085, "sampling/importance_sampling_ratio/min": 3.668817271318403e-06, "sampling/sampling_logp_difference/max": 12.515641212463379, "sampling/sampling_logp_difference/mean": 0.13094830513000488, "step": 709 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.33163173496723175, "epoch": 1.868421052631579, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": -0.0036, "step": 710 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.33412016928195953, "epoch": 1.8710526315789475, "grad_norm": 0.026372192427515984, "learning_rate": 1e-06, "loss": 0.0134, "step": 711 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3383806645870209, "epoch": 1.8736842105263158, "grad_norm": 0.0018389305332675576, "learning_rate": 1e-06, "loss": -0.0021, "step": 712 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9807.0, "completions/max_terminated_length": 9807.0, "completions/mean_length": 3515.87109375, "completions/mean_terminated_length": 3515.87109375, "completions/min_length": 376.0, "completions/min_terminated_length": 376.0, "entropy": 0.3475053906440735, "epoch": 1.8763157894736842, "frac_reward_zero_std": 0.9375, "grad_norm": 0.002811212558299303, "learning_rate": 1e-06, "loss": -0.0039, "num_tokens": 294902164.0, "reward": 0.89697265625, "reward_std": 0.01210937649011612, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.99609375, "rewards/symbolic_reward_accuracy/std": 0.06243881583213806, "rewards/symbolic_reward_partial_score/mean": 0.9977213144302368, "rewards/symbolic_reward_partial_score/std": 0.04478955641388893, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.075580358505249, "sampling/importance_sampling_ratio/min": 0.0015152755659073591, "sampling/sampling_logp_difference/max": 6.492157936096191, "sampling/sampling_logp_difference/mean": 0.1357021927833557, "step": 713 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.3437517285346985, "epoch": 1.8789473684210525, "grad_norm": 0.002806734060868621, "learning_rate": 1e-06, "loss": -0.0046, "step": 714 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.34555883705616, "epoch": 1.881578947368421, "grad_norm": 0.002586781047284603, "learning_rate": 1e-06, "loss": 0.0018, "step": 715 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.3483405113220215, "epoch": 1.8842105263157896, "grad_norm": 0.024919819086790085, "learning_rate": 1e-06, "loss": 0.008, "step": 716 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10566.0, "completions/max_terminated_length": 10566.0, "completions/mean_length": 3346.30859375, "completions/mean_terminated_length": 3346.30859375, "completions/min_length": 461.0, "completions/min_terminated_length": 461.0, "entropy": 0.34306687116622925, "epoch": 1.8868421052631579, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 297019122.0, "reward": 0.9000000357627869, "reward_std": 0.0, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 1.0, "rewards/symbolic_reward_accuracy/std": 0.0, "rewards/symbolic_reward_partial_score/mean": 1.0, "rewards/symbolic_reward_partial_score/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0755906105041504, "sampling/importance_sampling_ratio/min": 0.0039094919338822365, "sampling/sampling_logp_difference/max": 5.544347763061523, "sampling/sampling_logp_difference/mean": 0.13461332023143768, "step": 717 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.34099486470222473, "epoch": 1.8894736842105262, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 718 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.34473465383052826, "epoch": 1.8921052631578947, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 719 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.344462513923645, "epoch": 1.8947368421052633, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 720 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9163.0, "completions/max_terminated_length": 9163.0, "completions/mean_length": 3540.005859375, "completions/mean_terminated_length": 3540.005859375, "completions/min_length": 439.0, "completions/min_terminated_length": 439.0, "entropy": 0.34413011372089386, "epoch": 1.8973684210526316, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 299241429.0, "reward": 0.9000000357627869, "reward_std": 0.0, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 1.0, "rewards/symbolic_reward_accuracy/std": 0.0, "rewards/symbolic_reward_partial_score/mean": 1.0, "rewards/symbolic_reward_partial_score/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0772572755813599, "sampling/importance_sampling_ratio/min": 0.004300178959965706, "sampling/sampling_logp_difference/max": 5.449098587036133, "sampling/sampling_logp_difference/mean": 0.1373736560344696, "step": 721 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.35340389609336853, "epoch": 1.9, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 722 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3526596426963806, "epoch": 1.9026315789473685, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 723 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3482683449983597, "epoch": 1.905263157894737, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 724 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8450.0, "completions/max_terminated_length": 8450.0, "completions/mean_length": 3471.900390625, "completions/mean_terminated_length": 3471.900390625, "completions/min_length": 405.0, "completions/min_terminated_length": 405.0, "entropy": 0.3554147034883499, "epoch": 1.9078947368421053, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 301415010.0, "reward": 0.9000000357627869, "reward_std": 0.0, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 1.0, "rewards/symbolic_reward_accuracy/std": 0.0, "rewards/symbolic_reward_partial_score/mean": 1.0, "rewards/symbolic_reward_partial_score/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0789822340011597, "sampling/importance_sampling_ratio/min": 0.0008500578114762902, "sampling/sampling_logp_difference/max": 7.070206165313721, "sampling/sampling_logp_difference/mean": 0.13985846936702728, "step": 725 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3538322001695633, "epoch": 1.9105263157894736, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 726 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3619409203529358, "epoch": 1.913157894736842, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 727 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.35994380712509155, "epoch": 1.9157894736842105, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 728 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10013.0, "completions/max_terminated_length": 10013.0, "completions/mean_length": 3769.208984375, "completions/mean_terminated_length": 3769.208984375, "completions/min_length": 498.0, "completions/min_terminated_length": 498.0, "entropy": 0.3502224385738373, "epoch": 1.918421052631579, "frac_reward_zero_std": 0.96875, "grad_norm": 0.0019520545611158013, "learning_rate": 1e-06, "loss": -0.0022, "num_tokens": 303768685.0, "reward": 0.8982422351837158, "reward_std": 0.00703125074505806, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.998046875, "rewards/symbolic_reward_accuracy/std": 0.04419417306780815, "rewards/symbolic_reward_partial_score/mean": 0.998046875, "rewards/symbolic_reward_partial_score/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0774219036102295, "sampling/importance_sampling_ratio/min": 0.00268715457059443, "sampling/sampling_logp_difference/max": 5.919272422790527, "sampling/sampling_logp_difference/mean": 0.13766522705554962, "step": 729 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.3477538973093033, "epoch": 1.9210526315789473, "grad_norm": 0.0020143105648458004, "learning_rate": 1e-06, "loss": -0.0022, "step": 730 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.3494712859392166, "epoch": 1.9236842105263157, "grad_norm": 0.0014784891391173005, "learning_rate": 1e-06, "loss": -0.0018, "step": 731 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.35283252596855164, "epoch": 1.9263157894736842, "grad_norm": 0.03069218061864376, "learning_rate": 1e-06, "loss": 0.0177, "step": 732 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10389.0, "completions/max_terminated_length": 10389.0, "completions/mean_length": 3560.8515625, "completions/mean_terminated_length": 3560.8515625, "completions/min_length": 502.0, "completions/min_terminated_length": 502.0, "entropy": 0.3573026657104492, "epoch": 1.9289473684210527, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 305981473.0, "reward": 0.9000000357627869, "reward_std": 0.0, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 1.0, "rewards/symbolic_reward_accuracy/std": 0.0, "rewards/symbolic_reward_partial_score/mean": 1.0, "rewards/symbolic_reward_partial_score/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0794272422790527, "sampling/importance_sampling_ratio/min": 0.004254009574651718, "sampling/sampling_logp_difference/max": 5.459893226623535, "sampling/sampling_logp_difference/mean": 0.14001327753067017, "step": 733 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3597280979156494, "epoch": 1.931578947368421, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 734 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3543603867292404, "epoch": 1.9342105263157894, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 735 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3588518798351288, "epoch": 1.936842105263158, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 736 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9486.0, "completions/max_terminated_length": 9486.0, "completions/mean_length": 3560.462890625, "completions/mean_terminated_length": 3560.462890625, "completions/min_length": 463.0, "completions/min_terminated_length": 463.0, "entropy": 0.36161401867866516, "epoch": 1.9394736842105265, "frac_reward_zero_std": 0.96875, "grad_norm": 0.0023902805987745523, "learning_rate": 1e-06, "loss": -0.0031, "num_tokens": 308202062.0, "reward": 0.8987305164337158, "reward_std": 0.005078125279396772, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.998046875, "rewards/symbolic_reward_accuracy/std": 0.04419417306780815, "rewards/symbolic_reward_partial_score/mean": 0.9996744394302368, "rewards/symbolic_reward_partial_score/std": 0.007365696597844362, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0801494121551514, "sampling/importance_sampling_ratio/min": 0.0026822001673281193, "sampling/sampling_logp_difference/max": 5.921117782592773, "sampling/sampling_logp_difference/mean": 0.14135870337486267, "step": 737 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03125, "entropy": 0.3572559207677841, "epoch": 1.9421052631578948, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": -0.0029, "step": 738 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.3620138615369797, "epoch": 1.944736842105263, "grad_norm": 0.0017557261744514108, "learning_rate": 1e-06, "loss": -0.0035, "step": 739 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.3596826046705246, "epoch": 1.9473684210526314, "grad_norm": 0.0017345166997984052, "learning_rate": 1e-06, "loss": 0.0113, "step": 740 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11681.0, "completions/max_terminated_length": 11681.0, "completions/mean_length": 3739.15625, "completions/mean_terminated_length": 3739.15625, "completions/min_length": 522.0, "completions/min_terminated_length": 522.0, "entropy": 0.36253704130649567, "epoch": 1.95, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 310532670.0, "reward": 0.9000000357627869, "reward_std": 0.0, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 1.0, "rewards/symbolic_reward_accuracy/std": 0.0, "rewards/symbolic_reward_partial_score/mean": 1.0, "rewards/symbolic_reward_partial_score/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0792697668075562, "sampling/importance_sampling_ratio/min": 0.0028592911548912525, "sampling/sampling_logp_difference/max": 5.857181549072266, "sampling/sampling_logp_difference/mean": 0.1403844952583313, "step": 741 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3555978536605835, "epoch": 1.9526315789473685, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 742 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3534139096736908, "epoch": 1.9552631578947368, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 743 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.35781463980674744, "epoch": 1.9578947368421051, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 744 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10375.0, "completions/max_terminated_length": 10375.0, "completions/mean_length": 3780.607421875, "completions/mean_terminated_length": 3780.607421875, "completions/min_length": 438.0, "completions/min_terminated_length": 438.0, "entropy": 0.3578491806983948, "epoch": 1.9605263157894737, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 312890837.0, "reward": 0.9000000357627869, "reward_std": 0.0, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 1.0, "rewards/symbolic_reward_accuracy/std": 0.0, "rewards/symbolic_reward_partial_score/mean": 1.0, "rewards/symbolic_reward_partial_score/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0795950889587402, "sampling/importance_sampling_ratio/min": 0.004698720760643482, "sampling/sampling_logp_difference/max": 5.360465049743652, "sampling/sampling_logp_difference/mean": 0.14015620946884155, "step": 745 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3536480516195297, "epoch": 1.9631578947368422, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 746 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.35741159319877625, "epoch": 1.9657894736842105, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 747 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3618907332420349, "epoch": 1.9684210526315788, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 748 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9125.0, "completions/max_terminated_length": 9125.0, "completions/mean_length": 3344.76953125, "completions/mean_terminated_length": 3344.76953125, "completions/min_length": 405.0, "completions/min_terminated_length": 405.0, "entropy": 0.3646991401910782, "epoch": 1.9710526315789474, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 314960287.0, "reward": 0.9000000357627869, "reward_std": 0.0, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 1.0, "rewards/symbolic_reward_accuracy/std": 0.0, "rewards/symbolic_reward_partial_score/mean": 1.0, "rewards/symbolic_reward_partial_score/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0806875228881836, "sampling/importance_sampling_ratio/min": 0.0010063308291137218, "sampling/sampling_logp_difference/max": 6.901444435119629, "sampling/sampling_logp_difference/mean": 0.14274956285953522, "step": 749 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.36066919565200806, "epoch": 1.973684210526316, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 750 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.36283013224601746, "epoch": 1.9763157894736842, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 751 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.35837408900260925, "epoch": 1.9789473684210526, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 752 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10750.0, "completions/max_terminated_length": 10750.0, "completions/mean_length": 3362.15625, "completions/mean_terminated_length": 3362.15625, "completions/min_length": 426.0, "completions/min_terminated_length": 426.0, "entropy": 0.36036379635334015, "epoch": 1.981578947368421, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 317101359.0, "reward": 0.9000000357627869, "reward_std": 0.0, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 1.0, "rewards/symbolic_reward_accuracy/std": 0.0, "rewards/symbolic_reward_partial_score/mean": 1.0, "rewards/symbolic_reward_partial_score/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0809118747711182, "sampling/importance_sampling_ratio/min": 0.003745832247659564, "sampling/sampling_logp_difference/max": 5.587111473083496, "sampling/sampling_logp_difference/mean": 0.14238294959068298, "step": 753 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.36399853229522705, "epoch": 1.9842105263157894, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 754 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3615352362394333, "epoch": 1.986842105263158, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 755 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.36251747608184814, "epoch": 1.9894736842105263, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 756 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9536.0, "completions/max_terminated_length": 9536.0, "completions/mean_length": 3496.650390625, "completions/mean_terminated_length": 3496.650390625, "completions/min_length": 501.0, "completions/min_terminated_length": 501.0, "entropy": 0.36202944815158844, "epoch": 1.9921052631578946, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 319296956.0, "reward": 0.9000000357627869, "reward_std": 0.0, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 1.0, "rewards/symbolic_reward_accuracy/std": 0.0, "rewards/symbolic_reward_partial_score/mean": 1.0, "rewards/symbolic_reward_partial_score/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.080052137374878, "sampling/importance_sampling_ratio/min": 0.0019087025430053473, "sampling/sampling_logp_difference/max": 6.261331558227539, "sampling/sampling_logp_difference/mean": 0.14239847660064697, "step": 757 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.36309894919395447, "epoch": 1.9947368421052631, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 758 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3586999177932739, "epoch": 1.9973684210526317, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 759 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.35834677517414093, "epoch": 2.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 760 }, { "epoch": 2.0, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.000244140625, "eval_completions/max_length": 8231.84375, "eval_completions/max_terminated_length": 8136.375, "eval_completions/mean_length": 2280.115234375, "eval_completions/mean_terminated_length": 2276.8439407348633, "eval_completions/min_length": 430.1875, "eval_completions/min_terminated_length": 430.1875, "eval_entropy": 0.35164141561836004, "eval_frac_reward_zero_std": 0.9921875, "eval_loss": 0.0003617434995248914, "eval_num_tokens": 319296956.0, "eval_reward": 0.8997558951377869, "eval_reward_std": 0.0009765626164153218, "eval_rewards/progression_diversity/mean": 0.0, "eval_rewards/progression_diversity/std": 0.0, "eval_rewards/symbolic_reward_accuracy/mean": 0.999755859375, "eval_rewards/symbolic_reward_accuracy/std": 0.0027621358167380095, "eval_rewards/symbolic_reward_partial_score/mean": 0.999755859375, "eval_rewards/symbolic_reward_partial_score/std": 0.0027621358167380095, "eval_rewards/tag_count_reward/mean": -0.000244140625, "eval_rewards/tag_count_reward/std": 0.0027621358167380095, "eval_runtime": 4095.2895, "eval_samples_per_second": 0.061, "eval_sampling/importance_sampling_ratio/max": 2.0, "eval_sampling/importance_sampling_ratio/mean": 1.077749066054821, "eval_sampling/importance_sampling_ratio/min": 0.007175851121075993, "eval_sampling/sampling_logp_difference/max": 5.190796986222267, "eval_sampling/sampling_logp_difference/mean": 0.13837473606690764, "eval_steps_per_second": 0.0, "step": 760 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10761.0, "completions/max_terminated_length": 10761.0, "completions/mean_length": 3595.201171875, "completions/mean_terminated_length": 3595.201171875, "completions/min_length": 451.0, "completions/min_terminated_length": 451.0, "entropy": 0.36298730969429016, "epoch": 2.0026315789473683, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 321530659.0, "reward": 0.9000000357627869, "reward_std": 0.0, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 1.0, "rewards/symbolic_reward_accuracy/std": 0.0, "rewards/symbolic_reward_partial_score/mean": 1.0, "rewards/symbolic_reward_partial_score/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0813624858856201, "sampling/importance_sampling_ratio/min": 0.003946871496737003, "sampling/sampling_logp_difference/max": 5.534832000732422, "sampling/sampling_logp_difference/mean": 0.14352434873580933, "step": 761 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.367371141910553, "epoch": 2.0052631578947366, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 762 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3665529638528824, "epoch": 2.0078947368421054, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 763 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3691776543855667, "epoch": 2.0105263157894737, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 764 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9808.0, "completions/max_terminated_length": 9808.0, "completions/mean_length": 3425.44140625, "completions/mean_terminated_length": 3425.44140625, "completions/min_length": 401.0, "completions/min_terminated_length": 401.0, "entropy": 0.3624395728111267, "epoch": 2.013157894736842, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 323691301.0, "reward": 0.9000000357627869, "reward_std": 0.0, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 1.0, "rewards/symbolic_reward_accuracy/std": 0.0, "rewards/symbolic_reward_partial_score/mean": 1.0, "rewards/symbolic_reward_partial_score/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0808817148208618, "sampling/importance_sampling_ratio/min": 0.002431049942970276, "sampling/sampling_logp_difference/max": 6.019432067871094, "sampling/sampling_logp_difference/mean": 0.14238983392715454, "step": 765 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.36406005918979645, "epoch": 2.0157894736842104, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 766 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3598169535398483, "epoch": 2.018421052631579, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 767 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3641858845949173, "epoch": 2.0210526315789474, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 768 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9859.0, "completions/max_terminated_length": 9859.0, "completions/mean_length": 3323.517578125, "completions/mean_terminated_length": 3323.517578125, "completions/min_length": 476.0, "completions/min_terminated_length": 476.0, "entropy": 0.35223451256752014, "epoch": 2.0236842105263158, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 325784238.0, "reward": 0.9000000357627869, "reward_std": 0.0, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 1.0, "rewards/symbolic_reward_accuracy/std": 0.0, "rewards/symbolic_reward_partial_score/mean": 1.0, "rewards/symbolic_reward_partial_score/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0796624422073364, "sampling/importance_sampling_ratio/min": 0.0028084167279303074, "sampling/sampling_logp_difference/max": 5.875134468078613, "sampling/sampling_logp_difference/mean": 0.14061452448368073, "step": 769 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3612624555826187, "epoch": 2.026315789473684, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 770 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3575820177793503, "epoch": 2.028947368421053, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 771 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3550865948200226, "epoch": 2.031578947368421, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 772 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12033.0, "completions/max_terminated_length": 12033.0, "completions/mean_length": 3548.966796875, "completions/mean_terminated_length": 3548.966796875, "completions/min_length": 453.0, "completions/min_terminated_length": 453.0, "entropy": 0.35931631922721863, "epoch": 2.0342105263157895, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 328011613.0, "reward": 0.9000000357627869, "reward_std": 0.0, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 1.0, "rewards/symbolic_reward_accuracy/std": 0.0, "rewards/symbolic_reward_partial_score/mean": 1.0, "rewards/symbolic_reward_partial_score/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0805346965789795, "sampling/importance_sampling_ratio/min": 0.0019306916510686278, "sampling/sampling_logp_difference/max": 6.249876976013184, "sampling/sampling_logp_difference/mean": 0.1414928287267685, "step": 773 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3585065007209778, "epoch": 2.036842105263158, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 774 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.36112290620803833, "epoch": 2.039473684210526, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 775 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3583669066429138, "epoch": 2.042105263157895, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 776 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9312.0, "completions/max_terminated_length": 9312.0, "completions/mean_length": 3505.486328125, "completions/mean_terminated_length": 3505.486328125, "completions/min_length": 478.0, "completions/min_terminated_length": 478.0, "entropy": 0.3650098890066147, "epoch": 2.044736842105263, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 330185526.0, "reward": 0.9000000357627869, "reward_std": 0.0, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 1.0, "rewards/symbolic_reward_accuracy/std": 0.0, "rewards/symbolic_reward_partial_score/mean": 1.0, "rewards/symbolic_reward_partial_score/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0813181400299072, "sampling/importance_sampling_ratio/min": 0.001822043675929308, "sampling/sampling_logp_difference/max": 6.307796478271484, "sampling/sampling_logp_difference/mean": 0.1434062123298645, "step": 777 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.36926889419555664, "epoch": 2.0473684210526315, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 778 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3631732910871506, "epoch": 2.05, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 779 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3639003485441208, "epoch": 2.0526315789473686, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 780 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11281.0, "completions/max_terminated_length": 11281.0, "completions/mean_length": 3788.052734375, "completions/mean_terminated_length": 3788.052734375, "completions/min_length": 439.0, "completions/min_terminated_length": 439.0, "entropy": 0.35236436128616333, "epoch": 2.055263157894737, "frac_reward_zero_std": 0.96875, "grad_norm": 0.004361342638731003, "learning_rate": 1e-06, "loss": -0.0057, "num_tokens": 332550513.0, "reward": 0.8960449695587158, "reward_std": 0.008503163233399391, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.994140625, "rewards/symbolic_reward_accuracy/std": 0.07639661431312561, "rewards/symbolic_reward_partial_score/mean": 0.99853515625, "rewards/symbolic_reward_partial_score/std": 0.019099153578281403, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.078831672668457, "sampling/importance_sampling_ratio/min": 0.001703736837953329, "sampling/sampling_logp_difference/max": 6.374931335449219, "sampling/sampling_logp_difference/mean": 0.13904887437820435, "step": 781 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.35258060693740845, "epoch": 2.057894736842105, "grad_norm": 0.0037441530730575323, "learning_rate": 1e-06, "loss": -0.0072, "step": 782 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3512864261865616, "epoch": 2.0605263157894735, "grad_norm": 0.004325737711042166, "learning_rate": 1e-06, "loss": -0.0061, "step": 783 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.35623714327812195, "epoch": 2.0631578947368423, "grad_norm": 0.012732669711112976, "learning_rate": 1e-06, "loss": 0.0229, "step": 784 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8636.0, "completions/max_terminated_length": 8636.0, "completions/mean_length": 3515.12109375, "completions/mean_terminated_length": 3515.12109375, "completions/min_length": 490.0, "completions/min_terminated_length": 490.0, "entropy": 0.3619864732027054, "epoch": 2.0657894736842106, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 334777423.0, "reward": 0.9000000357627869, "reward_std": 0.0, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 1.0, "rewards/symbolic_reward_accuracy/std": 0.0, "rewards/symbolic_reward_partial_score/mean": 1.0, "rewards/symbolic_reward_partial_score/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0804877281188965, "sampling/importance_sampling_ratio/min": 0.0022474597208201885, "sampling/sampling_logp_difference/max": 6.097954750061035, "sampling/sampling_logp_difference/mean": 0.14149627089500427, "step": 785 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3623397946357727, "epoch": 2.068421052631579, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 786 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.36073145270347595, "epoch": 2.0710526315789473, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 787 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3587031066417694, "epoch": 2.0736842105263156, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 788 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9839.0, "completions/max_terminated_length": 9839.0, "completions/mean_length": 3346.65625, "completions/mean_terminated_length": 3346.65625, "completions/min_length": 416.0, "completions/min_terminated_length": 416.0, "entropy": 0.36421024799346924, "epoch": 2.0763157894736843, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 336877695.0, "reward": 0.9000000357627869, "reward_std": 0.0, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 1.0, "rewards/symbolic_reward_accuracy/std": 0.0, "rewards/symbolic_reward_partial_score/mean": 1.0, "rewards/symbolic_reward_partial_score/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0816600322723389, "sampling/importance_sampling_ratio/min": 0.003113460959866643, "sampling/sampling_logp_difference/max": 5.77202033996582, "sampling/sampling_logp_difference/mean": 0.14348509907722473, "step": 789 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.36605167388916016, "epoch": 2.0789473684210527, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 790 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3700164258480072, "epoch": 2.081578947368421, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 791 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3662918359041214, "epoch": 2.0842105263157893, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 792 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14800.0, "completions/max_terminated_length": 14800.0, "completions/mean_length": 3869.033203125, "completions/mean_terminated_length": 3869.033203125, "completions/min_length": 438.0, "completions/min_terminated_length": 438.0, "entropy": 0.36017023026943207, "epoch": 2.086842105263158, "frac_reward_zero_std": 0.96875, "grad_norm": 0.018225817009806633, "learning_rate": 1e-06, "loss": 0.0066, "num_tokens": 339271792.0, "reward": 0.8960937857627869, "reward_std": 0.008404643274843693, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.994140625, "rewards/symbolic_reward_accuracy/std": 0.07639661431312561, "rewards/symbolic_reward_partial_score/mean": 0.9986978769302368, "rewards/symbolic_reward_partial_score/std": 0.018012749031186104, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0800925493240356, "sampling/importance_sampling_ratio/min": 0.003562139580026269, "sampling/sampling_logp_difference/max": 5.637393951416016, "sampling/sampling_logp_difference/mean": 0.1412864476442337, "step": 793 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.36104610562324524, "epoch": 2.0894736842105264, "grad_norm": 0.021239040419459343, "learning_rate": 1e-06, "loss": 0.0085, "step": 794 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3594662845134735, "epoch": 2.0921052631578947, "grad_norm": 0.005434577818959951, "learning_rate": 1e-06, "loss": -0.0093, "step": 795 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.35890424251556396, "epoch": 2.094736842105263, "grad_norm": 0.004396317061036825, "learning_rate": 1e-06, "loss": 0.0052, "step": 796 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10691.0, "completions/max_terminated_length": 10691.0, "completions/mean_length": 3464.56640625, "completions/mean_terminated_length": 3464.56640625, "completions/min_length": 478.0, "completions/min_terminated_length": 478.0, "entropy": 0.3645046800374985, "epoch": 2.0973684210526318, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 341430930.0, "reward": 0.9000000357627869, "reward_std": 0.0, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 1.0, "rewards/symbolic_reward_accuracy/std": 0.0, "rewards/symbolic_reward_partial_score/mean": 1.0, "rewards/symbolic_reward_partial_score/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0804063081741333, "sampling/importance_sampling_ratio/min": 0.0034081318881362677, "sampling/sampling_logp_difference/max": 5.681591033935547, "sampling/sampling_logp_difference/mean": 0.14232930541038513, "step": 797 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3601907193660736, "epoch": 2.1, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 798 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3631383776664734, "epoch": 2.1026315789473684, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 799 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.35969512164592743, "epoch": 2.1052631578947367, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 800 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10718.0, "completions/max_terminated_length": 10718.0, "completions/mean_length": 3508.095703125, "completions/mean_terminated_length": 3508.095703125, "completions/min_length": 378.0, "completions/min_terminated_length": 378.0, "entropy": 0.3545304834842682, "epoch": 2.1078947368421055, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 343624867.0, "reward": 0.9000000357627869, "reward_std": 0.0, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 1.0, "rewards/symbolic_reward_accuracy/std": 0.0, "rewards/symbolic_reward_partial_score/mean": 1.0, "rewards/symbolic_reward_partial_score/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.079469919204712, "sampling/importance_sampling_ratio/min": 0.0019319017883390188, "sampling/sampling_logp_difference/max": 6.249250411987305, "sampling/sampling_logp_difference/mean": 0.14034950733184814, "step": 801 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3559059500694275, "epoch": 2.110526315789474, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 802 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3500673472881317, "epoch": 2.113157894736842, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 803 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3585348278284073, "epoch": 2.1157894736842104, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 804 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8261.0, "completions/max_terminated_length": 8261.0, "completions/mean_length": 3677.546875, "completions/mean_terminated_length": 3677.546875, "completions/min_length": 503.0, "completions/min_terminated_length": 503.0, "entropy": 0.3571825325489044, "epoch": 2.1184210526315788, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 345928603.0, "reward": 0.9000000357627869, "reward_std": 0.0, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 1.0, "rewards/symbolic_reward_accuracy/std": 0.0, "rewards/symbolic_reward_partial_score/mean": 1.0, "rewards/symbolic_reward_partial_score/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.079470157623291, "sampling/importance_sampling_ratio/min": 0.0022055478766560555, "sampling/sampling_logp_difference/max": 6.116779327392578, "sampling/sampling_logp_difference/mean": 0.14049874246120453, "step": 805 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.35630688071250916, "epoch": 2.1210526315789475, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 806 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.35744862258434296, "epoch": 2.123684210526316, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 807 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3563287556171417, "epoch": 2.126315789473684, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 808 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10644.0, "completions/max_terminated_length": 10644.0, "completions/mean_length": 3947.912109375, "completions/mean_terminated_length": 3947.912109375, "completions/min_length": 382.0, "completions/min_terminated_length": 382.0, "entropy": 0.35348281264305115, "epoch": 2.1289473684210525, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 348375438.0, "reward": 0.9000000357627869, "reward_std": 0.0, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 1.0, "rewards/symbolic_reward_accuracy/std": 0.0, "rewards/symbolic_reward_partial_score/mean": 1.0, "rewards/symbolic_reward_partial_score/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0787529945373535, "sampling/importance_sampling_ratio/min": 0.0020357677713036537, "sampling/sampling_logp_difference/max": 6.196882247924805, "sampling/sampling_logp_difference/mean": 0.1388179510831833, "step": 809 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3535951226949692, "epoch": 2.1315789473684212, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 810 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3539409786462784, "epoch": 2.1342105263157896, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 811 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.35516853630542755, "epoch": 2.136842105263158, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 812 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10071.0, "completions/max_terminated_length": 10071.0, "completions/mean_length": 3520.287109375, "completions/mean_terminated_length": 3520.287109375, "completions/min_length": 564.0, "completions/min_terminated_length": 564.0, "entropy": 0.34783630073070526, "epoch": 2.139473684210526, "frac_reward_zero_std": 0.96875, "grad_norm": 0.0021344521082937717, "learning_rate": 1e-06, "loss": -0.0023, "num_tokens": 350604833.0, "reward": 0.8982422351837158, "reward_std": 0.00703125074505806, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.998046875, "rewards/symbolic_reward_accuracy/std": 0.04419417306780815, "rewards/symbolic_reward_partial_score/mean": 0.998046875, "rewards/symbolic_reward_partial_score/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.078248381614685, "sampling/importance_sampling_ratio/min": 0.0030024840962141752, "sampling/sampling_logp_difference/max": 5.808315277099609, "sampling/sampling_logp_difference/mean": 0.13739916682243347, "step": 813 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.35168467462062836, "epoch": 2.1421052631578945, "grad_norm": 0.0012346720322966576, "learning_rate": 1e-06, "loss": -0.0019, "step": 814 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.3539455831050873, "epoch": 2.1447368421052633, "grad_norm": 0.0017987735336646438, "learning_rate": 1e-06, "loss": -0.0022, "step": 815 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3499945104122162, "epoch": 2.1473684210526316, "grad_norm": 0.030344804748892784, "learning_rate": 1e-06, "loss": 0.0172, "step": 816 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14795.0, "completions/max_terminated_length": 14795.0, "completions/mean_length": 3808.421875, "completions/mean_terminated_length": 3808.421875, "completions/min_length": 425.0, "completions/min_terminated_length": 425.0, "entropy": 0.34687963128089905, "epoch": 2.15, "frac_reward_zero_std": 0.96875, "grad_norm": 0.004520186688750982, "learning_rate": 1e-06, "loss": -0.0063, "num_tokens": 352974073.0, "reward": 0.8961914777755737, "reward_std": 0.00818823091685772, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.994140625, "rewards/symbolic_reward_accuracy/std": 0.07639661431312561, "rewards/symbolic_reward_partial_score/mean": 0.9990234375, "rewards/symbolic_reward_partial_score/std": 0.012732770293951035, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0778783559799194, "sampling/importance_sampling_ratio/min": 0.003796433098614216, "sampling/sampling_logp_difference/max": 5.57369327545166, "sampling/sampling_logp_difference/mean": 0.13715752959251404, "step": 817 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.34696531295776367, "epoch": 2.1526315789473682, "grad_norm": 0.0026537070516496897, "learning_rate": 1e-06, "loss": -0.0098, "step": 818 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3488131910562515, "epoch": 2.155263157894737, "grad_norm": 0.017045624554157257, "learning_rate": 1e-06, "loss": 0.007, "step": 819 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.34774544835090637, "epoch": 2.1578947368421053, "grad_norm": 0.0037021986208856106, "learning_rate": 1e-06, "loss": 0.0156, "step": 820 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9068.0, "completions/max_terminated_length": 9068.0, "completions/mean_length": 3010.5078125, "completions/mean_terminated_length": 3010.5078125, "completions/min_length": 417.0, "completions/min_terminated_length": 417.0, "entropy": 0.34668752551078796, "epoch": 2.1605263157894736, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 354925437.0, "reward": 0.9000000357627869, "reward_std": 0.0, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 1.0, "rewards/symbolic_reward_accuracy/std": 0.0, "rewards/symbolic_reward_partial_score/mean": 1.0, "rewards/symbolic_reward_partial_score/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0776565074920654, "sampling/importance_sampling_ratio/min": 0.0036361808888614178, "sampling/sampling_logp_difference/max": 5.6168212890625, "sampling/sampling_logp_difference/mean": 0.13762426376342773, "step": 821 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.34951791167259216, "epoch": 2.163157894736842, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 822 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.35019971430301666, "epoch": 2.1657894736842107, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 823 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3485657572746277, "epoch": 2.168421052631579, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 824 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 11177.0, "completions/mean_length": 3425.408203125, "completions/mean_terminated_length": 3400.048828125, "completions/min_length": 493.0, "completions/min_terminated_length": 493.0, "entropy": 0.35317307710647583, "epoch": 2.1710526315789473, "frac_reward_zero_std": 0.96875, "grad_norm": 0.03592127189040184, "learning_rate": 1e-06, "loss": 0.03, "num_tokens": 357089070.0, "reward": 0.8985351920127869, "reward_std": 0.005859375, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.998046875, "rewards/symbolic_reward_accuracy/std": 0.04419417306780815, "rewards/symbolic_reward_partial_score/mean": 0.9996744394302368, "rewards/symbolic_reward_partial_score/std": 0.007365696597844362, "rewards/tag_count_reward/mean": -0.001953125, "rewards/tag_count_reward/std": 0.04419417306780815, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0796266794204712, "sampling/importance_sampling_ratio/min": 0.00373989250510931, "sampling/sampling_logp_difference/max": 5.588698387145996, "sampling/sampling_logp_difference/mean": 0.14031460881233215, "step": 825 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.35781554877758026, "epoch": 2.1736842105263157, "grad_norm": 0.0011857482604682446, "learning_rate": 1e-06, "loss": -0.0018, "step": 826 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.35762716829776764, "epoch": 2.1763157894736844, "grad_norm": 0.0019255193183198571, "learning_rate": 1e-06, "loss": -0.0018, "step": 827 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.35916295647621155, "epoch": 2.1789473684210527, "grad_norm": 0.0015134497079998255, "learning_rate": 1e-06, "loss": -0.0016, "step": 828 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9881.0, "completions/max_terminated_length": 9881.0, "completions/mean_length": 3226.078125, "completions/mean_terminated_length": 3226.078125, "completions/min_length": 518.0, "completions/min_terminated_length": 518.0, "entropy": 0.36967386305332184, "epoch": 2.181578947368421, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 359121270.0, "reward": 0.9000000357627869, "reward_std": 0.0, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 1.0, "rewards/symbolic_reward_accuracy/std": 0.0, "rewards/symbolic_reward_partial_score/mean": 1.0, "rewards/symbolic_reward_partial_score/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0817246437072754, "sampling/importance_sampling_ratio/min": 0.0019021262414753437, "sampling/sampling_logp_difference/max": 6.264782905578613, "sampling/sampling_logp_difference/mean": 0.14368581771850586, "step": 829 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.36373527348041534, "epoch": 2.1842105263157894, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 830 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3680852949619293, "epoch": 2.1868421052631577, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 831 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.362392783164978, "epoch": 2.1894736842105265, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 832 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8042.0, "completions/max_terminated_length": 8042.0, "completions/mean_length": 2635.0, "completions/mean_terminated_length": 2635.0, "completions/min_length": 458.0, "completions/min_terminated_length": 458.0, "entropy": 0.35058173537254333, "epoch": 2.192105263157895, "frac_reward_zero_std": 0.96875, "grad_norm": 0.0021791746839880943, "learning_rate": 1e-06, "loss": -0.0022, "num_tokens": 360865014.0, "reward": 0.8987305164337158, "reward_std": 0.005078125279396772, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.998046875, "rewards/symbolic_reward_accuracy/std": 0.04419417306780815, "rewards/symbolic_reward_partial_score/mean": 0.9996744394302368, "rewards/symbolic_reward_partial_score/std": 0.007365696597844362, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0782395601272583, "sampling/importance_sampling_ratio/min": 0.0024804144632071257, "sampling/sampling_logp_difference/max": 5.999329566955566, "sampling/sampling_logp_difference/mean": 0.13835185766220093, "step": 833 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3521229177713394, "epoch": 2.194736842105263, "grad_norm": 0.023094572126865387, "learning_rate": 1e-06, "loss": 0.0074, "step": 834 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3483341783285141, "epoch": 2.1973684210526314, "grad_norm": 0.0019297855906188488, "learning_rate": 1e-06, "loss": -0.0022, "step": 835 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.34995439648628235, "epoch": 2.2, "grad_norm": 0.0012441467260941863, "learning_rate": 1e-06, "loss": -0.0021, "step": 836 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8547.0, "completions/max_terminated_length": 8547.0, "completions/mean_length": 3289.556640625, "completions/mean_terminated_length": 3289.556640625, "completions/min_length": 354.0, "completions/min_terminated_length": 354.0, "entropy": 0.3641905039548874, "epoch": 2.2026315789473685, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 362937555.0, "reward": 0.9000000357627869, "reward_std": 0.0, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 1.0, "rewards/symbolic_reward_accuracy/std": 0.0, "rewards/symbolic_reward_partial_score/mean": 1.0, "rewards/symbolic_reward_partial_score/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0813119411468506, "sampling/importance_sampling_ratio/min": 0.0018664266681298614, "sampling/sampling_logp_difference/max": 6.283729553222656, "sampling/sampling_logp_difference/mean": 0.1430273950099945, "step": 837 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3650030046701431, "epoch": 2.205263157894737, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 838 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3656873404979706, "epoch": 2.207894736842105, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 839 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3581591993570328, "epoch": 2.2105263157894735, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 840 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9270.0, "completions/max_terminated_length": 9270.0, "completions/mean_length": 2924.392578125, "completions/mean_terminated_length": 2924.392578125, "completions/min_length": 479.0, "completions/min_terminated_length": 479.0, "entropy": 0.3600582480430603, "epoch": 2.213157894736842, "frac_reward_zero_std": 0.96875, "grad_norm": 0.00194226810708642, "learning_rate": 1e-06, "loss": -0.0018, "num_tokens": 364829628.0, "reward": 0.8987305164337158, "reward_std": 0.005078125279396772, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.998046875, "rewards/symbolic_reward_accuracy/std": 0.04419417306780815, "rewards/symbolic_reward_partial_score/mean": 0.9996744394302368, "rewards/symbolic_reward_partial_score/std": 0.007365696597844362, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0805137157440186, "sampling/importance_sampling_ratio/min": 0.005263281054794788, "sampling/sampling_logp_difference/max": 5.247000694274902, "sampling/sampling_logp_difference/mean": 0.1419794261455536, "step": 841 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.357988566160202, "epoch": 2.2157894736842105, "grad_norm": 0.0019932533614337444, "learning_rate": 1e-06, "loss": 0.0049, "step": 842 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.35795870423316956, "epoch": 2.218421052631579, "grad_norm": 0.0014545193407684565, "learning_rate": 1e-06, "loss": -0.0022, "step": 843 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.3594760149717331, "epoch": 2.221052631578947, "grad_norm": 0.0016534478636458516, "learning_rate": 1e-06, "loss": -0.0026, "step": 844 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8782.0, "completions/max_terminated_length": 8782.0, "completions/mean_length": 2783.583984375, "completions/mean_terminated_length": 2783.583984375, "completions/min_length": 382.0, "completions/min_terminated_length": 382.0, "entropy": 0.3533519208431244, "epoch": 2.223684210526316, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 366637255.0, "reward": 0.9000000357627869, "reward_std": 0.0, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 1.0, "rewards/symbolic_reward_accuracy/std": 0.0, "rewards/symbolic_reward_partial_score/mean": 1.0, "rewards/symbolic_reward_partial_score/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0811340808868408, "sampling/importance_sampling_ratio/min": 0.0004352092510089278, "sampling/sampling_logp_difference/max": 7.739683628082275, "sampling/sampling_logp_difference/mean": 0.1418296992778778, "step": 845 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.35684728622436523, "epoch": 2.2263157894736842, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 846 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.36099888384342194, "epoch": 2.2289473684210526, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 847 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3621361255645752, "epoch": 2.231578947368421, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 848 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8370.0, "completions/max_terminated_length": 8370.0, "completions/mean_length": 3118.376953125, "completions/mean_terminated_length": 3118.376953125, "completions/min_length": 435.0, "completions/min_terminated_length": 435.0, "entropy": 0.3650641292333603, "epoch": 2.2342105263157896, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 368637352.0, "reward": 0.9000000357627869, "reward_std": 0.0, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 1.0, "rewards/symbolic_reward_accuracy/std": 0.0, "rewards/symbolic_reward_partial_score/mean": 1.0, "rewards/symbolic_reward_partial_score/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.082280158996582, "sampling/importance_sampling_ratio/min": 0.0026092303451150656, "sampling/sampling_logp_difference/max": 5.948699951171875, "sampling/sampling_logp_difference/mean": 0.1442147195339203, "step": 849 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3680501878261566, "epoch": 2.236842105263158, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 850 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3672652691602707, "epoch": 2.2394736842105263, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 851 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3655097186565399, "epoch": 2.2421052631578946, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 852 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9824.0, "completions/max_terminated_length": 9824.0, "completions/mean_length": 2764.900390625, "completions/mean_terminated_length": 2764.900390625, "completions/min_length": 459.0, "completions/min_terminated_length": 459.0, "entropy": 0.3595215082168579, "epoch": 2.2447368421052634, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 370436437.0, "reward": 0.9000000357627869, "reward_std": 0.0, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 1.0, "rewards/symbolic_reward_accuracy/std": 0.0, "rewards/symbolic_reward_partial_score/mean": 1.0, "rewards/symbolic_reward_partial_score/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0805084705352783, "sampling/importance_sampling_ratio/min": 0.0016191472532227635, "sampling/sampling_logp_difference/max": 6.42585563659668, "sampling/sampling_logp_difference/mean": 0.1419532597064972, "step": 853 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.35886846482753754, "epoch": 2.2473684210526317, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 854 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3609441816806793, "epoch": 2.25, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 855 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3563035726547241, "epoch": 2.2526315789473683, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 856 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7658.0, "completions/max_terminated_length": 7658.0, "completions/mean_length": 3001.794921875, "completions/mean_terminated_length": 3001.794921875, "completions/min_length": 435.0, "completions/min_terminated_length": 435.0, "entropy": 0.3596903085708618, "epoch": 2.2552631578947366, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 372386668.0, "reward": 0.9000000357627869, "reward_std": 0.0, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 1.0, "rewards/symbolic_reward_accuracy/std": 0.0, "rewards/symbolic_reward_partial_score/mean": 1.0, "rewards/symbolic_reward_partial_score/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0805888175964355, "sampling/importance_sampling_ratio/min": 0.006828156765550375, "sampling/sampling_logp_difference/max": 4.986700534820557, "sampling/sampling_logp_difference/mean": 0.14145098626613617, "step": 857 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.35975906252861023, "epoch": 2.2578947368421054, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 858 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.35930413007736206, "epoch": 2.2605263157894737, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 859 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.35540255904197693, "epoch": 2.263157894736842, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 860 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8183.0, "completions/max_terminated_length": 8183.0, "completions/mean_length": 3158.125, "completions/mean_terminated_length": 3158.125, "completions/min_length": 331.0, "completions/min_terminated_length": 331.0, "entropy": 0.36748993396759033, "epoch": 2.2657894736842104, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 374413612.0, "reward": 0.9000000357627869, "reward_std": 0.0, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 1.0, "rewards/symbolic_reward_accuracy/std": 0.0, "rewards/symbolic_reward_partial_score/mean": 1.0, "rewards/symbolic_reward_partial_score/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0813016891479492, "sampling/importance_sampling_ratio/min": 0.0036390256136655807, "sampling/sampling_logp_difference/max": 5.616039276123047, "sampling/sampling_logp_difference/mean": 0.14284461736679077, "step": 861 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3594106435775757, "epoch": 2.268421052631579, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 862 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.35894837975502014, "epoch": 2.2710526315789474, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 863 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.36357444524765015, "epoch": 2.2736842105263158, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 864 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9334.0, "completions/max_terminated_length": 9334.0, "completions/mean_length": 3025.111328125, "completions/mean_terminated_length": 3025.111328125, "completions/min_length": 474.0, "completions/min_terminated_length": 474.0, "entropy": 0.3758433759212494, "epoch": 2.276315789473684, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 376323749.0, "reward": 0.9000000357627869, "reward_std": 0.0, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 1.0, "rewards/symbolic_reward_accuracy/std": 0.0, "rewards/symbolic_reward_partial_score/mean": 1.0, "rewards/symbolic_reward_partial_score/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.084080457687378, "sampling/importance_sampling_ratio/min": 0.0035155261866748333, "sampling/sampling_logp_difference/max": 5.650566101074219, "sampling/sampling_logp_difference/mean": 0.1470998227596283, "step": 865 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.36874088644981384, "epoch": 2.2789473684210524, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 866 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3739761412143707, "epoch": 2.281578947368421, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 867 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3761322647333145, "epoch": 2.2842105263157895, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 868 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8993.0, "completions/max_terminated_length": 8993.0, "completions/mean_length": 2841.529296875, "completions/mean_terminated_length": 2841.529296875, "completions/min_length": 413.0, "completions/min_terminated_length": 413.0, "entropy": 0.36181609332561493, "epoch": 2.286842105263158, "frac_reward_zero_std": 0.96875, "grad_norm": 0.0021512943785637617, "learning_rate": 1e-06, "loss": -0.0023, "num_tokens": 378174900.0, "reward": 0.8982422351837158, "reward_std": 0.00703125074505806, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.998046875, "rewards/symbolic_reward_accuracy/std": 0.04419417306780815, "rewards/symbolic_reward_partial_score/mean": 0.998046875, "rewards/symbolic_reward_partial_score/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0818989276885986, "sampling/importance_sampling_ratio/min": 0.0024867765605449677, "sampling/sampling_logp_difference/max": 5.996767997741699, "sampling/sampling_logp_difference/mean": 0.14259354770183563, "step": 869 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3607634902000427, "epoch": 2.2894736842105265, "grad_norm": 0.0021994290873408318, "learning_rate": 1e-06, "loss": -0.0024, "step": 870 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.36314135789871216, "epoch": 2.292105263157895, "grad_norm": 0.0012428892077878118, "learning_rate": 1e-06, "loss": -0.0018, "step": 871 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.3619815558195114, "epoch": 2.294736842105263, "grad_norm": 0.0015261647058650851, "learning_rate": 1e-06, "loss": 0.0073, "step": 872 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9245.0, "completions/max_terminated_length": 9245.0, "completions/mean_length": 3123.599609375, "completions/mean_terminated_length": 3123.599609375, "completions/min_length": 509.0, "completions/min_terminated_length": 509.0, "entropy": 0.3635769635438919, "epoch": 2.2973684210526315, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 380179495.0, "reward": 0.9000000357627869, "reward_std": 0.0, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 1.0, "rewards/symbolic_reward_accuracy/std": 0.0, "rewards/symbolic_reward_partial_score/mean": 1.0, "rewards/symbolic_reward_partial_score/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0816640853881836, "sampling/importance_sampling_ratio/min": 0.0026533696800470352, "sampling/sampling_logp_difference/max": 5.931924819946289, "sampling/sampling_logp_difference/mean": 0.14348940551280975, "step": 873 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3607851415872574, "epoch": 2.3, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 874 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3640567660331726, "epoch": 2.3026315789473686, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 875 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.36307522654533386, "epoch": 2.305263157894737, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 876 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9730.0, "completions/max_terminated_length": 9730.0, "completions/mean_length": 2986.474609375, "completions/mean_terminated_length": 2986.474609375, "completions/min_length": 458.0, "completions/min_terminated_length": 458.0, "entropy": 0.3658098876476288, "epoch": 2.307894736842105, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 382115066.0, "reward": 0.9000000357627869, "reward_std": 0.0, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 1.0, "rewards/symbolic_reward_accuracy/std": 0.0, "rewards/symbolic_reward_partial_score/mean": 1.0, "rewards/symbolic_reward_partial_score/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.081493616104126, "sampling/importance_sampling_ratio/min": 0.0017810569843277335, "sampling/sampling_logp_difference/max": 6.330548286437988, "sampling/sampling_logp_difference/mean": 0.14339609444141388, "step": 877 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3651532083749771, "epoch": 2.3105263157894735, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 878 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.36263491213321686, "epoch": 2.3131578947368423, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 879 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.36467084288597107, "epoch": 2.3157894736842106, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 880 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9163.0, "completions/max_terminated_length": 9163.0, "completions/mean_length": 3212.955078125, "completions/mean_terminated_length": 3212.955078125, "completions/min_length": 399.0, "completions/min_terminated_length": 399.0, "entropy": 0.37664617598056793, "epoch": 2.318421052631579, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 384140547.0, "reward": 0.9000000357627869, "reward_std": 0.0, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 1.0, "rewards/symbolic_reward_accuracy/std": 0.0, "rewards/symbolic_reward_partial_score/mean": 1.0, "rewards/symbolic_reward_partial_score/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0847699642181396, "sampling/importance_sampling_ratio/min": 0.00490396237000823, "sampling/sampling_logp_difference/max": 5.31771183013916, "sampling/sampling_logp_difference/mean": 0.14786867797374725, "step": 881 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3779408037662506, "epoch": 2.3210526315789473, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 882 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3771331012248993, "epoch": 2.3236842105263156, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 883 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.37672729790210724, "epoch": 2.3263157894736843, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 884 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7021.0, "completions/max_terminated_length": 7021.0, "completions/mean_length": 2579.265625, "completions/mean_terminated_length": 2579.265625, "completions/min_length": 433.0, "completions/min_terminated_length": 433.0, "entropy": 0.37041398882865906, "epoch": 2.3289473684210527, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 385860427.0, "reward": 0.9000000357627869, "reward_std": 0.0, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 1.0, "rewards/symbolic_reward_accuracy/std": 0.0, "rewards/symbolic_reward_partial_score/mean": 1.0, "rewards/symbolic_reward_partial_score/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0829646587371826, "sampling/importance_sampling_ratio/min": 0.0038317593280225992, "sampling/sampling_logp_difference/max": 5.564431190490723, "sampling/sampling_logp_difference/mean": 0.14544130861759186, "step": 885 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3670762777328491, "epoch": 2.331578947368421, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 886 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.36805543303489685, "epoch": 2.3342105263157893, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 887 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.36852532625198364, "epoch": 2.336842105263158, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 888 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7678.0, "completions/max_terminated_length": 7678.0, "completions/mean_length": 2817.0078125, "completions/mean_terminated_length": 2817.0078125, "completions/min_length": 444.0, "completions/min_terminated_length": 444.0, "entropy": 0.36171959340572357, "epoch": 2.3394736842105264, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 387692367.0, "reward": 0.9000000357627869, "reward_std": 0.0, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 1.0, "rewards/symbolic_reward_accuracy/std": 0.0, "rewards/symbolic_reward_partial_score/mean": 1.0, "rewards/symbolic_reward_partial_score/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0813450813293457, "sampling/importance_sampling_ratio/min": 0.004328156355768442, "sampling/sampling_logp_difference/max": 5.44261360168457, "sampling/sampling_logp_difference/mean": 0.14258888363838196, "step": 889 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.36349526047706604, "epoch": 2.3421052631578947, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 890 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.35436564683914185, "epoch": 2.344736842105263, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 891 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.35943491756916046, "epoch": 2.3473684210526318, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 892 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10302.0, "completions/max_terminated_length": 10302.0, "completions/mean_length": 2966.74609375, "completions/mean_terminated_length": 2966.74609375, "completions/min_length": 478.0, "completions/min_terminated_length": 478.0, "entropy": 0.35607656836509705, "epoch": 2.35, "frac_reward_zero_std": 0.96875, "grad_norm": 0.0022790690418332815, "learning_rate": 1e-06, "loss": -0.003, "num_tokens": 389585933.0, "reward": 0.8987305164337158, "reward_std": 0.005078125279396772, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.998046875, "rewards/symbolic_reward_accuracy/std": 0.04419417306780815, "rewards/symbolic_reward_partial_score/mean": 0.9996744394302368, "rewards/symbolic_reward_partial_score/std": 0.007365696597844362, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0811139345169067, "sampling/importance_sampling_ratio/min": 0.0026939234230667353, "sampling/sampling_logp_difference/max": 5.916756629943848, "sampling/sampling_logp_difference/mean": 0.1432499885559082, "step": 893 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3614558279514313, "epoch": 2.3526315789473684, "grad_norm": 0.0023664783220738173, "learning_rate": 1e-06, "loss": -0.0031, "step": 894 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.35658153891563416, "epoch": 2.3552631578947367, "grad_norm": 0.001570661086589098, "learning_rate": 1e-06, "loss": -0.0032, "step": 895 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.3659001290798187, "epoch": 2.3578947368421055, "grad_norm": 0.0017783413641154766, "learning_rate": 1e-06, "loss": 0.0066, "step": 896 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12342.0, "completions/max_terminated_length": 12342.0, "completions/mean_length": 3032.390625, "completions/mean_terminated_length": 3032.390625, "completions/min_length": 493.0, "completions/min_terminated_length": 493.0, "entropy": 0.3593465983867645, "epoch": 2.360526315789474, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 391556021.0, "reward": 0.9000000357627869, "reward_std": 0.0, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 1.0, "rewards/symbolic_reward_accuracy/std": 0.0, "rewards/symbolic_reward_partial_score/mean": 1.0, "rewards/symbolic_reward_partial_score/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0812950134277344, "sampling/importance_sampling_ratio/min": 0.0015374697977676988, "sampling/sampling_logp_difference/max": 6.477617263793945, "sampling/sampling_logp_difference/mean": 0.1418515145778656, "step": 897 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.36153610050678253, "epoch": 2.363157894736842, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 898 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.35894453525543213, "epoch": 2.3657894736842104, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 899 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.36085371673107147, "epoch": 2.3684210526315788, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 900 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8186.0, "completions/max_terminated_length": 8186.0, "completions/mean_length": 3298.38671875, "completions/mean_terminated_length": 3298.38671875, "completions/min_length": 529.0, "completions/min_terminated_length": 529.0, "entropy": 0.36277227103710175, "epoch": 2.3710526315789475, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 393654779.0, "reward": 0.9000000357627869, "reward_std": 0.0, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 1.0, "rewards/symbolic_reward_accuracy/std": 0.0, "rewards/symbolic_reward_partial_score/mean": 1.0, "rewards/symbolic_reward_partial_score/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0808480978012085, "sampling/importance_sampling_ratio/min": 0.0075180670246481895, "sampling/sampling_logp_difference/max": 4.890446186065674, "sampling/sampling_logp_difference/mean": 0.14134301245212555, "step": 901 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.36113254725933075, "epoch": 2.373684210526316, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 902 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.35563766956329346, "epoch": 2.376315789473684, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 903 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3611921966075897, "epoch": 2.3789473684210525, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 904 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8667.0, "completions/max_terminated_length": 8667.0, "completions/mean_length": 3170.212890625, "completions/mean_terminated_length": 3170.212890625, "completions/min_length": 452.0, "completions/min_terminated_length": 452.0, "entropy": 0.3660573810338974, "epoch": 2.3815789473684212, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 395681736.0, "reward": 0.9000000357627869, "reward_std": 0.0, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 1.0, "rewards/symbolic_reward_accuracy/std": 0.0, "rewards/symbolic_reward_partial_score/mean": 1.0, "rewards/symbolic_reward_partial_score/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0818578004837036, "sampling/importance_sampling_ratio/min": 0.004758965689688921, "sampling/sampling_logp_difference/max": 5.347724914550781, "sampling/sampling_logp_difference/mean": 0.14306548237800598, "step": 905 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.35714682936668396, "epoch": 2.3842105263157896, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 906 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.36162807047367096, "epoch": 2.386842105263158, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 907 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3658362329006195, "epoch": 2.389473684210526, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 908 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10918.0, "completions/max_terminated_length": 10918.0, "completions/mean_length": 3196.40234375, "completions/mean_terminated_length": 3196.40234375, "completions/min_length": 425.0, "completions/min_terminated_length": 425.0, "entropy": 0.36716753244400024, "epoch": 2.3921052631578945, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 397751638.0, "reward": 0.9000000357627869, "reward_std": 0.0, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 1.0, "rewards/symbolic_reward_accuracy/std": 0.0, "rewards/symbolic_reward_partial_score/mean": 1.0, "rewards/symbolic_reward_partial_score/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0810041427612305, "sampling/importance_sampling_ratio/min": 0.0024786717258393764, "sampling/sampling_logp_difference/max": 6.000032424926758, "sampling/sampling_logp_difference/mean": 0.1421126127243042, "step": 909 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3610660284757614, "epoch": 2.3947368421052633, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 910 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3567744791507721, "epoch": 2.3973684210526316, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 911 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.35932107269763947, "epoch": 2.4, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 912 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11173.0, "completions/max_terminated_length": 11173.0, "completions/mean_length": 3390.716796875, "completions/mean_terminated_length": 3390.716796875, "completions/min_length": 467.0, "completions/min_terminated_length": 467.0, "entropy": 0.3545672595500946, "epoch": 2.4026315789473682, "frac_reward_zero_std": 0.90625, "grad_norm": 0.004027456510812044, "learning_rate": 1e-06, "loss": 0.0035, "num_tokens": 399911685.0, "reward": 0.8961914777755737, "reward_std": 0.01523437537252903, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.994140625, "rewards/symbolic_reward_accuracy/std": 0.07639661431312561, "rewards/symbolic_reward_partial_score/mean": 0.9990234375, "rewards/symbolic_reward_partial_score/std": 0.012732770293951035, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0799760818481445, "sampling/importance_sampling_ratio/min": 0.0014167280169203877, "sampling/sampling_logp_difference/max": 6.559405326843262, "sampling/sampling_logp_difference/mean": 0.14101743698120117, "step": 913 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0390625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0390625, "entropy": 0.3631473183631897, "epoch": 2.405263157894737, "grad_norm": 0.0018011536449193954, "learning_rate": 1e-06, "loss": -0.0089, "step": 914 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.35653069615364075, "epoch": 2.4078947368421053, "grad_norm": 0.003670409321784973, "learning_rate": 1e-06, "loss": 0.0076, "step": 915 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.3573293387889862, "epoch": 2.4105263157894736, "grad_norm": 0.0035192605573683977, "learning_rate": 1e-06, "loss": 0.0024, "step": 916 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11020.0, "completions/max_terminated_length": 11020.0, "completions/mean_length": 3115.115234375, "completions/mean_terminated_length": 3115.115234375, "completions/min_length": 396.0, "completions/min_terminated_length": 396.0, "entropy": 0.36557266116142273, "epoch": 2.413157894736842, "frac_reward_zero_std": 0.96875, "grad_norm": 0.02152114547789097, "learning_rate": 1e-06, "loss": 0.0076, "num_tokens": 401908928.0, "reward": 0.8974609375, "reward_std": 0.006938039790838957, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.99609375, "rewards/symbolic_reward_accuracy/std": 0.06243881583213806, "rewards/symbolic_reward_partial_score/mean": 0.9993489384651184, "rewards/symbolic_reward_partial_score/std": 0.01040646992623806, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0817641019821167, "sampling/importance_sampling_ratio/min": 0.001008402556180954, "sampling/sampling_logp_difference/max": 6.899387836456299, "sampling/sampling_logp_difference/mean": 0.1427956223487854, "step": 917 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.36513763666152954, "epoch": 2.4157894736842107, "grad_norm": 0.002639228478074074, "learning_rate": 1e-06, "loss": -0.0042, "step": 918 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.3633700907230377, "epoch": 2.418421052631579, "grad_norm": 0.004339069593697786, "learning_rate": 1e-06, "loss": 0.0085, "step": 919 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.3666573613882065, "epoch": 2.4210526315789473, "grad_norm": 0.0027488991618156433, "learning_rate": 1e-06, "loss": -0.0057, "step": 920 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10815.0, "completions/max_terminated_length": 10815.0, "completions/mean_length": 3267.1171875, "completions/mean_terminated_length": 3267.1171875, "completions/min_length": 518.0, "completions/min_terminated_length": 518.0, "entropy": 0.3684731721878052, "epoch": 2.4236842105263157, "frac_reward_zero_std": 0.96875, "grad_norm": 0.005090386141091585, "learning_rate": 1e-06, "loss": -0.0075, "num_tokens": 404000860.0, "reward": 0.8957031965255737, "reward_std": 0.009383677504956722, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.994140625, "rewards/symbolic_reward_accuracy/std": 0.07639661431312561, "rewards/symbolic_reward_partial_score/mean": 0.9973958134651184, "rewards/symbolic_reward_partial_score/std": 0.0453747883439064, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0833088159561157, "sampling/importance_sampling_ratio/min": 0.003141125664114952, "sampling/sampling_logp_difference/max": 5.763174057006836, "sampling/sampling_logp_difference/mean": 0.1452016979455948, "step": 921 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3731415569782257, "epoch": 2.4263157894736844, "grad_norm": 0.011077141389250755, "learning_rate": 1e-06, "loss": -0.0007, "step": 922 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.37360669672489166, "epoch": 2.4289473684210527, "grad_norm": 0.0037241894751787186, "learning_rate": 1e-06, "loss": 0.0026, "step": 923 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.37304292619228363, "epoch": 2.431578947368421, "grad_norm": 0.020754782482981682, "learning_rate": 1e-06, "loss": 0.0052, "step": 924 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10228.0, "completions/max_terminated_length": 10228.0, "completions/mean_length": 3098.818359375, "completions/mean_terminated_length": 3098.818359375, "completions/min_length": 513.0, "completions/min_terminated_length": 513.0, "entropy": 0.3699934184551239, "epoch": 2.4342105263157894, "frac_reward_zero_std": 0.96875, "grad_norm": 0.0026151048950850964, "learning_rate": 1e-06, "loss": 0.0146, "num_tokens": 406001087.0, "reward": 0.8987305164337158, "reward_std": 0.005078125279396772, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.998046875, "rewards/symbolic_reward_accuracy/std": 0.04419417306780815, "rewards/symbolic_reward_partial_score/mean": 0.9996744394302368, "rewards/symbolic_reward_partial_score/std": 0.007365696597844362, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0842585563659668, "sampling/importance_sampling_ratio/min": 0.0019444021163508296, "sampling/sampling_logp_difference/max": 6.242800712585449, "sampling/sampling_logp_difference/mean": 0.14681871235370636, "step": 925 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.37845851480960846, "epoch": 2.4368421052631577, "grad_norm": 0.002389604691416025, "learning_rate": 1e-06, "loss": -0.0031, "step": 926 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.3750576078891754, "epoch": 2.4394736842105265, "grad_norm": 0.0012132461415603757, "learning_rate": 1e-06, "loss": -0.0027, "step": 927 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.3756653815507889, "epoch": 2.442105263157895, "grad_norm": 0.001817339682020247, "learning_rate": 1e-06, "loss": -0.0041, "step": 928 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 9466.0, "completions/mean_length": 3340.31640625, "completions/mean_terminated_length": 3314.79052734375, "completions/min_length": 510.0, "completions/min_terminated_length": 510.0, "entropy": 0.3821450173854828, "epoch": 2.444736842105263, "frac_reward_zero_std": 0.9375, "grad_norm": 0.002393614035099745, "learning_rate": 1e-06, "loss": -0.0035, "num_tokens": 408113793.0, "reward": 0.8967143297195435, "reward_std": 0.013142729178071022, "rewards/progression_diversity/mean": -0.0014197737909853458, "rewards/progression_diversity/std": 0.032125815749168396, "rewards/symbolic_reward_accuracy/mean": 0.99609375, "rewards/symbolic_reward_accuracy/std": 0.06243881583213806, "rewards/symbolic_reward_partial_score/mean": 0.99755859375, "rewards/symbolic_reward_partial_score/std": 0.045533329248428345, "rewards/tag_count_reward/mean": -0.001953125, "rewards/tag_count_reward/std": 0.04419417306780815, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0851178169250488, "sampling/importance_sampling_ratio/min": 0.003363084513694048, "sampling/sampling_logp_difference/max": 5.694896697998047, "sampling/sampling_logp_difference/mean": 0.147353857755661, "step": 929 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.37932613492012024, "epoch": 2.4473684210526314, "grad_norm": 0.0023872023448348045, "learning_rate": 1e-06, "loss": -0.0037, "step": 930 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.372987300157547, "epoch": 2.45, "grad_norm": 0.02375985123217106, "learning_rate": 1e-06, "loss": 0.0358, "step": 931 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3830067366361618, "epoch": 2.4526315789473685, "grad_norm": 0.0027898119296878576, "learning_rate": 1e-06, "loss": -0.0035, "step": 932 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15376.0, "completions/max_terminated_length": 15376.0, "completions/mean_length": 3962.294921875, "completions/mean_terminated_length": 3962.294921875, "completions/min_length": 486.0, "completions/min_terminated_length": 486.0, "entropy": 0.37915468215942383, "epoch": 2.455263157894737, "frac_reward_zero_std": 0.9375, "grad_norm": 0.0034961416386067867, "learning_rate": 1e-06, "loss": 0.009, "num_tokens": 410571320.0, "reward": 0.8974609375, "reward_std": 0.010156250558793545, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.99609375, "rewards/symbolic_reward_accuracy/std": 0.06243881583213806, "rewards/symbolic_reward_partial_score/mean": 0.9993489384651184, "rewards/symbolic_reward_partial_score/std": 0.01040646992623806, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0855395793914795, "sampling/importance_sampling_ratio/min": 0.0032595572993159294, "sampling/sampling_logp_difference/max": 5.726163864135742, "sampling/sampling_logp_difference/mean": 0.14830079674720764, "step": 933 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.38199466466903687, "epoch": 2.457894736842105, "grad_norm": 0.033642303198575974, "learning_rate": 1e-06, "loss": 0.0234, "step": 934 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.3828924298286438, "epoch": 2.4605263157894735, "grad_norm": 0.002853027544915676, "learning_rate": 1e-06, "loss": -0.0084, "step": 935 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.38318072259426117, "epoch": 2.463157894736842, "grad_norm": 0.0032947203144431114, "learning_rate": 1e-06, "loss": -0.0076, "step": 936 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9606.0, "completions/max_terminated_length": 9606.0, "completions/mean_length": 3558.6328125, "completions/mean_terminated_length": 3558.6328125, "completions/min_length": 450.0, "completions/min_terminated_length": 450.0, "entropy": 0.39143824577331543, "epoch": 2.4657894736842105, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 412799996.0, "reward": 0.9000000357627869, "reward_std": 0.0, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 1.0, "rewards/symbolic_reward_accuracy/std": 0.0, "rewards/symbolic_reward_partial_score/mean": 1.0, "rewards/symbolic_reward_partial_score/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.08713698387146, "sampling/importance_sampling_ratio/min": 0.002326086163520813, "sampling/sampling_logp_difference/max": 6.063568115234375, "sampling/sampling_logp_difference/mean": 0.15151986479759216, "step": 937 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3876110315322876, "epoch": 2.468421052631579, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 938 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3868740648031235, "epoch": 2.4710526315789476, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 939 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3873925507068634, "epoch": 2.473684210526316, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 940 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8007.0, "completions/max_terminated_length": 8007.0, "completions/mean_length": 3393.525390625, "completions/mean_terminated_length": 3393.525390625, "completions/min_length": 443.0, "completions/min_terminated_length": 443.0, "entropy": 0.3954293429851532, "epoch": 2.4763157894736842, "frac_reward_zero_std": 0.96875, "grad_norm": 0.002155366586521268, "learning_rate": 1e-06, "loss": -0.0027, "num_tokens": 414930441.0, "reward": 0.8987305164337158, "reward_std": 0.005078125279396772, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.998046875, "rewards/symbolic_reward_accuracy/std": 0.04419417306780815, "rewards/symbolic_reward_partial_score/mean": 0.9996744394302368, "rewards/symbolic_reward_partial_score/std": 0.007365696597844362, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0895755290985107, "sampling/importance_sampling_ratio/min": 0.0019033129792660475, "sampling/sampling_logp_difference/max": 6.264159202575684, "sampling/sampling_logp_difference/mean": 0.1538197100162506, "step": 941 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.3938916027545929, "epoch": 2.4789473684210526, "grad_norm": 0.0013757689157500863, "learning_rate": 1e-06, "loss": -0.0025, "step": 942 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.40129780769348145, "epoch": 2.481578947368421, "grad_norm": 0.0015640616184100509, "learning_rate": 1e-06, "loss": -0.0024, "step": 943 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.39768919348716736, "epoch": 2.4842105263157896, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0025, "step": 944 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11198.0, "completions/max_terminated_length": 11198.0, "completions/mean_length": 3469.017578125, "completions/mean_terminated_length": 3469.017578125, "completions/min_length": 541.0, "completions/min_terminated_length": 541.0, "entropy": 0.3992462307214737, "epoch": 2.486842105263158, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 417097874.0, "reward": 0.9000000357627869, "reward_std": 0.0, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 1.0, "rewards/symbolic_reward_accuracy/std": 0.0, "rewards/symbolic_reward_partial_score/mean": 1.0, "rewards/symbolic_reward_partial_score/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0900344848632812, "sampling/importance_sampling_ratio/min": 0.002489559818059206, "sampling/sampling_logp_difference/max": 5.995649337768555, "sampling/sampling_logp_difference/mean": 0.15577396750450134, "step": 945 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.4018537849187851, "epoch": 2.4894736842105263, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 946 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.4007411599159241, "epoch": 2.4921052631578946, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 947 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3990521878004074, "epoch": 2.4947368421052634, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 948 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10753.0, "completions/max_terminated_length": 10753.0, "completions/mean_length": 3416.427734375, "completions/mean_terminated_length": 3416.427734375, "completions/min_length": 545.0, "completions/min_terminated_length": 545.0, "entropy": 0.39834538102149963, "epoch": 2.4973684210526317, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 419240205.0, "reward": 0.9000000357627869, "reward_std": 0.0, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 1.0, "rewards/symbolic_reward_accuracy/std": 0.0, "rewards/symbolic_reward_partial_score/mean": 1.0, "rewards/symbolic_reward_partial_score/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0902082920074463, "sampling/importance_sampling_ratio/min": 0.003112442558631301, "sampling/sampling_logp_difference/max": 5.772347450256348, "sampling/sampling_logp_difference/mean": 0.15505889058113098, "step": 949 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3964284211397171, "epoch": 2.5, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 950 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3991597145795822, "epoch": 2.5026315789473683, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 951 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.4031883478164673, "epoch": 2.5052631578947366, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 952 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8473.0, "completions/max_terminated_length": 8473.0, "completions/mean_length": 3089.087890625, "completions/mean_terminated_length": 3089.087890625, "completions/min_length": 524.0, "completions/min_terminated_length": 524.0, "entropy": 0.39177127182483673, "epoch": 2.5078947368421054, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 421213114.0, "reward": 0.9000000357627869, "reward_std": 0.0, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 1.0, "rewards/symbolic_reward_accuracy/std": 0.0, "rewards/symbolic_reward_partial_score/mean": 1.0, "rewards/symbolic_reward_partial_score/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0892750024795532, "sampling/importance_sampling_ratio/min": 0.0030540593434125185, "sampling/sampling_logp_difference/max": 5.79128360748291, "sampling/sampling_logp_difference/mean": 0.15330728888511658, "step": 953 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3939003646373749, "epoch": 2.5105263157894737, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 954 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3949681669473648, "epoch": 2.513157894736842, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 955 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.39721885323524475, "epoch": 2.515789473684211, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 956 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8395.0, "completions/max_terminated_length": 8395.0, "completions/mean_length": 3303.951171875, "completions/mean_terminated_length": 3303.951171875, "completions/min_length": 503.0, "completions/min_terminated_length": 503.0, "entropy": 0.3971671760082245, "epoch": 2.518421052631579, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 423286849.0, "reward": 0.9000000357627869, "reward_std": 0.0, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 1.0, "rewards/symbolic_reward_accuracy/std": 0.0, "rewards/symbolic_reward_partial_score/mean": 1.0, "rewards/symbolic_reward_partial_score/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0902173519134521, "sampling/importance_sampling_ratio/min": 0.005321144592016935, "sampling/sampling_logp_difference/max": 5.236066818237305, "sampling/sampling_logp_difference/mean": 0.15536397695541382, "step": 957 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3991956114768982, "epoch": 2.5210526315789474, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 958 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.40103158354759216, "epoch": 2.5236842105263158, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 959 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3964807689189911, "epoch": 2.526315789473684, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 960 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9804.0, "completions/max_terminated_length": 9804.0, "completions/mean_length": 3629.876953125, "completions/mean_terminated_length": 3629.876953125, "completions/min_length": 461.0, "completions/min_terminated_length": 461.0, "entropy": 0.39282380044460297, "epoch": 2.5289473684210524, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 425561666.0, "reward": 0.9000000357627869, "reward_std": 0.0, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 1.0, "rewards/symbolic_reward_accuracy/std": 0.0, "rewards/symbolic_reward_partial_score/mean": 1.0, "rewards/symbolic_reward_partial_score/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.089064121246338, "sampling/importance_sampling_ratio/min": 0.0022420284803956747, "sampling/sampling_logp_difference/max": 6.100374221801758, "sampling/sampling_logp_difference/mean": 0.15388435125350952, "step": 961 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3928462564945221, "epoch": 2.531578947368421, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 962 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.39881327748298645, "epoch": 2.5342105263157895, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 963 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.39672617614269257, "epoch": 2.536842105263158, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 964 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13343.0, "completions/max_terminated_length": 13343.0, "completions/mean_length": 3489.171875, "completions/mean_terminated_length": 3489.171875, "completions/min_length": 478.0, "completions/min_terminated_length": 478.0, "entropy": 0.40122637152671814, "epoch": 2.5394736842105265, "frac_reward_zero_std": 0.96875, "grad_norm": 0.0016905671218410134, "learning_rate": 1e-06, "loss": -0.0015, "num_tokens": 427745594.0, "reward": 0.898681640625, "reward_std": 0.0052734375931322575, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.998046875, "rewards/symbolic_reward_accuracy/std": 0.04419417306780815, "rewards/symbolic_reward_partial_score/mean": 0.99951171875, "rewards/symbolic_reward_partial_score/std": 0.011048543266952038, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0901939868927002, "sampling/importance_sampling_ratio/min": 0.0036790890153497458, "sampling/sampling_logp_difference/max": 5.605090141296387, "sampling/sampling_logp_difference/mean": 0.15467068552970886, "step": 965 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.4000319242477417, "epoch": 2.542105263157895, "grad_norm": 0.0016169939190149307, "learning_rate": 1e-06, "loss": -0.0013, "step": 966 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.39701980352401733, "epoch": 2.544736842105263, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0246, "step": 967 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.39805372059345245, "epoch": 2.5473684210526315, "grad_norm": 0.001755344565026462, "learning_rate": 1e-06, "loss": -0.0015, "step": 968 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9215.0, "completions/max_terminated_length": 9215.0, "completions/mean_length": 3308.947265625, "completions/mean_terminated_length": 3308.947265625, "completions/min_length": 475.0, "completions/min_terminated_length": 475.0, "entropy": 0.40029896795749664, "epoch": 2.55, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 429838911.0, "reward": 0.9000000357627869, "reward_std": 0.0, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 1.0, "rewards/symbolic_reward_accuracy/std": 0.0, "rewards/symbolic_reward_partial_score/mean": 1.0, "rewards/symbolic_reward_partial_score/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.090530514717102, "sampling/importance_sampling_ratio/min": 0.004107793793082237, "sampling/sampling_logp_difference/max": 5.494869232177734, "sampling/sampling_logp_difference/mean": 0.1554190218448639, "step": 969 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.4019162058830261, "epoch": 2.5526315789473686, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 970 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3964472711086273, "epoch": 2.555263157894737, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 971 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.402370348572731, "epoch": 2.557894736842105, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 972 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11182.0, "completions/max_terminated_length": 11182.0, "completions/mean_length": 3462.736328125, "completions/mean_terminated_length": 3462.736328125, "completions/min_length": 464.0, "completions/min_terminated_length": 464.0, "entropy": 0.4014033377170563, "epoch": 2.5605263157894735, "frac_reward_zero_std": 0.96875, "grad_norm": 0.017792794853448868, "learning_rate": 1e-06, "loss": 0.0033, "num_tokens": 432004952.0, "reward": 0.8987305164337158, "reward_std": 0.005078125279396772, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.998046875, "rewards/symbolic_reward_accuracy/std": 0.04419417306780815, "rewards/symbolic_reward_partial_score/mean": 0.9996744394302368, "rewards/symbolic_reward_partial_score/std": 0.007365696597844362, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0909943580627441, "sampling/importance_sampling_ratio/min": 0.004677068907767534, "sampling/sampling_logp_difference/max": 5.365083694458008, "sampling/sampling_logp_difference/mean": 0.15631479024887085, "step": 973 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.40374694764614105, "epoch": 2.5631578947368423, "grad_norm": 0.002025263849645853, "learning_rate": 1e-06, "loss": -0.0021, "step": 974 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.40185023844242096, "epoch": 2.5657894736842106, "grad_norm": 0.0014211626257747412, "learning_rate": 1e-06, "loss": -0.0017, "step": 975 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.40274615585803986, "epoch": 2.568421052631579, "grad_norm": 0.0018871768843382597, "learning_rate": 1e-06, "loss": -0.0018, "step": 976 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8790.0, "completions/max_terminated_length": 8790.0, "completions/mean_length": 3724.693359375, "completions/mean_terminated_length": 3724.693359375, "completions/min_length": 425.0, "completions/min_terminated_length": 425.0, "entropy": 0.40261590480804443, "epoch": 2.5710526315789473, "frac_reward_zero_std": 0.96875, "grad_norm": 0.002017219550907612, "learning_rate": 1e-06, "loss": -0.0016, "num_tokens": 434298619.0, "reward": 0.8982422351837158, "reward_std": 0.00703125074505806, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.998046875, "rewards/symbolic_reward_accuracy/std": 0.04419417306780815, "rewards/symbolic_reward_partial_score/mean": 0.998046875, "rewards/symbolic_reward_partial_score/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0903406143188477, "sampling/importance_sampling_ratio/min": 0.0024566189385950565, "sampling/sampling_logp_difference/max": 6.008969306945801, "sampling/sampling_logp_difference/mean": 0.15591517090797424, "step": 977 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3985179364681244, "epoch": 2.5736842105263156, "grad_norm": 0.0021839505061507225, "learning_rate": 1e-06, "loss": -0.0021, "step": 978 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.39775681495666504, "epoch": 2.5763157894736843, "grad_norm": 0.0018639329355210066, "learning_rate": 1e-06, "loss": 0.0112, "step": 979 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.4019179940223694, "epoch": 2.5789473684210527, "grad_norm": 0.0013285224558785558, "learning_rate": 1e-06, "loss": -0.0027, "step": 980 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8975.0, "completions/max_terminated_length": 8975.0, "completions/mean_length": 3941.796875, "completions/mean_terminated_length": 3941.796875, "completions/min_length": 452.0, "completions/min_terminated_length": 452.0, "entropy": 0.395505890250206, "epoch": 2.581578947368421, "frac_reward_zero_std": 0.96875, "grad_norm": 0.0017378403572365642, "learning_rate": 1e-06, "loss": -0.0014, "num_tokens": 436718803.0, "reward": 0.8982422351837158, "reward_std": 0.00703125074505806, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.998046875, "rewards/symbolic_reward_accuracy/std": 0.04419417306780815, "rewards/symbolic_reward_partial_score/mean": 0.998046875, "rewards/symbolic_reward_partial_score/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0892829895019531, "sampling/importance_sampling_ratio/min": 0.0028193816542625427, "sampling/sampling_logp_difference/max": 5.871237754821777, "sampling/sampling_logp_difference/mean": 0.1537029892206192, "step": 981 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.3938744068145752, "epoch": 2.5842105263157897, "grad_norm": 0.0012539365561679006, "learning_rate": 1e-06, "loss": -0.0018, "step": 982 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.39431141316890717, "epoch": 2.586842105263158, "grad_norm": 0.002117270603775978, "learning_rate": 1e-06, "loss": -0.0021, "step": 983 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3977039158344269, "epoch": 2.5894736842105264, "grad_norm": 0.02683359570801258, "learning_rate": 1e-06, "loss": 0.0105, "step": 984 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9633.0, "completions/max_terminated_length": 9633.0, "completions/mean_length": 3537.8203125, "completions/mean_terminated_length": 3537.8203125, "completions/min_length": 458.0, "completions/min_terminated_length": 458.0, "entropy": 0.38867008686065674, "epoch": 2.5921052631578947, "frac_reward_zero_std": 0.96875, "grad_norm": 0.0026917937211692333, "learning_rate": 1e-06, "loss": -0.0034, "num_tokens": 438947991.0, "reward": 0.8982422351837158, "reward_std": 0.00703125074505806, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.998046875, "rewards/symbolic_reward_accuracy/std": 0.04419417306780815, "rewards/symbolic_reward_partial_score/mean": 0.998046875, "rewards/symbolic_reward_partial_score/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0882879495620728, "sampling/importance_sampling_ratio/min": 0.0049593886360526085, "sampling/sampling_logp_difference/max": 5.3064727783203125, "sampling/sampling_logp_difference/mean": 0.15247535705566406, "step": 985 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3872314691543579, "epoch": 2.594736842105263, "grad_norm": 0.0026183947920799255, "learning_rate": 1e-06, "loss": -0.0033, "step": 986 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.3923877775669098, "epoch": 2.5973684210526313, "grad_norm": 0.0018071632366627455, "learning_rate": 1e-06, "loss": -0.0034, "step": 987 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.3928879499435425, "epoch": 2.6, "grad_norm": 0.002479348797351122, "learning_rate": 1e-06, "loss": 0.0099, "step": 988 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9992.0, "completions/max_terminated_length": 9992.0, "completions/mean_length": 3394.00390625, "completions/mean_terminated_length": 3394.00390625, "completions/min_length": 513.0, "completions/min_terminated_length": 513.0, "entropy": 0.40198518335819244, "epoch": 2.6026315789473684, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 441055481.0, "reward": 0.9000000357627869, "reward_std": 0.0, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 1.0, "rewards/symbolic_reward_accuracy/std": 0.0, "rewards/symbolic_reward_partial_score/mean": 1.0, "rewards/symbolic_reward_partial_score/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0912548303604126, "sampling/importance_sampling_ratio/min": 0.0014534969814121723, "sampling/sampling_logp_difference/max": 6.533782958984375, "sampling/sampling_logp_difference/mean": 0.15735076367855072, "step": 989 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.40786635875701904, "epoch": 2.6052631578947367, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 990 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.4017523229122162, "epoch": 2.6078947368421055, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 991 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.4046560078859329, "epoch": 2.610526315789474, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 992 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8586.0, "completions/max_terminated_length": 8586.0, "completions/mean_length": 3238.623046875, "completions/mean_terminated_length": 3238.623046875, "completions/min_length": 337.0, "completions/min_terminated_length": 337.0, "entropy": 0.39697568118572235, "epoch": 2.613157894736842, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 443112632.0, "reward": 0.9000000357627869, "reward_std": 0.0, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 1.0, "rewards/symbolic_reward_accuracy/std": 0.0, "rewards/symbolic_reward_partial_score/mean": 1.0, "rewards/symbolic_reward_partial_score/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0905907154083252, "sampling/importance_sampling_ratio/min": 0.0015394152142107487, "sampling/sampling_logp_difference/max": 6.476352691650391, "sampling/sampling_logp_difference/mean": 0.15642264485359192, "step": 993 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.40408557653427124, "epoch": 2.6157894736842104, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 994 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.40224938094615936, "epoch": 2.6184210526315788, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 995 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.4040856659412384, "epoch": 2.6210526315789475, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 996 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9284.0, "completions/max_terminated_length": 9284.0, "completions/mean_length": 3822.86328125, "completions/mean_terminated_length": 3822.86328125, "completions/min_length": 409.0, "completions/min_terminated_length": 409.0, "entropy": 0.40264029800891876, "epoch": 2.623684210526316, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 445489266.0, "reward": 0.9000000357627869, "reward_std": 0.0, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 1.0, "rewards/symbolic_reward_accuracy/std": 0.0, "rewards/symbolic_reward_partial_score/mean": 1.0, "rewards/symbolic_reward_partial_score/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.089907169342041, "sampling/importance_sampling_ratio/min": 0.0027840405236929655, "sampling/sampling_logp_difference/max": 5.883852005004883, "sampling/sampling_logp_difference/mean": 0.1549544334411621, "step": 997 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.39783909916877747, "epoch": 2.626315789473684, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 998 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.39893200993537903, "epoch": 2.6289473684210525, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 999 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3957681357860565, "epoch": 2.6315789473684212, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 1000 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9513.0, "completions/max_terminated_length": 9513.0, "completions/mean_length": 3169.8984375, "completions/mean_terminated_length": 3169.8984375, "completions/min_length": 398.0, "completions/min_terminated_length": 398.0, "entropy": 0.3932056128978729, "epoch": 2.6342105263157896, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 447497374.0, "reward": 0.9000000357627869, "reward_std": 0.0, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 1.0, "rewards/symbolic_reward_accuracy/std": 0.0, "rewards/symbolic_reward_partial_score/mean": 1.0, "rewards/symbolic_reward_partial_score/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0898118019104004, "sampling/importance_sampling_ratio/min": 0.0034137920010834932, "sampling/sampling_logp_difference/max": 5.679931640625, "sampling/sampling_logp_difference/mean": 0.15418946743011475, "step": 1001 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.39636586606502533, "epoch": 2.636842105263158, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 1002 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3966931700706482, "epoch": 2.639473684210526, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 1003 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.39644698798656464, "epoch": 2.6421052631578945, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 1004 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11205.0, "completions/max_terminated_length": 11205.0, "completions/mean_length": 3744.111328125, "completions/mean_terminated_length": 3744.111328125, "completions/min_length": 476.0, "completions/min_terminated_length": 476.0, "entropy": 0.4020548015832901, "epoch": 2.6447368421052633, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 449827511.0, "reward": 0.9000000357627869, "reward_std": 0.0, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 1.0, "rewards/symbolic_reward_accuracy/std": 0.0, "rewards/symbolic_reward_partial_score/mean": 1.0, "rewards/symbolic_reward_partial_score/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.090795874595642, "sampling/importance_sampling_ratio/min": 0.00338667631149292, "sampling/sampling_logp_difference/max": 5.687906265258789, "sampling/sampling_logp_difference/mean": 0.15657268464565277, "step": 1005 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.40538157522678375, "epoch": 2.6473684210526316, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 1006 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.40432411432266235, "epoch": 2.65, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 1007 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3984994739294052, "epoch": 2.6526315789473687, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 1008 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10429.0, "completions/max_terminated_length": 10429.0, "completions/mean_length": 3822.240234375, "completions/mean_terminated_length": 3822.240234375, "completions/min_length": 424.0, "completions/min_terminated_length": 424.0, "entropy": 0.4025459736585617, "epoch": 2.655263157894737, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 452178802.0, "reward": 0.9000000357627869, "reward_std": 0.0, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 1.0, "rewards/symbolic_reward_accuracy/std": 0.0, "rewards/symbolic_reward_partial_score/mean": 1.0, "rewards/symbolic_reward_partial_score/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0904866456985474, "sampling/importance_sampling_ratio/min": 0.002563636051490903, "sampling/sampling_logp_difference/max": 5.9663286209106445, "sampling/sampling_logp_difference/mean": 0.15611515939235687, "step": 1009 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.4009954035282135, "epoch": 2.6578947368421053, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 1010 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.40344473719596863, "epoch": 2.6605263157894736, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 1011 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.39849069714546204, "epoch": 2.663157894736842, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 1012 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10270.0, "completions/max_terminated_length": 10270.0, "completions/mean_length": 3808.224609375, "completions/mean_terminated_length": 3808.224609375, "completions/min_length": 492.0, "completions/min_terminated_length": 492.0, "entropy": 0.40151160955429077, "epoch": 2.6657894736842103, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 454516741.0, "reward": 0.9000000357627869, "reward_std": 0.0, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 1.0, "rewards/symbolic_reward_accuracy/std": 0.0, "rewards/symbolic_reward_partial_score/mean": 1.0, "rewards/symbolic_reward_partial_score/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0911831855773926, "sampling/importance_sampling_ratio/min": 0.0009332663030363619, "sampling/sampling_logp_difference/max": 6.97681999206543, "sampling/sampling_logp_difference/mean": 0.1565323770046234, "step": 1013 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.402164027094841, "epoch": 2.668421052631579, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 1014 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.40544237196445465, "epoch": 2.6710526315789473, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 1015 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.40339864790439606, "epoch": 2.6736842105263157, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 1016 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11035.0, "completions/max_terminated_length": 11035.0, "completions/mean_length": 3711.486328125, "completions/mean_terminated_length": 3711.486328125, "completions/min_length": 535.0, "completions/min_terminated_length": 535.0, "entropy": 0.403278112411499, "epoch": 2.6763157894736844, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 456787806.0, "reward": 0.9000000357627869, "reward_std": 0.0, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 1.0, "rewards/symbolic_reward_accuracy/std": 0.0, "rewards/symbolic_reward_partial_score/mean": 1.0, "rewards/symbolic_reward_partial_score/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.09116792678833, "sampling/importance_sampling_ratio/min": 0.001347281038761139, "sampling/sampling_logp_difference/max": 6.60966682434082, "sampling/sampling_logp_difference/mean": 0.15670067071914673, "step": 1017 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.4027330130338669, "epoch": 2.6789473684210527, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 1018 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.40617962181568146, "epoch": 2.681578947368421, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 1019 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.4022398740053177, "epoch": 2.6842105263157894, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 1020 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9171.0, "completions/max_terminated_length": 9171.0, "completions/mean_length": 3794.228515625, "completions/mean_terminated_length": 3794.228515625, "completions/min_length": 470.0, "completions/min_terminated_length": 470.0, "entropy": 0.40036024153232574, "epoch": 2.6868421052631577, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 459151443.0, "reward": 0.9000000357627869, "reward_std": 0.0, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 1.0, "rewards/symbolic_reward_accuracy/std": 0.0, "rewards/symbolic_reward_partial_score/mean": 1.0, "rewards/symbolic_reward_partial_score/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.08983314037323, "sampling/importance_sampling_ratio/min": 0.003144488437101245, "sampling/sampling_logp_difference/max": 5.762104034423828, "sampling/sampling_logp_difference/mean": 0.1544870287179947, "step": 1021 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3993856608867645, "epoch": 2.6894736842105265, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 1022 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.39674873650074005, "epoch": 2.692105263157895, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 1023 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3959619402885437, "epoch": 2.694736842105263, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 1024 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10469.0, "completions/max_terminated_length": 10469.0, "completions/mean_length": 3344.689453125, "completions/mean_terminated_length": 3344.689453125, "completions/min_length": 469.0, "completions/min_terminated_length": 469.0, "entropy": 0.39411069452762604, "epoch": 2.6973684210526314, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 461247540.0, "reward": 0.9000000357627869, "reward_std": 0.0, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 1.0, "rewards/symbolic_reward_accuracy/std": 0.0, "rewards/symbolic_reward_partial_score/mean": 1.0, "rewards/symbolic_reward_partial_score/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.089442491531372, "sampling/importance_sampling_ratio/min": 0.004360068589448929, "sampling/sampling_logp_difference/max": 5.435267448425293, "sampling/sampling_logp_difference/mean": 0.154719278216362, "step": 1025 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3965809643268585, "epoch": 2.7, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 1026 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.39783811569213867, "epoch": 2.7026315789473685, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 1027 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3959786891937256, "epoch": 2.705263157894737, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 1028 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13936.0, "completions/max_terminated_length": 13936.0, "completions/mean_length": 3729.05078125, "completions/mean_terminated_length": 3729.05078125, "completions/min_length": 483.0, "completions/min_terminated_length": 483.0, "entropy": 0.3941676616668701, "epoch": 2.707894736842105, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 463560782.0, "reward": 0.9000000357627869, "reward_std": 0.0, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 1.0, "rewards/symbolic_reward_accuracy/std": 0.0, "rewards/symbolic_reward_partial_score/mean": 1.0, "rewards/symbolic_reward_partial_score/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0890140533447266, "sampling/importance_sampling_ratio/min": 0.0010527565609663725, "sampling/sampling_logp_difference/max": 6.8563432693481445, "sampling/sampling_logp_difference/mean": 0.15341177582740784, "step": 1029 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.39955778419971466, "epoch": 2.7105263157894735, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 1030 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.39392608404159546, "epoch": 2.713157894736842, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 1031 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.39400890469551086, "epoch": 2.7157894736842105, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 1032 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8966.0, "completions/max_terminated_length": 8966.0, "completions/mean_length": 3685.279296875, "completions/mean_terminated_length": 3685.279296875, "completions/min_length": 486.0, "completions/min_terminated_length": 486.0, "entropy": 0.3934840112924576, "epoch": 2.718421052631579, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 465859293.0, "reward": 0.9000000357627869, "reward_std": 0.0, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 1.0, "rewards/symbolic_reward_accuracy/std": 0.0, "rewards/symbolic_reward_partial_score/mean": 1.0, "rewards/symbolic_reward_partial_score/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0886006355285645, "sampling/importance_sampling_ratio/min": 0.0030870584305375814, "sampling/sampling_logp_difference/max": 5.780536651611328, "sampling/sampling_logp_difference/mean": 0.15367726981639862, "step": 1033 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.39532606303691864, "epoch": 2.7210526315789476, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 1034 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3922531306743622, "epoch": 2.723684210526316, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 1035 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3962577283382416, "epoch": 2.7263157894736842, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 1036 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9200.0, "completions/max_terminated_length": 9200.0, "completions/mean_length": 3606.40234375, "completions/mean_terminated_length": 3606.40234375, "completions/min_length": 489.0, "completions/min_terminated_length": 489.0, "entropy": 0.3948618918657303, "epoch": 2.7289473684210526, "frac_reward_zero_std": 0.96875, "grad_norm": 0.002073246520012617, "learning_rate": 1e-06, "loss": -0.0026, "num_tokens": 468131275.0, "reward": 0.8987305164337158, "reward_std": 0.005078125279396772, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.998046875, "rewards/symbolic_reward_accuracy/std": 0.04419417306780815, "rewards/symbolic_reward_partial_score/mean": 0.9996744394302368, "rewards/symbolic_reward_partial_score/std": 0.007365696597844362, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0884604454040527, "sampling/importance_sampling_ratio/min": 2.980760882564937e-07, "sampling/sampling_logp_difference/max": 15.025917053222656, "sampling/sampling_logp_difference/mean": 0.15291887521743774, "step": 1037 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03125, "entropy": 0.39376483857631683, "epoch": 2.731578947368421, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0083, "step": 1038 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.3918444663286209, "epoch": 2.734210526315789, "grad_norm": 0.0020972939673811197, "learning_rate": 1e-06, "loss": -0.0023, "step": 1039 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.3908933848142624, "epoch": 2.736842105263158, "grad_norm": 0.0016518011689186096, "learning_rate": 1e-06, "loss": -0.0025, "step": 1040 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8386.0, "completions/max_terminated_length": 8386.0, "completions/mean_length": 3537.03125, "completions/mean_terminated_length": 3537.03125, "completions/min_length": 556.0, "completions/min_terminated_length": 556.0, "entropy": 0.40366119146347046, "epoch": 2.7394736842105263, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 470340027.0, "reward": 0.9000000357627869, "reward_std": 0.0, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 1.0, "rewards/symbolic_reward_accuracy/std": 0.0, "rewards/symbolic_reward_partial_score/mean": 1.0, "rewards/symbolic_reward_partial_score/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0909438133239746, "sampling/importance_sampling_ratio/min": 0.002487393096089363, "sampling/sampling_logp_difference/max": 5.996520042419434, "sampling/sampling_logp_difference/mean": 0.15648090839385986, "step": 1041 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.4051594138145447, "epoch": 2.7421052631578946, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 1042 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.4037921130657196, "epoch": 2.7447368421052634, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 1043 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.4029996395111084, "epoch": 2.7473684210526317, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 1044 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11908.0, "completions/max_terminated_length": 11908.0, "completions/mean_length": 3488.17578125, "completions/mean_terminated_length": 3488.17578125, "completions/min_length": 522.0, "completions/min_terminated_length": 522.0, "entropy": 0.4027787744998932, "epoch": 2.75, "frac_reward_zero_std": 0.96875, "grad_norm": 0.002077309414744377, "learning_rate": 1e-06, "loss": -0.0034, "num_tokens": 472525109.0, "reward": 0.8982422351837158, "reward_std": 0.00703125074505806, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.998046875, "rewards/symbolic_reward_accuracy/std": 0.04419417306780815, "rewards/symbolic_reward_partial_score/mean": 0.998046875, "rewards/symbolic_reward_partial_score/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0904816389083862, "sampling/importance_sampling_ratio/min": 0.004140568431466818, "sampling/sampling_logp_difference/max": 5.486922264099121, "sampling/sampling_logp_difference/mean": 0.15609616041183472, "step": 1045 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.4004421830177307, "epoch": 2.7526315789473683, "grad_norm": 0.027714325115084648, "learning_rate": 1e-06, "loss": 0.0134, "step": 1046 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.40462976694107056, "epoch": 2.7552631578947366, "grad_norm": 0.002029445255175233, "learning_rate": 1e-06, "loss": -0.0035, "step": 1047 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.39682209491729736, "epoch": 2.7578947368421054, "grad_norm": 0.0026957904919981956, "learning_rate": 1e-06, "loss": -0.0035, "step": 1048 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12487.0, "completions/max_terminated_length": 12487.0, "completions/mean_length": 3775.93359375, "completions/mean_terminated_length": 3775.93359375, "completions/min_length": 364.0, "completions/min_terminated_length": 364.0, "entropy": 0.39076554775238037, "epoch": 2.7605263157894737, "frac_reward_zero_std": 0.96875, "grad_norm": 0.02202165685594082, "learning_rate": 1e-06, "loss": 0.0102, "num_tokens": 474874547.0, "reward": 0.8974609375, "reward_std": 0.006938039790838957, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.99609375, "rewards/symbolic_reward_accuracy/std": 0.06243881583213806, "rewards/symbolic_reward_partial_score/mean": 0.9993489384651184, "rewards/symbolic_reward_partial_score/std": 0.010406470857560635, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0883629322052002, "sampling/importance_sampling_ratio/min": 0.002206446137279272, "sampling/sampling_logp_difference/max": 6.116372108459473, "sampling/sampling_logp_difference/mean": 0.152793750166893, "step": 1049 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.3945150077342987, "epoch": 2.763157894736842, "grad_norm": 0.004536837339401245, "learning_rate": 1e-06, "loss": -0.0062, "step": 1050 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.39002805948257446, "epoch": 2.765789473684211, "grad_norm": 0.003886835416778922, "learning_rate": 1e-06, "loss": -0.0054, "step": 1051 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.3889981210231781, "epoch": 2.768421052631579, "grad_norm": 0.003946240525692701, "learning_rate": 1e-06, "loss": 0.0105, "step": 1052 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15303.0, "completions/max_terminated_length": 15303.0, "completions/mean_length": 3061.92578125, "completions/mean_terminated_length": 3061.92578125, "completions/min_length": 358.0, "completions/min_terminated_length": 358.0, "entropy": 0.3929746598005295, "epoch": 2.7710526315789474, "frac_reward_zero_std": 0.96875, "grad_norm": 0.0022234539501369, "learning_rate": 1e-06, "loss": -0.0024, "num_tokens": 476846221.0, "reward": 0.8986328840255737, "reward_std": 0.00546875037252903, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.998046875, "rewards/symbolic_reward_accuracy/std": 0.04419417306780815, "rewards/symbolic_reward_partial_score/mean": 0.9993489980697632, "rewards/symbolic_reward_partial_score/std": 0.014731390401721, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0882174968719482, "sampling/importance_sampling_ratio/min": 0.004161065444350243, "sampling/sampling_logp_difference/max": 5.4819841384887695, "sampling/sampling_logp_difference/mean": 0.15306146442890167, "step": 1053 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.39126935601234436, "epoch": 2.7736842105263158, "grad_norm": 0.0012577084125950933, "learning_rate": 1e-06, "loss": -0.0019, "step": 1054 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.3867139220237732, "epoch": 2.776315789473684, "grad_norm": 0.0017114996444433928, "learning_rate": 1e-06, "loss": -0.002, "step": 1055 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.38739894330501556, "epoch": 2.7789473684210524, "grad_norm": 0.0020850293803960085, "learning_rate": 1e-06, "loss": 0.0276, "step": 1056 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9750.0, "completions/max_terminated_length": 9750.0, "completions/mean_length": 3504.279296875, "completions/mean_terminated_length": 3504.279296875, "completions/min_length": 380.0, "completions/min_terminated_length": 380.0, "entropy": 0.39062733948230743, "epoch": 2.781578947368421, "frac_reward_zero_std": 0.96875, "grad_norm": 0.0019838823936879635, "learning_rate": 1e-06, "loss": -0.0023, "num_tokens": 479045724.0, "reward": 0.8987305164337158, "reward_std": 0.005078125279396772, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.998046875, "rewards/symbolic_reward_accuracy/std": 0.04419417306780815, "rewards/symbolic_reward_partial_score/mean": 0.9996744394302368, "rewards/symbolic_reward_partial_score/std": 0.007365696597844362, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0878138542175293, "sampling/importance_sampling_ratio/min": 0.0021842578426003456, "sampling/sampling_logp_difference/max": 6.126479148864746, "sampling/sampling_logp_difference/mean": 0.15188148617744446, "step": 1057 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.3848903924226761, "epoch": 2.7842105263157895, "grad_norm": 0.002210680628195405, "learning_rate": 1e-06, "loss": -0.0021, "step": 1058 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.38857661187648773, "epoch": 2.786842105263158, "grad_norm": 0.0022050533443689346, "learning_rate": 1e-06, "loss": 0.0031, "step": 1059 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.38466736674308777, "epoch": 2.7894736842105265, "grad_norm": 0.0024165017530322075, "learning_rate": 1e-06, "loss": -0.0027, "step": 1060 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 8889.0, "completions/mean_length": 3234.724609375, "completions/mean_terminated_length": 3208.9921875, "completions/min_length": 433.0, "completions/min_terminated_length": 433.0, "entropy": 0.3732607364654541, "epoch": 2.792105263157895, "frac_reward_zero_std": 0.96875, "grad_norm": 0.0015201744390651584, "learning_rate": 1e-06, "loss": -0.0012, "num_tokens": 481133743.0, "reward": 0.8980469107627869, "reward_std": 0.0078125, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.998046875, "rewards/symbolic_reward_accuracy/std": 0.04419417306780815, "rewards/symbolic_reward_partial_score/mean": 0.998046875, "rewards/symbolic_reward_partial_score/std": 0.04419417306780815, "rewards/tag_count_reward/mean": -0.001953125, "rewards/tag_count_reward/std": 0.04419417306780815, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0841476917266846, "sampling/importance_sampling_ratio/min": 0.0015334340278059244, "sampling/sampling_logp_difference/max": 6.480245590209961, "sampling/sampling_logp_difference/mean": 0.14670062065124512, "step": 1061 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.3752097487449646, "epoch": 2.794736842105263, "grad_norm": 0.0011991435894742608, "learning_rate": 1e-06, "loss": -0.001, "step": 1062 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.37626224756240845, "epoch": 2.7973684210526315, "grad_norm": 0.031116753816604614, "learning_rate": 1e-06, "loss": 0.0299, "step": 1063 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.37669388949871063, "epoch": 2.8, "grad_norm": 0.0010285305324941874, "learning_rate": 1e-06, "loss": -0.0013, "step": 1064 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10935.0, "completions/max_terminated_length": 10935.0, "completions/mean_length": 3497.708984375, "completions/mean_terminated_length": 3497.708984375, "completions/min_length": 449.0, "completions/min_terminated_length": 449.0, "entropy": 0.38186049461364746, "epoch": 2.8026315789473686, "frac_reward_zero_std": 0.96875, "grad_norm": 0.0024529274087399244, "learning_rate": 1e-06, "loss": -0.0027, "num_tokens": 483339226.0, "reward": 0.8987305164337158, "reward_std": 0.005078125279396772, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.998046875, "rewards/symbolic_reward_accuracy/std": 0.04419417306780815, "rewards/symbolic_reward_partial_score/mean": 0.9996744394302368, "rewards/symbolic_reward_partial_score/std": 0.007365696597844362, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0863862037658691, "sampling/importance_sampling_ratio/min": 0.001768267247825861, "sampling/sampling_logp_difference/max": 6.33775520324707, "sampling/sampling_logp_difference/mean": 0.14989566802978516, "step": 1065 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.38205549120903015, "epoch": 2.805263157894737, "grad_norm": 0.001572508248500526, "learning_rate": 1e-06, "loss": -0.0027, "step": 1066 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3872770071029663, "epoch": 2.807894736842105, "grad_norm": 0.002107608364894986, "learning_rate": 1e-06, "loss": -0.0023, "step": 1067 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.38262926042079926, "epoch": 2.8105263157894735, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0192, "step": 1068 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9636.0, "completions/max_terminated_length": 9636.0, "completions/mean_length": 3024.078125, "completions/mean_terminated_length": 3024.078125, "completions/min_length": 456.0, "completions/min_terminated_length": 456.0, "entropy": 0.3894702047109604, "epoch": 2.8131578947368423, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 485276162.0, "reward": 0.9000000357627869, "reward_std": 0.0, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 1.0, "rewards/symbolic_reward_accuracy/std": 0.0, "rewards/symbolic_reward_partial_score/mean": 1.0, "rewards/symbolic_reward_partial_score/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.087268352508545, "sampling/importance_sampling_ratio/min": 0.004055540543049574, "sampling/sampling_logp_difference/max": 5.507671356201172, "sampling/sampling_logp_difference/mean": 0.1517965942621231, "step": 1069 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3911631107330322, "epoch": 2.8157894736842106, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 1070 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.38636232912540436, "epoch": 2.818421052631579, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 1071 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.38694897294044495, "epoch": 2.8210526315789473, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 1072 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10683.0, "completions/max_terminated_length": 10683.0, "completions/mean_length": 3603.876953125, "completions/mean_terminated_length": 3603.876953125, "completions/min_length": 461.0, "completions/min_terminated_length": 461.0, "entropy": 0.3905184119939804, "epoch": 2.8236842105263156, "frac_reward_zero_std": 0.96875, "grad_norm": 0.0023312463890761137, "learning_rate": 1e-06, "loss": -0.0027, "num_tokens": 487526499.0, "reward": 0.898681640625, "reward_std": 0.0052734375931322575, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.998046875, "rewards/symbolic_reward_accuracy/std": 0.04419417306780815, "rewards/symbolic_reward_partial_score/mean": 0.99951171875, "rewards/symbolic_reward_partial_score/std": 0.011048543266952038, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0872595310211182, "sampling/importance_sampling_ratio/min": 0.003434377256780863, "sampling/sampling_logp_difference/max": 5.673919677734375, "sampling/sampling_logp_difference/mean": 0.15198805928230286, "step": 1073 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.3862711638212204, "epoch": 2.8263157894736843, "grad_norm": 0.0015193783910945058, "learning_rate": 1e-06, "loss": -0.0022, "step": 1074 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.3876970112323761, "epoch": 2.8289473684210527, "grad_norm": 0.0016826426144689322, "learning_rate": 1e-06, "loss": -0.0024, "step": 1075 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.38463014364242554, "epoch": 2.831578947368421, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0067, "step": 1076 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13098.0, "completions/max_terminated_length": 13098.0, "completions/mean_length": 2989.75390625, "completions/mean_terminated_length": 2989.75390625, "completions/min_length": 470.0, "completions/min_terminated_length": 470.0, "entropy": 0.3805331587791443, "epoch": 2.8342105263157897, "frac_reward_zero_std": 0.9375, "grad_norm": 0.003820231882855296, "learning_rate": 1e-06, "loss": -0.0068, "num_tokens": 489433029.0, "reward": 0.8961914777755737, "reward_std": 0.01201616507023573, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.994140625, "rewards/symbolic_reward_accuracy/std": 0.07639661431312561, "rewards/symbolic_reward_partial_score/mean": 0.9990234375, "rewards/symbolic_reward_partial_score/std": 0.012732770293951035, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0856754779815674, "sampling/importance_sampling_ratio/min": 0.00398477166891098, "sampling/sampling_logp_difference/max": 5.525275230407715, "sampling/sampling_logp_difference/mean": 0.14967715740203857, "step": 1077 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.38402311503887177, "epoch": 2.836842105263158, "grad_norm": 0.0025738030672073364, "learning_rate": 1e-06, "loss": 0.0145, "step": 1078 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.38062965869903564, "epoch": 2.8394736842105264, "grad_norm": 0.003516597207635641, "learning_rate": 1e-06, "loss": 0.0122, "step": 1079 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.37976762652397156, "epoch": 2.8421052631578947, "grad_norm": 0.0036118386778980494, "learning_rate": 1e-06, "loss": -0.0053, "step": 1080 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11601.0, "completions/max_terminated_length": 11601.0, "completions/mean_length": 3507.228515625, "completions/mean_terminated_length": 3507.228515625, "completions/min_length": 484.0, "completions/min_terminated_length": 484.0, "entropy": 0.38242435455322266, "epoch": 2.844736842105263, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 491653210.0, "reward": 0.9000000357627869, "reward_std": 0.0, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 1.0, "rewards/symbolic_reward_accuracy/std": 0.0, "rewards/symbolic_reward_partial_score/mean": 1.0, "rewards/symbolic_reward_partial_score/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0866085290908813, "sampling/importance_sampling_ratio/min": 0.00325448508374393, "sampling/sampling_logp_difference/max": 5.727721214294434, "sampling/sampling_logp_difference/mean": 0.14992788434028625, "step": 1081 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.38511645793914795, "epoch": 2.8473684210526313, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 1082 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.38135167956352234, "epoch": 2.85, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 1083 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.38586321473121643, "epoch": 2.8526315789473684, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 1084 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8570.0, "completions/max_terminated_length": 8570.0, "completions/mean_length": 3092.357421875, "completions/mean_terminated_length": 3092.357421875, "completions/min_length": 500.0, "completions/min_terminated_length": 500.0, "entropy": 0.39128880202770233, "epoch": 2.8552631578947367, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 493627953.0, "reward": 0.9000000357627869, "reward_std": 0.0, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 1.0, "rewards/symbolic_reward_accuracy/std": 0.0, "rewards/symbolic_reward_partial_score/mean": 1.0, "rewards/symbolic_reward_partial_score/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0883841514587402, "sampling/importance_sampling_ratio/min": 0.0019409949891269207, "sampling/sampling_logp_difference/max": 6.24455451965332, "sampling/sampling_logp_difference/mean": 0.152787446975708, "step": 1085 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.39037515223026276, "epoch": 2.8578947368421055, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 1086 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.39064259827136993, "epoch": 2.860526315789474, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 1087 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.39653976261615753, "epoch": 2.863157894736842, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 1088 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11993.0, "completions/max_terminated_length": 11993.0, "completions/mean_length": 3391.83203125, "completions/mean_terminated_length": 3391.83203125, "completions/min_length": 533.0, "completions/min_terminated_length": 533.0, "entropy": 0.3848806172609329, "epoch": 2.8657894736842104, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 495771387.0, "reward": 0.9000000357627869, "reward_std": 0.0, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 1.0, "rewards/symbolic_reward_accuracy/std": 0.0, "rewards/symbolic_reward_partial_score/mean": 1.0, "rewards/symbolic_reward_partial_score/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0867500305175781, "sampling/importance_sampling_ratio/min": 0.002327575348317623, "sampling/sampling_logp_difference/max": 6.062928199768066, "sampling/sampling_logp_difference/mean": 0.1505243182182312, "step": 1089 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.38164520263671875, "epoch": 2.8684210526315788, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 1090 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3870142698287964, "epoch": 2.8710526315789475, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 1091 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.38321560621261597, "epoch": 2.873684210526316, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 1092 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9738.0, "completions/max_terminated_length": 9738.0, "completions/mean_length": 3065.369140625, "completions/mean_terminated_length": 3065.369140625, "completions/min_length": 539.0, "completions/min_terminated_length": 539.0, "entropy": 0.3887074589729309, "epoch": 2.876315789473684, "frac_reward_zero_std": 0.96875, "grad_norm": 0.001711332588456571, "learning_rate": 1e-06, "loss": -0.0014, "num_tokens": 497747832.0, "reward": 0.8987305164337158, "reward_std": 0.005078125279396772, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.998046875, "rewards/symbolic_reward_accuracy/std": 0.04419417306780815, "rewards/symbolic_reward_partial_score/mean": 0.9996744394302368, "rewards/symbolic_reward_partial_score/std": 0.007365696597844362, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.088169813156128, "sampling/importance_sampling_ratio/min": 0.002554316306486726, "sampling/sampling_logp_difference/max": 5.969970703125, "sampling/sampling_logp_difference/mean": 0.15200534462928772, "step": 1093 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.38776396214962006, "epoch": 2.8789473684210525, "grad_norm": 0.0014681449392810464, "learning_rate": 1e-06, "loss": 0.0176, "step": 1094 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.3874232918024063, "epoch": 2.8815789473684212, "grad_norm": 0.0010686359601095319, "learning_rate": 1e-06, "loss": -0.002, "step": 1095 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.3920523375272751, "epoch": 2.8842105263157896, "grad_norm": 0.0014021456008777022, "learning_rate": 1e-06, "loss": -0.0016, "step": 1096 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11043.0, "completions/max_terminated_length": 11043.0, "completions/mean_length": 3295.080078125, "completions/mean_terminated_length": 3295.080078125, "completions/min_length": 489.0, "completions/min_terminated_length": 489.0, "entropy": 0.3924073129892349, "epoch": 2.886842105263158, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 499818689.0, "reward": 0.9000000357627869, "reward_std": 0.0, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 1.0, "rewards/symbolic_reward_accuracy/std": 0.0, "rewards/symbolic_reward_partial_score/mean": 1.0, "rewards/symbolic_reward_partial_score/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.088860273361206, "sampling/importance_sampling_ratio/min": 0.0029728407971560955, "sampling/sampling_logp_difference/max": 5.8182373046875, "sampling/sampling_logp_difference/mean": 0.15280401706695557, "step": 1097 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.39159420132637024, "epoch": 2.889473684210526, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 1098 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3905697762966156, "epoch": 2.8921052631578945, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 1099 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3930809199810028, "epoch": 2.8947368421052633, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 1100 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11985.0, "completions/max_terminated_length": 11985.0, "completions/mean_length": 3453.9375, "completions/mean_terminated_length": 3453.9375, "completions/min_length": 394.0, "completions/min_terminated_length": 394.0, "entropy": 0.38902023434638977, "epoch": 2.8973684210526316, "frac_reward_zero_std": 0.9375, "grad_norm": 0.0030925211030989885, "learning_rate": 1e-06, "loss": -0.0006, "num_tokens": 501978561.0, "reward": 0.8980469107627869, "reward_std": 0.007812500931322575, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.998046875, "rewards/symbolic_reward_accuracy/std": 0.04419417306780815, "rewards/symbolic_reward_partial_score/mean": 0.998046875, "rewards/symbolic_reward_partial_score/std": 0.04419417306780815, "rewards/tag_count_reward/mean": -0.001953125, "rewards/tag_count_reward/std": 0.04419417306780815, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0882786512374878, "sampling/importance_sampling_ratio/min": 6.25175453023985e-05, "sampling/sampling_logp_difference/max": 9.680063247680664, "sampling/sampling_logp_difference/mean": 0.15266753733158112, "step": 1101 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.3919554203748703, "epoch": 2.9, "grad_norm": 0.03331305831670761, "learning_rate": 1e-06, "loss": 0.018, "step": 1102 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.39228980243206024, "epoch": 2.9026315789473687, "grad_norm": 0.003138504223898053, "learning_rate": 1e-06, "loss": -0.0061, "step": 1103 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03125, "entropy": 0.3881448358297348, "epoch": 2.905263157894737, "grad_norm": 0.0016439164755865932, "learning_rate": 1e-06, "loss": -0.0054, "step": 1104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11233.0, "completions/max_terminated_length": 11233.0, "completions/mean_length": 3375.2734375, "completions/mean_terminated_length": 3375.2734375, "completions/min_length": 386.0, "completions/min_terminated_length": 386.0, "entropy": 0.38365335762500763, "epoch": 2.9078947368421053, "frac_reward_zero_std": 0.96875, "grad_norm": 0.002679168712347746, "learning_rate": 1e-06, "loss": -0.0036, "num_tokens": 504121517.0, "reward": 0.8987305164337158, "reward_std": 0.005078125279396772, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.998046875, "rewards/symbolic_reward_accuracy/std": 0.04419417306780815, "rewards/symbolic_reward_partial_score/mean": 0.9996744394302368, "rewards/symbolic_reward_partial_score/std": 0.007365696597844362, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.086879014968872, "sampling/importance_sampling_ratio/min": 0.001467368914745748, "sampling/sampling_logp_difference/max": 6.524284362792969, "sampling/sampling_logp_difference/mean": 0.14978712797164917, "step": 1105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.38595905900001526, "epoch": 2.9105263157894736, "grad_norm": 0.002241633366793394, "learning_rate": 1e-06, "loss": -0.0027, "step": 1106 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.38653893768787384, "epoch": 2.913157894736842, "grad_norm": 0.030706673860549927, "learning_rate": 1e-06, "loss": 0.0169, "step": 1107 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3862478733062744, "epoch": 2.9157894736842103, "grad_norm": 0.0024082071613520384, "learning_rate": 1e-06, "loss": -0.0031, "step": 1108 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11098.0, "completions/max_terminated_length": 11098.0, "completions/mean_length": 3539.408203125, "completions/mean_terminated_length": 3539.408203125, "completions/min_length": 459.0, "completions/min_terminated_length": 459.0, "entropy": 0.3952454775571823, "epoch": 2.918421052631579, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 506322142.0, "reward": 0.9000000357627869, "reward_std": 0.0, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 1.0, "rewards/symbolic_reward_accuracy/std": 0.0, "rewards/symbolic_reward_partial_score/mean": 1.0, "rewards/symbolic_reward_partial_score/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0895371437072754, "sampling/importance_sampling_ratio/min": 0.0011138333939015865, "sampling/sampling_logp_difference/max": 6.799947738647461, "sampling/sampling_logp_difference/mean": 0.15442010760307312, "step": 1109 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3919040858745575, "epoch": 2.9210526315789473, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 1110 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.39629602432250977, "epoch": 2.9236842105263157, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 1111 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.39989764988422394, "epoch": 2.9263157894736844, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 1112 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15692.0, "completions/max_terminated_length": 15692.0, "completions/mean_length": 3618.439453125, "completions/mean_terminated_length": 3618.439453125, "completions/min_length": 543.0, "completions/min_terminated_length": 543.0, "entropy": 0.38937245309352875, "epoch": 2.9289473684210527, "frac_reward_zero_std": 0.9375, "grad_norm": 0.009127617813646793, "learning_rate": 1e-06, "loss": 0.0004, "num_tokens": 508570431.0, "reward": 0.8896484375, "reward_std": 0.016266435384750366, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.984375, "rewards/symbolic_reward_accuracy/std": 0.12414088100194931, "rewards/symbolic_reward_partial_score/mean": 0.9967447519302368, "rewards/symbolic_reward_partial_score/std": 0.02931104227900505, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0880897045135498, "sampling/importance_sampling_ratio/min": 0.0024456975515931845, "sampling/sampling_logp_difference/max": 6.013424873352051, "sampling/sampling_logp_difference/mean": 0.15183305740356445, "step": 1113 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.3833869844675064, "epoch": 2.931578947368421, "grad_norm": 0.02626909874379635, "learning_rate": 1e-06, "loss": 0.017, "step": 1114 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.39082400500774384, "epoch": 2.9342105263157894, "grad_norm": 0.010098946280777454, "learning_rate": 1e-06, "loss": 0.0018, "step": 1115 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.3872024714946747, "epoch": 2.9368421052631577, "grad_norm": 0.01311742514371872, "learning_rate": 1e-06, "loss": -0.0073, "step": 1116 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9458.0, "completions/max_terminated_length": 9458.0, "completions/mean_length": 3803.9140625, "completions/mean_terminated_length": 3803.9140625, "completions/min_length": 390.0, "completions/min_terminated_length": 390.0, "entropy": 0.388803631067276, "epoch": 2.9394736842105265, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 510937363.0, "reward": 0.9000000357627869, "reward_std": 0.0, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 1.0, "rewards/symbolic_reward_accuracy/std": 0.0, "rewards/symbolic_reward_partial_score/mean": 1.0, "rewards/symbolic_reward_partial_score/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0880706310272217, "sampling/importance_sampling_ratio/min": 0.004354209639132023, "sampling/sampling_logp_difference/max": 5.436612129211426, "sampling/sampling_logp_difference/mean": 0.15157127380371094, "step": 1117 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3881782591342926, "epoch": 2.942105263157895, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 1118 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.38570642471313477, "epoch": 2.944736842105263, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 1119 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.38715243339538574, "epoch": 2.9473684210526314, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 1120 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11819.0, "completions/max_terminated_length": 11819.0, "completions/mean_length": 3962.5390625, "completions/mean_terminated_length": 3962.5390625, "completions/min_length": 762.0, "completions/min_terminated_length": 762.0, "entropy": 0.3953952044248581, "epoch": 2.95, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 513382023.0, "reward": 0.9000000357627869, "reward_std": 0.0, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 1.0, "rewards/symbolic_reward_accuracy/std": 0.0, "rewards/symbolic_reward_partial_score/mean": 1.0, "rewards/symbolic_reward_partial_score/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0895607471466064, "sampling/importance_sampling_ratio/min": 0.003097308799624443, "sampling/sampling_logp_difference/max": 5.7772216796875, "sampling/sampling_logp_difference/mean": 0.15326669812202454, "step": 1121 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.39269375801086426, "epoch": 2.9526315789473685, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 1122 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3949871361255646, "epoch": 2.955263157894737, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 1123 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.38794559240341187, "epoch": 2.957894736842105, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 1124 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11825.0, "completions/max_terminated_length": 11825.0, "completions/mean_length": 3837.962890625, "completions/mean_terminated_length": 3837.962890625, "completions/min_length": 537.0, "completions/min_terminated_length": 537.0, "entropy": 0.38587941229343414, "epoch": 2.9605263157894735, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 515758548.0, "reward": 0.9000000357627869, "reward_std": 0.0, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 1.0, "rewards/symbolic_reward_accuracy/std": 0.0, "rewards/symbolic_reward_partial_score/mean": 1.0, "rewards/symbolic_reward_partial_score/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.087846279144287, "sampling/importance_sampling_ratio/min": 0.0011745793744921684, "sampling/sampling_logp_difference/max": 6.746845245361328, "sampling/sampling_logp_difference/mean": 0.151071697473526, "step": 1125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3901519477367401, "epoch": 2.963157894736842, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 1126 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.38443879783153534, "epoch": 2.9657894736842105, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 1127 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.38520263135433197, "epoch": 2.968421052631579, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 1128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9943.0, "completions/max_terminated_length": 9943.0, "completions/mean_length": 3948.1875, "completions/mean_terminated_length": 3948.1875, "completions/min_length": 603.0, "completions/min_terminated_length": 603.0, "entropy": 0.39092546701431274, "epoch": 2.9710526315789476, "frac_reward_zero_std": 0.96875, "grad_norm": 0.002224680967628956, "learning_rate": 1e-06, "loss": -0.0023, "num_tokens": 518183828.0, "reward": 0.898681640625, "reward_std": 0.0052734375931322575, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.998046875, "rewards/symbolic_reward_accuracy/std": 0.04419417306780815, "rewards/symbolic_reward_partial_score/mean": 0.99951171875, "rewards/symbolic_reward_partial_score/std": 0.011048543266952038, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0894312858581543, "sampling/importance_sampling_ratio/min": 0.0028497930616140366, "sampling/sampling_logp_difference/max": 5.860508918762207, "sampling/sampling_logp_difference/mean": 0.1534252017736435, "step": 1129 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.395298570394516, "epoch": 2.973684210526316, "grad_norm": 0.002213385421782732, "learning_rate": 1e-06, "loss": -0.002, "step": 1130 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.39260394871234894, "epoch": 2.9763157894736842, "grad_norm": 0.002191609935835004, "learning_rate": 1e-06, "loss": 0.0071, "step": 1131 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.3927992433309555, "epoch": 2.9789473684210526, "grad_norm": 0.0019342908635735512, "learning_rate": 1e-06, "loss": -0.0027, "step": 1132 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13554.0, "completions/max_terminated_length": 13554.0, "completions/mean_length": 4097.73046875, "completions/mean_terminated_length": 4097.73046875, "completions/min_length": 482.0, "completions/min_terminated_length": 482.0, "entropy": 0.3770846426486969, "epoch": 2.981578947368421, "frac_reward_zero_std": 0.96875, "grad_norm": 0.0038568079471588135, "learning_rate": 1e-06, "loss": -0.007, "num_tokens": 520719882.0, "reward": 0.8974609375, "reward_std": 0.006938039790838957, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.99609375, "rewards/symbolic_reward_accuracy/std": 0.06243881583213806, "rewards/symbolic_reward_partial_score/mean": 0.9993489384651184, "rewards/symbolic_reward_partial_score/std": 0.01040646992623806, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0863037109375, "sampling/importance_sampling_ratio/min": 0.0014548948965966702, "sampling/sampling_logp_difference/max": 6.5328216552734375, "sampling/sampling_logp_difference/mean": 0.14874295890331268, "step": 1133 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.37830521166324615, "epoch": 2.984210526315789, "grad_norm": 0.0019566272385418415, "learning_rate": 1e-06, "loss": -0.0053, "step": 1134 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.37764330208301544, "epoch": 2.986842105263158, "grad_norm": 0.002765793353319168, "learning_rate": 1e-06, "loss": -0.0053, "step": 1135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.37828999757766724, "epoch": 2.9894736842105263, "grad_norm": 0.024250797927379608, "learning_rate": 1e-06, "loss": 0.0332, "step": 1136 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13171.0, "completions/max_terminated_length": 13171.0, "completions/mean_length": 3725.427734375, "completions/mean_terminated_length": 3725.427734375, "completions/min_length": 518.0, "completions/min_terminated_length": 518.0, "entropy": 0.3969728499650955, "epoch": 2.9921052631578946, "frac_reward_zero_std": 0.96875, "grad_norm": 0.034783974289894104, "learning_rate": 1e-06, "loss": 0.0231, "num_tokens": 523034277.0, "reward": 0.8987305164337158, "reward_std": 0.005078125279396772, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.998046875, "rewards/symbolic_reward_accuracy/std": 0.04419417306780815, "rewards/symbolic_reward_partial_score/mean": 0.9996744394302368, "rewards/symbolic_reward_partial_score/std": 0.007365696597844362, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0898430347442627, "sampling/importance_sampling_ratio/min": 0.0024834393989294767, "sampling/sampling_logp_difference/max": 5.998110771179199, "sampling/sampling_logp_difference/mean": 0.15422195196151733, "step": 1137 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.3955516815185547, "epoch": 2.9947368421052634, "grad_norm": 0.0021734628826379776, "learning_rate": 1e-06, "loss": -0.0038, "step": 1138 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.39606884121894836, "epoch": 2.9973684210526317, "grad_norm": 0.0032223237212747335, "learning_rate": 1e-06, "loss": -0.0048, "step": 1139 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.39494186639785767, "epoch": 3.0, "grad_norm": 0.0027688953559845686, "learning_rate": 1e-06, "loss": -0.0036, "step": 1140 }, { "epoch": 3.0, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 8408.53125, "eval_completions/max_terminated_length": 8408.53125, "eval_completions/mean_length": 2759.1025390625, "eval_completions/mean_terminated_length": 2759.1025390625, "eval_completions/min_length": 497.25, "eval_completions/min_terminated_length": 497.25, "eval_entropy": 0.38839732483029366, "eval_frac_reward_zero_std": 0.98828125, "eval_loss": 0.0005080666160210967, "eval_num_tokens": 523034277.0, "eval_reward": 0.8994934447109699, "eval_reward_std": 0.0020263672340661287, "eval_rewards/progression_diversity/mean": 0.0, "eval_rewards/progression_diversity/std": 0.0, "eval_rewards/symbolic_reward_accuracy/mean": 0.999267578125, "eval_rewards/symbolic_reward_accuracy/std": 0.008286407450214028, "eval_rewards/symbolic_reward_partial_score/mean": 0.9997762031853199, "eval_rewards/symbolic_reward_partial_score/std": 0.0025319578999187797, "eval_rewards/tag_count_reward/mean": 0.0, "eval_rewards/tag_count_reward/std": 0.0, "eval_runtime": 4302.9845, "eval_samples_per_second": 0.058, "eval_sampling/importance_sampling_ratio/max": 2.0, "eval_sampling/importance_sampling_ratio/mean": 1.0872314274311066, "eval_sampling/importance_sampling_ratio/min": 0.005545041766822578, "eval_sampling/sampling_logp_difference/max": 5.535137459635735, "eval_sampling/sampling_logp_difference/mean": 0.1511916839517653, "eval_steps_per_second": 0.0, "step": 1140 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16347.0, "completions/max_terminated_length": 16347.0, "completions/mean_length": 4908.3671875, "completions/mean_terminated_length": 4908.3671875, "completions/min_length": 500.0, "completions/min_terminated_length": 500.0, "entropy": 0.3952754735946655, "epoch": 3.0026315789473683, "frac_reward_zero_std": 0.90625, "grad_norm": 0.03161121532320976, "learning_rate": 1e-06, "loss": 0.003, "num_tokens": 525951169.0, "reward": 0.8936035633087158, "reward_std": 0.0185397919267416, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.990234375, "rewards/symbolic_reward_accuracy/std": 0.09843364357948303, "rewards/symbolic_reward_partial_score/mean": 0.9982095956802368, "rewards/symbolic_reward_partial_score/std": 0.018344920128583908, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0902360677719116, "sampling/importance_sampling_ratio/min": 0.002814969979226589, "sampling/sampling_logp_difference/max": 5.872803688049316, "sampling/sampling_logp_difference/mean": 0.15388977527618408, "step": 1141 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.39411111176013947, "epoch": 3.0052631578947366, "grad_norm": 0.006604715716093779, "learning_rate": 1e-06, "loss": -0.0016, "step": 1142 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.39136138558387756, "epoch": 3.0078947368421054, "grad_norm": 0.016117166727781296, "learning_rate": 1e-06, "loss": 0.0246, "step": 1143 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.40034130215644836, "epoch": 3.0105263157894737, "grad_norm": 0.006716945208609104, "learning_rate": 1e-06, "loss": -0.0191, "step": 1144 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15182.0, "completions/max_terminated_length": 15182.0, "completions/mean_length": 4210.9609375, "completions/mean_terminated_length": 4210.9609375, "completions/min_length": 748.0, "completions/min_terminated_length": 748.0, "entropy": 0.3991387039422989, "epoch": 3.013157894736842, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 528514157.0, "reward": 0.9000000357627869, "reward_std": 0.0, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 1.0, "rewards/symbolic_reward_accuracy/std": 0.0, "rewards/symbolic_reward_partial_score/mean": 1.0, "rewards/symbolic_reward_partial_score/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0910050868988037, "sampling/importance_sampling_ratio/min": 0.0006721456302329898, "sampling/sampling_logp_difference/max": 7.305035591125488, "sampling/sampling_logp_difference/mean": 0.1553923487663269, "step": 1145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.39670225977897644, "epoch": 3.0157894736842104, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 1146 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.39703211188316345, "epoch": 3.018421052631579, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 1147 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.39862172305583954, "epoch": 3.0210526315789474, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 1148 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13730.0, "completions/max_terminated_length": 13730.0, "completions/mean_length": 4354.357421875, "completions/mean_terminated_length": 4354.357421875, "completions/min_length": 587.0, "completions/min_terminated_length": 587.0, "entropy": 0.39791300892829895, "epoch": 3.0236842105263158, "frac_reward_zero_std": 0.9375, "grad_norm": 0.015769112855196, "learning_rate": 1e-06, "loss": 0.0006, "num_tokens": 531155236.0, "reward": 0.8957031965255737, "reward_std": 0.013645137660205364, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.994140625, "rewards/symbolic_reward_accuracy/std": 0.07639661431312561, "rewards/symbolic_reward_partial_score/mean": 0.9973958134651184, "rewards/symbolic_reward_partial_score/std": 0.03750407695770264, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0895049571990967, "sampling/importance_sampling_ratio/min": 0.001879677176475525, "sampling/sampling_logp_difference/max": 6.276655197143555, "sampling/sampling_logp_difference/mean": 0.1535317599773407, "step": 1149 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.3920377194881439, "epoch": 3.026315789473684, "grad_norm": 0.004405778832733631, "learning_rate": 1e-06, "loss": 0.0057, "step": 1150 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.3932999074459076, "epoch": 3.028947368421053, "grad_norm": 0.004050300922244787, "learning_rate": 1e-06, "loss": -0.0094, "step": 1151 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.3923886567354202, "epoch": 3.031578947368421, "grad_norm": 0.0026641665026545525, "learning_rate": 1e-06, "loss": 0.0032, "step": 1152 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14438.0, "completions/max_terminated_length": 14438.0, "completions/mean_length": 4586.24609375, "completions/mean_terminated_length": 4586.24609375, "completions/min_length": 519.0, "completions/min_terminated_length": 519.0, "entropy": 0.3994075357913971, "epoch": 3.0342105263157895, "frac_reward_zero_std": 0.96875, "grad_norm": 0.0027698499616235495, "learning_rate": 1e-06, "loss": -0.003, "num_tokens": 533899522.0, "reward": 0.8987305164337158, "reward_std": 0.005078125279396772, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.998046875, "rewards/symbolic_reward_accuracy/std": 0.04419417306780815, "rewards/symbolic_reward_partial_score/mean": 0.9996744394302368, "rewards/symbolic_reward_partial_score/std": 0.007365696597844362, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0905743837356567, "sampling/importance_sampling_ratio/min": 0.0013881644699722528, "sampling/sampling_logp_difference/max": 6.57977294921875, "sampling/sampling_logp_difference/mean": 0.15558269619941711, "step": 1153 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.39899948239326477, "epoch": 3.036842105263158, "grad_norm": 0.032868605107069016, "learning_rate": 1e-06, "loss": 0.0192, "step": 1154 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.3988516181707382, "epoch": 3.039473684210526, "grad_norm": 0.001777838682755828, "learning_rate": 1e-06, "loss": -0.003, "step": 1155 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3992135673761368, "epoch": 3.042105263157895, "grad_norm": 0.0028097599279135466, "learning_rate": 1e-06, "loss": -0.0036, "step": 1156 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15838.0, "completions/max_terminated_length": 15838.0, "completions/mean_length": 4903.33203125, "completions/mean_terminated_length": 4903.33203125, "completions/min_length": 564.0, "completions/min_terminated_length": 564.0, "entropy": 0.3982274830341339, "epoch": 3.044736842105263, "frac_reward_zero_std": 0.9375, "grad_norm": 0.0031633598264306784, "learning_rate": 1e-06, "loss": -0.0044, "num_tokens": 536821356.0, "reward": 0.897216796875, "reward_std": 0.01113281399011612, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.99609375, "rewards/symbolic_reward_accuracy/std": 0.06243881583213806, "rewards/symbolic_reward_partial_score/mean": 0.99853515625, "rewards/symbolic_reward_partial_score/std": 0.024685947224497795, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0904779434204102, "sampling/importance_sampling_ratio/min": 0.002137739211320877, "sampling/sampling_logp_difference/max": 6.148006439208984, "sampling/sampling_logp_difference/mean": 0.15505161881446838, "step": 1157 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0390625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0390625, "entropy": 0.39778588712215424, "epoch": 3.0473684210526315, "grad_norm": 0.032787472009658813, "learning_rate": 1e-06, "loss": 0.0103, "step": 1158 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.3961934745311737, "epoch": 3.05, "grad_norm": 0.003081651171669364, "learning_rate": 1e-06, "loss": 0.0156, "step": 1159 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3992573320865631, "epoch": 3.0526315789473686, "grad_norm": 0.002976259682327509, "learning_rate": 1e-06, "loss": -0.0045, "step": 1160 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11893.0, "completions/max_terminated_length": 11893.0, "completions/mean_length": 4495.21484375, "completions/mean_terminated_length": 4495.21484375, "completions/min_length": 453.0, "completions/min_terminated_length": 453.0, "entropy": 0.40479637682437897, "epoch": 3.055263157894737, "frac_reward_zero_std": 0.96875, "grad_norm": 0.0021845384035259485, "learning_rate": 1e-06, "loss": -0.0024, "num_tokens": 539517690.0, "reward": 0.8985351920127869, "reward_std": 0.005859375, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.998046875, "rewards/symbolic_reward_accuracy/std": 0.04419417306780815, "rewards/symbolic_reward_partial_score/mean": 0.9990234375, "rewards/symbolic_reward_partial_score/std": 0.022097086533904076, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0916180610656738, "sampling/importance_sampling_ratio/min": 0.003239621175453067, "sampling/sampling_logp_difference/max": 5.732298851013184, "sampling/sampling_logp_difference/mean": 0.15671885013580322, "step": 1161 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.40252718329429626, "epoch": 3.057894736842105, "grad_norm": 0.002779013942927122, "learning_rate": 1e-06, "loss": -0.0035, "step": 1162 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.40390895307064056, "epoch": 3.0605263157894735, "grad_norm": 0.0017610761569812894, "learning_rate": 1e-06, "loss": -0.0035, "step": 1163 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.405286967754364, "epoch": 3.0631578947368423, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0185, "step": 1164 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13544.0, "completions/max_terminated_length": 13544.0, "completions/mean_length": 5190.345703125, "completions/mean_terminated_length": 5190.345703125, "completions/min_length": 695.0, "completions/min_terminated_length": 695.0, "entropy": 0.3964738994836807, "epoch": 3.0657894736842106, "frac_reward_zero_std": 0.96875, "grad_norm": 0.0021062421146780252, "learning_rate": 1e-06, "loss": -0.0017, "num_tokens": 542584811.0, "reward": 0.8985351920127869, "reward_std": 0.005859375, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.998046875, "rewards/symbolic_reward_accuracy/std": 0.04419417306780815, "rewards/symbolic_reward_partial_score/mean": 0.9990234375, "rewards/symbolic_reward_partial_score/std": 0.022097086533904076, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0900318622589111, "sampling/importance_sampling_ratio/min": 0.0018985614879056811, "sampling/sampling_logp_difference/max": 6.266658782958984, "sampling/sampling_logp_difference/mean": 0.15438339114189148, "step": 1165 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.39546899497509, "epoch": 3.068421052631579, "grad_norm": 0.0021811083424836397, "learning_rate": 1e-06, "loss": -0.0029, "step": 1166 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.39964383840560913, "epoch": 3.0710526315789473, "grad_norm": 0.0017049607122316957, "learning_rate": 1e-06, "loss": -0.0031, "step": 1167 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.39255666732788086, "epoch": 3.0736842105263156, "grad_norm": 0.0026875652838498354, "learning_rate": 1e-06, "loss": 0.0032, "step": 1168 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14544.0, "completions/max_terminated_length": 14544.0, "completions/mean_length": 4622.01171875, "completions/mean_terminated_length": 4622.01171875, "completions/min_length": 463.0, "completions/min_terminated_length": 463.0, "entropy": 0.40334227681159973, "epoch": 3.0763157894736843, "frac_reward_zero_std": 0.9375, "grad_norm": 0.0034430017694830894, "learning_rate": 1e-06, "loss": 0.0067, "num_tokens": 545327057.0, "reward": 0.896484375, "reward_std": 0.01406250149011612, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.99609375, "rewards/symbolic_reward_accuracy/std": 0.06243881583213806, "rewards/symbolic_reward_partial_score/mean": 0.99609375, "rewards/symbolic_reward_partial_score/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0909156799316406, "sampling/importance_sampling_ratio/min": 0.0021885638125240803, "sampling/sampling_logp_difference/max": 6.124509811401367, "sampling/sampling_logp_difference/mean": 0.15607982873916626, "step": 1169 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.4035855084657669, "epoch": 3.0789473684210527, "grad_norm": 0.003756597638130188, "learning_rate": 1e-06, "loss": -0.0069, "step": 1170 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.40055006742477417, "epoch": 3.081578947368421, "grad_norm": 0.003091114806011319, "learning_rate": 1e-06, "loss": -0.0065, "step": 1171 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03125, "entropy": 0.3967365473508835, "epoch": 3.0842105263157893, "grad_norm": 0.002782898722216487, "learning_rate": 1e-06, "loss": 0.0037, "step": 1172 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11884.0, "completions/max_terminated_length": 11884.0, "completions/mean_length": 5286.150390625, "completions/mean_terminated_length": 5286.150390625, "completions/min_length": 621.0, "completions/min_terminated_length": 621.0, "entropy": 0.4108661264181137, "epoch": 3.086842105263158, "frac_reward_zero_std": 0.9375, "grad_norm": 0.0036462058778852224, "learning_rate": 1e-06, "loss": 0.0054, "num_tokens": 548426846.0, "reward": 0.896484375, "reward_std": 0.01406250149011612, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.99609375, "rewards/symbolic_reward_accuracy/std": 0.06243881583213806, "rewards/symbolic_reward_partial_score/mean": 0.99609375, "rewards/symbolic_reward_partial_score/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0928075313568115, "sampling/importance_sampling_ratio/min": 0.0020160051062703133, "sampling/sampling_logp_difference/max": 6.206637382507324, "sampling/sampling_logp_difference/mean": 0.15911804139614105, "step": 1173 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.40885426104068756, "epoch": 3.0894736842105264, "grad_norm": 0.0025966274552047253, "learning_rate": 1e-06, "loss": -0.0047, "step": 1174 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.41025181114673615, "epoch": 3.0921052631578947, "grad_norm": 0.0026661918964236975, "learning_rate": 1e-06, "loss": 0.0072, "step": 1175 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.40951915085315704, "epoch": 3.094736842105263, "grad_norm": 0.002943785395473242, "learning_rate": 1e-06, "loss": -0.005, "step": 1176 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 16384.0, "completions/max_terminated_length": 16107.0, "completions/mean_length": 5539.591796875, "completions/mean_terminated_length": 5497.06494140625, "completions/min_length": 681.0, "completions/min_terminated_length": 681.0, "entropy": 0.3915517330169678, "epoch": 3.0973684210526318, "frac_reward_zero_std": 0.90625, "grad_norm": 0.0050131031312048435, "learning_rate": 1e-06, "loss": 0.0523, "num_tokens": 551668589.0, "reward": 0.8960937857627869, "reward_std": 0.015625, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.99609375, "rewards/symbolic_reward_accuracy/std": 0.06243881583213806, "rewards/symbolic_reward_partial_score/mean": 0.99609375, "rewards/symbolic_reward_partial_score/std": 0.06243881583213806, "rewards/tag_count_reward/mean": -0.00390625, "rewards/tag_count_reward/std": 0.06243881583213806, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0885796546936035, "sampling/importance_sampling_ratio/min": 0.0021795593202114105, "sampling/sampling_logp_difference/max": 6.128632545471191, "sampling/sampling_logp_difference/mean": 0.15259742736816406, "step": 1177 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.391658753156662, "epoch": 3.1, "grad_norm": 0.003974818158894777, "learning_rate": 1e-06, "loss": -0.0122, "step": 1178 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.390282541513443, "epoch": 3.1026315789473684, "grad_norm": 0.004405332263559103, "learning_rate": 1e-06, "loss": 0.0001, "step": 1179 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03125, "entropy": 0.39275291562080383, "epoch": 3.1052631578947367, "grad_norm": 0.0030568824149668217, "learning_rate": 1e-06, "loss": -0.013, "step": 1180 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 14841.0, "completions/mean_length": 5326.18359375, "completions/mean_terminated_length": 5304.5439453125, "completions/min_length": 576.0, "completions/min_terminated_length": 576.0, "entropy": 0.40120118856430054, "epoch": 3.1078947368421055, "frac_reward_zero_std": 0.75, "grad_norm": 0.007063650991767645, "learning_rate": 1e-06, "loss": 0.0061, "num_tokens": 554800907.0, "reward": 0.8857422471046448, "reward_std": 0.05703125521540642, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.984375, "rewards/symbolic_reward_accuracy/std": 0.12414088100194931, "rewards/symbolic_reward_partial_score/mean": 0.984375, "rewards/symbolic_reward_partial_score/std": 0.12414088100194931, "rewards/tag_count_reward/mean": -0.001953125, "rewards/tag_count_reward/std": 0.04419417306780815, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0900720357894897, "sampling/importance_sampling_ratio/min": 0.0028123431839048862, "sampling/sampling_logp_difference/max": 5.873737335205078, "sampling/sampling_logp_difference/mean": 0.15505918860435486, "step": 1181 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.09375, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.09375, "entropy": 0.39862124621868134, "epoch": 3.110526315789474, "grad_norm": 0.043328363448381424, "learning_rate": 1e-06, "loss": -0.0026, "step": 1182 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0546875, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0546875, "entropy": 0.4000040143728256, "epoch": 3.113157894736842, "grad_norm": 0.007014925591647625, "learning_rate": 1e-06, "loss": 0.0039, "step": 1183 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0390625, "entropy": 0.39822492003440857, "epoch": 3.1157894736842104, "grad_norm": 0.03283928334712982, "learning_rate": 1e-06, "loss": 0.0215, "step": 1184 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 16008.0, "completions/mean_length": 5722.03125, "completions/mean_terminated_length": 5701.16650390625, "completions/min_length": 522.0, "completions/min_terminated_length": 522.0, "entropy": 0.40730342268943787, "epoch": 3.1184210526315788, "frac_reward_zero_std": 0.875, "grad_norm": 0.005404895171523094, "learning_rate": 1e-06, "loss": -0.0151, "num_tokens": 558137723.0, "reward": 0.8953613042831421, "reward_std": 0.01855468936264515, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.994140625, "rewards/symbolic_reward_accuracy/std": 0.07639661431312561, "rewards/symbolic_reward_partial_score/mean": 0.9969075918197632, "rewards/symbolic_reward_partial_score/std": 0.047823768109083176, "rewards/tag_count_reward/mean": -0.001953125, "rewards/tag_count_reward/std": 0.04419417306780815, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.09215247631073, "sampling/importance_sampling_ratio/min": 0.0019525951938703656, "sampling/sampling_logp_difference/max": 6.238595962524414, "sampling/sampling_logp_difference/mean": 0.15795490145683289, "step": 1185 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.4078136682510376, "epoch": 3.1210526315789475, "grad_norm": 0.004979324992746115, "learning_rate": 1e-06, "loss": -0.0141, "step": 1186 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.41222524642944336, "epoch": 3.123684210526316, "grad_norm": 0.004098959732800722, "learning_rate": 1e-06, "loss": 0.0411, "step": 1187 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.046875, "entropy": 0.41124488413333893, "epoch": 3.126315789473684, "grad_norm": 0.005247786641120911, "learning_rate": 1e-06, "loss": 0.0176, "step": 1188 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15224.0, "completions/max_terminated_length": 15224.0, "completions/mean_length": 4540.048828125, "completions/mean_terminated_length": 4540.048828125, "completions/min_length": 524.0, "completions/min_terminated_length": 524.0, "entropy": 0.4028102457523346, "epoch": 3.1289473684210525, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 560871028.0, "reward": 0.9000000357627869, "reward_std": 0.0, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 1.0, "rewards/symbolic_reward_accuracy/std": 0.0, "rewards/symbolic_reward_partial_score/mean": 1.0, "rewards/symbolic_reward_partial_score/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.091124415397644, "sampling/importance_sampling_ratio/min": 0.0032635980751365423, "sampling/sampling_logp_difference/max": 5.7249250411987305, "sampling/sampling_logp_difference/mean": 0.15638495981693268, "step": 1189 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.40638749301433563, "epoch": 3.1315789473684212, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 1190 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.40416190028190613, "epoch": 3.1342105263157896, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 1191 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.40329709649086, "epoch": 3.136842105263158, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 1192 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 16384.0, "completions/max_terminated_length": 16366.0, "completions/mean_length": 4519.5859375, "completions/mean_terminated_length": 4473.05908203125, "completions/min_length": 455.0, "completions/min_terminated_length": 455.0, "entropy": 0.4031980484724045, "epoch": 3.139473684210526, "frac_reward_zero_std": 0.9375, "grad_norm": 0.004261158872395754, "learning_rate": 1e-06, "loss": 0.0126, "num_tokens": 563597024.0, "reward": 0.8932617902755737, "reward_std": 0.017367906868457794, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.9921875, "rewards/symbolic_reward_accuracy/std": 0.08812850713729858, "rewards/symbolic_reward_partial_score/mean": 0.9944661855697632, "rewards/symbolic_reward_partial_score/std": 0.06774870306253433, "rewards/tag_count_reward/mean": -0.00390625, "rewards/tag_count_reward/std": 0.06243881583213806, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0918298959732056, "sampling/importance_sampling_ratio/min": 0.003203378291800618, "sampling/sampling_logp_difference/max": 5.743549346923828, "sampling/sampling_logp_difference/mean": 0.15727800130844116, "step": 1193 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.4069235473871231, "epoch": 3.1421052631578945, "grad_norm": 0.004299469292163849, "learning_rate": 1e-06, "loss": 0.0058, "step": 1194 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.4075898230075836, "epoch": 3.1447368421052633, "grad_norm": 0.014656825922429562, "learning_rate": 1e-06, "loss": 0.0243, "step": 1195 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03125, "entropy": 0.40768302977085114, "epoch": 3.1473684210526316, "grad_norm": 0.005062122363597155, "learning_rate": 1e-06, "loss": -0.0124, "step": 1196 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11981.0, "completions/max_terminated_length": 11981.0, "completions/mean_length": 3935.923828125, "completions/mean_terminated_length": 3935.923828125, "completions/min_length": 433.0, "completions/min_terminated_length": 433.0, "entropy": 0.40527983009815216, "epoch": 3.15, "frac_reward_zero_std": 0.96875, "grad_norm": 0.002314778044819832, "learning_rate": 1e-06, "loss": -0.0031, "num_tokens": 566016025.0, "reward": 0.8985351920127869, "reward_std": 0.005859375931322575, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.998046875, "rewards/symbolic_reward_accuracy/std": 0.04419417306780815, "rewards/symbolic_reward_partial_score/mean": 0.9990234375, "rewards/symbolic_reward_partial_score/std": 0.022097086533904076, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0924803018569946, "sampling/importance_sampling_ratio/min": 0.002587622031569481, "sampling/sampling_logp_difference/max": 5.9570159912109375, "sampling/sampling_logp_difference/mean": 0.15855225920677185, "step": 1197 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.40624023973941803, "epoch": 3.1526315789473682, "grad_norm": 0.0021940208971500397, "learning_rate": 1e-06, "loss": -0.0029, "step": 1198 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.40848295390605927, "epoch": 3.155263157894737, "grad_norm": 0.0015026667388156056, "learning_rate": 1e-06, "loss": -0.0032, "step": 1199 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.4132244735956192, "epoch": 3.1578947368421053, "grad_norm": 0.0017344997031614184, "learning_rate": 1e-06, "loss": 0.0094, "step": 1200 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 16384.0, "completions/max_terminated_length": 12943.0, "completions/mean_length": 4448.17578125, "completions/mean_terminated_length": 4377.8271484375, "completions/min_length": 399.0, "completions/min_terminated_length": 399.0, "entropy": 0.41170305013656616, "epoch": 3.1605263157894736, "frac_reward_zero_std": 0.9375, "grad_norm": 0.021235667169094086, "learning_rate": 1e-06, "loss": 0.0077, "num_tokens": 568699283.0, "reward": 0.8946289420127869, "reward_std": 0.017260458320379257, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.994140625, "rewards/symbolic_reward_accuracy/std": 0.07639661431312561, "rewards/symbolic_reward_partial_score/mean": 0.9957681894302368, "rewards/symbolic_reward_partial_score/std": 0.06285149604082108, "rewards/tag_count_reward/mean": -0.005859375, "rewards/tag_count_reward/std": 0.07639661431312561, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.093801498413086, "sampling/importance_sampling_ratio/min": 0.005366542376577854, "sampling/sampling_logp_difference/max": 5.227571487426758, "sampling/sampling_logp_difference/mean": 0.16037949919700623, "step": 1201 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.41230031847953796, "epoch": 3.163157894736842, "grad_norm": 0.0037013026885688305, "learning_rate": 1e-06, "loss": -0.0104, "step": 1202 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.4122001975774765, "epoch": 3.1657894736842107, "grad_norm": 0.005195175297558308, "learning_rate": 1e-06, "loss": 0.0215, "step": 1203 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.41611844301223755, "epoch": 3.168421052631579, "grad_norm": 0.003911558073014021, "learning_rate": 1e-06, "loss": 0.016, "step": 1204 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 16384.0, "completions/max_terminated_length": 15828.0, "completions/mean_length": 3936.36328125, "completions/mean_terminated_length": 3887.54931640625, "completions/min_length": 426.0, "completions/min_terminated_length": 426.0, "entropy": 0.4056967943906784, "epoch": 3.1710526315789473, "frac_reward_zero_std": 0.9375, "grad_norm": 0.005454949103295803, "learning_rate": 1e-06, "loss": -0.0087, "num_tokens": 571097837.0, "reward": 0.8960937857627869, "reward_std": 0.015625, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.99609375, "rewards/symbolic_reward_accuracy/std": 0.06243881583213806, "rewards/symbolic_reward_partial_score/mean": 0.99609375, "rewards/symbolic_reward_partial_score/std": 0.06243881583213806, "rewards/tag_count_reward/mean": -0.00390625, "rewards/tag_count_reward/std": 0.06243881583213806, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0913147926330566, "sampling/importance_sampling_ratio/min": 0.0032484508119523525, "sampling/sampling_logp_difference/max": 5.72957706451416, "sampling/sampling_logp_difference/mean": 0.1577095091342926, "step": 1205 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.40130531787872314, "epoch": 3.1736842105263157, "grad_norm": 0.03140510246157646, "learning_rate": 1e-06, "loss": 0.0233, "step": 1206 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.4021826386451721, "epoch": 3.1763157894736844, "grad_norm": 0.0037210388109087944, "learning_rate": 1e-06, "loss": -0.0092, "step": 1207 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.40941959619522095, "epoch": 3.1789473684210527, "grad_norm": 0.003432672703638673, "learning_rate": 1e-06, "loss": 0.0244, "step": 1208 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13464.0, "completions/max_terminated_length": 13464.0, "completions/mean_length": 3987.0625, "completions/mean_terminated_length": 3987.0625, "completions/min_length": 306.0, "completions/min_terminated_length": 306.0, "entropy": 0.4114469289779663, "epoch": 3.181578947368421, "frac_reward_zero_std": 0.9375, "grad_norm": 0.0032412256114184856, "learning_rate": 1e-06, "loss": -0.0068, "num_tokens": 573522989.0, "reward": 0.8974121809005737, "reward_std": 0.01035156287252903, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.99609375, "rewards/symbolic_reward_accuracy/std": 0.06243881583213806, "rewards/symbolic_reward_partial_score/mean": 0.9991861581802368, "rewards/symbolic_reward_partial_score/std": 0.013266698457300663, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.093617558479309, "sampling/importance_sampling_ratio/min": 0.0019764334429055452, "sampling/sampling_logp_difference/max": 6.226461410522461, "sampling/sampling_logp_difference/mean": 0.1599876433610916, "step": 1209 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.410616010427475, "epoch": 3.1842105263157894, "grad_norm": 0.003006654093042016, "learning_rate": 1e-06, "loss": 0.0181, "step": 1210 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.4118806719779968, "epoch": 3.1868421052631577, "grad_norm": 0.0356738306581974, "learning_rate": 1e-06, "loss": 0.02, "step": 1211 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.416130855679512, "epoch": 3.1894736842105265, "grad_norm": 0.0039194184355437756, "learning_rate": 1e-06, "loss": -0.0081, "step": 1212 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 16074.0, "completions/mean_length": 4731.541015625, "completions/mean_terminated_length": 4708.73779296875, "completions/min_length": 518.0, "completions/min_terminated_length": 518.0, "entropy": 0.42329737544059753, "epoch": 3.192105263157895, "frac_reward_zero_std": 0.9375, "grad_norm": 0.003957363776862621, "learning_rate": 1e-06, "loss": -0.0091, "num_tokens": 576351010.0, "reward": 0.8967773914337158, "reward_std": 0.012890626676380634, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.99609375, "rewards/symbolic_reward_accuracy/std": 0.06243881583213806, "rewards/symbolic_reward_partial_score/mean": 0.9970703125, "rewards/symbolic_reward_partial_score/std": 0.04937189444899559, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0966722965240479, "sampling/importance_sampling_ratio/min": 0.0017456674249842763, "sampling/sampling_logp_difference/max": 6.350618362426758, "sampling/sampling_logp_difference/mean": 0.16366060078144073, "step": 1213 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.4240821599960327, "epoch": 3.194736842105263, "grad_norm": 0.002998428652063012, "learning_rate": 1e-06, "loss": 0.0164, "step": 1214 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.4237772822380066, "epoch": 3.1973684210526314, "grad_norm": 0.0026968149468302727, "learning_rate": 1e-06, "loss": 0.0234, "step": 1215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.4242277592420578, "epoch": 3.2, "grad_norm": 0.004285138566046953, "learning_rate": 1e-06, "loss": -0.0091, "step": 1216 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 16384.0, "completions/max_terminated_length": 14641.0, "completions/mean_length": 4914.927734375, "completions/mean_terminated_length": 4824.6201171875, "completions/min_length": 641.0, "completions/min_terminated_length": 641.0, "entropy": 0.41367655992507935, "epoch": 3.2026315789473685, "frac_reward_zero_std": 0.9375, "grad_norm": 0.005627737380564213, "learning_rate": 1e-06, "loss": 0.0101, "num_tokens": 579291613.0, "reward": 0.8921875357627869, "reward_std": 0.021347813308238983, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.9921875, "rewards/symbolic_reward_accuracy/std": 0.08812850713729858, "rewards/symbolic_reward_partial_score/mean": 0.9921875, "rewards/symbolic_reward_partial_score/std": 0.08812850713729858, "rewards/tag_count_reward/mean": -0.0078125, "rewards/tag_count_reward/std": 0.08812850713729858, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0939661264419556, "sampling/importance_sampling_ratio/min": 0.0019966266117990017, "sampling/sampling_logp_difference/max": 6.216296195983887, "sampling/sampling_logp_difference/mean": 0.16010570526123047, "step": 1217 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.4152397960424423, "epoch": 3.205263157894737, "grad_norm": 0.0028334069065749645, "learning_rate": 1e-06, "loss": -0.0144, "step": 1218 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.41124027967453003, "epoch": 3.207894736842105, "grad_norm": 0.035349901765584946, "learning_rate": 1e-06, "loss": 0.0336, "step": 1219 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.4131704419851303, "epoch": 3.2105263157894735, "grad_norm": 0.0047950102016329765, "learning_rate": 1e-06, "loss": 0.0076, "step": 1220 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12704.0, "completions/max_terminated_length": 12704.0, "completions/mean_length": 4258.9921875, "completions/mean_terminated_length": 4258.9921875, "completions/min_length": 557.0, "completions/min_terminated_length": 557.0, "entropy": 0.4134512394666672, "epoch": 3.213157894736842, "frac_reward_zero_std": 0.96875, "grad_norm": 0.0016610193997621536, "learning_rate": 1e-06, "loss": -0.0018, "num_tokens": 581866201.0, "reward": 0.8985351920127869, "reward_std": 0.005859375931322575, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.998046875, "rewards/symbolic_reward_accuracy/std": 0.04419417306780815, "rewards/symbolic_reward_partial_score/mean": 0.9990234375, "rewards/symbolic_reward_partial_score/std": 0.022097086533904076, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0945987701416016, "sampling/importance_sampling_ratio/min": 0.0022829456720501184, "sampling/sampling_logp_difference/max": 6.08228874206543, "sampling/sampling_logp_difference/mean": 0.16104772686958313, "step": 1221 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.41424034535884857, "epoch": 3.2157894736842105, "grad_norm": 0.0015104152262210846, "learning_rate": 1e-06, "loss": 0.0051, "step": 1222 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.41497941315174103, "epoch": 3.218421052631579, "grad_norm": 0.002166332444176078, "learning_rate": 1e-06, "loss": -0.0021, "step": 1223 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.4188615083694458, "epoch": 3.221052631578947, "grad_norm": 0.0013862756313756108, "learning_rate": 1e-06, "loss": -0.0024, "step": 1224 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 16384.0, "completions/max_terminated_length": 14262.0, "completions/mean_length": 4969.189453125, "completions/mean_terminated_length": 4901.91162109375, "completions/min_length": 534.0, "completions/min_terminated_length": 534.0, "entropy": 0.4242817461490631, "epoch": 3.223684210526316, "frac_reward_zero_std": 0.90625, "grad_norm": 0.004546622280031443, "learning_rate": 1e-06, "loss": 0.0059, "num_tokens": 584823418.0, "reward": 0.8929687738418579, "reward_std": 0.02317390963435173, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.9921875, "rewards/symbolic_reward_accuracy/std": 0.08812850713729858, "rewards/symbolic_reward_partial_score/mean": 0.9934896230697632, "rewards/symbolic_reward_partial_score/std": 0.07775481790304184, "rewards/tag_count_reward/mean": -0.00390625, "rewards/tag_count_reward/std": 0.06243881583213806, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0966877937316895, "sampling/importance_sampling_ratio/min": 0.0036373361945152283, "sampling/sampling_logp_difference/max": 5.616503715515137, "sampling/sampling_logp_difference/mean": 0.16397926211357117, "step": 1225 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03125, "entropy": 0.42179690301418304, "epoch": 3.2263157894736842, "grad_norm": 0.003140317741781473, "learning_rate": 1e-06, "loss": 0.022, "step": 1226 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0390625, "entropy": 0.4274747967720032, "epoch": 3.2289473684210526, "grad_norm": 0.002901677042245865, "learning_rate": 1e-06, "loss": 0.0121, "step": 1227 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.41976186633110046, "epoch": 3.231578947368421, "grad_norm": 0.0034241792745888233, "learning_rate": 1e-06, "loss": 0.0118, "step": 1228 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14647.0, "completions/max_terminated_length": 14647.0, "completions/mean_length": 4903.107421875, "completions/mean_terminated_length": 4903.107421875, "completions/min_length": 667.0, "completions/min_terminated_length": 667.0, "entropy": 0.42823950946331024, "epoch": 3.2342105263157896, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 587746641.0, "reward": 0.9000000357627869, "reward_std": 0.0, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 1.0, "rewards/symbolic_reward_accuracy/std": 0.0, "rewards/symbolic_reward_partial_score/mean": 1.0, "rewards/symbolic_reward_partial_score/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0973057746887207, "sampling/importance_sampling_ratio/min": 0.0032474007457494736, "sampling/sampling_logp_difference/max": 5.729900360107422, "sampling/sampling_logp_difference/mean": 0.16534727811813354, "step": 1229 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.427742674946785, "epoch": 3.236842105263158, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 1230 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.42854541540145874, "epoch": 3.2394736842105263, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 1231 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.42492353916168213, "epoch": 3.2421052631578946, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 1232 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10876.0, "completions/max_terminated_length": 10876.0, "completions/mean_length": 4922.861328125, "completions/mean_terminated_length": 4922.861328125, "completions/min_length": 471.0, "completions/min_terminated_length": 471.0, "entropy": 0.4322494715452194, "epoch": 3.2447368421052634, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 590674122.0, "reward": 0.9000000357627869, "reward_std": 0.0, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 1.0, "rewards/symbolic_reward_accuracy/std": 0.0, "rewards/symbolic_reward_partial_score/mean": 1.0, "rewards/symbolic_reward_partial_score/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0987818241119385, "sampling/importance_sampling_ratio/min": 0.0012941915774717927, "sampling/sampling_logp_difference/max": 6.649868965148926, "sampling/sampling_logp_difference/mean": 0.16677245497703552, "step": 1233 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.433841273188591, "epoch": 3.2473684210526317, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 1234 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.4353598356246948, "epoch": 3.25, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 1235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.42783698439598083, "epoch": 3.2526315789473683, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 1236 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 16274.0, "completions/mean_length": 5215.51953125, "completions/mean_terminated_length": 5193.66357421875, "completions/min_length": 579.0, "completions/min_terminated_length": 579.0, "entropy": 0.42230191826820374, "epoch": 3.2552631578947366, "frac_reward_zero_std": 0.96875, "grad_norm": 0.00270873517729342, "learning_rate": 1e-06, "loss": -0.0049, "num_tokens": 593770292.0, "reward": 0.8980469107627869, "reward_std": 0.0078125, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.998046875, "rewards/symbolic_reward_accuracy/std": 0.04419417306780815, "rewards/symbolic_reward_partial_score/mean": 0.998046875, "rewards/symbolic_reward_partial_score/std": 0.04419417306780815, "rewards/tag_count_reward/mean": -0.001953125, "rewards/tag_count_reward/std": 0.04419417306780815, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0960923433303833, "sampling/importance_sampling_ratio/min": 0.0013454603031277657, "sampling/sampling_logp_difference/max": 6.611019134521484, "sampling/sampling_logp_difference/mean": 0.16271348297595978, "step": 1237 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.41985855996608734, "epoch": 3.2578947368421054, "grad_norm": 0.0026089984457939863, "learning_rate": 1e-06, "loss": -0.0055, "step": 1238 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.4175923019647598, "epoch": 3.2605263157894737, "grad_norm": 0.0016260799020528793, "learning_rate": 1e-06, "loss": -0.0059, "step": 1239 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.4214717596769333, "epoch": 3.263157894736842, "grad_norm": 0.034119024872779846, "learning_rate": 1e-06, "loss": 0.0278, "step": 1240 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 16384.0, "completions/max_terminated_length": 15592.0, "completions/mean_length": 5398.509765625, "completions/mean_terminated_length": 5268.2470703125, "completions/min_length": 480.0, "completions/min_terminated_length": 480.0, "entropy": 0.43359020352363586, "epoch": 3.2657894736842104, "frac_reward_zero_std": 0.96875, "grad_norm": 0.006515873596072197, "learning_rate": 1e-06, "loss": -0.0131, "num_tokens": 596926105.0, "reward": 0.89404296875, "reward_std": 0.011066482402384281, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.994140625, "rewards/symbolic_reward_accuracy/std": 0.07639661431312561, "rewards/symbolic_reward_partial_score/mean": 0.9957681894302368, "rewards/symbolic_reward_partial_score/std": 0.06285149604082108, "rewards/tag_count_reward/mean": -0.01171875, "rewards/tag_count_reward/std": 0.10772226005792618, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0985498428344727, "sampling/importance_sampling_ratio/min": 0.001384856179356575, "sampling/sampling_logp_difference/max": 6.582159042358398, "sampling/sampling_logp_difference/mean": 0.16628772020339966, "step": 1241 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.42673449218273163, "epoch": 3.268421052631579, "grad_norm": 0.003408891847357154, "learning_rate": 1e-06, "loss": -0.0118, "step": 1242 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.43328768014907837, "epoch": 3.2710526315789474, "grad_norm": 0.005278347060084343, "learning_rate": 1e-06, "loss": 0.0106, "step": 1243 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.4302675575017929, "epoch": 3.2736842105263158, "grad_norm": 0.0024715871550142765, "learning_rate": 1e-06, "loss": 0.0178, "step": 1244 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13326.0, "completions/max_terminated_length": 13326.0, "completions/mean_length": 4673.396484375, "completions/mean_terminated_length": 4673.396484375, "completions/min_length": 464.0, "completions/min_terminated_length": 464.0, "entropy": 0.43210601806640625, "epoch": 3.276315789473684, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 599716196.0, "reward": 0.9000000357627869, "reward_std": 0.0, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 1.0, "rewards/symbolic_reward_accuracy/std": 0.0, "rewards/symbolic_reward_partial_score/mean": 1.0, "rewards/symbolic_reward_partial_score/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0981221199035645, "sampling/importance_sampling_ratio/min": 0.002803367329761386, "sampling/sampling_logp_difference/max": 5.876934051513672, "sampling/sampling_logp_difference/mean": 0.1663980931043625, "step": 1245 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.42830055952072144, "epoch": 3.2789473684210524, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 1246 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.42543044686317444, "epoch": 3.281578947368421, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 1247 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.4283517748117447, "epoch": 3.2842105263157895, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 1248 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 16384.0, "completions/max_terminated_length": 15735.0, "completions/mean_length": 4905.23828125, "completions/mean_terminated_length": 4860.2236328125, "completions/min_length": 395.0, "completions/min_terminated_length": 395.0, "entropy": 0.4273582249879837, "epoch": 3.286842105263158, "frac_reward_zero_std": 0.9375, "grad_norm": 0.00556342676281929, "learning_rate": 1e-06, "loss": 0.0133, "num_tokens": 602622142.0, "reward": 0.8965332508087158, "reward_std": 0.00991093460470438, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.99609375, "rewards/symbolic_reward_accuracy/std": 0.06243881583213806, "rewards/symbolic_reward_partial_score/mean": 0.99755859375, "rewards/symbolic_reward_partial_score/std": 0.045533329248428345, "rewards/tag_count_reward/mean": -0.00390625, "rewards/tag_count_reward/std": 0.06243881583213806, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0981862545013428, "sampling/importance_sampling_ratio/min": 0.0024805660359561443, "sampling/sampling_logp_difference/max": 5.999268531799316, "sampling/sampling_logp_difference/mean": 0.16568660736083984, "step": 1249 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.4250298887491226, "epoch": 3.2894736842105265, "grad_norm": 0.0021392509806901217, "learning_rate": 1e-06, "loss": 0.0182, "step": 1250 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.4252989590167999, "epoch": 3.292105263157895, "grad_norm": 0.005049911327660084, "learning_rate": 1e-06, "loss": -0.0143, "step": 1251 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.4272451251745224, "epoch": 3.294736842105263, "grad_norm": 0.0032953384798020124, "learning_rate": 1e-06, "loss": 0.0018, "step": 1252 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.009765625, "completions/max_length": 16384.0, "completions/max_terminated_length": 16236.0, "completions/mean_length": 4915.126953125, "completions/mean_terminated_length": 4802.02197265625, "completions/min_length": 604.0, "completions/min_terminated_length": 604.0, "entropy": 0.43097755312919617, "epoch": 3.2973684210526315, "frac_reward_zero_std": 0.96875, "grad_norm": 0.006670815404504538, "learning_rate": 1e-06, "loss": -0.0134, "num_tokens": 605542495.0, "reward": 0.8939453363418579, "reward_std": 0.012524389661848545, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.994140625, "rewards/symbolic_reward_accuracy/std": 0.07639661431312561, "rewards/symbolic_reward_partial_score/mean": 0.994140625, "rewards/symbolic_reward_partial_score/std": 0.07639661431312561, "rewards/tag_count_reward/mean": -0.0078125, "rewards/tag_count_reward/std": 0.08812850713729858, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.098680019378662, "sampling/importance_sampling_ratio/min": 0.0016943583032116294, "sampling/sampling_logp_difference/max": 6.380451202392578, "sampling/sampling_logp_difference/mean": 0.16671130061149597, "step": 1253 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.4279296100139618, "epoch": 3.3, "grad_norm": 0.01934565044939518, "learning_rate": 1e-06, "loss": 0.0275, "step": 1254 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.42890191078186035, "epoch": 3.3026315789473686, "grad_norm": 0.006398866884410381, "learning_rate": 1e-06, "loss": -0.0119, "step": 1255 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.4297415763139725, "epoch": 3.305263157894737, "grad_norm": 0.006584126967936754, "learning_rate": 1e-06, "loss": 0.0066, "step": 1256 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 16215.0, "completions/mean_length": 5048.841796875, "completions/mean_terminated_length": 5026.65966796875, "completions/min_length": 480.0, "completions/min_terminated_length": 480.0, "entropy": 0.4309433698654175, "epoch": 3.307894736842105, "frac_reward_zero_std": 0.96875, "grad_norm": 0.0031314236111938953, "learning_rate": 1e-06, "loss": -0.0051, "num_tokens": 608500270.0, "reward": 0.8980469107627869, "reward_std": 0.0078125, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.998046875, "rewards/symbolic_reward_accuracy/std": 0.04419417306780815, "rewards/symbolic_reward_partial_score/mean": 0.998046875, "rewards/symbolic_reward_partial_score/std": 0.04419417306780815, "rewards/tag_count_reward/mean": -0.001953125, "rewards/tag_count_reward/std": 0.04419417306780815, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0995407104492188, "sampling/importance_sampling_ratio/min": 4.134080882067792e-05, "sampling/sampling_logp_difference/max": 10.093660354614258, "sampling/sampling_logp_difference/mean": 0.16768856346607208, "step": 1257 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.4325014054775238, "epoch": 3.3105263157894735, "grad_norm": 0.0026678242720663548, "learning_rate": 1e-06, "loss": -0.0056, "step": 1258 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.4312526732683182, "epoch": 3.3131578947368423, "grad_norm": 0.003155788406729698, "learning_rate": 1e-06, "loss": 0.0266, "step": 1259 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.43209536373615265, "epoch": 3.3157894736842106, "grad_norm": 0.003199965925887227, "learning_rate": 1e-06, "loss": -0.0067, "step": 1260 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 16384.0, "completions/max_terminated_length": 15219.0, "completions/mean_length": 5482.1640625, "completions/mean_terminated_length": 5309.119140625, "completions/min_length": 581.0, "completions/min_terminated_length": 581.0, "entropy": 0.4102592468261719, "epoch": 3.318421052631579, "frac_reward_zero_std": 0.875, "grad_norm": 0.01575482450425625, "learning_rate": 1e-06, "loss": 0.0068, "num_tokens": 611735650.0, "reward": 0.8878906965255737, "reward_std": 0.030286725610494614, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.98828125, "rewards/symbolic_reward_accuracy/std": 0.10772226005792618, "rewards/symbolic_reward_partial_score/mean": 0.98828125, "rewards/symbolic_reward_partial_score/std": 0.10772226005792618, "rewards/tag_count_reward/mean": -0.015625, "rewards/tag_count_reward/std": 0.12414088100194931, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0951696634292603, "sampling/importance_sampling_ratio/min": 0.002318034414201975, "sampling/sampling_logp_difference/max": 6.067035675048828, "sampling/sampling_logp_difference/mean": 0.16081209480762482, "step": 1261 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.4124682545661926, "epoch": 3.3210526315789473, "grad_norm": 0.038539014756679535, "learning_rate": 1e-06, "loss": 0.0241, "step": 1262 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.41170747578144073, "epoch": 3.3236842105263156, "grad_norm": 0.008916563354432583, "learning_rate": 1e-06, "loss": 0.039, "step": 1263 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.4108743816614151, "epoch": 3.3263157894736843, "grad_norm": 0.007226528134196997, "learning_rate": 1e-06, "loss": -0.0109, "step": 1264 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12021.0, "completions/max_terminated_length": 12021.0, "completions/mean_length": 4667.21875, "completions/mean_terminated_length": 4667.21875, "completions/min_length": 461.0, "completions/min_terminated_length": 461.0, "entropy": 0.4249001443386078, "epoch": 3.3289473684210527, "frac_reward_zero_std": 0.96875, "grad_norm": 0.0019928712863475084, "learning_rate": 1e-06, "loss": -0.0017, "num_tokens": 614539762.0, "reward": 0.8985351920127869, "reward_std": 0.005859375, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.998046875, "rewards/symbolic_reward_accuracy/std": 0.04419417306780815, "rewards/symbolic_reward_partial_score/mean": 0.9990234375, "rewards/symbolic_reward_partial_score/std": 0.022097086533904076, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.097334861755371, "sampling/importance_sampling_ratio/min": 0.003616355126723647, "sampling/sampling_logp_difference/max": 5.622288703918457, "sampling/sampling_logp_difference/mean": 0.1645958125591278, "step": 1265 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.42036640644073486, "epoch": 3.331578947368421, "grad_norm": 0.0198800228536129, "learning_rate": 1e-06, "loss": 0.0064, "step": 1266 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.4227233976125717, "epoch": 3.3342105263157893, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": -0.0013, "step": 1267 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.41960878670215607, "epoch": 3.336842105263158, "grad_norm": 0.0010428999084979296, "learning_rate": 1e-06, "loss": -0.0016, "step": 1268 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 16384.0, "completions/max_terminated_length": 14731.0, "completions/mean_length": 5023.693359375, "completions/mean_terminated_length": 4956.73681640625, "completions/min_length": 397.0, "completions/min_terminated_length": 397.0, "entropy": 0.42835190892219543, "epoch": 3.3394736842105264, "frac_reward_zero_std": 0.9375, "grad_norm": 0.02546633780002594, "learning_rate": 1e-06, "loss": 0.041, "num_tokens": 617490837.0, "reward": 0.89453125, "reward_std": 0.01609490066766739, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.994140625, "rewards/symbolic_reward_accuracy/std": 0.07639661431312561, "rewards/symbolic_reward_partial_score/mean": 0.9954427480697632, "rewards/symbolic_reward_partial_score/std": 0.06411336362361908, "rewards/tag_count_reward/mean": -0.005859375, "rewards/tag_count_reward/std": 0.07639661431312561, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0978296995162964, "sampling/importance_sampling_ratio/min": 0.003153545781970024, "sampling/sampling_logp_difference/max": 5.759227752685547, "sampling/sampling_logp_difference/mean": 0.16565638780593872, "step": 1269 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.4279763549566269, "epoch": 3.3421052631578947, "grad_norm": 0.024767417460680008, "learning_rate": 1e-06, "loss": 0.0111, "step": 1270 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.4226616472005844, "epoch": 3.344736842105263, "grad_norm": 0.003013022942468524, "learning_rate": 1e-06, "loss": -0.0127, "step": 1271 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.42757633328437805, "epoch": 3.3473684210526318, "grad_norm": 0.005012062378227711, "learning_rate": 1e-06, "loss": -0.0133, "step": 1272 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11702.0, "completions/max_terminated_length": 11702.0, "completions/mean_length": 4659.611328125, "completions/mean_terminated_length": 4659.611328125, "completions/min_length": 539.0, "completions/min_terminated_length": 539.0, "entropy": 0.41563454270362854, "epoch": 3.35, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 620295726.0, "reward": 0.9000000357627869, "reward_std": 0.0, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 1.0, "rewards/symbolic_reward_accuracy/std": 0.0, "rewards/symbolic_reward_partial_score/mean": 1.0, "rewards/symbolic_reward_partial_score/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.095591425895691, "sampling/importance_sampling_ratio/min": 0.00218997523188591, "sampling/sampling_logp_difference/max": 6.123865127563477, "sampling/sampling_logp_difference/mean": 0.16260264813899994, "step": 1273 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.4157206416130066, "epoch": 3.3526315789473684, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 1274 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.4172124117612839, "epoch": 3.3552631578947367, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 1275 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.4159879833459854, "epoch": 3.3578947368421055, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 1276 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11507.0, "completions/max_terminated_length": 11507.0, "completions/mean_length": 4030.861328125, "completions/mean_terminated_length": 4030.861328125, "completions/min_length": 377.0, "completions/min_terminated_length": 377.0, "entropy": 0.42698127031326294, "epoch": 3.360526315789474, "frac_reward_zero_std": 0.96875, "grad_norm": 0.0027545492630451918, "learning_rate": 1e-06, "loss": -0.0048, "num_tokens": 622766663.0, "reward": 0.8982422351837158, "reward_std": 0.00703125074505806, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.998046875, "rewards/symbolic_reward_accuracy/std": 0.04419417306780815, "rewards/symbolic_reward_partial_score/mean": 0.998046875, "rewards/symbolic_reward_partial_score/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0978082418441772, "sampling/importance_sampling_ratio/min": 0.0004315856785979122, "sampling/sampling_logp_difference/max": 7.748044490814209, "sampling/sampling_logp_difference/mean": 0.1658649742603302, "step": 1277 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.4332115352153778, "epoch": 3.363157894736842, "grad_norm": 0.002210133709013462, "learning_rate": 1e-06, "loss": -0.004, "step": 1278 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.42397499084472656, "epoch": 3.3657894736842104, "grad_norm": 0.0023693006951361895, "learning_rate": 1e-06, "loss": -0.0042, "step": 1279 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.42917726933956146, "epoch": 3.3684210526315788, "grad_norm": 0.0029219924472272396, "learning_rate": 1e-06, "loss": 0.0096, "step": 1280 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13158.0, "completions/max_terminated_length": 13158.0, "completions/mean_length": 3919.298828125, "completions/mean_terminated_length": 3919.298828125, "completions/min_length": 524.0, "completions/min_terminated_length": 524.0, "entropy": 0.4132506251335144, "epoch": 3.3710526315789475, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 625181504.0, "reward": 0.9000000357627869, "reward_std": 0.0, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 1.0, "rewards/symbolic_reward_accuracy/std": 0.0, "rewards/symbolic_reward_partial_score/mean": 1.0, "rewards/symbolic_reward_partial_score/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0954046249389648, "sampling/importance_sampling_ratio/min": 0.00219067488797009, "sampling/sampling_logp_difference/max": 6.1235456466674805, "sampling/sampling_logp_difference/mean": 0.16249452531337738, "step": 1281 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.41926707327365875, "epoch": 3.373684210526316, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 1282 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.41958069801330566, "epoch": 3.376315789473684, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 1283 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.41834938526153564, "epoch": 3.3789473684210525, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 1284 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 16384.0, "completions/max_terminated_length": 13622.0, "completions/mean_length": 4244.5703125, "completions/mean_terminated_length": 4196.96484375, "completions/min_length": 467.0, "completions/min_terminated_length": 467.0, "entropy": 0.4205971360206604, "epoch": 3.3815789473684212, "frac_reward_zero_std": 0.96875, "grad_norm": 0.0040439157746732235, "learning_rate": 1e-06, "loss": 0.0194, "num_tokens": 627766212.0, "reward": 0.8965820670127869, "reward_std": 0.009447958320379257, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.99609375, "rewards/symbolic_reward_accuracy/std": 0.06243881583213806, "rewards/symbolic_reward_partial_score/mean": 0.9977213144302368, "rewards/symbolic_reward_partial_score/std": 0.04478955641388893, "rewards/tag_count_reward/mean": -0.00390625, "rewards/tag_count_reward/std": 0.06243881583213806, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.095611810684204, "sampling/importance_sampling_ratio/min": 0.0014818337513133883, "sampling/sampling_logp_difference/max": 6.514474868774414, "sampling/sampling_logp_difference/mean": 0.16291332244873047, "step": 1285 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.4190484285354614, "epoch": 3.3842105263157896, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0134, "step": 1286 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.41686636209487915, "epoch": 3.386842105263158, "grad_norm": 0.003506827400997281, "learning_rate": 1e-06, "loss": -0.0052, "step": 1287 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.42019134759902954, "epoch": 3.389473684210526, "grad_norm": 0.0035482433158904314, "learning_rate": 1e-06, "loss": -0.0052, "step": 1288 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 16384.0, "completions/max_terminated_length": 15532.0, "completions/mean_length": 4554.291015625, "completions/mean_terminated_length": 4484.56787109375, "completions/min_length": 497.0, "completions/min_terminated_length": 497.0, "entropy": 0.41340696811676025, "epoch": 3.3921052631578945, "frac_reward_zero_std": 0.9375, "grad_norm": 0.014019426889717579, "learning_rate": 1e-06, "loss": -0.0005, "num_tokens": 630518841.0, "reward": 0.8946289420127869, "reward_std": 0.01680140197277069, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.994140625, "rewards/symbolic_reward_accuracy/std": 0.07639661431312561, "rewards/symbolic_reward_partial_score/mean": 0.9957681894302368, "rewards/symbolic_reward_partial_score/std": 0.06285149604082108, "rewards/tag_count_reward/mean": -0.005859375, "rewards/tag_count_reward/std": 0.07639661431312561, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0938090085983276, "sampling/importance_sampling_ratio/min": 0.0044320086017251015, "sampling/sampling_logp_difference/max": 5.418902397155762, "sampling/sampling_logp_difference/mean": 0.16045835614204407, "step": 1289 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.4126584678888321, "epoch": 3.3947368421052633, "grad_norm": 0.004571118392050266, "learning_rate": 1e-06, "loss": -0.0124, "step": 1290 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.41205281019210815, "epoch": 3.3973684210526316, "grad_norm": 0.03225545212626457, "learning_rate": 1e-06, "loss": 0.0518, "step": 1291 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.41085344552993774, "epoch": 3.4, "grad_norm": 0.004043778404593468, "learning_rate": 1e-06, "loss": -0.009, "step": 1292 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 14594.0, "completions/mean_length": 3503.52734375, "completions/mean_terminated_length": 3478.32080078125, "completions/min_length": 380.0, "completions/min_terminated_length": 380.0, "entropy": 0.42178209125995636, "epoch": 3.4026315789473682, "frac_reward_zero_std": 0.9375, "grad_norm": 0.0378531776368618, "learning_rate": 1e-06, "loss": 0.0238, "num_tokens": 632674567.0, "reward": 0.8970703482627869, "reward_std": 0.011718750931322575, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.99609375, "rewards/symbolic_reward_accuracy/std": 0.06243881583213806, "rewards/symbolic_reward_partial_score/mean": 0.9986978769302368, "rewards/symbolic_reward_partial_score/std": 0.023278694599866867, "rewards/tag_count_reward/mean": -0.001953125, "rewards/tag_count_reward/std": 0.04419417306780815, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0965383052825928, "sampling/importance_sampling_ratio/min": 0.0020539017859846354, "sampling/sampling_logp_difference/max": 6.188014030456543, "sampling/sampling_logp_difference/mean": 0.1647762805223465, "step": 1293 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.42566388845443726, "epoch": 3.405263157894737, "grad_norm": 0.00374534260481596, "learning_rate": 1e-06, "loss": -0.008, "step": 1294 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.4272879511117935, "epoch": 3.4078947368421053, "grad_norm": 0.003755070036277175, "learning_rate": 1e-06, "loss": 0.0091, "step": 1295 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.4268956482410431, "epoch": 3.4105263157894736, "grad_norm": 0.003591731656342745, "learning_rate": 1e-06, "loss": -0.0073, "step": 1296 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 16384.0, "completions/max_terminated_length": 15632.0, "completions/mean_length": 3734.2578125, "completions/mean_terminated_length": 3684.651123046875, "completions/min_length": 385.0, "completions/min_terminated_length": 385.0, "entropy": 0.4201415479183197, "epoch": 3.413157894736842, "frac_reward_zero_std": 0.9375, "grad_norm": 0.0036082318983972073, "learning_rate": 1e-06, "loss": -0.0071, "num_tokens": 634981291.0, "reward": 0.8978515863418579, "reward_std": 0.008593750186264515, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.998046875, "rewards/symbolic_reward_accuracy/std": 0.04419417306780815, "rewards/symbolic_reward_partial_score/mean": 0.998046875, "rewards/symbolic_reward_partial_score/std": 0.04419417306780815, "rewards/tag_count_reward/mean": -0.00390625, "rewards/tag_count_reward/std": 0.06243881583213806, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0954632759094238, "sampling/importance_sampling_ratio/min": 0.0022026135120540857, "sampling/sampling_logp_difference/max": 6.118110656738281, "sampling/sampling_logp_difference/mean": 0.16258487105369568, "step": 1297 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.41966430842876434, "epoch": 3.4157894736842107, "grad_norm": 0.0033430808689445257, "learning_rate": 1e-06, "loss": -0.0067, "step": 1298 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.42219574749469757, "epoch": 3.418421052631579, "grad_norm": 0.0035229851491749287, "learning_rate": 1e-06, "loss": 0.0251, "step": 1299 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.41871756315231323, "epoch": 3.4210526315789473, "grad_norm": 0.03338438645005226, "learning_rate": 1e-06, "loss": 0.0243, "step": 1300 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 11484.0, "completions/mean_length": 3798.453125, "completions/mean_terminated_length": 3773.82373046875, "completions/min_length": 445.0, "completions/min_terminated_length": 445.0, "entropy": 0.41775913536548615, "epoch": 3.4236842105263157, "frac_reward_zero_std": 0.96875, "grad_norm": 0.002836981788277626, "learning_rate": 1e-06, "loss": -0.0033, "num_tokens": 637325075.0, "reward": 0.8998047113418579, "reward_std": 0.0007812501862645149, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 1.0, "rewards/symbolic_reward_accuracy/std": 0.0, "rewards/symbolic_reward_partial_score/mean": 1.0, "rewards/symbolic_reward_partial_score/std": 0.0, "rewards/tag_count_reward/mean": -0.001953125, "rewards/tag_count_reward/std": 0.04419417306780815, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0949296951293945, "sampling/importance_sampling_ratio/min": 0.002238023094832897, "sampling/sampling_logp_difference/max": 6.1021623611450195, "sampling/sampling_logp_difference/mean": 0.1624608337879181, "step": 1301 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.4226219058036804, "epoch": 3.4263157894736844, "grad_norm": 0.0007905043894425035, "learning_rate": 1e-06, "loss": -0.0014, "step": 1302 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.4173944890499115, "epoch": 3.4289473684210527, "grad_norm": 0.002238411922007799, "learning_rate": 1e-06, "loss": -0.0034, "step": 1303 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.40958690643310547, "epoch": 3.431578947368421, "grad_norm": 0.03789857029914856, "learning_rate": 1e-06, "loss": 0.0305, "step": 1304 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11103.0, "completions/max_terminated_length": 11103.0, "completions/mean_length": 3814.59375, "completions/mean_terminated_length": 3814.59375, "completions/min_length": 583.0, "completions/min_terminated_length": 583.0, "entropy": 0.4099797010421753, "epoch": 3.4342105263157894, "frac_reward_zero_std": 0.90625, "grad_norm": 0.0034063279163092375, "learning_rate": 1e-06, "loss": -0.0077, "num_tokens": 639691139.0, "reward": 0.8953125476837158, "reward_std": 0.01875000074505806, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.994140625, "rewards/symbolic_reward_accuracy/std": 0.07639661431312561, "rewards/symbolic_reward_partial_score/mean": 0.99609375, "rewards/symbolic_reward_partial_score/std": 0.05403824523091316, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0937280654907227, "sampling/importance_sampling_ratio/min": 0.0014351787976920605, "sampling/sampling_logp_difference/max": 6.546465873718262, "sampling/sampling_logp_difference/mean": 0.1595838963985443, "step": 1305 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.4073960930109024, "epoch": 3.4368421052631577, "grad_norm": 0.03798210993409157, "learning_rate": 1e-06, "loss": 0.0372, "step": 1306 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.4112928956747055, "epoch": 3.4394736842105265, "grad_norm": 0.0036208827514201403, "learning_rate": 1e-06, "loss": -0.0065, "step": 1307 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.41463279724121094, "epoch": 3.442105263157895, "grad_norm": 0.003667658194899559, "learning_rate": 1e-06, "loss": -0.0064, "step": 1308 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13245.0, "completions/max_terminated_length": 13245.0, "completions/mean_length": 3489.34765625, "completions/mean_terminated_length": 3489.34765625, "completions/min_length": 532.0, "completions/min_terminated_length": 532.0, "entropy": 0.4158789962530136, "epoch": 3.444736842105263, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 641870645.0, "reward": 0.9000000357627869, "reward_std": 0.0, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 1.0, "rewards/symbolic_reward_accuracy/std": 0.0, "rewards/symbolic_reward_partial_score/mean": 1.0, "rewards/symbolic_reward_partial_score/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0960693359375, "sampling/importance_sampling_ratio/min": 0.002532797632738948, "sampling/sampling_logp_difference/max": 5.97843074798584, "sampling/sampling_logp_difference/mean": 0.1633663773536682, "step": 1309 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.41905932128429413, "epoch": 3.4473684210526314, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 1310 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.42152129113674164, "epoch": 3.45, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 1311 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.4199579060077667, "epoch": 3.4526315789473685, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 1312 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12564.0, "completions/max_terminated_length": 12564.0, "completions/mean_length": 3941.126953125, "completions/mean_terminated_length": 3941.126953125, "completions/min_length": 426.0, "completions/min_terminated_length": 426.0, "entropy": 0.4127250015735626, "epoch": 3.455263157894737, "frac_reward_zero_std": 0.9375, "grad_norm": 0.0035389773547649384, "learning_rate": 1e-06, "loss": 0.0083, "num_tokens": 644301654.0, "reward": 0.8974121809005737, "reward_std": 0.01035156287252903, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.99609375, "rewards/symbolic_reward_accuracy/std": 0.06243881583213806, "rewards/symbolic_reward_partial_score/mean": 0.9991861581802368, "rewards/symbolic_reward_partial_score/std": 0.013266698457300663, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0941858291625977, "sampling/importance_sampling_ratio/min": 0.0022893729619681835, "sampling/sampling_logp_difference/max": 6.079477310180664, "sampling/sampling_logp_difference/mean": 0.1602892130613327, "step": 1313 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.4096657335758209, "epoch": 3.457894736842105, "grad_norm": 0.0037163407541811466, "learning_rate": 1e-06, "loss": -0.0077, "step": 1314 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03125, "entropy": 0.4050673246383667, "epoch": 3.4605263157894735, "grad_norm": 0.0015734070912003517, "learning_rate": 1e-06, "loss": -0.0067, "step": 1315 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.4047611653804779, "epoch": 3.463157894736842, "grad_norm": 0.003082559909671545, "learning_rate": 1e-06, "loss": -0.0069, "step": 1316 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9814.0, "completions/max_terminated_length": 9814.0, "completions/mean_length": 2924.294921875, "completions/mean_terminated_length": 2924.294921875, "completions/min_length": 467.0, "completions/min_terminated_length": 467.0, "entropy": 0.417520135641098, "epoch": 3.4657894736842105, "frac_reward_zero_std": 0.96875, "grad_norm": 0.002352791838347912, "learning_rate": 1e-06, "loss": -0.0031, "num_tokens": 646190029.0, "reward": 0.8982422351837158, "reward_std": 0.00703125074505806, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.998046875, "rewards/symbolic_reward_accuracy/std": 0.04419417306780815, "rewards/symbolic_reward_partial_score/mean": 0.998046875, "rewards/symbolic_reward_partial_score/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0951573848724365, "sampling/importance_sampling_ratio/min": 0.00251167849637568, "sampling/sampling_logp_difference/max": 5.986804008483887, "sampling/sampling_logp_difference/mean": 0.16205096244812012, "step": 1317 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.41661183536052704, "epoch": 3.468421052631579, "grad_norm": 0.002251103287562728, "learning_rate": 1e-06, "loss": -0.004, "step": 1318 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.4123474657535553, "epoch": 3.4710526315789476, "grad_norm": 0.002237804466858506, "learning_rate": 1e-06, "loss": 0.0119, "step": 1319 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.4150903522968292, "epoch": 3.473684210526316, "grad_norm": 0.001431317999958992, "learning_rate": 1e-06, "loss": -0.0031, "step": 1320 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12570.0, "completions/max_terminated_length": 12570.0, "completions/mean_length": 3181.740234375, "completions/mean_terminated_length": 3181.740234375, "completions/min_length": 478.0, "completions/min_terminated_length": 478.0, "entropy": 0.4164803624153137, "epoch": 3.4763157894736842, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 648215208.0, "reward": 0.9000000357627869, "reward_std": 0.0, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 1.0, "rewards/symbolic_reward_accuracy/std": 0.0, "rewards/symbolic_reward_partial_score/mean": 1.0, "rewards/symbolic_reward_partial_score/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0958328247070312, "sampling/importance_sampling_ratio/min": 0.004173843190073967, "sampling/sampling_logp_difference/max": 5.478918075561523, "sampling/sampling_logp_difference/mean": 0.1629454493522644, "step": 1321 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.4138936251401901, "epoch": 3.4789473684210526, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 1322 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.41635027527809143, "epoch": 3.481578947368421, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 1323 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.4135337918996811, "epoch": 3.4842105263157896, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 1324 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.009765625, "completions/max_length": 16384.0, "completions/max_terminated_length": 14925.0, "completions/mean_length": 4314.9921875, "completions/mean_terminated_length": 4195.96826171875, "completions/min_length": 359.0, "completions/min_terminated_length": 359.0, "entropy": 0.40028227865695953, "epoch": 3.486842105263158, "frac_reward_zero_std": 0.84375, "grad_norm": 0.015114717185497284, "learning_rate": 1e-06, "loss": -0.0207, "num_tokens": 650846980.0, "reward": 0.8815430402755737, "reward_std": 0.039533913135528564, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.978515625, "rewards/symbolic_reward_accuracy/std": 0.14513419568538666, "rewards/symbolic_reward_partial_score/mean": 0.9847005009651184, "rewards/symbolic_reward_partial_score/std": 0.11766314506530762, "rewards/tag_count_reward/mean": -0.009765625, "rewards/tag_count_reward/std": 0.09843364357948303, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0922293663024902, "sampling/importance_sampling_ratio/min": 0.001477907644584775, "sampling/sampling_logp_difference/max": 6.517127990722656, "sampling/sampling_logp_difference/mean": 0.1571500301361084, "step": 1325 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0390625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0390625, "entropy": 0.3992725908756256, "epoch": 3.4894736842105263, "grad_norm": 0.0093569690361619, "learning_rate": 1e-06, "loss": -0.0154, "step": 1326 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03125, "entropy": 0.4013103097677231, "epoch": 3.4921052631578946, "grad_norm": 0.0117483576759696, "learning_rate": 1e-06, "loss": -0.0128, "step": 1327 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.3956882953643799, "epoch": 3.4947368421052634, "grad_norm": 0.04905395209789276, "learning_rate": 1e-06, "loss": 0.0761, "step": 1328 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 14365.0, "completions/mean_length": 3695.45703125, "completions/mean_terminated_length": 3670.626220703125, "completions/min_length": 475.0, "completions/min_terminated_length": 475.0, "entropy": 0.4043821394443512, "epoch": 3.4973684210526317, "frac_reward_zero_std": 0.875, "grad_norm": 0.027094334363937378, "learning_rate": 1e-06, "loss": -0.0003, "num_tokens": 653139534.0, "reward": 0.8939453363418579, "reward_std": 0.02421875298023224, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.9921875, "rewards/symbolic_reward_accuracy/std": 0.08812850713729858, "rewards/symbolic_reward_partial_score/mean": 0.9954426884651184, "rewards/symbolic_reward_partial_score/std": 0.06325981765985489, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0937004089355469, "sampling/importance_sampling_ratio/min": 0.002513238461688161, "sampling/sampling_logp_difference/max": 5.986183166503906, "sampling/sampling_logp_difference/mean": 0.15961581468582153, "step": 1329 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03125, "entropy": 0.4076476991176605, "epoch": 3.5, "grad_norm": 0.034689415246248245, "learning_rate": 1e-06, "loss": 0.0174, "step": 1330 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.4067174196243286, "epoch": 3.5026315789473683, "grad_norm": 0.030187811702489853, "learning_rate": 1e-06, "loss": -0.0002, "step": 1331 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.4084063917398453, "epoch": 3.5052631578947366, "grad_norm": 0.004705375991761684, "learning_rate": 1e-06, "loss": 0.0031, "step": 1332 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 13060.0, "completions/mean_length": 2954.896484375, "completions/mean_terminated_length": 2928.616455078125, "completions/min_length": 459.0, "completions/min_terminated_length": 459.0, "entropy": 0.40375232696533203, "epoch": 3.5078947368421054, "frac_reward_zero_std": 0.9375, "grad_norm": 0.0032409271225333214, "learning_rate": 1e-06, "loss": -0.0074, "num_tokens": 655046745.0, "reward": 0.8966797590255737, "reward_std": 0.01328125037252903, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.99609375, "rewards/symbolic_reward_accuracy/std": 0.06243881583213806, "rewards/symbolic_reward_partial_score/mean": 0.9973958730697632, "rewards/symbolic_reward_partial_score/std": 0.04655739292502403, "rewards/tag_count_reward/mean": -0.001953125, "rewards/tag_count_reward/std": 0.04419417306780815, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0927824974060059, "sampling/importance_sampling_ratio/min": 0.005014748778194189, "sampling/sampling_logp_difference/max": 5.295372009277344, "sampling/sampling_logp_difference/mean": 0.15880022943019867, "step": 1333 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.4036596566438675, "epoch": 3.5105263157894737, "grad_norm": 0.0024556033313274384, "learning_rate": 1e-06, "loss": -0.0054, "step": 1334 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.4068935066461563, "epoch": 3.513157894736842, "grad_norm": 0.0033040994312614202, "learning_rate": 1e-06, "loss": -0.0077, "step": 1335 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.4026488959789276, "epoch": 3.515789473684211, "grad_norm": 0.0027986783534288406, "learning_rate": 1e-06, "loss": 0.0397, "step": 1336 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13649.0, "completions/max_terminated_length": 13649.0, "completions/mean_length": 3377.55078125, "completions/mean_terminated_length": 3377.55078125, "completions/min_length": 521.0, "completions/min_terminated_length": 521.0, "entropy": 0.4026798754930496, "epoch": 3.518421052631579, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 657180179.0, "reward": 0.9000000357627869, "reward_std": 0.0, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 1.0, "rewards/symbolic_reward_accuracy/std": 0.0, "rewards/symbolic_reward_partial_score/mean": 1.0, "rewards/symbolic_reward_partial_score/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0938270092010498, "sampling/importance_sampling_ratio/min": 0.0026030070148408413, "sampling/sampling_logp_difference/max": 5.951087951660156, "sampling/sampling_logp_difference/mean": 0.15991026163101196, "step": 1337 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.4056524187326431, "epoch": 3.5210526315789474, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 1338 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.40314194560050964, "epoch": 3.5236842105263158, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 1339 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.41141124069690704, "epoch": 3.526315789473684, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 1340 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 14933.0, "completions/mean_length": 3331.615234375, "completions/mean_terminated_length": 3306.072265625, "completions/min_length": 511.0, "completions/min_terminated_length": 511.0, "entropy": 0.40859343111515045, "epoch": 3.5289473684210524, "frac_reward_zero_std": 0.9375, "grad_norm": 0.004199342802166939, "learning_rate": 1e-06, "loss": -0.0072, "num_tokens": 659253902.0, "reward": 0.8953125476837158, "reward_std": 0.015305986627936363, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.994140625, "rewards/symbolic_reward_accuracy/std": 0.07639661431312561, "rewards/symbolic_reward_partial_score/mean": 0.9967447519302368, "rewards/symbolic_reward_partial_score/std": 0.04989916458725929, "rewards/tag_count_reward/mean": -0.001953125, "rewards/tag_count_reward/std": 0.04419417306780815, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0948388576507568, "sampling/importance_sampling_ratio/min": 0.004439584910869598, "sampling/sampling_logp_difference/max": 5.417194366455078, "sampling/sampling_logp_difference/mean": 0.161345437169075, "step": 1341 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.41215769946575165, "epoch": 3.531578947368421, "grad_norm": 0.0033273992594331503, "learning_rate": 1e-06, "loss": 0.0038, "step": 1342 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.40942439436912537, "epoch": 3.5342105263157895, "grad_norm": 0.003310244530439377, "learning_rate": 1e-06, "loss": 0.0382, "step": 1343 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.41266122460365295, "epoch": 3.536842105263158, "grad_norm": 0.004392263013869524, "learning_rate": 1e-06, "loss": -0.0087, "step": 1344 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11280.0, "completions/max_terminated_length": 11280.0, "completions/mean_length": 3390.9296875, "completions/mean_terminated_length": 3390.9296875, "completions/min_length": 374.0, "completions/min_terminated_length": 374.0, "entropy": 0.4126154035329819, "epoch": 3.5394736842105265, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 661388010.0, "reward": 0.9000000357627869, "reward_std": 0.0, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 1.0, "rewards/symbolic_reward_accuracy/std": 0.0, "rewards/symbolic_reward_partial_score/mean": 1.0, "rewards/symbolic_reward_partial_score/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0949976444244385, "sampling/importance_sampling_ratio/min": 0.001377489184960723, "sampling/sampling_logp_difference/max": 6.587492942810059, "sampling/sampling_logp_difference/mean": 0.16147500276565552, "step": 1345 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.4073363095521927, "epoch": 3.542105263157895, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 1346 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.4091443121433258, "epoch": 3.544736842105263, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 1347 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.41578423976898193, "epoch": 3.5473684210526315, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 1348 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11655.0, "completions/max_terminated_length": 11655.0, "completions/mean_length": 3495.994140625, "completions/mean_terminated_length": 3495.994140625, "completions/min_length": 492.0, "completions/min_terminated_length": 492.0, "entropy": 0.4095095098018646, "epoch": 3.55, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 663572423.0, "reward": 0.9000000357627869, "reward_std": 0.0, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 1.0, "rewards/symbolic_reward_accuracy/std": 0.0, "rewards/symbolic_reward_partial_score/mean": 1.0, "rewards/symbolic_reward_partial_score/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0945348739624023, "sampling/importance_sampling_ratio/min": 0.0057539683766663074, "sampling/sampling_logp_difference/max": 5.157865524291992, "sampling/sampling_logp_difference/mean": 0.1603575348854065, "step": 1349 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.4120600074529648, "epoch": 3.5526315789473686, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 1350 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.4052221179008484, "epoch": 3.555263157894737, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 1351 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.4080249220132828, "epoch": 3.557894736842105, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 1352 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13585.0, "completions/max_terminated_length": 13585.0, "completions/mean_length": 3372.05078125, "completions/mean_terminated_length": 3372.05078125, "completions/min_length": 503.0, "completions/min_terminated_length": 503.0, "entropy": 0.4013156294822693, "epoch": 3.5605263157894735, "frac_reward_zero_std": 0.96875, "grad_norm": 0.029925869777798653, "learning_rate": 1e-06, "loss": 0.018, "num_tokens": 665707233.0, "reward": 0.8987305164337158, "reward_std": 0.005078125279396772, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.998046875, "rewards/symbolic_reward_accuracy/std": 0.04419417306780815, "rewards/symbolic_reward_partial_score/mean": 0.9996744394302368, "rewards/symbolic_reward_partial_score/std": 0.007365696597844362, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0925092697143555, "sampling/importance_sampling_ratio/min": 0.002474510343745351, "sampling/sampling_logp_difference/max": 6.001712799072266, "sampling/sampling_logp_difference/mean": 0.1575009524822235, "step": 1353 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.39519596099853516, "epoch": 3.5631578947368423, "grad_norm": 0.0022106533870100975, "learning_rate": 1e-06, "loss": -0.0033, "step": 1354 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.39832234382629395, "epoch": 3.5657894736842106, "grad_norm": 0.0025575796607881784, "learning_rate": 1e-06, "loss": -0.0042, "step": 1355 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.40068840980529785, "epoch": 3.568421052631579, "grad_norm": 0.0026897259522229433, "learning_rate": 1e-06, "loss": -0.0038, "step": 1356 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12470.0, "completions/max_terminated_length": 12470.0, "completions/mean_length": 3052.69140625, "completions/mean_terminated_length": 3052.69140625, "completions/min_length": 455.0, "completions/min_terminated_length": 455.0, "entropy": 0.3993008881807327, "epoch": 3.5710526315789473, "frac_reward_zero_std": 0.96875, "grad_norm": 0.0019752681255340576, "learning_rate": 1e-06, "loss": -0.0033, "num_tokens": 667672675.0, "reward": 0.898681640625, "reward_std": 0.0052734375931322575, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.998046875, "rewards/symbolic_reward_accuracy/std": 0.04419417306780815, "rewards/symbolic_reward_partial_score/mean": 0.99951171875, "rewards/symbolic_reward_partial_score/std": 0.011048543266952038, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0928423404693604, "sampling/importance_sampling_ratio/min": 0.002603180706501007, "sampling/sampling_logp_difference/max": 5.951021194458008, "sampling/sampling_logp_difference/mean": 0.15809664130210876, "step": 1357 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.39561323821544647, "epoch": 3.5736842105263156, "grad_norm": 0.001663347939029336, "learning_rate": 1e-06, "loss": 0.0098, "step": 1358 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.4037141501903534, "epoch": 3.5763157894736843, "grad_norm": 0.0014364116359502077, "learning_rate": 1e-06, "loss": -0.0031, "step": 1359 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.4012430012226105, "epoch": 3.5789473684210527, "grad_norm": 0.0019044099608436227, "learning_rate": 1e-06, "loss": -0.0038, "step": 1360 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11312.0, "completions/max_terminated_length": 11312.0, "completions/mean_length": 3387.263671875, "completions/mean_terminated_length": 3387.263671875, "completions/min_length": 473.0, "completions/min_terminated_length": 473.0, "entropy": 0.4019183814525604, "epoch": 3.581578947368421, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 669802762.0, "reward": 0.9000000357627869, "reward_std": 0.0, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 1.0, "rewards/symbolic_reward_accuracy/std": 0.0, "rewards/symbolic_reward_partial_score/mean": 1.0, "rewards/symbolic_reward_partial_score/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.092702865600586, "sampling/importance_sampling_ratio/min": 0.0017990414053201675, "sampling/sampling_logp_difference/max": 6.320501327514648, "sampling/sampling_logp_difference/mean": 0.15685243904590607, "step": 1361 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3959041237831116, "epoch": 3.5842105263157897, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 1362 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3938000202178955, "epoch": 3.586842105263158, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 1363 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.39817318320274353, "epoch": 3.5894736842105264, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 1364 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9908.0, "completions/max_terminated_length": 9908.0, "completions/mean_length": 3127.095703125, "completions/mean_terminated_length": 3127.095703125, "completions/min_length": 420.0, "completions/min_terminated_length": 420.0, "entropy": 0.39987435936927795, "epoch": 3.5921052631578947, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 671816987.0, "reward": 0.9000000357627869, "reward_std": 0.0, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 1.0, "rewards/symbolic_reward_accuracy/std": 0.0, "rewards/symbolic_reward_partial_score/mean": 1.0, "rewards/symbolic_reward_partial_score/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0911076068878174, "sampling/importance_sampling_ratio/min": 0.00319243548437953, "sampling/sampling_logp_difference/max": 5.746971130371094, "sampling/sampling_logp_difference/mean": 0.1556779146194458, "step": 1365 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.39023223519325256, "epoch": 3.594736842105263, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 1366 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.39398255944252014, "epoch": 3.5973684210526313, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 1367 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3929821848869324, "epoch": 3.6, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 1368 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10732.0, "completions/max_terminated_length": 10732.0, "completions/mean_length": 3169.263671875, "completions/mean_terminated_length": 3169.263671875, "completions/min_length": 363.0, "completions/min_terminated_length": 363.0, "entropy": 0.40132491290569305, "epoch": 3.6026315789473684, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 673822786.0, "reward": 0.9000000357627869, "reward_std": 0.0, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 1.0, "rewards/symbolic_reward_accuracy/std": 0.0, "rewards/symbolic_reward_partial_score/mean": 1.0, "rewards/symbolic_reward_partial_score/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.093056559562683, "sampling/importance_sampling_ratio/min": 0.0037359073758125305, "sampling/sampling_logp_difference/max": 5.589764595031738, "sampling/sampling_logp_difference/mean": 0.15860742330551147, "step": 1369 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.4025930166244507, "epoch": 3.6052631578947367, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 1370 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.39976200461387634, "epoch": 3.6078947368421055, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 1371 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3997676372528076, "epoch": 3.610526315789474, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 1372 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8537.0, "completions/max_terminated_length": 8537.0, "completions/mean_length": 2823.861328125, "completions/mean_terminated_length": 2823.861328125, "completions/min_length": 406.0, "completions/min_terminated_length": 406.0, "entropy": 0.39905427396297455, "epoch": 3.613157894736842, "frac_reward_zero_std": 0.96875, "grad_norm": 0.0017854244215413928, "learning_rate": 1e-06, "loss": -0.0015, "num_tokens": 675652059.0, "reward": 0.8985351920127869, "reward_std": 0.005859375931322575, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.998046875, "rewards/symbolic_reward_accuracy/std": 0.04419417306780815, "rewards/symbolic_reward_partial_score/mean": 0.9990234375, "rewards/symbolic_reward_partial_score/std": 0.022097086533904076, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0928120613098145, "sampling/importance_sampling_ratio/min": 0.0020412288140505552, "sampling/sampling_logp_difference/max": 6.1942033767700195, "sampling/sampling_logp_difference/mean": 0.1579289734363556, "step": 1373 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.39978672564029694, "epoch": 3.6157894736842104, "grad_norm": 0.0011991802603006363, "learning_rate": 1e-06, "loss": -0.0013, "step": 1374 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.3967844247817993, "epoch": 3.6184210526315788, "grad_norm": 0.0010204967111349106, "learning_rate": 1e-06, "loss": -0.0001, "step": 1375 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.40440069139003754, "epoch": 3.6210526315789475, "grad_norm": 0.0014097831444814801, "learning_rate": 1e-06, "loss": -0.0019, "step": 1376 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11598.0, "completions/max_terminated_length": 11598.0, "completions/mean_length": 3173.1875, "completions/mean_terminated_length": 3173.1875, "completions/min_length": 427.0, "completions/min_terminated_length": 427.0, "entropy": 0.4004215896129608, "epoch": 3.623684210526316, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 677674363.0, "reward": 0.9000000357627869, "reward_std": 0.0, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 1.0, "rewards/symbolic_reward_accuracy/std": 0.0, "rewards/symbolic_reward_partial_score/mean": 1.0, "rewards/symbolic_reward_partial_score/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.093893051147461, "sampling/importance_sampling_ratio/min": 0.004086603876203299, "sampling/sampling_logp_difference/max": 5.5000410079956055, "sampling/sampling_logp_difference/mean": 0.15911394357681274, "step": 1377 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.4010716676712036, "epoch": 3.626315789473684, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 1378 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.40348049998283386, "epoch": 3.6289473684210525, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 1379 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.4039851129055023, "epoch": 3.6315789473684212, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 1380 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15965.0, "completions/max_terminated_length": 15965.0, "completions/mean_length": 3340.8828125, "completions/mean_terminated_length": 3340.8828125, "completions/min_length": 424.0, "completions/min_terminated_length": 424.0, "entropy": 0.3950095623731613, "epoch": 3.6342105263157896, "frac_reward_zero_std": 0.96875, "grad_norm": 0.020174384117126465, "learning_rate": 1e-06, "loss": 0.0109, "num_tokens": 679790207.0, "reward": 0.8974609375, "reward_std": 0.006938039790838957, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.99609375, "rewards/symbolic_reward_accuracy/std": 0.06243881583213806, "rewards/symbolic_reward_partial_score/mean": 0.9993489384651184, "rewards/symbolic_reward_partial_score/std": 0.01040646992623806, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0919876098632812, "sampling/importance_sampling_ratio/min": 0.0024679680354893208, "sampling/sampling_logp_difference/max": 6.004360198974609, "sampling/sampling_logp_difference/mean": 0.15635082125663757, "step": 1381 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.39107972383499146, "epoch": 3.636842105263158, "grad_norm": 0.0030597951263189316, "learning_rate": 1e-06, "loss": -0.0051, "step": 1382 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.3926554024219513, "epoch": 3.639473684210526, "grad_norm": 0.0034251385368406773, "learning_rate": 1e-06, "loss": -0.0055, "step": 1383 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.39353494346141815, "epoch": 3.6421052631578945, "grad_norm": 0.002028356771916151, "learning_rate": 1e-06, "loss": 0.0138, "step": 1384 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11442.0, "completions/max_terminated_length": 11442.0, "completions/mean_length": 3600.646484375, "completions/mean_terminated_length": 3600.646484375, "completions/min_length": 509.0, "completions/min_terminated_length": 509.0, "entropy": 0.38536764681339264, "epoch": 3.6447368421052633, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 682060586.0, "reward": 0.9000000357627869, "reward_std": 0.0, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 1.0, "rewards/symbolic_reward_accuracy/std": 0.0, "rewards/symbolic_reward_partial_score/mean": 1.0, "rewards/symbolic_reward_partial_score/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0902211666107178, "sampling/importance_sampling_ratio/min": 0.002668058965355158, "sampling/sampling_logp_difference/max": 5.926403999328613, "sampling/sampling_logp_difference/mean": 0.153443843126297, "step": 1385 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3879353255033493, "epoch": 3.6473684210526316, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 1386 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.39046235382556915, "epoch": 3.65, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 1387 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3896489590406418, "epoch": 3.6526315789473687, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 1388 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11423.0, "completions/max_terminated_length": 11423.0, "completions/mean_length": 3338.302734375, "completions/mean_terminated_length": 3338.302734375, "completions/min_length": 648.0, "completions/min_terminated_length": 648.0, "entropy": 0.38561201095581055, "epoch": 3.655263157894737, "frac_reward_zero_std": 0.96875, "grad_norm": 0.0018490205984562635, "learning_rate": 1e-06, "loss": -0.0019, "num_tokens": 684191973.0, "reward": 0.8982422351837158, "reward_std": 0.00703125074505806, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.998046875, "rewards/symbolic_reward_accuracy/std": 0.04419417306780815, "rewards/symbolic_reward_partial_score/mean": 0.998046875, "rewards/symbolic_reward_partial_score/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0902330875396729, "sampling/importance_sampling_ratio/min": 0.0030723377130925655, "sampling/sampling_logp_difference/max": 5.785316467285156, "sampling/sampling_logp_difference/mean": 0.1536712497472763, "step": 1389 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.39230525493621826, "epoch": 3.6578947368421053, "grad_norm": 0.0017965837614610791, "learning_rate": 1e-06, "loss": 0.0034, "step": 1390 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.38700607419013977, "epoch": 3.6605263157894736, "grad_norm": 0.0017074166098609567, "learning_rate": 1e-06, "loss": -0.0018, "step": 1391 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3893258571624756, "epoch": 3.663157894736842, "grad_norm": 0.00184584257658571, "learning_rate": 1e-06, "loss": -0.0019, "step": 1392 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15280.0, "completions/max_terminated_length": 15280.0, "completions/mean_length": 3533.90234375, "completions/mean_terminated_length": 3533.90234375, "completions/min_length": 470.0, "completions/min_terminated_length": 470.0, "entropy": 0.40172871947288513, "epoch": 3.6657894736842103, "frac_reward_zero_std": 0.96875, "grad_norm": 0.0023419312201440334, "learning_rate": 1e-06, "loss": -0.0041, "num_tokens": 686391443.0, "reward": 0.8987305164337158, "reward_std": 0.005078125279396772, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.998046875, "rewards/symbolic_reward_accuracy/std": 0.04419417306780815, "rewards/symbolic_reward_partial_score/mean": 0.9996744394302368, "rewards/symbolic_reward_partial_score/std": 0.007365696597844362, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0933713912963867, "sampling/importance_sampling_ratio/min": 0.0017077759839594364, "sampling/sampling_logp_difference/max": 6.372563362121582, "sampling/sampling_logp_difference/mean": 0.15777018666267395, "step": 1393 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.3977302759885788, "epoch": 3.668421052631579, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": -0.0034, "step": 1394 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3989953547716141, "epoch": 3.6710526315789473, "grad_norm": 0.0022163260728120804, "learning_rate": 1e-06, "loss": 0.0153, "step": 1395 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.40052279829978943, "epoch": 3.6736842105263157, "grad_norm": 0.0025884083006531, "learning_rate": 1e-06, "loss": -0.0045, "step": 1396 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13327.0, "completions/max_terminated_length": 13327.0, "completions/mean_length": 3259.73828125, "completions/mean_terminated_length": 3259.73828125, "completions/min_length": 479.0, "completions/min_terminated_length": 479.0, "entropy": 0.3998134732246399, "epoch": 3.6763157894736844, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 688460909.0, "reward": 0.9000000357627869, "reward_std": 0.0, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 1.0, "rewards/symbolic_reward_accuracy/std": 0.0, "rewards/symbolic_reward_partial_score/mean": 1.0, "rewards/symbolic_reward_partial_score/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0925114154815674, "sampling/importance_sampling_ratio/min": 0.00391404889523983, "sampling/sampling_logp_difference/max": 5.543182849884033, "sampling/sampling_logp_difference/mean": 0.15690429508686066, "step": 1397 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3954601436853409, "epoch": 3.6789473684210527, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 1398 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3984253853559494, "epoch": 3.681578947368421, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 1399 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.39593395590782166, "epoch": 3.6842105263157894, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 1400 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 9928.0, "completions/mean_length": 3060.197265625, "completions/mean_terminated_length": 3034.123291015625, "completions/min_length": 475.0, "completions/min_terminated_length": 475.0, "entropy": 0.4014148712158203, "epoch": 3.6868421052631577, "frac_reward_zero_std": 0.96875, "grad_norm": 0.0020971365738660097, "learning_rate": 1e-06, "loss": -0.0031, "num_tokens": 690418002.0, "reward": 0.8980469107627869, "reward_std": 0.0078125, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.998046875, "rewards/symbolic_reward_accuracy/std": 0.04419417306780815, "rewards/symbolic_reward_partial_score/mean": 0.998046875, "rewards/symbolic_reward_partial_score/std": 0.04419417306780815, "rewards/tag_count_reward/mean": -0.001953125, "rewards/tag_count_reward/std": 0.04419417306780815, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0928709506988525, "sampling/importance_sampling_ratio/min": 0.001358574372716248, "sampling/sampling_logp_difference/max": 6.601319313049316, "sampling/sampling_logp_difference/mean": 0.1577681303024292, "step": 1401 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3952745348215103, "epoch": 3.6894736842105265, "grad_norm": 0.0021380619145929813, "learning_rate": 1e-06, "loss": -0.0031, "step": 1402 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.39697694778442383, "epoch": 3.692105263157895, "grad_norm": 0.0017192736268043518, "learning_rate": 1e-06, "loss": 0.0277, "step": 1403 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.40231893956661224, "epoch": 3.694736842105263, "grad_norm": 0.001479703583754599, "learning_rate": 1e-06, "loss": -0.0022, "step": 1404 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13133.0, "completions/max_terminated_length": 13133.0, "completions/mean_length": 3382.88671875, "completions/mean_terminated_length": 3382.88671875, "completions/min_length": 483.0, "completions/min_terminated_length": 483.0, "entropy": 0.3874139189720154, "epoch": 3.6973684210526314, "frac_reward_zero_std": 0.96875, "grad_norm": 0.0026652305386960506, "learning_rate": 1e-06, "loss": -0.0038, "num_tokens": 692557336.0, "reward": 0.8987305164337158, "reward_std": 0.005078125279396772, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.998046875, "rewards/symbolic_reward_accuracy/std": 0.04419417306780815, "rewards/symbolic_reward_partial_score/mean": 0.9996744394302368, "rewards/symbolic_reward_partial_score/std": 0.007365696597844362, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0917917490005493, "sampling/importance_sampling_ratio/min": 0.0020509676542133093, "sampling/sampling_logp_difference/max": 6.189443588256836, "sampling/sampling_logp_difference/mean": 0.1555863618850708, "step": 1405 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.3897632509469986, "epoch": 3.7, "grad_norm": 0.0028849304653704166, "learning_rate": 1e-06, "loss": -0.0042, "step": 1406 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.39466267824172974, "epoch": 3.7026315789473685, "grad_norm": 0.0021011019125580788, "learning_rate": 1e-06, "loss": -0.0041, "step": 1407 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.39202168583869934, "epoch": 3.705263157894737, "grad_norm": 0.0016496055759489536, "learning_rate": 1e-06, "loss": 0.0055, "step": 1408 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10350.0, "completions/max_terminated_length": 10350.0, "completions/mean_length": 3223.619140625, "completions/mean_terminated_length": 3223.619140625, "completions/min_length": 473.0, "completions/min_terminated_length": 473.0, "entropy": 0.40050970017910004, "epoch": 3.707894736842105, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 694595957.0, "reward": 0.9000000357627869, "reward_std": 0.0, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 1.0, "rewards/symbolic_reward_accuracy/std": 0.0, "rewards/symbolic_reward_partial_score/mean": 1.0, "rewards/symbolic_reward_partial_score/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0935096740722656, "sampling/importance_sampling_ratio/min": 0.0024177185259759426, "sampling/sampling_logp_difference/max": 6.024930953979492, "sampling/sampling_logp_difference/mean": 0.15873806178569794, "step": 1409 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.40371544659137726, "epoch": 3.7105263157894735, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 1410 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.40224555134773254, "epoch": 3.713157894736842, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 1411 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.40073154866695404, "epoch": 3.7157894736842105, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 1412 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12493.0, "completions/max_terminated_length": 12493.0, "completions/mean_length": 3898.419921875, "completions/mean_terminated_length": 3898.419921875, "completions/min_length": 543.0, "completions/min_terminated_length": 543.0, "entropy": 0.39467544853687286, "epoch": 3.718421052631579, "frac_reward_zero_std": 0.9375, "grad_norm": 0.004786165431141853, "learning_rate": 1e-06, "loss": -0.0005, "num_tokens": 696999084.0, "reward": 0.8955078125, "reward_std": 0.014253725297749043, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.994140625, "rewards/symbolic_reward_accuracy/std": 0.07639661431312561, "rewards/symbolic_reward_partial_score/mean": 0.9967447519302368, "rewards/symbolic_reward_partial_score/std": 0.04989916458725929, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.092606782913208, "sampling/importance_sampling_ratio/min": 0.004562050104141235, "sampling/sampling_logp_difference/max": 5.389983177185059, "sampling/sampling_logp_difference/mean": 0.15658065676689148, "step": 1413 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.39634251594543457, "epoch": 3.7210526315789476, "grad_norm": 0.0038975742645561695, "learning_rate": 1e-06, "loss": -0.0094, "step": 1414 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.39172324538230896, "epoch": 3.723684210526316, "grad_norm": 0.004343866836279631, "learning_rate": 1e-06, "loss": -0.0102, "step": 1415 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.3948686420917511, "epoch": 3.7263157894736842, "grad_norm": 0.023683957755565643, "learning_rate": 1e-06, "loss": 0.0209, "step": 1416 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14559.0, "completions/max_terminated_length": 14559.0, "completions/mean_length": 3128.171875, "completions/mean_terminated_length": 3128.171875, "completions/min_length": 581.0, "completions/min_terminated_length": 581.0, "entropy": 0.4044443964958191, "epoch": 3.7289473684210526, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 698998020.0, "reward": 0.9000000357627869, "reward_std": 0.0, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 1.0, "rewards/symbolic_reward_accuracy/std": 0.0, "rewards/symbolic_reward_partial_score/mean": 1.0, "rewards/symbolic_reward_partial_score/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0930471420288086, "sampling/importance_sampling_ratio/min": 0.001986793242394924, "sampling/sampling_logp_difference/max": 6.221233367919922, "sampling/sampling_logp_difference/mean": 0.1583324372768402, "step": 1417 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.4009324014186859, "epoch": 3.731578947368421, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 1418 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3986443877220154, "epoch": 3.734210526315789, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 1419 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.40290258824825287, "epoch": 3.736842105263158, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 1420 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10382.0, "completions/max_terminated_length": 10382.0, "completions/mean_length": 3306.9296875, "completions/mean_terminated_length": 3306.9296875, "completions/min_length": 553.0, "completions/min_terminated_length": 553.0, "entropy": 0.40319564938545227, "epoch": 3.7394736842105263, "frac_reward_zero_std": 0.96875, "grad_norm": 0.0020903500262647867, "learning_rate": 1e-06, "loss": -0.0027, "num_tokens": 701087136.0, "reward": 0.8985351920127869, "reward_std": 0.005859375931322575, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.998046875, "rewards/symbolic_reward_accuracy/std": 0.04419417306780815, "rewards/symbolic_reward_partial_score/mean": 0.9990234375, "rewards/symbolic_reward_partial_score/std": 0.022097086533904076, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0921461582183838, "sampling/importance_sampling_ratio/min": 0.0011779491323977709, "sampling/sampling_logp_difference/max": 6.743980407714844, "sampling/sampling_logp_difference/mean": 0.15718862414360046, "step": 1421 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.39845849573612213, "epoch": 3.7421052631578946, "grad_norm": 0.0015312627656385303, "learning_rate": 1e-06, "loss": -0.0034, "step": 1422 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.39401476085186005, "epoch": 3.7447368421052634, "grad_norm": 0.0011932970955967903, "learning_rate": 1e-06, "loss": 0.0083, "step": 1423 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.396557480096817, "epoch": 3.7473684210526317, "grad_norm": 0.00152764399535954, "learning_rate": 1e-06, "loss": -0.003, "step": 1424 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10209.0, "completions/max_terminated_length": 10209.0, "completions/mean_length": 3418.779296875, "completions/mean_terminated_length": 3418.779296875, "completions/min_length": 538.0, "completions/min_terminated_length": 538.0, "entropy": 0.3978844881057739, "epoch": 3.75, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 703235343.0, "reward": 0.9000000357627869, "reward_std": 0.0, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 1.0, "rewards/symbolic_reward_accuracy/std": 0.0, "rewards/symbolic_reward_partial_score/mean": 1.0, "rewards/symbolic_reward_partial_score/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.093510627746582, "sampling/importance_sampling_ratio/min": 0.0033774874173104763, "sampling/sampling_logp_difference/max": 5.6906232833862305, "sampling/sampling_logp_difference/mean": 0.15863358974456787, "step": 1425 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.4042161554098129, "epoch": 3.7526315789473683, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 1426 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.40453460812568665, "epoch": 3.7552631578947366, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 1427 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.4057096391916275, "epoch": 3.7578947368421054, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 1428 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10720.0, "completions/max_terminated_length": 10720.0, "completions/mean_length": 3174.57421875, "completions/mean_terminated_length": 3174.57421875, "completions/min_length": 534.0, "completions/min_terminated_length": 534.0, "entropy": 0.3985760658979416, "epoch": 3.7605263157894737, "frac_reward_zero_std": 0.96875, "grad_norm": 0.00251170271076262, "learning_rate": 1e-06, "loss": -0.0039, "num_tokens": 705235477.0, "reward": 0.898681640625, "reward_std": 0.0052734375931322575, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.998046875, "rewards/symbolic_reward_accuracy/std": 0.04419417306780815, "rewards/symbolic_reward_partial_score/mean": 0.99951171875, "rewards/symbolic_reward_partial_score/std": 0.011048543266952038, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0938581228256226, "sampling/importance_sampling_ratio/min": 0.003616937668994069, "sampling/sampling_logp_difference/max": 5.622127532958984, "sampling/sampling_logp_difference/mean": 0.15890240669250488, "step": 1429 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.4028221666812897, "epoch": 3.763157894736842, "grad_norm": 0.0015786701114848256, "learning_rate": 1e-06, "loss": -0.0024, "step": 1430 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.409660205245018, "epoch": 3.765789473684211, "grad_norm": 0.0023150264751166105, "learning_rate": 1e-06, "loss": -0.0037, "step": 1431 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.40204913914203644, "epoch": 3.768421052631579, "grad_norm": 0.0015482159797102213, "learning_rate": 1e-06, "loss": 0.0186, "step": 1432 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13045.0, "completions/max_terminated_length": 13045.0, "completions/mean_length": 4052.107421875, "completions/mean_terminated_length": 4052.107421875, "completions/min_length": 566.0, "completions/min_terminated_length": 566.0, "entropy": 0.3927154541015625, "epoch": 3.7710526315789474, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 707715148.0, "reward": 0.9000000357627869, "reward_std": 0.0, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 1.0, "rewards/symbolic_reward_accuracy/std": 0.0, "rewards/symbolic_reward_partial_score/mean": 1.0, "rewards/symbolic_reward_partial_score/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0915820598602295, "sampling/importance_sampling_ratio/min": 0.0026977232191711664, "sampling/sampling_logp_difference/max": 5.915347099304199, "sampling/sampling_logp_difference/mean": 0.15520447492599487, "step": 1433 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.38977791368961334, "epoch": 3.7736842105263158, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 1434 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.39580821990966797, "epoch": 3.776315789473684, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 1435 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.39414040744304657, "epoch": 3.7789473684210524, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 1436 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13228.0, "completions/max_terminated_length": 13228.0, "completions/mean_length": 3834.833984375, "completions/mean_terminated_length": 3834.833984375, "completions/min_length": 453.0, "completions/min_terminated_length": 453.0, "entropy": 0.3926548361778259, "epoch": 3.781578947368421, "frac_reward_zero_std": 0.9375, "grad_norm": 0.0031334292143583298, "learning_rate": 1e-06, "loss": -0.0062, "num_tokens": 710106071.0, "reward": 0.89697265625, "reward_std": 0.01210937649011612, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.99609375, "rewards/symbolic_reward_accuracy/std": 0.06243881583213806, "rewards/symbolic_reward_partial_score/mean": 0.9977213144302368, "rewards/symbolic_reward_partial_score/std": 0.04478955641388893, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0905840396881104, "sampling/importance_sampling_ratio/min": 0.00128124188631773, "sampling/sampling_logp_difference/max": 6.65992546081543, "sampling/sampling_logp_difference/mean": 0.153834730386734, "step": 1437 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3901326656341553, "epoch": 3.7842105263157895, "grad_norm": 0.003779578721150756, "learning_rate": 1e-06, "loss": -0.0071, "step": 1438 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.38869695365428925, "epoch": 3.786842105263158, "grad_norm": 0.0028702968265861273, "learning_rate": 1e-06, "loss": -0.0066, "step": 1439 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.387768492102623, "epoch": 3.7894736842105265, "grad_norm": 0.026358773931860924, "learning_rate": 1e-06, "loss": 0.0303, "step": 1440 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13652.0, "completions/max_terminated_length": 13652.0, "completions/mean_length": 3755.267578125, "completions/mean_terminated_length": 3755.267578125, "completions/min_length": 627.0, "completions/min_terminated_length": 627.0, "entropy": 0.3892553150653839, "epoch": 3.792105263157895, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 712435904.0, "reward": 0.9000000357627869, "reward_std": 0.0, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 1.0, "rewards/symbolic_reward_accuracy/std": 0.0, "rewards/symbolic_reward_partial_score/mean": 1.0, "rewards/symbolic_reward_partial_score/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.092039942741394, "sampling/importance_sampling_ratio/min": 0.0021835248917341232, "sampling/sampling_logp_difference/max": 6.126814842224121, "sampling/sampling_logp_difference/mean": 0.15593919157981873, "step": 1441 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3990880101919174, "epoch": 3.794736842105263, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 1442 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.39939743280410767, "epoch": 3.7973684210526315, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 1443 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.400002658367157, "epoch": 3.8, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 1444 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12545.0, "completions/max_terminated_length": 12545.0, "completions/mean_length": 3728.94921875, "completions/mean_terminated_length": 3728.94921875, "completions/min_length": 584.0, "completions/min_terminated_length": 584.0, "entropy": 0.40515466034412384, "epoch": 3.8026315789473686, "frac_reward_zero_std": 0.96875, "grad_norm": 0.0020378362387418747, "learning_rate": 1e-06, "loss": -0.0024, "num_tokens": 714728742.0, "reward": 0.898681640625, "reward_std": 0.0052734375931322575, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.998046875, "rewards/symbolic_reward_accuracy/std": 0.04419417306780815, "rewards/symbolic_reward_partial_score/mean": 0.99951171875, "rewards/symbolic_reward_partial_score/std": 0.011048543266952038, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.094640851020813, "sampling/importance_sampling_ratio/min": 0.0031920094043016434, "sampling/sampling_logp_difference/max": 5.747104644775391, "sampling/sampling_logp_difference/mean": 0.1599438488483429, "step": 1445 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.4050808846950531, "epoch": 3.805263157894737, "grad_norm": 0.0021285039838403463, "learning_rate": 1e-06, "loss": -0.0029, "step": 1446 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.4077269583940506, "epoch": 3.807894736842105, "grad_norm": 0.020180271938443184, "learning_rate": 1e-06, "loss": 0.0064, "step": 1447 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.41125862300395966, "epoch": 3.8105263157894735, "grad_norm": 0.0018239085329696536, "learning_rate": 1e-06, "loss": -0.003, "step": 1448 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 12424.0, "completions/mean_length": 3610.353515625, "completions/mean_terminated_length": 3585.356201171875, "completions/min_length": 591.0, "completions/min_terminated_length": 591.0, "entropy": 0.400941401720047, "epoch": 3.8131578947368423, "frac_reward_zero_std": 0.96875, "grad_norm": 0.0017912550829350948, "learning_rate": 1e-06, "loss": -0.0022, "num_tokens": 716961339.0, "reward": 0.8980469107627869, "reward_std": 0.0078125, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.998046875, "rewards/symbolic_reward_accuracy/std": 0.04419417306780815, "rewards/symbolic_reward_partial_score/mean": 0.998046875, "rewards/symbolic_reward_partial_score/std": 0.04419417306780815, "rewards/tag_count_reward/mean": -0.001953125, "rewards/tag_count_reward/std": 0.04419417306780815, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0927866697311401, "sampling/importance_sampling_ratio/min": 0.002997717587277293, "sampling/sampling_logp_difference/max": 5.809904098510742, "sampling/sampling_logp_difference/mean": 0.1572321653366089, "step": 1449 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.39634352922439575, "epoch": 3.8157894736842106, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": -0.0025, "step": 1450 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.39722955226898193, "epoch": 3.818421052631579, "grad_norm": 0.03367019444704056, "learning_rate": 1e-06, "loss": 0.0293, "step": 1451 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.40396061539649963, "epoch": 3.8210526315789473, "grad_norm": 0.0013267152244225144, "learning_rate": 1e-06, "loss": -0.0028, "step": 1452 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14840.0, "completions/max_terminated_length": 14840.0, "completions/mean_length": 3933.537109375, "completions/mean_terminated_length": 3933.537109375, "completions/min_length": 536.0, "completions/min_terminated_length": 536.0, "entropy": 0.3897482752799988, "epoch": 3.8236842105263156, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 719386638.0, "reward": 0.9000000357627869, "reward_std": 0.0, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 1.0, "rewards/symbolic_reward_accuracy/std": 0.0, "rewards/symbolic_reward_partial_score/mean": 1.0, "rewards/symbolic_reward_partial_score/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.091388463973999, "sampling/importance_sampling_ratio/min": 0.0024434339720755816, "sampling/sampling_logp_difference/max": 6.014350891113281, "sampling/sampling_logp_difference/mean": 0.15466183423995972, "step": 1453 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.39654238522052765, "epoch": 3.8263157894736843, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 1454 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.39224767684936523, "epoch": 3.8289473684210527, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 1455 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.39149652421474457, "epoch": 3.831578947368421, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 1456 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14113.0, "completions/max_terminated_length": 14113.0, "completions/mean_length": 4065.244140625, "completions/mean_terminated_length": 4065.244140625, "completions/min_length": 523.0, "completions/min_terminated_length": 523.0, "entropy": 0.40415142476558685, "epoch": 3.8342105263157897, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 721865515.0, "reward": 0.9000000357627869, "reward_std": 0.0, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 1.0, "rewards/symbolic_reward_accuracy/std": 0.0, "rewards/symbolic_reward_partial_score/mean": 1.0, "rewards/symbolic_reward_partial_score/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0938994884490967, "sampling/importance_sampling_ratio/min": 0.00262816553004086, "sampling/sampling_logp_difference/max": 5.941469192504883, "sampling/sampling_logp_difference/mean": 0.15814323723316193, "step": 1457 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.4050493389368057, "epoch": 3.836842105263158, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 1458 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.4033322036266327, "epoch": 3.8394736842105264, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 1459 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.4038220942020416, "epoch": 3.8421052631578947, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 1460 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16061.0, "completions/max_terminated_length": 16061.0, "completions/mean_length": 4237.50390625, "completions/mean_terminated_length": 4237.50390625, "completions/min_length": 545.0, "completions/min_terminated_length": 545.0, "entropy": 0.4071665406227112, "epoch": 3.844736842105263, "frac_reward_zero_std": 0.96875, "grad_norm": 0.0023033262696117163, "learning_rate": 1e-06, "loss": -0.0035, "num_tokens": 724435917.0, "reward": 0.8982422351837158, "reward_std": 0.00703125074505806, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.998046875, "rewards/symbolic_reward_accuracy/std": 0.04419417306780815, "rewards/symbolic_reward_partial_score/mean": 0.998046875, "rewards/symbolic_reward_partial_score/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0930531024932861, "sampling/importance_sampling_ratio/min": 0.0019550323486328125, "sampling/sampling_logp_difference/max": 6.237348556518555, "sampling/sampling_logp_difference/mean": 0.1577797532081604, "step": 1461 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.40273521840572357, "epoch": 3.8473684210526313, "grad_norm": 0.0023665132466703653, "learning_rate": 1e-06, "loss": 0.0277, "step": 1462 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.40068747103214264, "epoch": 3.85, "grad_norm": 0.0016836661379784346, "learning_rate": 1e-06, "loss": -0.0041, "step": 1463 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.4021112322807312, "epoch": 3.8526315789473684, "grad_norm": 0.0018241856014356017, "learning_rate": 1e-06, "loss": -0.0032, "step": 1464 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 14994.0, "completions/mean_length": 4416.72265625, "completions/mean_terminated_length": 4393.30322265625, "completions/min_length": 497.0, "completions/min_terminated_length": 497.0, "entropy": 0.40466147661209106, "epoch": 3.8552631578947367, "frac_reward_zero_std": 0.96875, "grad_norm": 0.0020933107007294893, "learning_rate": 1e-06, "loss": 0.0289, "num_tokens": 727102431.0, "reward": 0.8980469107627869, "reward_std": 0.0078125, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.998046875, "rewards/symbolic_reward_accuracy/std": 0.04419417306780815, "rewards/symbolic_reward_partial_score/mean": 0.998046875, "rewards/symbolic_reward_partial_score/std": 0.04419417306780815, "rewards/tag_count_reward/mean": -0.001953125, "rewards/tag_count_reward/std": 0.04419417306780815, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0951014757156372, "sampling/importance_sampling_ratio/min": 6.185632173583144e-06, "sampling/sampling_logp_difference/max": 11.993281364440918, "sampling/sampling_logp_difference/mean": 0.15952223539352417, "step": 1465 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.409665122628212, "epoch": 3.8578947368421055, "grad_norm": 0.002439366653561592, "learning_rate": 1e-06, "loss": -0.0038, "step": 1466 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.40874864161014557, "epoch": 3.860526315789474, "grad_norm": 0.001965287374332547, "learning_rate": 1e-06, "loss": -0.0026, "step": 1467 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.4086417257785797, "epoch": 3.863157894736842, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": -0.0038, "step": 1468 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 14026.0, "completions/mean_length": 4438.623046875, "completions/mean_terminated_length": 4415.24658203125, "completions/min_length": 409.0, "completions/min_terminated_length": 409.0, "entropy": 0.4004950076341629, "epoch": 3.8657894736842104, "frac_reward_zero_std": 0.96875, "grad_norm": 0.002180153038352728, "learning_rate": 1e-06, "loss": -0.0029, "num_tokens": 729772798.0, "reward": 0.8998047113418579, "reward_std": 0.0007812501862645149, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 1.0, "rewards/symbolic_reward_accuracy/std": 0.0, "rewards/symbolic_reward_partial_score/mean": 1.0, "rewards/symbolic_reward_partial_score/std": 0.0, "rewards/tag_count_reward/mean": -0.001953125, "rewards/tag_count_reward/std": 0.04419417306780815, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.092930793762207, "sampling/importance_sampling_ratio/min": 0.0036180950701236725, "sampling/sampling_logp_difference/max": 5.62180757522583, "sampling/sampling_logp_difference/mean": 0.15623509883880615, "step": 1469 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.39847925305366516, "epoch": 3.8684210526315788, "grad_norm": 0.0022407532669603825, "learning_rate": 1e-06, "loss": -0.0037, "step": 1470 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3970453441143036, "epoch": 3.8710526315789475, "grad_norm": 0.034548867493867874, "learning_rate": 1e-06, "loss": 0.028, "step": 1471 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.3975095897912979, "epoch": 3.873684210526316, "grad_norm": 0.0015942624304443598, "learning_rate": 1e-06, "loss": -0.0042, "step": 1472 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 13766.0, "completions/mean_length": 4670.01953125, "completions/mean_terminated_length": 4647.095703125, "completions/min_length": 421.0, "completions/min_terminated_length": 421.0, "entropy": 0.3799055218696594, "epoch": 3.876315789473684, "frac_reward_zero_std": 0.96875, "grad_norm": 0.002713815774768591, "learning_rate": 1e-06, "loss": 0.0281, "num_tokens": 732597032.0, "reward": 0.8998047113418579, "reward_std": 0.0007812501862645149, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 1.0, "rewards/symbolic_reward_accuracy/std": 0.0, "rewards/symbolic_reward_partial_score/mean": 1.0, "rewards/symbolic_reward_partial_score/std": 0.0, "rewards/tag_count_reward/mean": -0.001953125, "rewards/tag_count_reward/std": 0.04419417306780815, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0902076959609985, "sampling/importance_sampling_ratio/min": 0.0022239615209400654, "sampling/sampling_logp_difference/max": 6.108465194702148, "sampling/sampling_logp_difference/mean": 0.15248744189739227, "step": 1473 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.3894472122192383, "epoch": 3.8789473684210525, "grad_norm": 0.001705386326648295, "learning_rate": 1e-06, "loss": -0.0049, "step": 1474 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3877953886985779, "epoch": 3.8815789473684212, "grad_norm": 0.0022450347896665335, "learning_rate": 1e-06, "loss": -0.0038, "step": 1475 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.38294896483421326, "epoch": 3.8842105263157896, "grad_norm": 0.0025115814059972763, "learning_rate": 1e-06, "loss": -0.0044, "step": 1476 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 15636.0, "completions/mean_length": 5138.291015625, "completions/mean_terminated_length": 5116.28369140625, "completions/min_length": 572.0, "completions/min_terminated_length": 572.0, "entropy": 0.39051946997642517, "epoch": 3.886842105263158, "frac_reward_zero_std": 0.9375, "grad_norm": 0.0037060140166431665, "learning_rate": 1e-06, "loss": -0.0085, "num_tokens": 735647165.0, "reward": 0.8965820670127869, "reward_std": 0.013671875931322575, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.99609375, "rewards/symbolic_reward_accuracy/std": 0.06243881583213806, "rewards/symbolic_reward_partial_score/mean": 0.9970703125, "rewards/symbolic_reward_partial_score/std": 0.04937189444899559, "rewards/tag_count_reward/mean": -0.001953125, "rewards/tag_count_reward/std": 0.04419417306780815, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0919941663742065, "sampling/importance_sampling_ratio/min": 0.001707986113615334, "sampling/sampling_logp_difference/max": 6.372440338134766, "sampling/sampling_logp_difference/mean": 0.15500596165657043, "step": 1477 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3918895572423935, "epoch": 3.889473684210526, "grad_norm": 0.0037045739591121674, "learning_rate": 1e-06, "loss": -0.0095, "step": 1478 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.3911411166191101, "epoch": 3.8921052631578945, "grad_norm": 0.031501058489084244, "learning_rate": 1e-06, "loss": 0.0437, "step": 1479 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3948730081319809, "epoch": 3.8947368421052633, "grad_norm": 0.003884932491928339, "learning_rate": 1e-06, "loss": -0.0087, "step": 1480 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 16384.0, "completions/max_terminated_length": 15566.0, "completions/mean_length": 5308.91796875, "completions/mean_terminated_length": 5221.71240234375, "completions/min_length": 474.0, "completions/min_terminated_length": 474.0, "entropy": 0.39051443338394165, "epoch": 3.8973684210526316, "frac_reward_zero_std": 0.875, "grad_norm": 0.04549328237771988, "learning_rate": 1e-06, "loss": 0.0504, "num_tokens": 738783155.0, "reward": 0.890869140625, "reward_std": 0.02705981954932213, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.990234375, "rewards/symbolic_reward_accuracy/std": 0.09843364357948303, "rewards/symbolic_reward_partial_score/mean": 0.99169921875, "rewards/symbolic_reward_partial_score/std": 0.08877533674240112, "rewards/tag_count_reward/mean": -0.0078125, "rewards/tag_count_reward/std": 0.08812850713729858, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0921385288238525, "sampling/importance_sampling_ratio/min": 0.001943130511790514, "sampling/sampling_logp_difference/max": 6.243454933166504, "sampling/sampling_logp_difference/mean": 0.15515056252479553, "step": 1481 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03125, "entropy": 0.39302758872509, "epoch": 3.9, "grad_norm": 0.00662320526316762, "learning_rate": 1e-06, "loss": -0.0108, "step": 1482 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.39847151935100555, "epoch": 3.9026315789473687, "grad_norm": 0.006563673727214336, "learning_rate": 1e-06, "loss": -0.0209, "step": 1483 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.3883136212825775, "epoch": 3.905263157894737, "grad_norm": 0.030046168714761734, "learning_rate": 1e-06, "loss": 0.0336, "step": 1484 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 16384.0, "completions/max_terminated_length": 12801.0, "completions/mean_length": 4988.72265625, "completions/mean_terminated_length": 4944.03564453125, "completions/min_length": 665.0, "completions/min_terminated_length": 665.0, "entropy": 0.401815801858902, "epoch": 3.9078947368421053, "frac_reward_zero_std": 0.9375, "grad_norm": 0.029819732531905174, "learning_rate": 1e-06, "loss": 0.0231, "num_tokens": 741742533.0, "reward": 0.8960937857627869, "reward_std": 0.015625, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.99609375, "rewards/symbolic_reward_accuracy/std": 0.06243881583213806, "rewards/symbolic_reward_partial_score/mean": 0.99609375, "rewards/symbolic_reward_partial_score/std": 0.06243881583213806, "rewards/tag_count_reward/mean": -0.00390625, "rewards/tag_count_reward/std": 0.06243881583213806, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0947823524475098, "sampling/importance_sampling_ratio/min": 0.002653134521096945, "sampling/sampling_logp_difference/max": 5.932013511657715, "sampling/sampling_logp_difference/mean": 0.15863843262195587, "step": 1485 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.40521858632564545, "epoch": 3.9105263157894736, "grad_norm": 0.0026575105730444193, "learning_rate": 1e-06, "loss": -0.0074, "step": 1486 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.4038505554199219, "epoch": 3.913157894736842, "grad_norm": 0.0032303433399647474, "learning_rate": 1e-06, "loss": 0.0237, "step": 1487 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.4077226221561432, "epoch": 3.9157894736842103, "grad_norm": 0.004619062412530184, "learning_rate": 1e-06, "loss": -0.0093, "step": 1488 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013671875, "completions/max_length": 16384.0, "completions/max_terminated_length": 15314.0, "completions/mean_length": 4639.970703125, "completions/mean_terminated_length": 4477.18212890625, "completions/min_length": 547.0, "completions/min_terminated_length": 547.0, "entropy": 0.4002656936645508, "epoch": 3.918421052631579, "frac_reward_zero_std": 0.84375, "grad_norm": 0.017964759841561317, "learning_rate": 1e-06, "loss": 0.0202, "num_tokens": 744495158.0, "reward": 0.8863281607627869, "reward_std": 0.04478531330823898, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.986328125, "rewards/symbolic_reward_accuracy/std": 0.1162383034825325, "rewards/symbolic_reward_partial_score/mean": 0.986328125, "rewards/symbolic_reward_partial_score/std": 0.1162383034825325, "rewards/tag_count_reward/mean": -0.013671875, "rewards/tag_count_reward/std": 0.1162383034825325, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0951684713363647, "sampling/importance_sampling_ratio/min": 0.002952937036752701, "sampling/sampling_logp_difference/max": 5.824954986572266, "sampling/sampling_logp_difference/mean": 0.1594112515449524, "step": 1489 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.41311562061309814, "epoch": 3.9210526315789473, "grad_norm": 0.025135215371847153, "learning_rate": 1e-06, "loss": 0.0401, "step": 1490 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.40488433837890625, "epoch": 3.9236842105263157, "grad_norm": 0.007263537496328354, "learning_rate": 1e-06, "loss": 0.0034, "step": 1491 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.40645793080329895, "epoch": 3.9263157894736844, "grad_norm": 0.02124885283410549, "learning_rate": 1e-06, "loss": 0.0202, "step": 1492 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 16384.0, "completions/max_terminated_length": 15372.0, "completions/mean_length": 4455.48828125, "completions/mean_terminated_length": 4408.7099609375, "completions/min_length": 530.0, "completions/min_terminated_length": 530.0, "entropy": 0.4151764214038849, "epoch": 3.9289473684210527, "frac_reward_zero_std": 0.9375, "grad_norm": 0.003507391083985567, "learning_rate": 1e-06, "loss": 0.0253, "num_tokens": 747164336.0, "reward": 0.8960937857627869, "reward_std": 0.015625, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.99609375, "rewards/symbolic_reward_accuracy/std": 0.06243881583213806, "rewards/symbolic_reward_partial_score/mean": 0.99609375, "rewards/symbolic_reward_partial_score/std": 0.06243881583213806, "rewards/tag_count_reward/mean": -0.00390625, "rewards/tag_count_reward/std": 0.06243881583213806, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0968103408813477, "sampling/importance_sampling_ratio/min": 0.0019236616790294647, "sampling/sampling_logp_difference/max": 6.2535247802734375, "sampling/sampling_logp_difference/mean": 0.16167742013931274, "step": 1493 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.41255517303943634, "epoch": 3.931578947368421, "grad_norm": 0.0030434769578278065, "learning_rate": 1e-06, "loss": -0.0072, "step": 1494 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.4119809418916702, "epoch": 3.9342105263157894, "grad_norm": 0.0031008217483758926, "learning_rate": 1e-06, "loss": -0.0064, "step": 1495 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.41209113597869873, "epoch": 3.9368421052631577, "grad_norm": 0.02380312979221344, "learning_rate": 1e-06, "loss": 0.0237, "step": 1496 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 16384.0, "completions/max_terminated_length": 15992.0, "completions/mean_length": 4989.240234375, "completions/mean_terminated_length": 4944.55517578125, "completions/min_length": 466.0, "completions/min_terminated_length": 466.0, "entropy": 0.404226616024971, "epoch": 3.9394736842105265, "frac_reward_zero_std": 0.9375, "grad_norm": 0.004120892845094204, "learning_rate": 1e-06, "loss": -0.0107, "num_tokens": 750117963.0, "reward": 0.8965820670127869, "reward_std": 0.013671875, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.99609375, "rewards/symbolic_reward_accuracy/std": 0.06243881583213806, "rewards/symbolic_reward_partial_score/mean": 0.9977213144302368, "rewards/symbolic_reward_partial_score/std": 0.04478955641388893, "rewards/tag_count_reward/mean": -0.00390625, "rewards/tag_count_reward/std": 0.06243881583213806, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.094872236251831, "sampling/importance_sampling_ratio/min": 0.0023986201267689466, "sampling/sampling_logp_difference/max": 6.032861709594727, "sampling/sampling_logp_difference/mean": 0.15871724486351013, "step": 1497 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.40456075966358185, "epoch": 3.942105263157895, "grad_norm": 0.003923047799617052, "learning_rate": 1e-06, "loss": 0.0548, "step": 1498 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.39779654145240784, "epoch": 3.944736842105263, "grad_norm": 0.00380745530128479, "learning_rate": 1e-06, "loss": -0.01, "step": 1499 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.40239349007606506, "epoch": 3.9473684210526314, "grad_norm": 0.004106962587684393, "learning_rate": 1e-06, "loss": -0.0109, "step": 1500 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 16384.0, "completions/max_terminated_length": 16011.0, "completions/mean_length": 5036.435546875, "completions/mean_terminated_length": 4991.935546875, "completions/min_length": 535.0, "completions/min_terminated_length": 535.0, "entropy": 0.4066380709409714, "epoch": 3.95, "frac_reward_zero_std": 0.9375, "grad_norm": 0.0036225696094334126, "learning_rate": 1e-06, "loss": -0.0081, "num_tokens": 753112938.0, "reward": 0.8960937857627869, "reward_std": 0.015625, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.99609375, "rewards/symbolic_reward_accuracy/std": 0.06243881583213806, "rewards/symbolic_reward_partial_score/mean": 0.99609375, "rewards/symbolic_reward_partial_score/std": 0.06243881583213806, "rewards/tag_count_reward/mean": -0.00390625, "rewards/tag_count_reward/std": 0.06243881583213806, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0963261127471924, "sampling/importance_sampling_ratio/min": 0.0016075860476121306, "sampling/sampling_logp_difference/max": 6.433021545410156, "sampling/sampling_logp_difference/mean": 0.1609252244234085, "step": 1501 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.4123341590166092, "epoch": 3.9526315789473685, "grad_norm": 0.003561399644240737, "learning_rate": 1e-06, "loss": -0.0078, "step": 1502 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.40965262055397034, "epoch": 3.955263157894737, "grad_norm": 0.0029743367340415716, "learning_rate": 1e-06, "loss": -0.0062, "step": 1503 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.40872377157211304, "epoch": 3.957894736842105, "grad_norm": 0.030794724822044373, "learning_rate": 1e-06, "loss": 0.056, "step": 1504 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 16384.0, "completions/max_terminated_length": 16193.0, "completions/mean_length": 5512.244140625, "completions/mean_terminated_length": 5426.6396484375, "completions/min_length": 628.0, "completions/min_terminated_length": 628.0, "entropy": 0.4127257764339447, "epoch": 3.9605263157894735, "frac_reward_zero_std": 0.9375, "grad_norm": 0.004801150877028704, "learning_rate": 1e-06, "loss": -0.0129, "num_tokens": 756352007.0, "reward": 0.8939453363418579, "reward_std": 0.017909539863467216, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.994140625, "rewards/symbolic_reward_accuracy/std": 0.07639661431312561, "rewards/symbolic_reward_partial_score/mean": 0.994140625, "rewards/symbolic_reward_partial_score/std": 0.07639661431312561, "rewards/tag_count_reward/mean": -0.0078125, "rewards/tag_count_reward/std": 0.08812850713729858, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.096022367477417, "sampling/importance_sampling_ratio/min": 0.0014268403174355626, "sampling/sampling_logp_difference/max": 6.552292823791504, "sampling/sampling_logp_difference/mean": 0.16092725098133087, "step": 1505 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.4092298448085785, "epoch": 3.963157894736842, "grad_norm": 0.03272217884659767, "learning_rate": 1e-06, "loss": 0.0305, "step": 1506 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.4117880314588547, "epoch": 3.9657894736842105, "grad_norm": 0.004830363672226667, "learning_rate": 1e-06, "loss": 0.0108, "step": 1507 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03125, "entropy": 0.40936070680618286, "epoch": 3.968421052631579, "grad_norm": 0.0038726413622498512, "learning_rate": 1e-06, "loss": -0.0101, "step": 1508 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 16384.0, "completions/max_terminated_length": 16161.0, "completions/mean_length": 5972.404296875, "completions/mean_terminated_length": 5931.57470703125, "completions/min_length": 575.0, "completions/min_terminated_length": 575.0, "entropy": 0.40254396200180054, "epoch": 3.9710526315789476, "frac_reward_zero_std": 0.90625, "grad_norm": 0.031498510390520096, "learning_rate": 1e-06, "loss": 0.0388, "num_tokens": 759840054.0, "reward": 0.8934570550918579, "reward_std": 0.022123489528894424, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.9921875, "rewards/symbolic_reward_accuracy/std": 0.08812850713729858, "rewards/symbolic_reward_partial_score/mean": 0.9951171875, "rewards/symbolic_reward_partial_score/std": 0.06451204419136047, "rewards/tag_count_reward/mean": -0.00390625, "rewards/tag_count_reward/std": 0.06243881583213806, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0945191383361816, "sampling/importance_sampling_ratio/min": 0.002183837117627263, "sampling/sampling_logp_difference/max": 6.12667179107666, "sampling/sampling_logp_difference/mean": 0.1585274487733841, "step": 1509 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.4039093255996704, "epoch": 3.973684210526316, "grad_norm": 0.0052402750588953495, "learning_rate": 1e-06, "loss": -0.0155, "step": 1510 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.4034292995929718, "epoch": 3.9763157894736842, "grad_norm": 0.004276621155440807, "learning_rate": 1e-06, "loss": 0.0212, "step": 1511 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.40216925740242004, "epoch": 3.9789473684210526, "grad_norm": 0.006280162371695042, "learning_rate": 1e-06, "loss": -0.0163, "step": 1512 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 16384.0, "completions/max_terminated_length": 16295.0, "completions/mean_length": 5662.978515625, "completions/mean_terminated_length": 5599.7900390625, "completions/min_length": 580.0, "completions/min_terminated_length": 580.0, "entropy": 0.40237975120544434, "epoch": 3.981578947368421, "frac_reward_zero_std": 0.9375, "grad_norm": 0.005399126559495926, "learning_rate": 1e-06, "loss": 0.0093, "num_tokens": 763149643.0, "reward": 0.8941406607627869, "reward_std": 0.01848640665411949, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.994140625, "rewards/symbolic_reward_accuracy/std": 0.07639661431312561, "rewards/symbolic_reward_partial_score/mean": 0.994140625, "rewards/symbolic_reward_partial_score/std": 0.07639661431312561, "rewards/tag_count_reward/mean": -0.005859375, "rewards/tag_count_reward/std": 0.07639661431312561, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0950000286102295, "sampling/importance_sampling_ratio/min": 0.0020277712028473616, "sampling/sampling_logp_difference/max": 6.200818061828613, "sampling/sampling_logp_difference/mean": 0.159552201628685, "step": 1513 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.4041586071252823, "epoch": 3.984210526315789, "grad_norm": 0.004899794235825539, "learning_rate": 1e-06, "loss": -0.0147, "step": 1514 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.4064927399158478, "epoch": 3.986842105263158, "grad_norm": 0.005058801267296076, "learning_rate": 1e-06, "loss": 0.0418, "step": 1515 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.40395575761795044, "epoch": 3.9894736842105263, "grad_norm": 0.004211281426250935, "learning_rate": 1e-06, "loss": -0.0123, "step": 1516 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15643.0, "completions/max_terminated_length": 15643.0, "completions/mean_length": 6078.6015625, "completions/mean_terminated_length": 6078.6015625, "completions/min_length": 493.0, "completions/min_terminated_length": 493.0, "entropy": 0.40998007357120514, "epoch": 3.9921052631578946, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 766671711.0, "reward": 0.9000000357627869, "reward_std": 0.0, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 1.0, "rewards/symbolic_reward_accuracy/std": 0.0, "rewards/symbolic_reward_partial_score/mean": 1.0, "rewards/symbolic_reward_partial_score/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0965890884399414, "sampling/importance_sampling_ratio/min": 0.0032699191942811012, "sampling/sampling_logp_difference/max": 5.722990036010742, "sampling/sampling_logp_difference/mean": 0.1614089161157608, "step": 1517 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.40980038046836853, "epoch": 3.9947368421052634, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 1518 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.40817686915397644, "epoch": 3.9973684210526317, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 1519 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.41471439599990845, "epoch": 4.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 1520 }, { "epoch": 4.0, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.002685546875, "eval_completions/max_length": 12391.40625, "eval_completions/max_terminated_length": 11695.40625, "eval_completions/mean_length": 4060.523193359375, "eval_completions/mean_terminated_length": 4028.744041442871, "eval_completions/min_length": 549.125, "eval_completions/min_terminated_length": 549.125, "eval_entropy": 0.41615553945302963, "eval_frac_reward_zero_std": 0.96484375, "eval_loss": 0.001882668468169868, "eval_num_tokens": 766671711.0, "eval_reward": 0.8970764521509409, "eval_reward_std": 0.009239628736395389, "eval_rewards/progression_diversity/mean": 0.0, "eval_rewards/progression_diversity/std": 0.0, "eval_rewards/symbolic_reward_accuracy/mean": 0.996826171875, "eval_rewards/symbolic_reward_accuracy/std": 0.027467404725030065, "eval_rewards/symbolic_reward_partial_score/mean": 0.9973347969353199, "eval_rewards/symbolic_reward_partial_score/std": 0.02281399435014464, "eval_rewards/tag_count_reward/mean": -0.002197265625, "eval_rewards/tag_count_reward/std": 0.019685723586007953, "eval_runtime": 6788.6295, "eval_samples_per_second": 0.037, "eval_sampling/importance_sampling_ratio/max": 2.0, "eval_sampling/importance_sampling_ratio/mean": 1.0959402099251747, "eval_sampling/importance_sampling_ratio/min": 0.004207178114710872, "eval_sampling/sampling_logp_difference/max": 6.028235450387001, "eval_sampling/sampling_logp_difference/mean": 0.16201621294021606, "eval_steps_per_second": 0.0, "step": 1520 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 16384.0, "completions/max_terminated_length": 15460.0, "completions/mean_length": 6646.126953125, "completions/mean_terminated_length": 6588.73291015625, "completions/min_length": 571.0, "completions/min_terminated_length": 571.0, "entropy": 0.4054238498210907, "epoch": 4.002631578947368, "frac_reward_zero_std": 0.9375, "grad_norm": 0.004806513898074627, "learning_rate": 1e-06, "loss": -0.0111, "num_tokens": 770491008.0, "reward": 0.8946289420127869, "reward_std": 0.01682901941239834, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.994140625, "rewards/symbolic_reward_accuracy/std": 0.07639661431312561, "rewards/symbolic_reward_partial_score/mean": 0.9957681894302368, "rewards/symbolic_reward_partial_score/std": 0.06285149604082108, "rewards/tag_count_reward/mean": -0.005859375, "rewards/tag_count_reward/std": 0.07639661431312561, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0953829288482666, "sampling/importance_sampling_ratio/min": 0.0013598940568044782, "sampling/sampling_logp_difference/max": 6.600348472595215, "sampling/sampling_logp_difference/mean": 0.15978649258613586, "step": 1521 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.4089938700199127, "epoch": 4.005263157894737, "grad_norm": 0.005528689827769995, "learning_rate": 1e-06, "loss": -0.0126, "step": 1522 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.39951440691947937, "epoch": 4.007894736842105, "grad_norm": 0.005525888409465551, "learning_rate": 1e-06, "loss": 0.0142, "step": 1523 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.4093039333820343, "epoch": 4.010526315789473, "grad_norm": 0.017072586342692375, "learning_rate": 1e-06, "loss": 0.0332, "step": 1524 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 16384.0, "completions/max_terminated_length": 16176.0, "completions/mean_length": 6063.529296875, "completions/mean_terminated_length": 6002.70166015625, "completions/min_length": 476.0, "completions/min_terminated_length": 476.0, "entropy": 0.4164528548717499, "epoch": 4.0131578947368425, "frac_reward_zero_std": 0.9375, "grad_norm": 0.0074941301718354225, "learning_rate": 1e-06, "loss": 0.0204, "num_tokens": 773977647.0, "reward": 0.8899414539337158, "reward_std": 0.021436430513858795, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.98828125, "rewards/symbolic_reward_accuracy/std": 0.10772226005792618, "rewards/symbolic_reward_partial_score/mean": 0.9918619394302368, "rewards/symbolic_reward_partial_score/std": 0.08334289491176605, "rewards/tag_count_reward/mean": -0.005859375, "rewards/tag_count_reward/std": 0.07639661431312561, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0974147319793701, "sampling/importance_sampling_ratio/min": 0.003898089053109288, "sampling/sampling_logp_difference/max": 5.547268867492676, "sampling/sampling_logp_difference/mean": 0.16334551572799683, "step": 1525 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.4130219519138336, "epoch": 4.015789473684211, "grad_norm": 0.007226323243230581, "learning_rate": 1e-06, "loss": 0.0012, "step": 1526 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.415419265627861, "epoch": 4.018421052631579, "grad_norm": 0.019122513011097908, "learning_rate": 1e-06, "loss": 0.0173, "step": 1527 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.41308580338954926, "epoch": 4.021052631578947, "grad_norm": 0.012223465368151665, "learning_rate": 1e-06, "loss": -0.0085, "step": 1528 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14760.0, "completions/max_terminated_length": 14760.0, "completions/mean_length": 5919.271484375, "completions/mean_terminated_length": 5919.271484375, "completions/min_length": 447.0, "completions/min_terminated_length": 447.0, "entropy": 0.40429234504699707, "epoch": 4.023684210526316, "frac_reward_zero_std": 0.90625, "grad_norm": 0.004486248828470707, "learning_rate": 1e-06, "loss": -0.0102, "num_tokens": 777407130.0, "reward": 0.8956055045127869, "reward_std": 0.017578125, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.994140625, "rewards/symbolic_reward_accuracy/std": 0.07639661431312561, "rewards/symbolic_reward_partial_score/mean": 0.9970703125, "rewards/symbolic_reward_partial_score/std": 0.046829111874103546, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0959361791610718, "sampling/importance_sampling_ratio/min": 0.002474784152582288, "sampling/sampling_logp_difference/max": 6.0016021728515625, "sampling/sampling_logp_difference/mean": 0.16091609001159668, "step": 1529 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.40773653984069824, "epoch": 4.026315789473684, "grad_norm": 0.029880456626415253, "learning_rate": 1e-06, "loss": 0.0154, "step": 1530 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.4094361662864685, "epoch": 4.028947368421052, "grad_norm": 0.003929602447897196, "learning_rate": 1e-06, "loss": -0.0109, "step": 1531 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.40926802158355713, "epoch": 4.031578947368421, "grad_norm": 0.026355089619755745, "learning_rate": 1e-06, "loss": 0.0023, "step": 1532 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 16384.0, "completions/max_terminated_length": 16256.0, "completions/mean_length": 6261.3828125, "completions/mean_terminated_length": 6201.72119140625, "completions/min_length": 628.0, "completions/min_terminated_length": 628.0, "entropy": 0.4165153503417969, "epoch": 4.03421052631579, "frac_reward_zero_std": 0.90625, "grad_norm": 0.027090586721897125, "learning_rate": 1e-06, "loss": 0.0191, "num_tokens": 781001086.0, "reward": 0.8941406607627869, "reward_std": 0.0234375, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.994140625, "rewards/symbolic_reward_accuracy/std": 0.07639661431312561, "rewards/symbolic_reward_partial_score/mean": 0.994140625, "rewards/symbolic_reward_partial_score/std": 0.07639661431312561, "rewards/tag_count_reward/mean": -0.005859375, "rewards/tag_count_reward/std": 0.07639661431312561, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.099013328552246, "sampling/importance_sampling_ratio/min": 0.003709872020408511, "sampling/sampling_logp_difference/max": 5.596757888793945, "sampling/sampling_logp_difference/mean": 0.16502097249031067, "step": 1533 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.42090509831905365, "epoch": 4.036842105263158, "grad_norm": 0.005345202051103115, "learning_rate": 1e-06, "loss": -0.0137, "step": 1534 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.4249906986951828, "epoch": 4.0394736842105265, "grad_norm": 0.004722771234810352, "learning_rate": 1e-06, "loss": 0.0193, "step": 1535 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.42054611444473267, "epoch": 4.042105263157895, "grad_norm": 0.03426428511738777, "learning_rate": 1e-06, "loss": 0.0182, "step": 1536 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 16384.0, "completions/max_terminated_length": 16173.0, "completions/mean_length": 7508.78515625, "completions/mean_terminated_length": 7367.9091796875, "completions/min_length": 419.0, "completions/min_terminated_length": 419.0, "entropy": 0.4168701469898224, "epoch": 4.044736842105263, "frac_reward_zero_std": 0.8125, "grad_norm": 0.04696556553244591, "learning_rate": 1e-06, "loss": 0.0326, "num_tokens": 785229360.0, "reward": 0.8848145008087158, "reward_std": 0.05104056000709534, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.984375, "rewards/symbolic_reward_accuracy/std": 0.12414088100194931, "rewards/symbolic_reward_partial_score/mean": 0.98583984375, "rewards/symbolic_reward_partial_score/std": 0.11670491099357605, "rewards/tag_count_reward/mean": -0.015625, "rewards/tag_count_reward/std": 0.12414088100194931, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.09952974319458, "sampling/importance_sampling_ratio/min": 0.0002965867461171001, "sampling/sampling_logp_difference/max": 8.123170852661133, "sampling/sampling_logp_difference/mean": 0.16504105925559998, "step": 1537 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.42273393273353577, "epoch": 4.0473684210526315, "grad_norm": 0.029424453154206276, "learning_rate": 1e-06, "loss": -0.0045, "step": 1538 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.41807372868061066, "epoch": 4.05, "grad_norm": 0.04837526008486748, "learning_rate": 1e-06, "loss": 0.03, "step": 1539 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.42169442772865295, "epoch": 4.052631578947368, "grad_norm": 0.012585725635290146, "learning_rate": 1e-06, "loss": 0.0217, "step": 1540 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 16384.0, "completions/max_terminated_length": 16019.0, "completions/mean_length": 6282.087890625, "completions/mean_terminated_length": 6222.54833984375, "completions/min_length": 680.0, "completions/min_terminated_length": 680.0, "entropy": 0.4269144833087921, "epoch": 4.0552631578947365, "frac_reward_zero_std": 0.875, "grad_norm": 0.043204013258218765, "learning_rate": 1e-06, "loss": 0.0395, "num_tokens": 788829245.0, "reward": 0.8893066644668579, "reward_std": 0.03330982103943825, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.98828125, "rewards/symbolic_reward_accuracy/std": 0.10772226005792618, "rewards/symbolic_reward_partial_score/mean": 0.98974609375, "rewards/symbolic_reward_partial_score/std": 0.09900352358818054, "rewards/tag_count_reward/mean": -0.005859375, "rewards/tag_count_reward/std": 0.07639661431312561, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.1021716594696045, "sampling/importance_sampling_ratio/min": 0.0014063130365684628, "sampling/sampling_logp_difference/max": 6.566783905029297, "sampling/sampling_logp_difference/mean": 0.1688862293958664, "step": 1541 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.43069496750831604, "epoch": 4.057894736842106, "grad_norm": 0.007464423310011625, "learning_rate": 1e-06, "loss": -0.0208, "step": 1542 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.42454932630062103, "epoch": 4.060526315789474, "grad_norm": 0.004672233480960131, "learning_rate": 1e-06, "loss": 0.0331, "step": 1543 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.43463513255119324, "epoch": 4.063157894736842, "grad_norm": 0.007852221839129925, "learning_rate": 1e-06, "loss": -0.0197, "step": 1544 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013671875, "completions/max_length": 16384.0, "completions/max_terminated_length": 15856.0, "completions/mean_length": 6347.244140625, "completions/mean_terminated_length": 6208.12109375, "completions/min_length": 548.0, "completions/min_terminated_length": 548.0, "entropy": 0.4267430305480957, "epoch": 4.065789473684211, "frac_reward_zero_std": 0.875, "grad_norm": 0.03230677545070648, "learning_rate": 1e-06, "loss": 0.0242, "num_tokens": 792470490.0, "reward": 0.8748535513877869, "reward_std": 0.038360513746738434, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.966796875, "rewards/symbolic_reward_accuracy/std": 0.17934183776378632, "rewards/symbolic_reward_partial_score/mean": 0.9871419072151184, "rewards/symbolic_reward_partial_score/std": 0.09331966191530228, "rewards/tag_count_reward/mean": -0.013671875, "rewards/tag_count_reward/std": 0.1162383034825325, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.1025676727294922, "sampling/importance_sampling_ratio/min": 0.0020366902463138103, "sampling/sampling_logp_difference/max": 6.196429252624512, "sampling/sampling_logp_difference/mean": 0.16911377012729645, "step": 1545 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.42655228078365326, "epoch": 4.068421052631579, "grad_norm": 0.008896084502339363, "learning_rate": 1e-06, "loss": 0.0109, "step": 1546 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.4267456531524658, "epoch": 4.071052631578947, "grad_norm": 0.02747557871043682, "learning_rate": 1e-06, "loss": 0.0301, "step": 1547 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.43166837096214294, "epoch": 4.073684210526316, "grad_norm": 0.01020568236708641, "learning_rate": 1e-06, "loss": -0.0091, "step": 1548 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 16384.0, "completions/max_terminated_length": 16217.0, "completions/mean_length": 6304.05859375, "completions/mean_terminated_length": 6184.53369140625, "completions/min_length": 382.0, "completions/min_terminated_length": 382.0, "entropy": 0.4209160953760147, "epoch": 4.076315789473684, "frac_reward_zero_std": 0.84375, "grad_norm": 0.010705246590077877, "learning_rate": 1e-06, "loss": -0.0235, "num_tokens": 796122328.0, "reward": 0.8836842775344849, "reward_std": 0.04094620794057846, "rewards/progression_diversity/mean": -0.0007134145707823336, "rewards/progression_diversity/std": 0.011619010008871555, "rewards/symbolic_reward_accuracy/mean": 0.98046875, "rewards/symbolic_reward_accuracy/std": 0.1385180652141571, "rewards/symbolic_reward_partial_score/mean": 0.9886067509651184, "rewards/symbolic_reward_partial_score/std": 0.09963169693946838, "rewards/tag_count_reward/mean": -0.01171875, "rewards/tag_count_reward/std": 0.10772226005792618, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.1002486944198608, "sampling/importance_sampling_ratio/min": 6.994994583564562e-10, "sampling/sampling_logp_difference/max": 21.080656051635742, "sampling/sampling_logp_difference/mean": 0.16513851284980774, "step": 1549 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0390625, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.046875, "entropy": 0.4146343171596527, "epoch": 4.078947368421052, "grad_norm": 0.036181289702653885, "learning_rate": 1e-06, "loss": 0.0306, "step": 1550 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0390625, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.046875, "entropy": 0.42250366508960724, "epoch": 4.081578947368421, "grad_norm": 0.009834694676101208, "learning_rate": 1e-06, "loss": -0.015, "step": 1551 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.4073076993227005, "epoch": 4.08421052631579, "grad_norm": 0.020931914448738098, "learning_rate": 1e-06, "loss": 0.069, "step": 1552 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 16384.0, "completions/max_terminated_length": 15545.0, "completions/mean_length": 6187.69140625, "completions/mean_terminated_length": 6127.595703125, "completions/min_length": 609.0, "completions/min_terminated_length": 609.0, "entropy": 0.4314107894897461, "epoch": 4.086842105263158, "frac_reward_zero_std": 0.90625, "grad_norm": 0.026285408064723015, "learning_rate": 1e-06, "loss": 0.0138, "num_tokens": 799691226.0, "reward": 0.8889627456665039, "reward_std": 0.02758970484137535, "rewards/progression_diversity/mean": -0.000214396117371507, "rewards/progression_diversity/std": 0.004851229954510927, "rewards/symbolic_reward_accuracy/mean": 0.986328125, "rewards/symbolic_reward_accuracy/std": 0.1162383034825325, "rewards/symbolic_reward_partial_score/mean": 0.9925130009651184, "rewards/symbolic_reward_partial_score/std": 0.07870955020189285, "rewards/tag_count_reward/mean": -0.005859375, "rewards/tag_count_reward/std": 0.07639661431312561, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.1033068895339966, "sampling/importance_sampling_ratio/min": 0.0012025631731376052, "sampling/sampling_logp_difference/max": 6.723299980163574, "sampling/sampling_logp_difference/mean": 0.16940413415431976, "step": 1553 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.046875, "entropy": 0.4321361780166626, "epoch": 4.089473684210526, "grad_norm": 0.005617157090455294, "learning_rate": 1e-06, "loss": 0.0042, "step": 1554 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03125, "entropy": 0.4312334358692169, "epoch": 4.092105263157895, "grad_norm": 0.0064244051463902, "learning_rate": 1e-06, "loss": -0.0063, "step": 1555 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.4245026558637619, "epoch": 4.094736842105263, "grad_norm": 0.009505083784461021, "learning_rate": 1e-06, "loss": 0.026, "step": 1556 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.009765625, "completions/max_length": 16384.0, "completions/max_terminated_length": 16373.0, "completions/mean_length": 5854.384765625, "completions/mean_terminated_length": 5750.54248046875, "completions/min_length": 619.0, "completions/min_terminated_length": 619.0, "entropy": 0.4277113676071167, "epoch": 4.097368421052631, "frac_reward_zero_std": 0.90625, "grad_norm": 0.00943253468722105, "learning_rate": 1e-06, "loss": -0.0054, "num_tokens": 803081951.0, "reward": 0.8873047232627869, "reward_std": 0.028727315366268158, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.984375, "rewards/symbolic_reward_accuracy/std": 0.12414088100194931, "rewards/symbolic_reward_partial_score/mean": 0.9921875, "rewards/symbolic_reward_partial_score/std": 0.0788503959774971, "rewards/tag_count_reward/mean": -0.009765625, "rewards/tag_count_reward/std": 0.09843364357948303, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.102722406387329, "sampling/importance_sampling_ratio/min": 0.0038275739643722773, "sampling/sampling_logp_difference/max": 5.565524101257324, "sampling/sampling_logp_difference/mean": 0.16877815127372742, "step": 1557 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.43056604266166687, "epoch": 4.1, "grad_norm": 0.007483618333935738, "learning_rate": 1e-06, "loss": -0.0265, "step": 1558 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.4222365468740463, "epoch": 4.102631578947369, "grad_norm": 0.006830757483839989, "learning_rate": 1e-06, "loss": 0.0584, "step": 1559 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.42999759316444397, "epoch": 4.105263157894737, "grad_norm": 0.00895257294178009, "learning_rate": 1e-06, "loss": 0.0088, "step": 1560 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 16384.0, "completions/max_terminated_length": 16203.0, "completions/mean_length": 5741.98046875, "completions/mean_terminated_length": 5658.18505859375, "completions/min_length": 491.0, "completions/min_terminated_length": 491.0, "entropy": 0.43271204829216003, "epoch": 4.1078947368421055, "frac_reward_zero_std": 0.90625, "grad_norm": 0.017555126920342445, "learning_rate": 1e-06, "loss": -0.0109, "num_tokens": 806403957.0, "reward": 0.8857421875, "reward_std": 0.029332052916288376, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.982421875, "rewards/symbolic_reward_accuracy/std": 0.13154059648513794, "rewards/symbolic_reward_partial_score/mean": 0.990234375, "rewards/symbolic_reward_partial_score/std": 0.09037207812070847, "rewards/tag_count_reward/mean": -0.0078125, "rewards/tag_count_reward/std": 0.08812850713729858, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.103397011756897, "sampling/importance_sampling_ratio/min": 0.0017045103013515472, "sampling/sampling_logp_difference/max": 6.374477386474609, "sampling/sampling_logp_difference/mean": 0.1705881506204605, "step": 1561 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.43101418018341064, "epoch": 4.110526315789474, "grad_norm": 0.03278311342000961, "learning_rate": 1e-06, "loss": 0.0296, "step": 1562 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.43013016879558563, "epoch": 4.113157894736842, "grad_norm": 0.02056693658232689, "learning_rate": 1e-06, "loss": 0.0316, "step": 1563 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.4330628216266632, "epoch": 4.11578947368421, "grad_norm": 0.010471044108271599, "learning_rate": 1e-06, "loss": -0.0116, "step": 1564 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 16384.0, "completions/max_terminated_length": 16296.0, "completions/mean_length": 6214.228515625, "completions/mean_terminated_length": 6134.1513671875, "completions/min_length": 752.0, "completions/min_terminated_length": 752.0, "entropy": 0.42355310916900635, "epoch": 4.118421052631579, "frac_reward_zero_std": 0.90625, "grad_norm": 0.02259978838264942, "learning_rate": 1e-06, "loss": 0.0093, "num_tokens": 810003466.0, "reward": 0.8831055164337158, "reward_std": 0.02655228227376938, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.9765625, "rewards/symbolic_reward_accuracy/std": 0.15143637359142303, "rewards/symbolic_reward_partial_score/mean": 0.9931640625, "rewards/symbolic_reward_partial_score/std": 0.05717618763446808, "rewards/tag_count_reward/mean": -0.0078125, "rewards/tag_count_reward/std": 0.08812850713729858, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.1020596027374268, "sampling/importance_sampling_ratio/min": 0.001632677624002099, "sampling/sampling_logp_difference/max": 6.417533874511719, "sampling/sampling_logp_difference/mean": 0.16794228553771973, "step": 1565 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.4257608652114868, "epoch": 4.121052631578947, "grad_norm": 0.01597507856786251, "learning_rate": 1e-06, "loss": -0.0108, "step": 1566 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.422183558344841, "epoch": 4.123684210526315, "grad_norm": 0.0146177401766181, "learning_rate": 1e-06, "loss": 0.022, "step": 1567 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.4265965670347214, "epoch": 4.126315789473685, "grad_norm": 0.014027196913957596, "learning_rate": 1e-06, "loss": 0.0178, "step": 1568 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13301.0, "completions/max_terminated_length": 13301.0, "completions/mean_length": 5375.0234375, "completions/mean_terminated_length": 5375.0234375, "completions/min_length": 712.0, "completions/min_terminated_length": 712.0, "entropy": 0.45230963826179504, "epoch": 4.128947368421053, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 813124598.0, "reward": 0.9000000357627869, "reward_std": 0.0, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 1.0, "rewards/symbolic_reward_accuracy/std": 0.0, "rewards/symbolic_reward_partial_score/mean": 1.0, "rewards/symbolic_reward_partial_score/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.106469750404358, "sampling/importance_sampling_ratio/min": 0.0017879101214930415, "sampling/sampling_logp_difference/max": 6.32670783996582, "sampling/sampling_logp_difference/mean": 0.17562320828437805, "step": 1569 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.4437611997127533, "epoch": 4.131578947368421, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 1570 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.44860294461250305, "epoch": 4.13421052631579, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 1571 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.4509068727493286, "epoch": 4.136842105263158, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 1572 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 16384.0, "completions/max_terminated_length": 15169.0, "completions/mean_length": 6123.830078125, "completions/mean_terminated_length": 6002.16845703125, "completions/min_length": 632.0, "completions/min_terminated_length": 632.0, "entropy": 0.4409501701593399, "epoch": 4.139473684210526, "frac_reward_zero_std": 0.9375, "grad_norm": 0.012426611967384815, "learning_rate": 1e-06, "loss": -0.0128, "num_tokens": 816643455.0, "reward": 0.8839844465255737, "reward_std": 0.022723259404301643, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.982421875, "rewards/symbolic_reward_accuracy/std": 0.13154059648513794, "rewards/symbolic_reward_partial_score/mean": 0.9856770634651184, "rewards/symbolic_reward_partial_score/std": 0.11662676930427551, "rewards/tag_count_reward/mean": -0.01171875, "rewards/tag_count_reward/std": 0.10772226005792618, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.1045918464660645, "sampling/importance_sampling_ratio/min": 0.0016437529120594263, "sampling/sampling_logp_difference/max": 6.410773277282715, "sampling/sampling_logp_difference/mean": 0.17237232625484467, "step": 1573 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.43849921226501465, "epoch": 4.1421052631578945, "grad_norm": 0.009212223812937737, "learning_rate": 1e-06, "loss": -0.0113, "step": 1574 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.43927159905433655, "epoch": 4.144736842105263, "grad_norm": 0.030542606487870216, "learning_rate": 1e-06, "loss": 0.0374, "step": 1575 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.4394027143716812, "epoch": 4.147368421052631, "grad_norm": 0.0145005127415061, "learning_rate": 1e-06, "loss": 0.0166, "step": 1576 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.009765625, "completions/max_length": 16384.0, "completions/max_terminated_length": 16314.0, "completions/mean_length": 6081.587890625, "completions/mean_terminated_length": 5979.986328125, "completions/min_length": 828.0, "completions/min_terminated_length": 828.0, "entropy": 0.4372425526380539, "epoch": 4.15, "frac_reward_zero_std": 0.96875, "grad_norm": 0.01591624692082405, "learning_rate": 1e-06, "loss": -0.0067, "num_tokens": 820172044.0, "reward": 0.8863281607627869, "reward_std": 0.014596348628401756, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.984375, "rewards/symbolic_reward_accuracy/std": 0.12414088100194931, "rewards/symbolic_reward_partial_score/mean": 0.9889322519302368, "rewards/symbolic_reward_partial_score/std": 0.09994078427553177, "rewards/tag_count_reward/mean": -0.009765625, "rewards/tag_count_reward/std": 0.09843364357948303, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.1035807132720947, "sampling/importance_sampling_ratio/min": 0.0019331310177221894, "sampling/sampling_logp_difference/max": 6.248614311218262, "sampling/sampling_logp_difference/mean": 0.17094659805297852, "step": 1577 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.4351629614830017, "epoch": 4.152631578947369, "grad_norm": 0.011278823018074036, "learning_rate": 1e-06, "loss": 0.003, "step": 1578 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.4317953735589981, "epoch": 4.155263157894737, "grad_norm": 0.008484586142003536, "learning_rate": 1e-06, "loss": 0.0044, "step": 1579 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.43848468363285065, "epoch": 4.157894736842105, "grad_norm": 0.012261978350579739, "learning_rate": 1e-06, "loss": 0.0166, "step": 1580 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 16384.0, "completions/max_terminated_length": 16146.0, "completions/mean_length": 6845.169921875, "completions/mean_terminated_length": 6788.94921875, "completions/min_length": 593.0, "completions/min_terminated_length": 593.0, "entropy": 0.4121706932783127, "epoch": 4.160526315789474, "frac_reward_zero_std": 0.96875, "grad_norm": 0.01333274319767952, "learning_rate": 1e-06, "loss": 0.0019, "num_tokens": 824111459.0, "reward": 0.8948242664337158, "reward_std": 0.010836118832230568, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.994140625, "rewards/symbolic_reward_accuracy/std": 0.07639661431312561, "rewards/symbolic_reward_partial_score/mean": 0.9957681894302368, "rewards/symbolic_reward_partial_score/std": 0.06285149604082108, "rewards/tag_count_reward/mean": -0.00390625, "rewards/tag_count_reward/std": 0.06243881583213806, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.099442958831787, "sampling/importance_sampling_ratio/min": 0.0014132558135315776, "sampling/sampling_logp_difference/max": 6.561859130859375, "sampling/sampling_logp_difference/mean": 0.1648731529712677, "step": 1581 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.41776856780052185, "epoch": 4.163157894736842, "grad_norm": 0.01982933096587658, "learning_rate": 1e-06, "loss": 0.0318, "step": 1582 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.41750428080558777, "epoch": 4.16578947368421, "grad_norm": 0.0046440716832876205, "learning_rate": 1e-06, "loss": -0.0121, "step": 1583 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.41665056347846985, "epoch": 4.168421052631579, "grad_norm": 0.0064771235920488834, "learning_rate": 1e-06, "loss": -0.0122, "step": 1584 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 16384.0, "completions/max_terminated_length": 16176.0, "completions/mean_length": 6556.806640625, "completions/mean_terminated_length": 6440.27880859375, "completions/min_length": 527.0, "completions/min_terminated_length": 527.0, "entropy": 0.430302232503891, "epoch": 4.171052631578948, "frac_reward_zero_std": 0.9375, "grad_norm": 0.02541787549853325, "learning_rate": 1e-06, "loss": 0.0279, "num_tokens": 827871008.0, "reward": 0.8846679925918579, "reward_std": 0.024891898036003113, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.982421875, "rewards/symbolic_reward_accuracy/std": 0.13154059648513794, "rewards/symbolic_reward_partial_score/mean": 0.9873046875, "rewards/symbolic_reward_partial_score/std": 0.10836639255285263, "rewards/tag_count_reward/mean": -0.009765625, "rewards/tag_count_reward/std": 0.09843364357948303, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.1027357578277588, "sampling/importance_sampling_ratio/min": 0.002878451021388173, "sampling/sampling_logp_difference/max": 5.850502967834473, "sampling/sampling_logp_difference/mean": 0.16930317878723145, "step": 1585 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.43265989422798157, "epoch": 4.173684210526316, "grad_norm": 0.025225359946489334, "learning_rate": 1e-06, "loss": 0.0248, "step": 1586 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.43102091550827026, "epoch": 4.176315789473684, "grad_norm": 0.011354782618582249, "learning_rate": 1e-06, "loss": -0.0323, "step": 1587 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.43173354864120483, "epoch": 4.178947368421053, "grad_norm": 0.015644771978259087, "learning_rate": 1e-06, "loss": 0.0061, "step": 1588 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 15771.0, "completions/mean_length": 6050.693359375, "completions/mean_terminated_length": 5844.8505859375, "completions/min_length": 685.0, "completions/min_terminated_length": 685.0, "entropy": 0.4312874674797058, "epoch": 4.181578947368421, "frac_reward_zero_std": 0.84375, "grad_norm": 0.010100515559315681, "learning_rate": 1e-06, "loss": -0.0378, "num_tokens": 831355427.0, "reward": 0.8802632093429565, "reward_std": 0.048109594732522964, "rewards/progression_diversity/mean": -0.0010256515815854073, "rewards/progression_diversity/std": 0.017782118171453476, "rewards/symbolic_reward_accuracy/mean": 0.978515625, "rewards/symbolic_reward_accuracy/std": 0.14513419568538666, "rewards/symbolic_reward_partial_score/mean": 0.9830729365348816, "rewards/symbolic_reward_partial_score/std": 0.1252782791852951, "rewards/tag_count_reward/mean": -0.017578125, "rewards/tag_count_reward/std": 0.13154059648513794, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.1018785238265991, "sampling/importance_sampling_ratio/min": 0.002515235682949424, "sampling/sampling_logp_difference/max": 5.98538875579834, "sampling/sampling_logp_difference/mean": 0.16809441149234772, "step": 1589 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0390625, "entropy": 0.4306061118841171, "epoch": 4.184210526315789, "grad_norm": 0.012110080569982529, "learning_rate": 1e-06, "loss": 0.0058, "step": 1590 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.4216834008693695, "epoch": 4.186842105263158, "grad_norm": 0.046260882169008255, "learning_rate": 1e-06, "loss": 0.09, "step": 1591 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.4289931207895279, "epoch": 4.189473684210526, "grad_norm": 0.02653714455664158, "learning_rate": 1e-06, "loss": 0.0196, "step": 1592 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 16271.0, "completions/mean_length": 5844.58984375, "completions/mean_terminated_length": 5823.96484375, "completions/min_length": 576.0, "completions/min_terminated_length": 576.0, "entropy": 0.43429282307624817, "epoch": 4.192105263157894, "frac_reward_zero_std": 0.96875, "grad_norm": 0.005181339103728533, "learning_rate": 1e-06, "loss": 0.0044, "num_tokens": 834756177.0, "reward": 0.8955078125, "reward_std": 0.009929356165230274, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.994140625, "rewards/symbolic_reward_accuracy/std": 0.07639661431312561, "rewards/symbolic_reward_partial_score/mean": 0.9973958134651184, "rewards/symbolic_reward_partial_score/std": 0.0453747883439064, "rewards/tag_count_reward/mean": -0.001953125, "rewards/tag_count_reward/std": 0.04419417306780815, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.1035181283950806, "sampling/importance_sampling_ratio/min": 0.003554050112143159, "sampling/sampling_logp_difference/max": 5.639667510986328, "sampling/sampling_logp_difference/mean": 0.17058652639389038, "step": 1593 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.4330582320690155, "epoch": 4.1947368421052635, "grad_norm": 0.004463242832571268, "learning_rate": 1e-06, "loss": 0.013, "step": 1594 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.43694183230400085, "epoch": 4.197368421052632, "grad_norm": 0.006396851968020201, "learning_rate": 1e-06, "loss": -0.0119, "step": 1595 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.43564656376838684, "epoch": 4.2, "grad_norm": 0.0059556555934250355, "learning_rate": 1e-06, "loss": 0.0029, "step": 1596 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 16384.0, "completions/max_terminated_length": 15562.0, "completions/mean_length": 5982.173828125, "completions/mean_terminated_length": 5941.3828125, "completions/min_length": 558.0, "completions/min_terminated_length": 558.0, "entropy": 0.43185955286026, "epoch": 4.2026315789473685, "frac_reward_zero_std": 0.90625, "grad_norm": 0.0054462142288684845, "learning_rate": 1e-06, "loss": 0.0447, "num_tokens": 838243210.0, "reward": 0.892578125, "reward_std": 0.02500000223517418, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.9921875, "rewards/symbolic_reward_accuracy/std": 0.08812850713729858, "rewards/symbolic_reward_partial_score/mean": 0.9921875, "rewards/symbolic_reward_partial_score/std": 0.08812850713729858, "rewards/tag_count_reward/mean": -0.00390625, "rewards/tag_count_reward/std": 0.06243881583213806, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.1030385494232178, "sampling/importance_sampling_ratio/min": 0.0011941633420065045, "sampling/sampling_logp_difference/max": 6.73030948638916, "sampling/sampling_logp_difference/mean": 0.16941289603710175, "step": 1597 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.4259839355945587, "epoch": 4.205263157894737, "grad_norm": 0.0255422480404377, "learning_rate": 1e-06, "loss": 0.0276, "step": 1598 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0390625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0390625, "entropy": 0.4351424425840378, "epoch": 4.207894736842105, "grad_norm": 0.0039727105759084225, "learning_rate": 1e-06, "loss": -0.0171, "step": 1599 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.4342169910669327, "epoch": 4.2105263157894735, "grad_norm": 0.006120329722762108, "learning_rate": 1e-06, "loss": -0.018, "step": 1600 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16173.0, "completions/max_terminated_length": 16173.0, "completions/mean_length": 5221.826171875, "completions/mean_terminated_length": 5221.826171875, "completions/min_length": 1132.0, "completions/min_terminated_length": 1132.0, "entropy": 0.4448789060115814, "epoch": 4.213157894736842, "frac_reward_zero_std": 0.9375, "grad_norm": 0.006611510179936886, "learning_rate": 1e-06, "loss": -0.0123, "num_tokens": 841323601.0, "reward": 0.896191418170929, "reward_std": 0.01201616507023573, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.994140625, "rewards/symbolic_reward_accuracy/std": 0.07639661431312561, "rewards/symbolic_reward_partial_score/mean": 0.9990234375, "rewards/symbolic_reward_partial_score/std": 0.012732770293951035, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.1065080165863037, "sampling/importance_sampling_ratio/min": 0.003243516432121396, "sampling/sampling_logp_difference/max": 5.731097221374512, "sampling/sampling_logp_difference/mean": 0.17428599298000336, "step": 1601 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03125, "entropy": 0.4407072216272354, "epoch": 4.215789473684211, "grad_norm": 0.0024684646632522345, "learning_rate": 1e-06, "loss": -0.0126, "step": 1602 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.4452795833349228, "epoch": 4.218421052631579, "grad_norm": 0.01824215240776539, "learning_rate": 1e-06, "loss": -0.0013, "step": 1603 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.444363608956337, "epoch": 4.221052631578948, "grad_norm": 0.03171297162771225, "learning_rate": 1e-06, "loss": 0.0212, "step": 1604 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 16384.0, "completions/max_terminated_length": 15709.0, "completions/mean_length": 5482.19921875, "completions/mean_terminated_length": 5439.447265625, "completions/min_length": 377.0, "completions/min_terminated_length": 377.0, "entropy": 0.43427589535713196, "epoch": 4.223684210526316, "frac_reward_zero_std": 0.9375, "grad_norm": 0.02045847475528717, "learning_rate": 1e-06, "loss": 0.0038, "num_tokens": 844540311.0, "reward": 0.8917969465255737, "reward_std": 0.01876388117671013, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.990234375, "rewards/symbolic_reward_accuracy/std": 0.09843364357948303, "rewards/symbolic_reward_partial_score/mean": 0.9928385019302368, "rewards/symbolic_reward_partial_score/std": 0.07976873219013214, "rewards/tag_count_reward/mean": -0.001953125, "rewards/tag_count_reward/std": 0.04419417306780815, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.104947805404663, "sampling/importance_sampling_ratio/min": 0.0028438337612897158, "sampling/sampling_logp_difference/max": 5.862602233886719, "sampling/sampling_logp_difference/mean": 0.17266666889190674, "step": 1605 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.43635350465774536, "epoch": 4.226315789473684, "grad_norm": 0.026922380551695824, "learning_rate": 1e-06, "loss": -0.0031, "step": 1606 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.44073161482810974, "epoch": 4.228947368421053, "grad_norm": 0.0053346725180745125, "learning_rate": 1e-06, "loss": 0.0104, "step": 1607 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.43568021059036255, "epoch": 4.231578947368421, "grad_norm": 0.006379115395247936, "learning_rate": 1e-06, "loss": -0.0014, "step": 1608 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.009765625, "completions/max_length": 16384.0, "completions/max_terminated_length": 15342.0, "completions/mean_length": 5839.40625, "completions/mean_terminated_length": 5735.416015625, "completions/min_length": 1059.0, "completions/min_terminated_length": 1059.0, "entropy": 0.4302578419446945, "epoch": 4.234210526315789, "frac_reward_zero_std": 0.84375, "grad_norm": 0.007278991863131523, "learning_rate": 1e-06, "loss": 0.061, "num_tokens": 847929543.0, "reward": 0.8886668682098389, "reward_std": 0.03985070437192917, "rewards/progression_diversity/mean": -0.0005020540556870401, "rewards/progression_diversity/std": 0.011360187083482742, "rewards/symbolic_reward_accuracy/mean": 0.98828125, "rewards/symbolic_reward_accuracy/std": 0.10772226005792618, "rewards/symbolic_reward_partial_score/mean": 0.98828125, "rewards/symbolic_reward_partial_score/std": 0.10772226005792618, "rewards/tag_count_reward/mean": -0.0078125, "rewards/tag_count_reward/std": 0.08812850713729858, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.10491144657135, "sampling/importance_sampling_ratio/min": 0.0031260817777365446, "sampling/sampling_logp_difference/max": 5.767974853515625, "sampling/sampling_logp_difference/mean": 0.17166000604629517, "step": 1609 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0390625, "entropy": 0.4342924505472183, "epoch": 4.2368421052631575, "grad_norm": 0.007170964498072863, "learning_rate": 1e-06, "loss": -0.0124, "step": 1610 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0546875, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0625, "entropy": 0.4330960214138031, "epoch": 4.239473684210527, "grad_norm": 0.004011278972029686, "learning_rate": 1e-06, "loss": 0.0008, "step": 1611 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.4380767345428467, "epoch": 4.242105263157895, "grad_norm": 0.03263184055685997, "learning_rate": 1e-06, "loss": -0.0032, "step": 1612 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 16384.0, "completions/max_terminated_length": 16354.0, "completions/mean_length": 6162.76171875, "completions/mean_terminated_length": 6122.6787109375, "completions/min_length": 1696.0, "completions/min_terminated_length": 1696.0, "entropy": 0.4307510554790497, "epoch": 4.244736842105263, "frac_reward_zero_std": 0.90625, "grad_norm": 0.0063966186717152596, "learning_rate": 1e-06, "loss": -0.0178, "num_tokens": 851494701.0, "reward": 0.8922815322875977, "reward_std": 0.02307000756263733, "rewards/progression_diversity/mean": -0.0003613182343542576, "rewards/progression_diversity/std": 0.008175698108971119, "rewards/symbolic_reward_accuracy/mean": 0.990234375, "rewards/symbolic_reward_accuracy/std": 0.09843364357948303, "rewards/symbolic_reward_partial_score/mean": 0.9951171875, "rewards/symbolic_reward_partial_score/std": 0.06366384774446487, "rewards/tag_count_reward/mean": -0.00390625, "rewards/tag_count_reward/std": 0.06243881583213806, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.1048433780670166, "sampling/importance_sampling_ratio/min": 0.0027767803985625505, "sampling/sampling_logp_difference/max": 5.886463165283203, "sampling/sampling_logp_difference/mean": 0.17114081978797913, "step": 1613 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.4304428696632385, "epoch": 4.247368421052632, "grad_norm": 0.01738375425338745, "learning_rate": 1e-06, "loss": 0.0299, "step": 1614 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.4284074455499649, "epoch": 4.25, "grad_norm": 0.03195277974009514, "learning_rate": 1e-06, "loss": 0.0284, "step": 1615 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03125, "entropy": 0.4310337007045746, "epoch": 4.252631578947368, "grad_norm": 0.005885908380150795, "learning_rate": 1e-06, "loss": 0.0026, "step": 1616 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 15560.0, "completions/mean_length": 5092.703125, "completions/mean_terminated_length": 5070.6064453125, "completions/min_length": 535.0, "completions/min_terminated_length": 535.0, "entropy": 0.43830157816410065, "epoch": 4.255263157894737, "frac_reward_zero_std": 0.96875, "grad_norm": 0.014152695424854755, "learning_rate": 1e-06, "loss": 0.0024, "num_tokens": 854488949.0, "reward": 0.8940896987915039, "reward_std": 0.010815596207976341, "rewards/progression_diversity/mean": -0.00021749193547293544, "rewards/progression_diversity/std": 0.004921280778944492, "rewards/symbolic_reward_accuracy/mean": 0.9921875, "rewards/symbolic_reward_accuracy/std": 0.08812850713729858, "rewards/symbolic_reward_partial_score/mean": 0.99658203125, "rewards/symbolic_reward_partial_score/std": 0.04808502271771431, "rewards/tag_count_reward/mean": -0.001953125, "rewards/tag_count_reward/std": 0.04419417306780815, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.1073800325393677, "sampling/importance_sampling_ratio/min": 0.003663417650386691, "sampling/sampling_logp_difference/max": 5.609358787536621, "sampling/sampling_logp_difference/mean": 0.17454031109809875, "step": 1617 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.43388527631759644, "epoch": 4.257894736842105, "grad_norm": 0.013832170516252518, "learning_rate": 1e-06, "loss": 0.0348, "step": 1618 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.4379284530878067, "epoch": 4.260526315789473, "grad_norm": 0.0061399731785058975, "learning_rate": 1e-06, "loss": -0.0103, "step": 1619 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.44142892956733704, "epoch": 4.2631578947368425, "grad_norm": 0.004335321485996246, "learning_rate": 1e-06, "loss": -0.01, "step": 1620 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10867.0, "completions/max_terminated_length": 10867.0, "completions/mean_length": 4823.498046875, "completions/mean_terminated_length": 4823.498046875, "completions/min_length": 756.0, "completions/min_terminated_length": 756.0, "entropy": 0.44278237223625183, "epoch": 4.265789473684211, "frac_reward_zero_std": 0.96875, "grad_norm": 0.0023667393252253532, "learning_rate": 1e-06, "loss": 0.0069, "num_tokens": 857325332.0, "reward": 0.8982422351837158, "reward_std": 0.00703125074505806, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.998046875, "rewards/symbolic_reward_accuracy/std": 0.04419417306780815, "rewards/symbolic_reward_partial_score/mean": 0.998046875, "rewards/symbolic_reward_partial_score/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.1084308624267578, "sampling/importance_sampling_ratio/min": 0.0014824430691078305, "sampling/sampling_logp_difference/max": 6.514063835144043, "sampling/sampling_logp_difference/mean": 0.17682185769081116, "step": 1621 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.4408983588218689, "epoch": 4.268421052631579, "grad_norm": 0.001729672192595899, "learning_rate": 1e-06, "loss": -0.003, "step": 1622 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.44144123792648315, "epoch": 4.271052631578947, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": -0.0029, "step": 1623 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.4441017508506775, "epoch": 4.273684210526316, "grad_norm": 0.001630808925256133, "learning_rate": 1e-06, "loss": -0.0026, "step": 1624 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.009765625, "completions/max_length": 16384.0, "completions/max_terminated_length": 16212.0, "completions/mean_length": 6106.8125, "completions/mean_terminated_length": 6005.45947265625, "completions/min_length": 628.0, "completions/min_terminated_length": 628.0, "entropy": 0.4158805012702942, "epoch": 4.276315789473684, "frac_reward_zero_std": 0.78125, "grad_norm": 0.03734549134969711, "learning_rate": 1e-06, "loss": 0.0353, "num_tokens": 860881716.0, "reward": 0.8811489343643188, "reward_std": 0.05394713953137398, "rewards/progression_diversity/mean": -0.0003459767031017691, "rewards/progression_diversity/std": 0.007828558795154095, "rewards/symbolic_reward_accuracy/mean": 0.9765625, "rewards/symbolic_reward_accuracy/std": 0.15143637359142303, "rewards/symbolic_reward_partial_score/mean": 0.9873046875, "rewards/symbolic_reward_partial_score/std": 0.10136845707893372, "rewards/tag_count_reward/mean": -0.009765625, "rewards/tag_count_reward/std": 0.09843364357948303, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.1023050546646118, "sampling/importance_sampling_ratio/min": 0.0010799751617014408, "sampling/sampling_logp_difference/max": 6.830817222595215, "sampling/sampling_logp_difference/mean": 0.16777199506759644, "step": 1625 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.046875, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.046875, "entropy": 0.41941526532173157, "epoch": 4.278947368421052, "grad_norm": 0.0061073992401361465, "learning_rate": 1e-06, "loss": -0.031, "step": 1626 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.046875, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0625, "entropy": 0.4139310121536255, "epoch": 4.281578947368421, "grad_norm": 0.03639233484864235, "learning_rate": 1e-06, "loss": 0.0638, "step": 1627 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.413208544254303, "epoch": 4.284210526315789, "grad_norm": 0.03637981042265892, "learning_rate": 1e-06, "loss": -0.0054, "step": 1628 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.009765625, "completions/max_length": 16384.0, "completions/max_terminated_length": 15321.0, "completions/mean_length": 5594.134765625, "completions/mean_terminated_length": 5487.72607421875, "completions/min_length": 555.0, "completions/min_terminated_length": 555.0, "entropy": 0.42244474589824677, "epoch": 4.286842105263158, "frac_reward_zero_std": 0.84375, "grad_norm": 0.021670928224921227, "learning_rate": 1e-06, "loss": 0.012, "num_tokens": 864137369.0, "reward": 0.8787109851837158, "reward_std": 0.035904426127672195, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.97265625, "rewards/symbolic_reward_accuracy/std": 0.16324250400066376, "rewards/symbolic_reward_partial_score/mean": 0.9869791269302368, "rewards/symbolic_reward_partial_score/std": 0.09637792408466339, "rewards/tag_count_reward/mean": -0.009765625, "rewards/tag_count_reward/std": 0.09843364357948303, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.1029551029205322, "sampling/importance_sampling_ratio/min": 0.0018105923663824797, "sampling/sampling_logp_difference/max": 6.314101219177246, "sampling/sampling_logp_difference/mean": 0.16864517331123352, "step": 1629 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.4186270236968994, "epoch": 4.2894736842105265, "grad_norm": 0.03033420816063881, "learning_rate": 1e-06, "loss": 0.043, "step": 1630 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0546875, "entropy": 0.4187678396701813, "epoch": 4.292105263157895, "grad_norm": 0.009419787675142288, "learning_rate": 1e-06, "loss": 0.0291, "step": 1631 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.4157967120409012, "epoch": 4.294736842105263, "grad_norm": 0.015215841121971607, "learning_rate": 1e-06, "loss": -0.0297, "step": 1632 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 14047.0, "completions/mean_length": 5093.275390625, "completions/mean_terminated_length": 5071.18017578125, "completions/min_length": 479.0, "completions/min_terminated_length": 479.0, "entropy": 0.42294587194919586, "epoch": 4.2973684210526315, "frac_reward_zero_std": 0.90625, "grad_norm": 0.0063886987045407295, "learning_rate": 1e-06, "loss": -0.0193, "num_tokens": 867147430.0, "reward": 0.8928711414337158, "reward_std": 0.02124902978539467, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.990234375, "rewards/symbolic_reward_accuracy/std": 0.09843364357948303, "rewards/symbolic_reward_partial_score/mean": 0.9964193105697632, "rewards/symbolic_reward_partial_score/std": 0.04821429029107094, "rewards/tag_count_reward/mean": -0.001953125, "rewards/tag_count_reward/std": 0.04419417306780815, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.102891206741333, "sampling/importance_sampling_ratio/min": 0.0013397879665717483, "sampling/sampling_logp_difference/max": 6.615243911743164, "sampling/sampling_logp_difference/mean": 0.16851578652858734, "step": 1633 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03125, "entropy": 0.41886942088603973, "epoch": 4.3, "grad_norm": 0.005276647862046957, "learning_rate": 1e-06, "loss": 0.0022, "step": 1634 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.41886237263679504, "epoch": 4.302631578947368, "grad_norm": 0.024588752537965775, "learning_rate": 1e-06, "loss": 0.0189, "step": 1635 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.41815611720085144, "epoch": 4.3052631578947365, "grad_norm": 0.006106048356741667, "learning_rate": 1e-06, "loss": 0.0137, "step": 1636 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 16384.0, "completions/max_terminated_length": 16140.0, "completions/mean_length": 5248.478515625, "completions/mean_terminated_length": 5116.43701171875, "completions/min_length": 986.0, "completions/min_terminated_length": 986.0, "entropy": 0.4239061772823334, "epoch": 4.307894736842106, "frac_reward_zero_std": 0.875, "grad_norm": 0.012889496050775051, "learning_rate": 1e-06, "loss": -0.0404, "num_tokens": 870187227.0, "reward": 0.8755371570587158, "reward_std": 0.03623192757368088, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.96875, "rewards/symbolic_reward_accuracy/std": 0.17416280508041382, "rewards/symbolic_reward_partial_score/mean": 0.98486328125, "rewards/symbolic_reward_partial_score/std": 0.11010749638080597, "rewards/tag_count_reward/mean": -0.01171875, "rewards/tag_count_reward/std": 0.10772226005792618, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.1034862995147705, "sampling/importance_sampling_ratio/min": 0.0015555914724245667, "sampling/sampling_logp_difference/max": 6.465899467468262, "sampling/sampling_logp_difference/mean": 0.1695464551448822, "step": 1637 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03125, "entropy": 0.42418769001960754, "epoch": 4.310526315789474, "grad_norm": 0.023644376546144485, "learning_rate": 1e-06, "loss": 0.025, "step": 1638 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03125, "entropy": 0.4248455762863159, "epoch": 4.313157894736842, "grad_norm": 0.012486502528190613, "learning_rate": 1e-06, "loss": 0.0465, "step": 1639 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.4236167073249817, "epoch": 4.315789473684211, "grad_norm": 0.022313551977276802, "learning_rate": 1e-06, "loss": 0.0121, "step": 1640 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 16384.0, "completions/max_terminated_length": 15936.0, "completions/mean_length": 5908.37890625, "completions/mean_terminated_length": 5867.29833984375, "completions/min_length": 836.0, "completions/min_terminated_length": 836.0, "entropy": 0.40725649893283844, "epoch": 4.318421052631579, "frac_reward_zero_std": 0.8125, "grad_norm": 0.02419469691812992, "learning_rate": 1e-06, "loss": 0.0094, "num_tokens": 873633469.0, "reward": 0.8699198961257935, "reward_std": 0.05075995996594429, "rewards/progression_diversity/mean": -0.0001967435673577711, "rewards/progression_diversity/std": 0.00445179920643568, "rewards/symbolic_reward_accuracy/mean": 0.958984375, "rewards/symbolic_reward_accuracy/std": 0.19852031767368317, "rewards/symbolic_reward_partial_score/mean": 0.9830729365348816, "rewards/symbolic_reward_partial_score/std": 0.10651702433824539, "rewards/tag_count_reward/mean": -0.00390625, "rewards/tag_count_reward/std": 0.06243881583213806, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0998890399932861, "sampling/importance_sampling_ratio/min": 7.278633518126298e-08, "sampling/sampling_logp_difference/max": 16.43573760986328, "sampling/sampling_logp_difference/mean": 0.1641865074634552, "step": 1641 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.046875, "entropy": 0.4058327078819275, "epoch": 4.321052631578947, "grad_norm": 0.020586229860782623, "learning_rate": 1e-06, "loss": 0.0495, "step": 1642 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0625, "entropy": 0.4100194275379181, "epoch": 4.323684210526316, "grad_norm": 0.007098539266735315, "learning_rate": 1e-06, "loss": -0.0031, "step": 1643 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03125, "entropy": 0.4071105271577835, "epoch": 4.326315789473684, "grad_norm": 0.013148884288966656, "learning_rate": 1e-06, "loss": -0.0018, "step": 1644 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 16384.0, "completions/max_terminated_length": 15779.0, "completions/mean_length": 6241.224609375, "completions/mean_terminated_length": 6181.4443359375, "completions/min_length": 983.0, "completions/min_terminated_length": 983.0, "entropy": 0.40303365886211395, "epoch": 4.328947368421053, "frac_reward_zero_std": 0.8125, "grad_norm": 0.043847598135471344, "learning_rate": 1e-06, "loss": -0.0076, "num_tokens": 877233968.0, "reward": 0.87939453125, "reward_std": 0.04617775231599808, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.97265625, "rewards/symbolic_reward_accuracy/std": 0.16324250400066376, "rewards/symbolic_reward_partial_score/mean": 0.9879557490348816, "rewards/symbolic_reward_partial_score/std": 0.08857493847608566, "rewards/tag_count_reward/mean": -0.005859375, "rewards/tag_count_reward/std": 0.07639661431312561, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.097895622253418, "sampling/importance_sampling_ratio/min": 0.0022770878858864307, "sampling/sampling_logp_difference/max": 6.084857940673828, "sampling/sampling_logp_difference/mean": 0.1614503711462021, "step": 1645 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03125, "entropy": 0.3984634280204773, "epoch": 4.331578947368421, "grad_norm": 0.03127056732773781, "learning_rate": 1e-06, "loss": 0.0258, "step": 1646 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.046875, "entropy": 0.4001065492630005, "epoch": 4.33421052631579, "grad_norm": 0.03108837455511093, "learning_rate": 1e-06, "loss": 0.0189, "step": 1647 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03125, "entropy": 0.4035009890794754, "epoch": 4.336842105263158, "grad_norm": 0.011295434087514877, "learning_rate": 1e-06, "loss": -0.0076, "step": 1648 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 16384.0, "completions/max_terminated_length": 14151.0, "completions/mean_length": 6002.21484375, "completions/mean_terminated_length": 5961.50244140625, "completions/min_length": 744.0, "completions/min_terminated_length": 744.0, "entropy": 0.40457485616207123, "epoch": 4.339473684210526, "frac_reward_zero_std": 0.8125, "grad_norm": 0.006858786102384329, "learning_rate": 1e-06, "loss": -0.0273, "num_tokens": 880709246.0, "reward": 0.8883789777755737, "reward_std": 0.03927573934197426, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.984375, "rewards/symbolic_reward_accuracy/std": 0.12414088100194931, "rewards/symbolic_reward_partial_score/mean": 0.9938151240348816, "rewards/symbolic_reward_partial_score/std": 0.057958170771598816, "rewards/tag_count_reward/mean": -0.00390625, "rewards/tag_count_reward/std": 0.06243881583213806, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0986275672912598, "sampling/importance_sampling_ratio/min": 0.00021476548863574862, "sampling/sampling_logp_difference/max": 8.445963859558105, "sampling/sampling_logp_difference/mean": 0.1622392237186432, "step": 1649 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0546875, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0546875, "entropy": 0.40529656410217285, "epoch": 4.342105263157895, "grad_norm": 0.00645934185013175, "learning_rate": 1e-06, "loss": 0.0389, "step": 1650 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.0234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.046875, "entropy": 0.4104660302400589, "epoch": 4.344736842105263, "grad_norm": 0.005867961794137955, "learning_rate": 1e-06, "loss": 0.038, "step": 1651 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.4050469398498535, "epoch": 4.347368421052631, "grad_norm": 0.02269315905869007, "learning_rate": 1e-06, "loss": 0.0023, "step": 1652 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.025390625, "completions/max_length": 16384.0, "completions/max_terminated_length": 16222.0, "completions/mean_length": 7054.04296875, "completions/mean_terminated_length": 6810.97802734375, "completions/min_length": 1325.0, "completions/min_terminated_length": 1325.0, "entropy": 0.3939700275659561, "epoch": 4.35, "frac_reward_zero_std": 0.71875, "grad_norm": 0.04180002585053444, "learning_rate": 1e-06, "loss": 0.0591, "num_tokens": 884724564.0, "reward": 0.8453124761581421, "reward_std": 0.08576792478561401, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.931640625, "rewards/symbolic_reward_accuracy/std": 0.25260838866233826, "rewards/symbolic_reward_partial_score/mean": 0.962890625, "rewards/symbolic_reward_partial_score/std": 0.1722983866930008, "rewards/tag_count_reward/mean": -0.025390625, "rewards/tag_count_reward/std": 0.15746226906776428, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0959928035736084, "sampling/importance_sampling_ratio/min": 0.0013346988707780838, "sampling/sampling_logp_difference/max": 6.619049549102783, "sampling/sampling_logp_difference/mean": 0.15839730203151703, "step": 1653 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0390625, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.09375, "entropy": 0.3988970220088959, "epoch": 4.352631578947369, "grad_norm": 0.017516355961561203, "learning_rate": 1e-06, "loss": 0.0198, "step": 1654 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0390625, "clip_ratio/low_mean": 0.0390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.078125, "entropy": 0.39493827521800995, "epoch": 4.355263157894737, "grad_norm": 0.026788026094436646, "learning_rate": 1e-06, "loss": -0.0212, "step": 1655 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.046875, "entropy": 0.3981803357601166, "epoch": 4.3578947368421055, "grad_norm": 0.015244071371853352, "learning_rate": 1e-06, "loss": -0.0179, "step": 1656 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 16384.0, "completions/max_terminated_length": 16082.0, "completions/mean_length": 6806.8125, "completions/mean_terminated_length": 6769.25537109375, "completions/min_length": 1648.0, "completions/min_terminated_length": 1648.0, "entropy": 0.40387649834156036, "epoch": 4.360526315789474, "frac_reward_zero_std": 0.84375, "grad_norm": 0.01023479737341404, "learning_rate": 1e-06, "loss": 0.0048, "num_tokens": 888629300.0, "reward": 0.8820313215255737, "reward_std": 0.04264799878001213, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.9765625, "rewards/symbolic_reward_accuracy/std": 0.15143637359142303, "rewards/symbolic_reward_partial_score/mean": 0.98828125, "rewards/symbolic_reward_partial_score/std": 0.09484104067087173, "rewards/tag_count_reward/mean": -0.00390625, "rewards/tag_count_reward/std": 0.06243881583213806, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0977758169174194, "sampling/importance_sampling_ratio/min": 0.0022452680859714746, "sampling/sampling_logp_difference/max": 6.098930358886719, "sampling/sampling_logp_difference/mean": 0.16125650703907013, "step": 1657 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.4043918401002884, "epoch": 4.363157894736842, "grad_norm": 0.010048780590295792, "learning_rate": 1e-06, "loss": -0.0192, "step": 1658 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.4045111835002899, "epoch": 4.36578947368421, "grad_norm": 0.02937551587820053, "learning_rate": 1e-06, "loss": 0.052, "step": 1659 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.40271003544330597, "epoch": 4.368421052631579, "grad_norm": 0.024729048833251, "learning_rate": 1e-06, "loss": -0.0043, "step": 1660 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 16384.0, "completions/max_terminated_length": 15976.0, "completions/mean_length": 6973.9609375, "completions/mean_terminated_length": 6824.595703125, "completions/min_length": 1583.0, "completions/min_terminated_length": 1583.0, "entropy": 0.39718057215213776, "epoch": 4.371052631578947, "frac_reward_zero_std": 0.8125, "grad_norm": 0.044504936784505844, "learning_rate": 1e-06, "loss": 0.0623, "num_tokens": 892613280.0, "reward": 0.850830078125, "reward_std": 0.052683740854263306, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.93359375, "rewards/symbolic_reward_accuracy/std": 0.2492343932390213, "rewards/symbolic_reward_partial_score/mean": 0.97412109375, "rewards/symbolic_reward_partial_score/std": 0.13553021848201752, "rewards/tag_count_reward/mean": -0.015625, "rewards/tag_count_reward/std": 0.12414088100194931, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0964837074279785, "sampling/importance_sampling_ratio/min": 0.002200217917561531, "sampling/sampling_logp_difference/max": 6.119198799133301, "sampling/sampling_logp_difference/mean": 0.1591547429561615, "step": 1661 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03125, "entropy": 0.39757077395915985, "epoch": 4.373684210526315, "grad_norm": 0.026913262903690338, "learning_rate": 1e-06, "loss": -0.0025, "step": 1662 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03125, "entropy": 0.4004693180322647, "epoch": 4.376315789473685, "grad_norm": 0.015976862981915474, "learning_rate": 1e-06, "loss": 0.003, "step": 1663 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.39960332214832306, "epoch": 4.378947368421053, "grad_norm": 0.01917731761932373, "learning_rate": 1e-06, "loss": -0.0142, "step": 1664 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 16384.0, "completions/max_terminated_length": 15629.0, "completions/mean_length": 6234.939453125, "completions/mean_terminated_length": 6195.1396484375, "completions/min_length": 2091.0, "completions/min_terminated_length": 2091.0, "entropy": 0.4015529453754425, "epoch": 4.381578947368421, "frac_reward_zero_std": 0.90625, "grad_norm": 0.012103889137506485, "learning_rate": 1e-06, "loss": 0.0154, "num_tokens": 896229569.0, "reward": 0.8820313215255737, "reward_std": 0.02522941492497921, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.974609375, "rewards/symbolic_reward_accuracy/std": 0.15746226906776428, "rewards/symbolic_reward_partial_score/mean": 0.9921875, "rewards/symbolic_reward_partial_score/std": 0.06792477518320084, "rewards/tag_count_reward/mean": -0.00390625, "rewards/tag_count_reward/std": 0.06243881583213806, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0962879657745361, "sampling/importance_sampling_ratio/min": 0.002557792467996478, "sampling/sampling_logp_difference/max": 5.968610763549805, "sampling/sampling_logp_difference/mean": 0.1597835123538971, "step": 1665 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.401179775595665, "epoch": 4.38421052631579, "grad_norm": 0.014084015041589737, "learning_rate": 1e-06, "loss": -0.0059, "step": 1666 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03125, "entropy": 0.4006345123052597, "epoch": 4.386842105263158, "grad_norm": 0.0064776502549648285, "learning_rate": 1e-06, "loss": 0.0336, "step": 1667 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.4056100845336914, "epoch": 4.389473684210526, "grad_norm": 0.011185785755515099, "learning_rate": 1e-06, "loss": -0.0299, "step": 1668 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 16384.0, "completions/max_terminated_length": 15514.0, "completions/mean_length": 6391.01953125, "completions/mean_terminated_length": 6351.83154296875, "completions/min_length": 818.0, "completions/min_terminated_length": 818.0, "entropy": 0.40403537452220917, "epoch": 4.3921052631578945, "frac_reward_zero_std": 0.9375, "grad_norm": 0.013687696307897568, "learning_rate": 1e-06, "loss": 0.0294, "num_tokens": 899916107.0, "reward": 0.8818359375, "reward_std": 0.015566971153020859, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.974609375, "rewards/symbolic_reward_accuracy/std": 0.15746226906776428, "rewards/symbolic_reward_partial_score/mean": 0.9908853769302368, "rewards/symbolic_reward_partial_score/std": 0.07147102802991867, "rewards/tag_count_reward/mean": -0.001953125, "rewards/tag_count_reward/std": 0.04419417306780815, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0975208282470703, "sampling/importance_sampling_ratio/min": 0.0017064865678548813, "sampling/sampling_logp_difference/max": 6.373318672180176, "sampling/sampling_logp_difference/mean": 0.16066737473011017, "step": 1669 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.40307632088661194, "epoch": 4.394736842105263, "grad_norm": 0.012635898776352406, "learning_rate": 1e-06, "loss": -0.0047, "step": 1670 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.4056246876716614, "epoch": 4.397368421052631, "grad_norm": 0.015188086777925491, "learning_rate": 1e-06, "loss": -0.0156, "step": 1671 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.4075080454349518, "epoch": 4.4, "grad_norm": 0.004119619727134705, "learning_rate": 1e-06, "loss": 0.0048, "step": 1672 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 16384.0, "completions/max_terminated_length": 16097.0, "completions/mean_length": 6159.755859375, "completions/mean_terminated_length": 6079.25, "completions/min_length": 1476.0, "completions/min_terminated_length": 1476.0, "entropy": 0.4071505665779114, "epoch": 4.402631578947369, "frac_reward_zero_std": 0.90625, "grad_norm": 0.012636536732316017, "learning_rate": 1e-06, "loss": 0.0248, "num_tokens": 903464846.0, "reward": 0.8818359375, "reward_std": 0.029663637280464172, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.9765625, "rewards/symbolic_reward_accuracy/std": 0.15143637359142303, "rewards/symbolic_reward_partial_score/mean": 0.9889322519302368, "rewards/symbolic_reward_partial_score/std": 0.09201135486364365, "rewards/tag_count_reward/mean": -0.0078125, "rewards/tag_count_reward/std": 0.08812850713729858, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0972017049789429, "sampling/importance_sampling_ratio/min": 0.002207136480137706, "sampling/sampling_logp_difference/max": 6.116059303283691, "sampling/sampling_logp_difference/mean": 0.1610202193260193, "step": 1673 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.4056698679924011, "epoch": 4.405263157894737, "grad_norm": 0.018183821812272072, "learning_rate": 1e-06, "loss": 0.007, "step": 1674 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.40626025199890137, "epoch": 4.407894736842105, "grad_norm": 0.013775753788650036, "learning_rate": 1e-06, "loss": -0.0024, "step": 1675 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.40729711949825287, "epoch": 4.410526315789474, "grad_norm": 0.014505565166473389, "learning_rate": 1e-06, "loss": 0.0007, "step": 1676 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 15137.0, "completions/mean_length": 5970.013671875, "completions/mean_terminated_length": 5949.6337890625, "completions/min_length": 707.0, "completions/min_terminated_length": 707.0, "entropy": 0.4138667732477188, "epoch": 4.413157894736842, "frac_reward_zero_std": 0.9375, "grad_norm": 0.002891423413529992, "learning_rate": 1e-06, "loss": -0.0057, "num_tokens": 906897109.0, "reward": 0.8967773914337158, "reward_std": 0.01289062574505806, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.99609375, "rewards/symbolic_reward_accuracy/std": 0.06243881583213806, "rewards/symbolic_reward_partial_score/mean": 0.9977213144302368, "rewards/symbolic_reward_partial_score/std": 0.04478955641388893, "rewards/tag_count_reward/mean": -0.001953125, "rewards/tag_count_reward/std": 0.04419417306780815, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0988906621932983, "sampling/importance_sampling_ratio/min": 0.0019935385789722204, "sampling/sampling_logp_difference/max": 6.217844009399414, "sampling/sampling_logp_difference/mean": 0.16292931139469147, "step": 1677 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.40900835394859314, "epoch": 4.41578947368421, "grad_norm": 0.019590480253100395, "learning_rate": 1e-06, "loss": 0.0232, "step": 1678 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.41462790966033936, "epoch": 4.418421052631579, "grad_norm": 0.0016400973545387387, "learning_rate": 1e-06, "loss": 0.0225, "step": 1679 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.4147539883852005, "epoch": 4.421052631578947, "grad_norm": 0.003316552611067891, "learning_rate": 1e-06, "loss": -0.007, "step": 1680 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017578125, "completions/max_length": 16384.0, "completions/max_terminated_length": 16318.0, "completions/mean_length": 7046.146484375, "completions/mean_terminated_length": 6879.0673828125, "completions/min_length": 774.0, "completions/min_terminated_length": 774.0, "entropy": 0.39725610613822937, "epoch": 4.423684210526316, "frac_reward_zero_std": 0.875, "grad_norm": 0.032636187970638275, "learning_rate": 1e-06, "loss": 0.0121, "num_tokens": 910929056.0, "reward": 0.8839844465255737, "reward_std": 0.03659204766154289, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.982421875, "rewards/symbolic_reward_accuracy/std": 0.13154059648513794, "rewards/symbolic_reward_partial_score/mean": 0.9869791269302368, "rewards/symbolic_reward_partial_score/std": 0.10907779633998871, "rewards/tag_count_reward/mean": -0.015625, "rewards/tag_count_reward/std": 0.12414088100194931, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.095511555671692, "sampling/importance_sampling_ratio/min": 0.002931704046204686, "sampling/sampling_logp_difference/max": 5.832171440124512, "sampling/sampling_logp_difference/mean": 0.1576957106590271, "step": 1681 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.3959013670682907, "epoch": 4.426315789473684, "grad_norm": 0.009154154919087887, "learning_rate": 1e-06, "loss": 0.0155, "step": 1682 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.39808160066604614, "epoch": 4.428947368421053, "grad_norm": 0.027669908478856087, "learning_rate": 1e-06, "loss": 0.0255, "step": 1683 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.39798080921173096, "epoch": 4.431578947368421, "grad_norm": 0.009289699606597424, "learning_rate": 1e-06, "loss": -0.0018, "step": 1684 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 16384.0, "completions/max_terminated_length": 15428.0, "completions/mean_length": 6361.517578125, "completions/mean_terminated_length": 6282.6005859375, "completions/min_length": 1777.0, "completions/min_terminated_length": 1777.0, "entropy": 0.3980341851711273, "epoch": 4.434210526315789, "frac_reward_zero_std": 0.8125, "grad_norm": 0.024433737620711327, "learning_rate": 1e-06, "loss": 0.072, "num_tokens": 914605161.0, "reward": 0.8907226920127869, "reward_std": 0.037109375, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.990234375, "rewards/symbolic_reward_accuracy/std": 0.09843364357948303, "rewards/symbolic_reward_partial_score/mean": 0.9918619394302368, "rewards/symbolic_reward_partial_score/std": 0.08840696513652802, "rewards/tag_count_reward/mean": -0.009765625, "rewards/tag_count_reward/std": 0.09843364357948303, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.096335768699646, "sampling/importance_sampling_ratio/min": 0.003198143094778061, "sampling/sampling_logp_difference/max": 5.745184898376465, "sampling/sampling_logp_difference/mean": 0.15931525826454163, "step": 1685 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.40825584530830383, "epoch": 4.436842105263158, "grad_norm": 0.006376485340297222, "learning_rate": 1e-06, "loss": -0.0123, "step": 1686 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.40282125771045685, "epoch": 4.439473684210526, "grad_norm": 0.005751782562583685, "learning_rate": 1e-06, "loss": 0.0096, "step": 1687 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.40184593200683594, "epoch": 4.442105263157894, "grad_norm": 0.006629121955484152, "learning_rate": 1e-06, "loss": 0.0068, "step": 1688 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.009765625, "completions/max_length": 16384.0, "completions/max_terminated_length": 14156.0, "completions/mean_length": 5919.556640625, "completions/mean_terminated_length": 5816.35693359375, "completions/min_length": 996.0, "completions/min_terminated_length": 996.0, "entropy": 0.4139447808265686, "epoch": 4.4447368421052635, "frac_reward_zero_std": 0.90625, "grad_norm": 0.03216357156634331, "learning_rate": 1e-06, "loss": 0.029, "num_tokens": 918028934.0, "reward": 0.8896484971046448, "reward_std": 0.02695416286587715, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.98828125, "rewards/symbolic_reward_accuracy/std": 0.10772226005792618, "rewards/symbolic_reward_partial_score/mean": 0.9921875, "rewards/symbolic_reward_partial_score/std": 0.0808921679854393, "rewards/tag_count_reward/mean": -0.009765625, "rewards/tag_count_reward/std": 0.09843364357948303, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0991489887237549, "sampling/importance_sampling_ratio/min": 0.00282970629632473, "sampling/sampling_logp_difference/max": 5.867582321166992, "sampling/sampling_logp_difference/mean": 0.1632101684808731, "step": 1689 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.4094916731119156, "epoch": 4.447368421052632, "grad_norm": 0.004263041075319052, "learning_rate": 1e-06, "loss": 0.0319, "step": 1690 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.41255608201026917, "epoch": 4.45, "grad_norm": 0.006899460684508085, "learning_rate": 1e-06, "loss": -0.0196, "step": 1691 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.4127243608236313, "epoch": 4.4526315789473685, "grad_norm": 0.007229155860841274, "learning_rate": 1e-06, "loss": -0.0112, "step": 1692 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 16384.0, "completions/max_terminated_length": 15864.0, "completions/mean_length": 6220.880859375, "completions/mean_terminated_length": 6059.56201171875, "completions/min_length": 1386.0, "completions/min_terminated_length": 1386.0, "entropy": 0.40877464413642883, "epoch": 4.455263157894737, "frac_reward_zero_std": 0.8125, "grad_norm": 0.027125459164381027, "learning_rate": 1e-06, "loss": 0.0079, "num_tokens": 921603817.0, "reward": 0.8788086175918579, "reward_std": 0.04460802674293518, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.974609375, "rewards/symbolic_reward_accuracy/std": 0.15746226906776428, "rewards/symbolic_reward_partial_score/mean": 0.9853515625, "rewards/symbolic_reward_partial_score/std": 0.11048159003257751, "rewards/tag_count_reward/mean": -0.015625, "rewards/tag_count_reward/std": 0.12414088100194931, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0970017910003662, "sampling/importance_sampling_ratio/min": 0.002184893237426877, "sampling/sampling_logp_difference/max": 6.126188278198242, "sampling/sampling_logp_difference/mean": 0.16050924360752106, "step": 1693 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03125, "entropy": 0.4050184488296509, "epoch": 4.457894736842105, "grad_norm": 0.010832409374415874, "learning_rate": 1e-06, "loss": -0.0152, "step": 1694 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.4044746458530426, "epoch": 4.4605263157894735, "grad_norm": 0.012561858631670475, "learning_rate": 1e-06, "loss": 0.0342, "step": 1695 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.40345558524131775, "epoch": 4.463157894736842, "grad_norm": 0.012913716956973076, "learning_rate": 1e-06, "loss": 0.0585, "step": 1696 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 16384.0, "completions/max_terminated_length": 16375.0, "completions/mean_length": 6532.689453125, "completions/mean_terminated_length": 6296.25830078125, "completions/min_length": 1005.0, "completions/min_terminated_length": 1005.0, "entropy": 0.39597228169441223, "epoch": 4.465789473684211, "frac_reward_zero_std": 0.75, "grad_norm": 0.014212416484951973, "learning_rate": 1e-06, "loss": -0.0038, "num_tokens": 925361546.0, "reward": 0.870800793170929, "reward_std": 0.06000380590558052, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.96484375, "rewards/symbolic_reward_accuracy/std": 0.1843547374010086, "rewards/symbolic_reward_partial_score/mean": 0.9794921875, "rewards/symbolic_reward_partial_score/std": 0.13296754658222198, "rewards/tag_count_reward/mean": -0.01953125, "rewards/tag_count_reward/std": 0.1385180652141571, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0944582223892212, "sampling/importance_sampling_ratio/min": 0.000805265037342906, "sampling/sampling_logp_difference/max": 7.1243391036987305, "sampling/sampling_logp_difference/mean": 0.1563316285610199, "step": 1697 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03125, "entropy": 0.389819473028183, "epoch": 4.468421052631579, "grad_norm": 0.02167220413684845, "learning_rate": 1e-06, "loss": 0.0721, "step": 1698 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0625, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0703125, "entropy": 0.3912952095270157, "epoch": 4.471052631578948, "grad_norm": 0.029220927506685257, "learning_rate": 1e-06, "loss": 0.0269, "step": 1699 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.39638207852840424, "epoch": 4.473684210526316, "grad_norm": 0.02709389291703701, "learning_rate": 1e-06, "loss": -0.0039, "step": 1700 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 16384.0, "completions/max_terminated_length": 16259.0, "completions/mean_length": 6240.470703125, "completions/mean_terminated_length": 6160.6005859375, "completions/min_length": 1452.0, "completions/min_terminated_length": 1452.0, "entropy": 0.4060589671134949, "epoch": 4.476315789473684, "frac_reward_zero_std": 0.9375, "grad_norm": 0.009188372641801834, "learning_rate": 1e-06, "loss": -0.0096, "num_tokens": 928957467.0, "reward": 0.888134777545929, "reward_std": 0.019519919529557228, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.986328125, "rewards/symbolic_reward_accuracy/std": 0.1162383034825325, "rewards/symbolic_reward_partial_score/mean": 0.9903971552848816, "rewards/symbolic_reward_partial_score/std": 0.09106360375881195, "rewards/tag_count_reward/mean": -0.0078125, "rewards/tag_count_reward/std": 0.08812850713729858, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0976431369781494, "sampling/importance_sampling_ratio/min": 0.0019957490731030703, "sampling/sampling_logp_difference/max": 6.21673583984375, "sampling/sampling_logp_difference/mean": 0.16073334217071533, "step": 1701 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.4065135717391968, "epoch": 4.478947368421053, "grad_norm": 0.011536335572600365, "learning_rate": 1e-06, "loss": -0.0056, "step": 1702 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.40237389504909515, "epoch": 4.481578947368421, "grad_norm": 0.006363328546285629, "learning_rate": 1e-06, "loss": 0.0077, "step": 1703 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.4049024283885956, "epoch": 4.484210526315789, "grad_norm": 0.013384182006120682, "learning_rate": 1e-06, "loss": 0.0206, "step": 1704 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 16384.0, "completions/max_terminated_length": 14719.0, "completions/mean_length": 6288.73046875, "completions/mean_terminated_length": 6229.22998046875, "completions/min_length": 1690.0, "completions/min_terminated_length": 1690.0, "entropy": 0.40029455721378326, "epoch": 4.4868421052631575, "frac_reward_zero_std": 0.90625, "grad_norm": 0.03570149093866348, "learning_rate": 1e-06, "loss": 0.0282, "num_tokens": 932590449.0, "reward": 0.8893555402755737, "reward_std": 0.024492472410202026, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.986328125, "rewards/symbolic_reward_accuracy/std": 0.1162383034825325, "rewards/symbolic_reward_partial_score/mean": 0.9938150644302368, "rewards/symbolic_reward_partial_score/std": 0.06769225746393204, "rewards/tag_count_reward/mean": -0.005859375, "rewards/tag_count_reward/std": 0.07639661431312561, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0966813564300537, "sampling/importance_sampling_ratio/min": 0.0012064871843904257, "sampling/sampling_logp_difference/max": 6.7200422286987305, "sampling/sampling_logp_difference/mean": 0.15994581580162048, "step": 1705 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.39987750351428986, "epoch": 4.489473684210527, "grad_norm": 0.013917638920247555, "learning_rate": 1e-06, "loss": 0.0092, "step": 1706 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.4031446576118469, "epoch": 4.492105263157895, "grad_norm": 0.008411859162151814, "learning_rate": 1e-06, "loss": -0.0248, "step": 1707 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.4044816493988037, "epoch": 4.494736842105263, "grad_norm": 0.00608462281525135, "learning_rate": 1e-06, "loss": 0.0196, "step": 1708 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.009765625, "completions/max_length": 16384.0, "completions/max_terminated_length": 16222.0, "completions/mean_length": 6287.87890625, "completions/mean_terminated_length": 6188.3115234375, "completions/min_length": 993.0, "completions/min_terminated_length": 993.0, "entropy": 0.41594061255455017, "epoch": 4.497368421052632, "frac_reward_zero_std": 0.875, "grad_norm": 0.01294434629380703, "learning_rate": 1e-06, "loss": 0.0041, "num_tokens": 936196627.0, "reward": 0.8863769769668579, "reward_std": 0.029826264828443527, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.982421875, "rewards/symbolic_reward_accuracy/std": 0.13154059648513794, "rewards/symbolic_reward_partial_score/mean": 0.9923502206802368, "rewards/symbolic_reward_partial_score/std": 0.07192692160606384, "rewards/tag_count_reward/mean": -0.0078125, "rewards/tag_count_reward/std": 0.08812850713729858, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0986714363098145, "sampling/importance_sampling_ratio/min": 0.002038198057562113, "sampling/sampling_logp_difference/max": 6.1956892013549805, "sampling/sampling_logp_difference/mean": 0.16297107934951782, "step": 1709 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.4143824130296707, "epoch": 4.5, "grad_norm": 0.009162618778645992, "learning_rate": 1e-06, "loss": -0.0101, "step": 1710 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0390625, "entropy": 0.4101819694042206, "epoch": 4.502631578947368, "grad_norm": 0.02733253687620163, "learning_rate": 1e-06, "loss": 0.0319, "step": 1711 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.41094449162483215, "epoch": 4.505263157894737, "grad_norm": 0.03555028513073921, "learning_rate": 1e-06, "loss": 0.0131, "step": 1712 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 16286.0, "completions/mean_length": 6608.314453125, "completions/mean_terminated_length": 6589.18408203125, "completions/min_length": 1933.0, "completions/min_terminated_length": 1933.0, "entropy": 0.40871562063694, "epoch": 4.507894736842105, "frac_reward_zero_std": 0.90625, "grad_norm": 0.004176863934844732, "learning_rate": 1e-06, "loss": 0.0074, "num_tokens": 939982868.0, "reward": 0.895263671875, "reward_std": 0.01894531399011612, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.994140625, "rewards/symbolic_reward_accuracy/std": 0.07639661431312561, "rewards/symbolic_reward_partial_score/mean": 0.99658203125, "rewards/symbolic_reward_partial_score/std": 0.050564687699079514, "rewards/tag_count_reward/mean": -0.001953125, "rewards/tag_count_reward/std": 0.04419417306780815, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0982666015625, "sampling/importance_sampling_ratio/min": 0.0019903965294361115, "sampling/sampling_logp_difference/max": 6.21942138671875, "sampling/sampling_logp_difference/mean": 0.16244372725486755, "step": 1713 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.4116509258747101, "epoch": 4.510526315789473, "grad_norm": 0.0035293849650770426, "learning_rate": 1e-06, "loss": -0.0121, "step": 1714 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.4102381467819214, "epoch": 4.5131578947368425, "grad_norm": 0.004798616282641888, "learning_rate": 1e-06, "loss": 0.0301, "step": 1715 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.4101520925760269, "epoch": 4.515789473684211, "grad_norm": 0.00496835308149457, "learning_rate": 1e-06, "loss": -0.0128, "step": 1716 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 15657.0, "completions/mean_length": 6260.26953125, "completions/mean_terminated_length": 6240.4580078125, "completions/min_length": 2198.0, "completions/min_terminated_length": 2198.0, "entropy": 0.424741730093956, "epoch": 4.518421052631579, "frac_reward_zero_std": 0.96875, "grad_norm": 0.038017649203538895, "learning_rate": 1e-06, "loss": 0.028, "num_tokens": 943579742.0, "reward": 0.8982422351837158, "reward_std": 0.00703125074505806, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.998046875, "rewards/symbolic_reward_accuracy/std": 0.04419417306780815, "rewards/symbolic_reward_partial_score/mean": 0.998046875, "rewards/symbolic_reward_partial_score/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.1007094383239746, "sampling/importance_sampling_ratio/min": 0.0006929899100214243, "sampling/sampling_logp_difference/max": 7.2744951248168945, "sampling/sampling_logp_difference/mean": 0.1662769317626953, "step": 1717 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.4188734292984009, "epoch": 4.521052631578947, "grad_norm": 0.0028155501931905746, "learning_rate": 1e-06, "loss": -0.0046, "step": 1718 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.4212244898080826, "epoch": 4.523684210526316, "grad_norm": 0.0027970641385763884, "learning_rate": 1e-06, "loss": -0.0037, "step": 1719 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.4196837991476059, "epoch": 4.526315789473684, "grad_norm": 0.002883852692320943, "learning_rate": 1e-06, "loss": -0.0049, "step": 1720 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14963.0, "completions/max_terminated_length": 14963.0, "completions/mean_length": 6637.16796875, "completions/mean_terminated_length": 6637.16796875, "completions/min_length": 2307.0, "completions/min_terminated_length": 2307.0, "entropy": 0.41772331297397614, "epoch": 4.528947368421052, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 947403316.0, "reward": 0.9000000357627869, "reward_std": 0.0, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 1.0, "rewards/symbolic_reward_accuracy/std": 0.0, "rewards/symbolic_reward_partial_score/mean": 1.0, "rewards/symbolic_reward_partial_score/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.101027011871338, "sampling/importance_sampling_ratio/min": 0.002484119264408946, "sampling/sampling_logp_difference/max": 5.997837066650391, "sampling/sampling_logp_difference/mean": 0.166985422372818, "step": 1721 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.4215196669101715, "epoch": 4.531578947368421, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 1722 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.42238539457321167, "epoch": 4.534210526315789, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 1723 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.4196287840604782, "epoch": 4.536842105263158, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 1724 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14916.0, "completions/max_terminated_length": 14916.0, "completions/mean_length": 6023.966796875, "completions/mean_terminated_length": 6023.966796875, "completions/min_length": 1805.0, "completions/min_terminated_length": 1805.0, "entropy": 0.4303021878004074, "epoch": 4.5394736842105265, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 950891555.0, "reward": 0.9000000357627869, "reward_std": 0.0, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 1.0, "rewards/symbolic_reward_accuracy/std": 0.0, "rewards/symbolic_reward_partial_score/mean": 1.0, "rewards/symbolic_reward_partial_score/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.1032167673110962, "sampling/importance_sampling_ratio/min": 3.6364458537718747e-06, "sampling/sampling_logp_difference/max": 12.524503707885742, "sampling/sampling_logp_difference/mean": 0.1701851785182953, "step": 1725 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.4268851727247238, "epoch": 4.542105263157895, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 1726 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.43249501287937164, "epoch": 4.544736842105263, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 1727 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.4292835295200348, "epoch": 4.5473684210526315, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 1728 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15663.0, "completions/max_terminated_length": 15663.0, "completions/mean_length": 6459.330078125, "completions/mean_terminated_length": 6459.330078125, "completions/min_length": 2409.0, "completions/min_terminated_length": 2409.0, "entropy": 0.42863769829273224, "epoch": 4.55, "frac_reward_zero_std": 0.96875, "grad_norm": 0.029625413939356804, "learning_rate": 1e-06, "loss": 0.0122, "num_tokens": 954605708.0, "reward": 0.898681640625, "reward_std": 0.0052734375931322575, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.998046875, "rewards/symbolic_reward_accuracy/std": 0.04419417306780815, "rewards/symbolic_reward_partial_score/mean": 0.99951171875, "rewards/symbolic_reward_partial_score/std": 0.011048543266952038, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.1031821966171265, "sampling/importance_sampling_ratio/min": 0.0011710627004504204, "sampling/sampling_logp_difference/max": 6.749843597412109, "sampling/sampling_logp_difference/mean": 0.170491024851799, "step": 1729 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.427320271730423, "epoch": 4.552631578947368, "grad_norm": 0.002444726647809148, "learning_rate": 1e-06, "loss": -0.0034, "step": 1730 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.43284234404563904, "epoch": 4.5552631578947365, "grad_norm": 0.0026107802987098694, "learning_rate": 1e-06, "loss": -0.0036, "step": 1731 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.4296949803829193, "epoch": 4.557894736842105, "grad_norm": 0.0027166479267179966, "learning_rate": 1e-06, "loss": -0.0034, "step": 1732 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15035.0, "completions/max_terminated_length": 15035.0, "completions/mean_length": 6189.65625, "completions/mean_terminated_length": 6189.65625, "completions/min_length": 1689.0, "completions/min_terminated_length": 1689.0, "entropy": 0.4322141706943512, "epoch": 4.560526315789474, "frac_reward_zero_std": 0.96875, "grad_norm": 0.03257491812109947, "learning_rate": 1e-06, "loss": 0.0191, "num_tokens": 958171100.0, "reward": 0.8985351920127869, "reward_std": 0.005859375931322575, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.998046875, "rewards/symbolic_reward_accuracy/std": 0.04419417306780815, "rewards/symbolic_reward_partial_score/mean": 0.9990234375, "rewards/symbolic_reward_partial_score/std": 0.022097086533904076, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.1039059162139893, "sampling/importance_sampling_ratio/min": 0.0016840739408507943, "sampling/sampling_logp_difference/max": 6.386539459228516, "sampling/sampling_logp_difference/mean": 0.17148764431476593, "step": 1733 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.43205249309539795, "epoch": 4.563157894736842, "grad_norm": 0.002109309658408165, "learning_rate": 1e-06, "loss": -0.0054, "step": 1734 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.4358983635902405, "epoch": 4.565789473684211, "grad_norm": 0.0018140700412914157, "learning_rate": 1e-06, "loss": -0.0037, "step": 1735 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.43200112879276276, "epoch": 4.568421052631579, "grad_norm": 0.0017628814093768597, "learning_rate": 1e-06, "loss": -0.0042, "step": 1736 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14472.0, "completions/max_terminated_length": 14472.0, "completions/mean_length": 6301.013671875, "completions/mean_terminated_length": 6301.013671875, "completions/min_length": 2132.0, "completions/min_terminated_length": 2132.0, "entropy": 0.4375248998403549, "epoch": 4.571052631578947, "frac_reward_zero_std": 0.9375, "grad_norm": 0.004156010691076517, "learning_rate": 1e-06, "loss": 0.0112, "num_tokens": 961799683.0, "reward": 0.896484375, "reward_std": 0.01406250149011612, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.99609375, "rewards/symbolic_reward_accuracy/std": 0.06243881583213806, "rewards/symbolic_reward_partial_score/mean": 0.99609375, "rewards/symbolic_reward_partial_score/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.104670763015747, "sampling/importance_sampling_ratio/min": 0.0017597615951672196, "sampling/sampling_logp_difference/max": 6.34257698059082, "sampling/sampling_logp_difference/mean": 0.17250433564186096, "step": 1737 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.43696147203445435, "epoch": 4.573684210526316, "grad_norm": 0.003323608310893178, "learning_rate": 1e-06, "loss": -0.0102, "step": 1738 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.43567726016044617, "epoch": 4.576315789473684, "grad_norm": 0.0033042333088815212, "learning_rate": 1e-06, "loss": 0.0047, "step": 1739 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.4338361322879791, "epoch": 4.578947368421053, "grad_norm": 0.002856391714885831, "learning_rate": 1e-06, "loss": -0.0103, "step": 1740 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14688.0, "completions/max_terminated_length": 14688.0, "completions/mean_length": 5929.265625, "completions/mean_terminated_length": 5929.265625, "completions/min_length": 971.0, "completions/min_terminated_length": 971.0, "entropy": 0.43696045875549316, "epoch": 4.581578947368421, "frac_reward_zero_std": 0.9375, "grad_norm": 0.003539962926879525, "learning_rate": 1e-06, "loss": -0.0074, "num_tokens": 965245771.0, "reward": 0.8968750238418579, "reward_std": 0.01250000111758709, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.99609375, "rewards/symbolic_reward_accuracy/std": 0.06243881583213806, "rewards/symbolic_reward_partial_score/mean": 0.9973958730697632, "rewards/symbolic_reward_partial_score/std": 0.046557389199733734, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.104569911956787, "sampling/importance_sampling_ratio/min": 0.0022125020623207092, "sampling/sampling_logp_difference/max": 6.113631248474121, "sampling/sampling_logp_difference/mean": 0.17194896936416626, "step": 1741 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.43360671401023865, "epoch": 4.58421052631579, "grad_norm": 0.03177979961037636, "learning_rate": 1e-06, "loss": 0.0145, "step": 1742 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.43533071875572205, "epoch": 4.586842105263158, "grad_norm": 0.0031480384059250355, "learning_rate": 1e-06, "loss": -0.0065, "step": 1743 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.4325557351112366, "epoch": 4.589473684210526, "grad_norm": 0.004079607781022787, "learning_rate": 1e-06, "loss": 0.0219, "step": 1744 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 16384.0, "completions/max_terminated_length": 15738.0, "completions/mean_length": 6571.03515625, "completions/mean_terminated_length": 6532.55322265625, "completions/min_length": 2417.0, "completions/min_terminated_length": 2417.0, "entropy": 0.43312308192253113, "epoch": 4.592105263157895, "frac_reward_zero_std": 0.90625, "grad_norm": 0.004563876893371344, "learning_rate": 1e-06, "loss": -0.0132, "num_tokens": 969049821.0, "reward": 0.8970703482627869, "reward_std": 0.01171875186264515, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.99609375, "rewards/symbolic_reward_accuracy/std": 0.06243881583213806, "rewards/symbolic_reward_partial_score/mean": 0.9993489384651184, "rewards/symbolic_reward_partial_score/std": 0.01040646992623806, "rewards/tag_count_reward/mean": -0.00390625, "rewards/tag_count_reward/std": 0.06243881583213806, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.1048052310943604, "sampling/importance_sampling_ratio/min": 0.002029762137681246, "sampling/sampling_logp_difference/max": 6.199836730957031, "sampling/sampling_logp_difference/mean": 0.17204180359840393, "step": 1745 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.4366276413202286, "epoch": 4.594736842105263, "grad_norm": 0.005072879139333963, "learning_rate": 1e-06, "loss": -0.0128, "step": 1746 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.43628183007240295, "epoch": 4.597368421052631, "grad_norm": 0.00417398102581501, "learning_rate": 1e-06, "loss": 0.0362, "step": 1747 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.4349798709154129, "epoch": 4.6, "grad_norm": 0.041563209146261215, "learning_rate": 1e-06, "loss": 0.0197, "step": 1748 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15068.0, "completions/max_terminated_length": 15068.0, "completions/mean_length": 6294.46484375, "completions/mean_terminated_length": 6294.46484375, "completions/min_length": 1729.0, "completions/min_terminated_length": 1729.0, "entropy": 0.4351597726345062, "epoch": 4.602631578947369, "frac_reward_zero_std": 0.96875, "grad_norm": 0.001996587496250868, "learning_rate": 1e-06, "loss": -0.0021, "num_tokens": 972707595.0, "reward": 0.8986328840255737, "reward_std": 0.00546875037252903, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.998046875, "rewards/symbolic_reward_accuracy/std": 0.04419417306780815, "rewards/symbolic_reward_partial_score/mean": 0.9993489980697632, "rewards/symbolic_reward_partial_score/std": 0.014731390401721, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.1045300960540771, "sampling/importance_sampling_ratio/min": 0.0028725771699100733, "sampling/sampling_logp_difference/max": 5.852545738220215, "sampling/sampling_logp_difference/mean": 0.17221154272556305, "step": 1749 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.4338027983903885, "epoch": 4.605263157894737, "grad_norm": 0.026945630088448524, "learning_rate": 1e-06, "loss": 0.013, "step": 1750 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.4344194829463959, "epoch": 4.6078947368421055, "grad_norm": 0.001975148683413863, "learning_rate": 1e-06, "loss": -0.002, "step": 1751 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.4337017834186554, "epoch": 4.610526315789474, "grad_norm": 0.0024492305237799883, "learning_rate": 1e-06, "loss": -0.0027, "step": 1752 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13329.0, "completions/max_terminated_length": 13329.0, "completions/mean_length": 5964.625, "completions/mean_terminated_length": 5964.625, "completions/min_length": 1780.0, "completions/min_terminated_length": 1780.0, "entropy": 0.43178844451904297, "epoch": 4.613157894736842, "frac_reward_zero_std": 0.96875, "grad_norm": 0.0024755916092544794, "learning_rate": 1e-06, "loss": 0.0171, "num_tokens": 976166475.0, "reward": 0.8985351920127869, "reward_std": 0.005859375931322575, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.998046875, "rewards/symbolic_reward_accuracy/std": 0.04419417306780815, "rewards/symbolic_reward_partial_score/mean": 0.9990234375, "rewards/symbolic_reward_partial_score/std": 0.022097086533904076, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.1038627624511719, "sampling/importance_sampling_ratio/min": 0.0017053185729309916, "sampling/sampling_logp_difference/max": 6.3740034103393555, "sampling/sampling_logp_difference/mean": 0.17091308534145355, "step": 1753 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.432775616645813, "epoch": 4.61578947368421, "grad_norm": 0.002126955660060048, "learning_rate": 1e-06, "loss": -0.0048, "step": 1754 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.4326540231704712, "epoch": 4.618421052631579, "grad_norm": 0.0018514416879042983, "learning_rate": 1e-06, "loss": -0.0046, "step": 1755 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.4313662648200989, "epoch": 4.621052631578947, "grad_norm": 0.0026604789309203625, "learning_rate": 1e-06, "loss": -0.0039, "step": 1756 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13110.0, "completions/max_terminated_length": 13110.0, "completions/mean_length": 5739.724609375, "completions/mean_terminated_length": 5739.724609375, "completions/min_length": 1311.0, "completions/min_terminated_length": 1311.0, "entropy": 0.43435807526111603, "epoch": 4.623684210526315, "frac_reward_zero_std": 0.875, "grad_norm": 0.03117295168340206, "learning_rate": 1e-06, "loss": 0.0036, "num_tokens": 979496830.0, "reward": 0.8929687738418579, "reward_std": 0.02812500298023224, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.9921875, "rewards/symbolic_reward_accuracy/std": 0.08812850713729858, "rewards/symbolic_reward_partial_score/mean": 0.9921875, "rewards/symbolic_reward_partial_score/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.1037782430648804, "sampling/importance_sampling_ratio/min": 0.0016620134701952338, "sampling/sampling_logp_difference/max": 6.399725437164307, "sampling/sampling_logp_difference/mean": 0.17162902653217316, "step": 1757 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0390625, "entropy": 0.43242721259593964, "epoch": 4.626315789473685, "grad_norm": 0.004598487634211779, "learning_rate": 1e-06, "loss": 0.0275, "step": 1758 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.43589073419570923, "epoch": 4.628947368421053, "grad_norm": 0.004339495208114386, "learning_rate": 1e-06, "loss": -0.0145, "step": 1759 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.43442071974277496, "epoch": 4.631578947368421, "grad_norm": 0.005245934706181288, "learning_rate": 1e-06, "loss": 0.0105, "step": 1760 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14166.0, "completions/max_terminated_length": 14166.0, "completions/mean_length": 6004.47265625, "completions/mean_terminated_length": 6004.47265625, "completions/min_length": 676.0, "completions/min_terminated_length": 676.0, "entropy": 0.43318332731723785, "epoch": 4.63421052631579, "frac_reward_zero_std": 0.9375, "grad_norm": 0.0035136344376951456, "learning_rate": 1e-06, "loss": 0.0122, "num_tokens": 982970096.0, "reward": 0.8972656726837158, "reward_std": 0.01093750074505806, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.99609375, "rewards/symbolic_reward_accuracy/std": 0.06243881583213806, "rewards/symbolic_reward_partial_score/mean": 0.9986978769302368, "rewards/symbolic_reward_partial_score/std": 0.023278694599866867, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.102708101272583, "sampling/importance_sampling_ratio/min": 0.0023684019688516855, "sampling/sampling_logp_difference/max": 6.045539855957031, "sampling/sampling_logp_difference/mean": 0.16960932314395905, "step": 1761 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.42665621638298035, "epoch": 4.636842105263158, "grad_norm": 0.002514537191018462, "learning_rate": 1e-06, "loss": -0.0101, "step": 1762 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.4275357723236084, "epoch": 4.639473684210526, "grad_norm": 0.0034015884157270193, "learning_rate": 1e-06, "loss": -0.0096, "step": 1763 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.42807987332344055, "epoch": 4.6421052631578945, "grad_norm": 0.02369544468820095, "learning_rate": 1e-06, "loss": 0.0048, "step": 1764 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15944.0, "completions/max_terminated_length": 15944.0, "completions/mean_length": 5743.640625, "completions/mean_terminated_length": 5743.640625, "completions/min_length": 1270.0, "completions/min_terminated_length": 1270.0, "entropy": 0.41847366094589233, "epoch": 4.644736842105263, "frac_reward_zero_std": 0.90625, "grad_norm": 0.006833137013018131, "learning_rate": 1e-06, "loss": -0.0177, "num_tokens": 986306968.0, "reward": 0.8921875357627869, "reward_std": 0.023446178063750267, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.990234375, "rewards/symbolic_reward_accuracy/std": 0.09843364357948303, "rewards/symbolic_reward_partial_score/mean": 0.9934895634651184, "rewards/symbolic_reward_partial_score/std": 0.0770525261759758, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.100651502609253, "sampling/importance_sampling_ratio/min": 0.001315418747253716, "sampling/sampling_logp_difference/max": 6.633600234985352, "sampling/sampling_logp_difference/mean": 0.16649366915225983, "step": 1765 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.422316312789917, "epoch": 4.647368421052631, "grad_norm": 0.03135828673839569, "learning_rate": 1e-06, "loss": 0.0087, "step": 1766 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.42030397057533264, "epoch": 4.65, "grad_norm": 0.012496300972998142, "learning_rate": 1e-06, "loss": -0.0082, "step": 1767 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.4223172664642334, "epoch": 4.652631578947369, "grad_norm": 0.005074888467788696, "learning_rate": 1e-06, "loss": 0.0133, "step": 1768 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 14611.0, "completions/mean_length": 6149.064453125, "completions/mean_terminated_length": 6129.03515625, "completions/min_length": 1197.0, "completions/min_terminated_length": 1197.0, "entropy": 0.4221375733613968, "epoch": 4.655263157894737, "frac_reward_zero_std": 0.90625, "grad_norm": 0.004442637786269188, "learning_rate": 1e-06, "loss": -0.0124, "num_tokens": 989868281.0, "reward": 0.895458996295929, "reward_std": 0.01816406287252903, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.994140625, "rewards/symbolic_reward_accuracy/std": 0.07639661431312561, "rewards/symbolic_reward_partial_score/mean": 0.9972330331802368, "rewards/symbolic_reward_partial_score/std": 0.04610797390341759, "rewards/tag_count_reward/mean": -0.001953125, "rewards/tag_count_reward/std": 0.04419417306780815, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.1002521514892578, "sampling/importance_sampling_ratio/min": 0.002037106081843376, "sampling/sampling_logp_difference/max": 6.196225166320801, "sampling/sampling_logp_difference/mean": 0.16634932160377502, "step": 1769 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0390625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0390625, "entropy": 0.4193985015153885, "epoch": 4.657894736842105, "grad_norm": 0.003967350348830223, "learning_rate": 1e-06, "loss": -0.012, "step": 1770 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.4207218885421753, "epoch": 4.660526315789474, "grad_norm": 0.0035787681117653847, "learning_rate": 1e-06, "loss": -0.0112, "step": 1771 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.41778165102005005, "epoch": 4.663157894736842, "grad_norm": 0.003998721018433571, "learning_rate": 1e-06, "loss": 0.0519, "step": 1772 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15599.0, "completions/max_terminated_length": 15599.0, "completions/mean_length": 6235.953125, "completions/mean_terminated_length": 6235.953125, "completions/min_length": 1424.0, "completions/min_terminated_length": 1424.0, "entropy": 0.41705767810344696, "epoch": 4.66578947368421, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 993489761.0, "reward": 0.9000000357627869, "reward_std": 0.0, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 1.0, "rewards/symbolic_reward_accuracy/std": 0.0, "rewards/symbolic_reward_partial_score/mean": 1.0, "rewards/symbolic_reward_partial_score/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.1004341840744019, "sampling/importance_sampling_ratio/min": 0.00038307326030917466, "sampling/sampling_logp_difference/max": 7.867284297943115, "sampling/sampling_logp_difference/mean": 0.16648200154304504, "step": 1773 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.4232047498226166, "epoch": 4.668421052631579, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 1774 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.42157651484012604, "epoch": 4.671052631578947, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 1775 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.41958342492580414, "epoch": 4.673684210526316, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 1776 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 16384.0, "completions/max_terminated_length": 13875.0, "completions/mean_length": 5888.615234375, "completions/mean_terminated_length": 5847.45703125, "completions/min_length": 502.0, "completions/min_terminated_length": 502.0, "entropy": 0.4201970547437668, "epoch": 4.676315789473684, "frac_reward_zero_std": 0.9375, "grad_norm": 0.003767336718738079, "learning_rate": 1e-06, "loss": -0.0097, "num_tokens": 996902364.0, "reward": 0.8960937857627869, "reward_std": 0.015625, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.99609375, "rewards/symbolic_reward_accuracy/std": 0.06243881583213806, "rewards/symbolic_reward_partial_score/mean": 0.99609375, "rewards/symbolic_reward_partial_score/std": 0.06243881583213806, "rewards/tag_count_reward/mean": -0.00390625, "rewards/tag_count_reward/std": 0.06243881583213806, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.1007671356201172, "sampling/importance_sampling_ratio/min": 0.0020156993996351957, "sampling/sampling_logp_difference/max": 6.206789016723633, "sampling/sampling_logp_difference/mean": 0.16714762151241302, "step": 1777 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.4254469573497772, "epoch": 4.678947368421053, "grad_norm": 0.0032893153838813305, "learning_rate": 1e-06, "loss": 0.024, "step": 1778 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.4251594543457031, "epoch": 4.681578947368421, "grad_norm": 0.003810209920629859, "learning_rate": 1e-06, "loss": 0.0242, "step": 1779 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.4216005504131317, "epoch": 4.684210526315789, "grad_norm": 0.0029099395032972097, "learning_rate": 1e-06, "loss": -0.0079, "step": 1780 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 13018.0, "completions/mean_length": 5639.482421875, "completions/mean_terminated_length": 5618.4560546875, "completions/min_length": 1634.0, "completions/min_terminated_length": 1634.0, "entropy": 0.43445518612861633, "epoch": 4.686842105263158, "frac_reward_zero_std": 0.9375, "grad_norm": 0.0035394555889070034, "learning_rate": 1e-06, "loss": -0.0069, "num_tokens": 1000168883.0, "reward": 0.8962891101837158, "reward_std": 0.01484375074505806, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.99609375, "rewards/symbolic_reward_accuracy/std": 0.06243881583213806, "rewards/symbolic_reward_partial_score/mean": 0.99609375, "rewards/symbolic_reward_partial_score/std": 0.06243881583213806, "rewards/tag_count_reward/mean": -0.001953125, "rewards/tag_count_reward/std": 0.04419417306780815, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.1032869815826416, "sampling/importance_sampling_ratio/min": 0.0014284333446994424, "sampling/sampling_logp_difference/max": 6.551177024841309, "sampling/sampling_logp_difference/mean": 0.1708400547504425, "step": 1781 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.4345066547393799, "epoch": 4.689473684210526, "grad_norm": 0.0307772234082222, "learning_rate": 1e-06, "loss": 0.0143, "step": 1782 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.43297117948532104, "epoch": 4.692105263157895, "grad_norm": 0.0025953135918825865, "learning_rate": 1e-06, "loss": -0.0062, "step": 1783 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0390625, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.046875, "entropy": 0.4290800094604492, "epoch": 4.6947368421052635, "grad_norm": 0.0015723688993602991, "learning_rate": 1e-06, "loss": 0.0245, "step": 1784 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 14795.0, "completions/mean_length": 6005.3203125, "completions/mean_terminated_length": 5985.009765625, "completions/min_length": 1532.0, "completions/min_terminated_length": 1532.0, "entropy": 0.42191801965236664, "epoch": 4.697368421052632, "frac_reward_zero_std": 0.90625, "grad_norm": 0.00472025852650404, "learning_rate": 1e-06, "loss": -0.0132, "num_tokens": 1003631895.0, "reward": 0.89501953125, "reward_std": 0.01992187649011612, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.994140625, "rewards/symbolic_reward_accuracy/std": 0.07639661431312561, "rewards/symbolic_reward_partial_score/mean": 0.9957681894302368, "rewards/symbolic_reward_partial_score/std": 0.06285149604082108, "rewards/tag_count_reward/mean": -0.001953125, "rewards/tag_count_reward/std": 0.04419417306780815, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0996088981628418, "sampling/importance_sampling_ratio/min": 0.002038520760834217, "sampling/sampling_logp_difference/max": 6.195530891418457, "sampling/sampling_logp_difference/mean": 0.16503699123859406, "step": 1785 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.4165394455194473, "epoch": 4.7, "grad_norm": 0.004029050935059786, "learning_rate": 1e-06, "loss": 0.0191, "step": 1786 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.4143212139606476, "epoch": 4.7026315789473685, "grad_norm": 0.024517429992556572, "learning_rate": 1e-06, "loss": 0.0308, "step": 1787 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03125, "entropy": 0.4173804223537445, "epoch": 4.705263157894737, "grad_norm": 0.003916588611900806, "learning_rate": 1e-06, "loss": -0.0121, "step": 1788 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16356.0, "completions/max_terminated_length": 16356.0, "completions/mean_length": 6006.560546875, "completions/mean_terminated_length": 6006.560546875, "completions/min_length": 740.0, "completions/min_terminated_length": 740.0, "entropy": 0.41274675726890564, "epoch": 4.707894736842105, "frac_reward_zero_std": 0.9375, "grad_norm": 0.021270431578159332, "learning_rate": 1e-06, "loss": 0.0048, "num_tokens": 1007104886.0, "reward": 0.8937500715255737, "reward_std": 0.017241813242435455, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.9921875, "rewards/symbolic_reward_accuracy/std": 0.08812850713729858, "rewards/symbolic_reward_partial_score/mean": 0.9947916269302368, "rewards/symbolic_reward_partial_score/std": 0.06656058132648468, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0989975929260254, "sampling/importance_sampling_ratio/min": 0.0023042457178235054, "sampling/sampling_logp_difference/max": 6.073001861572266, "sampling/sampling_logp_difference/mean": 0.16398563981056213, "step": 1789 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.4127029776573181, "epoch": 4.7105263157894735, "grad_norm": 0.025960620492696762, "learning_rate": 1e-06, "loss": 0.0081, "step": 1790 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.4156576246023178, "epoch": 4.713157894736842, "grad_norm": 0.005964155308902264, "learning_rate": 1e-06, "loss": -0.0171, "step": 1791 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.4167211502790451, "epoch": 4.715789473684211, "grad_norm": 0.003682361450046301, "learning_rate": 1e-06, "loss": 0.0174, "step": 1792 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14764.0, "completions/max_terminated_length": 14764.0, "completions/mean_length": 5603.07421875, "completions/mean_terminated_length": 5603.07421875, "completions/min_length": 1730.0, "completions/min_terminated_length": 1730.0, "entropy": 0.42309585213661194, "epoch": 4.718421052631579, "frac_reward_zero_std": 0.96875, "grad_norm": 0.0021216305904090405, "learning_rate": 1e-06, "loss": -0.0026, "num_tokens": 1010360124.0, "reward": 0.8982422351837158, "reward_std": 0.00703125074505806, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.998046875, "rewards/symbolic_reward_accuracy/std": 0.04419417306780815, "rewards/symbolic_reward_partial_score/mean": 0.998046875, "rewards/symbolic_reward_partial_score/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.1015081405639648, "sampling/importance_sampling_ratio/min": 0.0014691626420244575, "sampling/sampling_logp_difference/max": 6.523062705993652, "sampling/sampling_logp_difference/mean": 0.16811133921146393, "step": 1793 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.42089352011680603, "epoch": 4.721052631578948, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": -0.0025, "step": 1794 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.424471378326416, "epoch": 4.723684210526316, "grad_norm": 0.02454499714076519, "learning_rate": 1e-06, "loss": 0.0107, "step": 1795 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.427706241607666, "epoch": 4.726315789473684, "grad_norm": 0.0020945905707776546, "learning_rate": 1e-06, "loss": -0.0024, "step": 1796 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13044.0, "completions/max_terminated_length": 13044.0, "completions/mean_length": 5454.7421875, "completions/mean_terminated_length": 5454.7421875, "completions/min_length": 930.0, "completions/min_terminated_length": 930.0, "entropy": 0.42137929797172546, "epoch": 4.728947368421053, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 1013556760.0, "reward": 0.9000000357627869, "reward_std": 0.0, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 1.0, "rewards/symbolic_reward_accuracy/std": 0.0, "rewards/symbolic_reward_partial_score/mean": 1.0, "rewards/symbolic_reward_partial_score/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.101433277130127, "sampling/importance_sampling_ratio/min": 0.0028511767741292715, "sampling/sampling_logp_difference/max": 5.860023498535156, "sampling/sampling_logp_difference/mean": 0.16730912029743195, "step": 1797 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.4213852286338806, "epoch": 4.731578947368421, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 1798 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.4226205050945282, "epoch": 4.734210526315789, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 1799 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.4248412251472473, "epoch": 4.7368421052631575, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 1800 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14786.0, "completions/max_terminated_length": 14786.0, "completions/mean_length": 5348.390625, "completions/mean_terminated_length": 5348.390625, "completions/min_length": 1823.0, "completions/min_terminated_length": 1823.0, "entropy": 0.4181142747402191, "epoch": 4.739473684210527, "frac_reward_zero_std": 0.9375, "grad_norm": 0.003662986448034644, "learning_rate": 1e-06, "loss": -0.0092, "num_tokens": 1016700608.0, "reward": 0.896484375, "reward_std": 0.01406250149011612, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.99609375, "rewards/symbolic_reward_accuracy/std": 0.06243881583213806, "rewards/symbolic_reward_partial_score/mean": 0.99609375, "rewards/symbolic_reward_partial_score/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.1013845205307007, "sampling/importance_sampling_ratio/min": 0.0023322992492467165, "sampling/sampling_logp_difference/max": 6.060900688171387, "sampling/sampling_logp_difference/mean": 0.16688010096549988, "step": 1801 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.42347392439842224, "epoch": 4.742105263157895, "grad_norm": 0.026112349703907967, "learning_rate": 1e-06, "loss": 0.0093, "step": 1802 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.42271293699741364, "epoch": 4.744736842105263, "grad_norm": 0.003840529127046466, "learning_rate": 1e-06, "loss": -0.0083, "step": 1803 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.41917718946933746, "epoch": 4.747368421052632, "grad_norm": 0.0033010016195476055, "learning_rate": 1e-06, "loss": 0.0008, "step": 1804 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11331.0, "completions/max_terminated_length": 11331.0, "completions/mean_length": 4988.30078125, "completions/mean_terminated_length": 4988.30078125, "completions/min_length": 1228.0, "completions/min_terminated_length": 1228.0, "entropy": 0.4234064817428589, "epoch": 4.75, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 1019673946.0, "reward": 0.9000000357627869, "reward_std": 0.0, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 1.0, "rewards/symbolic_reward_accuracy/std": 0.0, "rewards/symbolic_reward_partial_score/mean": 1.0, "rewards/symbolic_reward_partial_score/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.1018203496932983, "sampling/importance_sampling_ratio/min": 0.0016644904389977455, "sampling/sampling_logp_difference/max": 6.398236274719238, "sampling/sampling_logp_difference/mean": 0.16775867342948914, "step": 1805 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.42225438356399536, "epoch": 4.752631578947368, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 1806 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.4225133806467056, "epoch": 4.755263157894737, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 1807 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.423874631524086, "epoch": 4.757894736842105, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 1808 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 14649.0, "completions/mean_length": 4774.296875, "completions/mean_terminated_length": 4751.5771484375, "completions/min_length": 1111.0, "completions/min_terminated_length": 1111.0, "entropy": 0.4202171713113785, "epoch": 4.760526315789473, "frac_reward_zero_std": 0.96875, "grad_norm": 0.00228870939463377, "learning_rate": 1e-06, "loss": -0.0032, "num_tokens": 1022508178.0, "reward": 0.8982422351837158, "reward_std": 0.00703125074505806, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.998046875, "rewards/symbolic_reward_accuracy/std": 0.04419417306780815, "rewards/symbolic_reward_partial_score/mean": 0.998046875, "rewards/symbolic_reward_partial_score/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.1006393432617188, "sampling/importance_sampling_ratio/min": 0.0017279875464737415, "sampling/sampling_logp_difference/max": 6.360797882080078, "sampling/sampling_logp_difference/mean": 0.16665740311145782, "step": 1809 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.42041245102882385, "epoch": 4.7631578947368425, "grad_norm": 0.0016010698163881898, "learning_rate": 1e-06, "loss": -0.0033, "step": 1810 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.41893942654132843, "epoch": 4.765789473684211, "grad_norm": 0.002436969429254532, "learning_rate": 1e-06, "loss": 0.0292, "step": 1811 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.42019888758659363, "epoch": 4.768421052631579, "grad_norm": 0.0023796483874320984, "learning_rate": 1e-06, "loss": -0.0032, "step": 1812 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12237.0, "completions/max_terminated_length": 12237.0, "completions/mean_length": 4487.822265625, "completions/mean_terminated_length": 4487.822265625, "completions/min_length": 918.0, "completions/min_terminated_length": 918.0, "entropy": 0.4267707020044327, "epoch": 4.771052631578947, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 1025195735.0, "reward": 0.9000000357627869, "reward_std": 0.0, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 1.0, "rewards/symbolic_reward_accuracy/std": 0.0, "rewards/symbolic_reward_partial_score/mean": 1.0, "rewards/symbolic_reward_partial_score/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.1021645069122314, "sampling/importance_sampling_ratio/min": 0.004417083691805601, "sampling/sampling_logp_difference/max": 5.422275543212891, "sampling/sampling_logp_difference/mean": 0.16884681582450867, "step": 1813 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.42350026965141296, "epoch": 4.773684210526316, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 1814 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.4263763129711151, "epoch": 4.776315789473684, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 1815 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.4271865487098694, "epoch": 4.778947368421052, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 1816 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 16214.0, "completions/mean_length": 4990.013671875, "completions/mean_terminated_length": 4967.71630859375, "completions/min_length": 1176.0, "completions/min_terminated_length": 1176.0, "entropy": 0.42124783992767334, "epoch": 4.781578947368421, "frac_reward_zero_std": 0.96875, "grad_norm": 0.0031755820382386446, "learning_rate": 1e-06, "loss": -0.0058, "num_tokens": 1028157118.0, "reward": 0.8980469107627869, "reward_std": 0.0078125, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.998046875, "rewards/symbolic_reward_accuracy/std": 0.04419417306780815, "rewards/symbolic_reward_partial_score/mean": 0.998046875, "rewards/symbolic_reward_partial_score/std": 0.04419417306780815, "rewards/tag_count_reward/mean": -0.001953125, "rewards/tag_count_reward/std": 0.04419417306780815, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.1013004779815674, "sampling/importance_sampling_ratio/min": 0.0011738245375454426, "sampling/sampling_logp_difference/max": 6.747488021850586, "sampling/sampling_logp_difference/mean": 0.1670088768005371, "step": 1817 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.4198300987482071, "epoch": 4.784210526315789, "grad_norm": 0.002093652728945017, "learning_rate": 1e-06, "loss": -0.006, "step": 1818 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.41897939145565033, "epoch": 4.786842105263158, "grad_norm": 0.03322795405983925, "learning_rate": 1e-06, "loss": 0.0276, "step": 1819 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.4199965000152588, "epoch": 4.7894736842105265, "grad_norm": 0.002599254949018359, "learning_rate": 1e-06, "loss": -0.005, "step": 1820 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15782.0, "completions/max_terminated_length": 15782.0, "completions/mean_length": 5030.734375, "completions/mean_terminated_length": 5030.734375, "completions/min_length": 1858.0, "completions/min_terminated_length": 1858.0, "entropy": 0.41677914559841156, "epoch": 4.792105263157895, "frac_reward_zero_std": 0.96875, "grad_norm": 0.002943596802651882, "learning_rate": 1e-06, "loss": -0.0057, "num_tokens": 1031137142.0, "reward": 0.8987305164337158, "reward_std": 0.005078125279396772, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.998046875, "rewards/symbolic_reward_accuracy/std": 0.04419417306780815, "rewards/symbolic_reward_partial_score/mean": 0.9996744394302368, "rewards/symbolic_reward_partial_score/std": 0.007365696597844362, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.1011899709701538, "sampling/importance_sampling_ratio/min": 0.0014168955385684967, "sampling/sampling_logp_difference/max": 6.559287071228027, "sampling/sampling_logp_difference/mean": 0.1671377718448639, "step": 1821 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.42033596336841583, "epoch": 4.794736842105263, "grad_norm": 0.0019201026298105717, "learning_rate": 1e-06, "loss": 0.0088, "step": 1822 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.418594628572464, "epoch": 4.7973684210526315, "grad_norm": 0.00219871592707932, "learning_rate": 1e-06, "loss": -0.0057, "step": 1823 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.4197862297296524, "epoch": 4.8, "grad_norm": 0.0034701847471296787, "learning_rate": 1e-06, "loss": -0.0062, "step": 1824 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 16384.0, "completions/max_terminated_length": 14201.0, "completions/mean_length": 4741.9140625, "completions/mean_terminated_length": 4696.25927734375, "completions/min_length": 640.0, "completions/min_terminated_length": 640.0, "entropy": 0.41448909044265747, "epoch": 4.802631578947368, "frac_reward_zero_std": 0.9375, "grad_norm": 0.005095763597637415, "learning_rate": 1e-06, "loss": -0.0105, "num_tokens": 1033959786.0, "reward": 0.890576183795929, "reward_std": 0.019037172198295593, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.98828125, "rewards/symbolic_reward_accuracy/std": 0.10772226005792618, "rewards/symbolic_reward_partial_score/mean": 0.99267578125, "rewards/symbolic_reward_partial_score/std": 0.07863853871822357, "rewards/tag_count_reward/mean": -0.001953125, "rewards/tag_count_reward/std": 0.04419417306780815, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.1007022857666016, "sampling/importance_sampling_ratio/min": 0.0010352793615311384, "sampling/sampling_logp_difference/max": 6.87308406829834, "sampling/sampling_logp_difference/mean": 0.16622817516326904, "step": 1825 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03125, "entropy": 0.41738876700401306, "epoch": 4.8052631578947365, "grad_norm": 0.027588602155447006, "learning_rate": 1e-06, "loss": 0.008, "step": 1826 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.41756127774715424, "epoch": 4.807894736842105, "grad_norm": 0.00932508148252964, "learning_rate": 1e-06, "loss": 0.0175, "step": 1827 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.4169604033231735, "epoch": 4.810526315789474, "grad_norm": 0.004513036925345659, "learning_rate": 1e-06, "loss": 0.0176, "step": 1828 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 13231.0, "completions/mean_length": 4946.751953125, "completions/mean_terminated_length": 4924.36962890625, "completions/min_length": 597.0, "completions/min_terminated_length": 597.0, "entropy": 0.41412411630153656, "epoch": 4.813157894736842, "frac_reward_zero_std": 0.9375, "grad_norm": 0.0036981627345085144, "learning_rate": 1e-06, "loss": -0.0073, "num_tokens": 1036901323.0, "reward": 0.8978515863418579, "reward_std": 0.008593750186264515, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.998046875, "rewards/symbolic_reward_accuracy/std": 0.04419417306780815, "rewards/symbolic_reward_partial_score/mean": 0.998046875, "rewards/symbolic_reward_partial_score/std": 0.04419417306780815, "rewards/tag_count_reward/mean": -0.00390625, "rewards/tag_count_reward/std": 0.06243881583213806, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.099937915802002, "sampling/importance_sampling_ratio/min": 0.0012380169937387109, "sampling/sampling_logp_difference/max": 6.694244384765625, "sampling/sampling_logp_difference/mean": 0.16443777084350586, "step": 1829 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.41186849772930145, "epoch": 4.815789473684211, "grad_norm": 0.002778484718874097, "learning_rate": 1e-06, "loss": 0.0012, "step": 1830 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.4086800813674927, "epoch": 4.818421052631579, "grad_norm": 0.003878846997395158, "learning_rate": 1e-06, "loss": -0.0089, "step": 1831 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.409384623169899, "epoch": 4.821052631578947, "grad_norm": 0.0031435037963092327, "learning_rate": 1e-06, "loss": 0.0242, "step": 1832 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15860.0, "completions/max_terminated_length": 15860.0, "completions/mean_length": 5247.94140625, "completions/mean_terminated_length": 5247.94140625, "completions/min_length": 1159.0, "completions/min_terminated_length": 1159.0, "entropy": 0.4083153158426285, "epoch": 4.823684210526316, "frac_reward_zero_std": 0.96875, "grad_norm": 0.018791064620018005, "learning_rate": 1e-06, "loss": 0.0184, "num_tokens": 1040007757.0, "reward": 0.8949218988418579, "reward_std": 0.009084025397896767, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.9921875, "rewards/symbolic_reward_accuracy/std": 0.08812850713729858, "rewards/symbolic_reward_partial_score/mean": 0.9986978769302368, "rewards/symbolic_reward_partial_score/std": 0.014688086695969105, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.098818063735962, "sampling/importance_sampling_ratio/min": 0.001938261091709137, "sampling/sampling_logp_difference/max": 6.245964050292969, "sampling/sampling_logp_difference/mean": 0.1635073721408844, "step": 1833 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.4119107872247696, "epoch": 4.826315789473684, "grad_norm": 0.0073483288288116455, "learning_rate": 1e-06, "loss": -0.0142, "step": 1834 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.40948253870010376, "epoch": 4.828947368421053, "grad_norm": 0.006414026953279972, "learning_rate": 1e-06, "loss": -0.0129, "step": 1835 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.40543854236602783, "epoch": 4.831578947368421, "grad_norm": 0.0031001681927591562, "learning_rate": 1e-06, "loss": 0.0152, "step": 1836 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 16384.0, "completions/max_terminated_length": 15810.0, "completions/mean_length": 5591.357421875, "completions/mean_terminated_length": 5527.74658203125, "completions/min_length": 1082.0, "completions/min_terminated_length": 1082.0, "entropy": 0.41380156576633453, "epoch": 4.83421052631579, "frac_reward_zero_std": 0.90625, "grad_norm": 0.009498282335698605, "learning_rate": 1e-06, "loss": -0.0194, "num_tokens": 1043272356.0, "reward": 0.8850586414337158, "reward_std": 0.026405753567814827, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.982421875, "rewards/symbolic_reward_accuracy/std": 0.13154059648513794, "rewards/symbolic_reward_partial_score/mean": 0.9873046875, "rewards/symbolic_reward_partial_score/std": 0.10836639255285263, "rewards/tag_count_reward/mean": -0.005859375, "rewards/tag_count_reward/std": 0.07639661431312561, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0996696949005127, "sampling/importance_sampling_ratio/min": 6.373844371410087e-05, "sampling/sampling_logp_difference/max": 9.660722732543945, "sampling/sampling_logp_difference/mean": 0.16511842608451843, "step": 1837 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.41262662410736084, "epoch": 4.836842105263158, "grad_norm": 0.0446275994181633, "learning_rate": 1e-06, "loss": 0.0382, "step": 1838 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.4088703989982605, "epoch": 4.839473684210526, "grad_norm": 0.014532266184687614, "learning_rate": 1e-06, "loss": -0.0011, "step": 1839 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.4135838449001312, "epoch": 4.842105263157895, "grad_norm": 0.017805378884077072, "learning_rate": 1e-06, "loss": 0.0045, "step": 1840 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13001.0, "completions/max_terminated_length": 13001.0, "completions/mean_length": 5067.646484375, "completions/mean_terminated_length": 5067.646484375, "completions/min_length": 651.0, "completions/min_terminated_length": 651.0, "entropy": 0.4197660982608795, "epoch": 4.844736842105263, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 1046239599.0, "reward": 0.9000000357627869, "reward_std": 0.0, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 1.0, "rewards/symbolic_reward_accuracy/std": 0.0, "rewards/symbolic_reward_partial_score/mean": 1.0, "rewards/symbolic_reward_partial_score/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.1019878387451172, "sampling/importance_sampling_ratio/min": 0.0013290561037138104, "sampling/sampling_logp_difference/max": 6.623286247253418, "sampling/sampling_logp_difference/mean": 0.16853979229927063, "step": 1841 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.420428991317749, "epoch": 4.847368421052631, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 1842 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.4233607202768326, "epoch": 4.85, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 1843 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.4192558228969574, "epoch": 4.852631578947369, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 1844 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13559.0, "completions/max_terminated_length": 13559.0, "completions/mean_length": 4588.462890625, "completions/mean_terminated_length": 4588.462890625, "completions/min_length": 866.0, "completions/min_terminated_length": 866.0, "entropy": 0.4230364263057709, "epoch": 4.855263157894737, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 1048995548.0, "reward": 0.9000000357627869, "reward_std": 0.0, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 1.0, "rewards/symbolic_reward_accuracy/std": 0.0, "rewards/symbolic_reward_partial_score/mean": 1.0, "rewards/symbolic_reward_partial_score/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.1032750606536865, "sampling/importance_sampling_ratio/min": 0.0035867649130523205, "sampling/sampling_logp_difference/max": 5.630504608154297, "sampling/sampling_logp_difference/mean": 0.16935646533966064, "step": 1845 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.4231020212173462, "epoch": 4.8578947368421055, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 1846 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.42159895598888397, "epoch": 4.860526315789474, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 1847 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.422979474067688, "epoch": 4.863157894736842, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 1848 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 16384.0, "completions/max_terminated_length": 16011.0, "completions/mean_length": 4764.5703125, "completions/mean_terminated_length": 4719.00439453125, "completions/min_length": 769.0, "completions/min_terminated_length": 769.0, "entropy": 0.4147234559059143, "epoch": 4.86578947368421, "frac_reward_zero_std": 0.9375, "grad_norm": 0.004372838884592056, "learning_rate": 1e-06, "loss": -0.0078, "num_tokens": 1051829952.0, "reward": 0.894726574420929, "reward_std": 0.01622215285897255, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.994140625, "rewards/symbolic_reward_accuracy/std": 0.07639661431312561, "rewards/symbolic_reward_partial_score/mean": 0.9954427480697632, "rewards/symbolic_reward_partial_score/std": 0.06411336362361908, "rewards/tag_count_reward/mean": -0.00390625, "rewards/tag_count_reward/std": 0.06243881583213806, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.1012895107269287, "sampling/importance_sampling_ratio/min": 0.002354616764932871, "sampling/sampling_logp_difference/max": 6.051377296447754, "sampling/sampling_logp_difference/mean": 0.16688959300518036, "step": 1849 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.41656024754047394, "epoch": 4.868421052631579, "grad_norm": 0.00268099014647305, "learning_rate": 1e-06, "loss": 0.0299, "step": 1850 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03125, "entropy": 0.41994088888168335, "epoch": 4.871052631578947, "grad_norm": 0.0024931104853749275, "learning_rate": 1e-06, "loss": -0.0069, "step": 1851 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.415420725941658, "epoch": 4.873684210526315, "grad_norm": 0.017212405800819397, "learning_rate": 1e-06, "loss": 0.0114, "step": 1852 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12259.0, "completions/max_terminated_length": 12259.0, "completions/mean_length": 5496.720703125, "completions/mean_terminated_length": 5496.720703125, "completions/min_length": 1531.0, "completions/min_terminated_length": 1531.0, "entropy": 0.4071143716573715, "epoch": 4.876315789473685, "frac_reward_zero_std": 0.96875, "grad_norm": 0.001858607167378068, "learning_rate": 1e-06, "loss": -0.0021, "num_tokens": 1055063281.0, "reward": 0.8982422351837158, "reward_std": 0.00703125074505806, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.998046875, "rewards/symbolic_reward_accuracy/std": 0.04419417306780815, "rewards/symbolic_reward_partial_score/mean": 0.998046875, "rewards/symbolic_reward_partial_score/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.1000921726226807, "sampling/importance_sampling_ratio/min": 0.0007913529407233, "sampling/sampling_logp_difference/max": 7.141766548156738, "sampling/sampling_logp_difference/mean": 0.164865180850029, "step": 1853 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.4069135785102844, "epoch": 4.878947368421053, "grad_norm": 0.0015915363328531384, "learning_rate": 1e-06, "loss": 0.0181, "step": 1854 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.40932293236255646, "epoch": 4.881578947368421, "grad_norm": 0.0015356827061623335, "learning_rate": 1e-06, "loss": -0.0029, "step": 1855 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.4124646335840225, "epoch": 4.88421052631579, "grad_norm": 0.0012307771248742938, "learning_rate": 1e-06, "loss": -0.0024, "step": 1856 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 16384.0, "completions/max_terminated_length": 15018.0, "completions/mean_length": 5135.388671875, "completions/mean_terminated_length": 5091.27685546875, "completions/min_length": 986.0, "completions/min_terminated_length": 986.0, "entropy": 0.40472041070461273, "epoch": 4.886842105263158, "frac_reward_zero_std": 0.90625, "grad_norm": 0.0066587249748408794, "learning_rate": 1e-06, "loss": 0.0185, "num_tokens": 1058115096.0, "reward": 0.8921387195587158, "reward_std": 0.023189274594187737, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.990234375, "rewards/symbolic_reward_accuracy/std": 0.09843364357948303, "rewards/symbolic_reward_partial_score/mean": 0.99462890625, "rewards/symbolic_reward_partial_score/std": 0.06520672142505646, "rewards/tag_count_reward/mean": -0.00390625, "rewards/tag_count_reward/std": 0.06243881583213806, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0994518995285034, "sampling/importance_sampling_ratio/min": 0.002227945951744914, "sampling/sampling_logp_difference/max": 6.106675148010254, "sampling/sampling_logp_difference/mean": 0.16302888095378876, "step": 1857 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.40252819657325745, "epoch": 4.889473684210526, "grad_norm": 0.005474721547216177, "learning_rate": 1e-06, "loss": 0.0165, "step": 1858 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.4054529368877411, "epoch": 4.8921052631578945, "grad_norm": 0.005351050291210413, "learning_rate": 1e-06, "loss": 0.0113, "step": 1859 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.4061288982629776, "epoch": 4.894736842105263, "grad_norm": 0.00564099894836545, "learning_rate": 1e-06, "loss": -0.0149, "step": 1860 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11346.0, "completions/max_terminated_length": 11346.0, "completions/mean_length": 4943.892578125, "completions/mean_terminated_length": 4943.892578125, "completions/min_length": 1270.0, "completions/min_terminated_length": 1270.0, "entropy": 0.4189876317977905, "epoch": 4.897368421052631, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 1061014305.0, "reward": 0.9000000357627869, "reward_std": 0.0, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 1.0, "rewards/symbolic_reward_accuracy/std": 0.0, "rewards/symbolic_reward_partial_score/mean": 1.0, "rewards/symbolic_reward_partial_score/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.10298752784729, "sampling/importance_sampling_ratio/min": 0.0022381297312676907, "sampling/sampling_logp_difference/max": 6.102114677429199, "sampling/sampling_logp_difference/mean": 0.16874951124191284, "step": 1861 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.4216906875371933, "epoch": 4.9, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 1862 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.42171426117420197, "epoch": 4.902631578947369, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 1863 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.4235544800758362, "epoch": 4.905263157894737, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 1864 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12292.0, "completions/max_terminated_length": 12292.0, "completions/mean_length": 5125.125, "completions/mean_terminated_length": 5125.125, "completions/min_length": 1909.0, "completions/min_terminated_length": 1909.0, "entropy": 0.4159326106309891, "epoch": 4.907894736842105, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 1064039169.0, "reward": 0.9000000357627869, "reward_std": 0.0, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 1.0, "rewards/symbolic_reward_accuracy/std": 0.0, "rewards/symbolic_reward_partial_score/mean": 1.0, "rewards/symbolic_reward_partial_score/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.1020094156265259, "sampling/importance_sampling_ratio/min": 0.004233494866639376, "sampling/sampling_logp_difference/max": 5.464727401733398, "sampling/sampling_logp_difference/mean": 0.16753841936588287, "step": 1865 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.417013019323349, "epoch": 4.910526315789474, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 1866 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.41526539623737335, "epoch": 4.913157894736842, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 1867 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.4190913736820221, "epoch": 4.91578947368421, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 1868 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 16384.0, "completions/max_terminated_length": 13104.0, "completions/mean_length": 5798.11328125, "completions/mean_terminated_length": 5756.60009765625, "completions/min_length": 1531.0, "completions/min_terminated_length": 1531.0, "entropy": 0.4080495685338974, "epoch": 4.918421052631579, "frac_reward_zero_std": 0.96875, "grad_norm": 0.002552908379584551, "learning_rate": 1e-06, "loss": -0.0035, "num_tokens": 1067440987.0, "reward": 0.8980469107627869, "reward_std": 0.0078125, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.998046875, "rewards/symbolic_reward_accuracy/std": 0.04419417306780815, "rewards/symbolic_reward_partial_score/mean": 0.998046875, "rewards/symbolic_reward_partial_score/std": 0.04419417306780815, "rewards/tag_count_reward/mean": -0.001953125, "rewards/tag_count_reward/std": 0.04419417306780815, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.1004188060760498, "sampling/importance_sampling_ratio/min": 0.001327339792624116, "sampling/sampling_logp_difference/max": 6.624578475952148, "sampling/sampling_logp_difference/mean": 0.1642836332321167, "step": 1869 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.4084668755531311, "epoch": 4.921052631578947, "grad_norm": 0.0029575773514807224, "learning_rate": 1e-06, "loss": 0.028, "step": 1870 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.40933088958263397, "epoch": 4.923684210526316, "grad_norm": 0.0028016483411192894, "learning_rate": 1e-06, "loss": -0.0056, "step": 1871 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.40506620705127716, "epoch": 4.926315789473684, "grad_norm": 0.0019874332938343287, "learning_rate": 1e-06, "loss": -0.0044, "step": 1872 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11739.0, "completions/max_terminated_length": 11739.0, "completions/mean_length": 5096.03515625, "completions/mean_terminated_length": 5096.03515625, "completions/min_length": 787.0, "completions/min_terminated_length": 787.0, "entropy": 0.41956058144569397, "epoch": 4.928947368421053, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 1070428941.0, "reward": 0.9000000357627869, "reward_std": 0.0, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 1.0, "rewards/symbolic_reward_accuracy/std": 0.0, "rewards/symbolic_reward_partial_score/mean": 1.0, "rewards/symbolic_reward_partial_score/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.1031776666641235, "sampling/importance_sampling_ratio/min": 0.0033364673145115376, "sampling/sampling_logp_difference/max": 5.702842712402344, "sampling/sampling_logp_difference/mean": 0.16856572031974792, "step": 1873 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.42255792021751404, "epoch": 4.931578947368421, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 1874 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.41967572271823883, "epoch": 4.934210526315789, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 1875 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.4202282428741455, "epoch": 4.936842105263158, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 1876 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13916.0, "completions/max_terminated_length": 13916.0, "completions/mean_length": 5596.33984375, "completions/mean_terminated_length": 5596.33984375, "completions/min_length": 1901.0, "completions/min_terminated_length": 1901.0, "entropy": 0.41141530871391296, "epoch": 4.939473684210526, "frac_reward_zero_std": 0.96875, "grad_norm": 0.028281377628445625, "learning_rate": 1e-06, "loss": 0.0144, "num_tokens": 1073712091.0, "reward": 0.8982422351837158, "reward_std": 0.00703125074505806, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.998046875, "rewards/symbolic_reward_accuracy/std": 0.04419417306780815, "rewards/symbolic_reward_partial_score/mean": 0.998046875, "rewards/symbolic_reward_partial_score/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.1021140813827515, "sampling/importance_sampling_ratio/min": 6.899942377458501e-07, "sampling/sampling_logp_difference/max": 14.186582565307617, "sampling/sampling_logp_difference/mean": 0.16654615104198456, "step": 1877 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.41413505375385284, "epoch": 4.942105263157895, "grad_norm": 0.002298869891092181, "learning_rate": 1e-06, "loss": -0.0032, "step": 1878 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.41438594460487366, "epoch": 4.9447368421052635, "grad_norm": 0.0022998114582151175, "learning_rate": 1e-06, "loss": -0.0036, "step": 1879 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.41090287268161774, "epoch": 4.947368421052632, "grad_norm": 0.0020206754561513662, "learning_rate": 1e-06, "loss": -0.0041, "step": 1880 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13720.0, "completions/max_terminated_length": 13720.0, "completions/mean_length": 5165.421875, "completions/mean_terminated_length": 5165.421875, "completions/min_length": 1084.0, "completions/min_terminated_length": 1084.0, "entropy": 0.41653770208358765, "epoch": 4.95, "frac_reward_zero_std": 0.96875, "grad_norm": 0.0026710904203355312, "learning_rate": 1e-06, "loss": -0.005, "num_tokens": 1076769619.0, "reward": 0.8982422351837158, "reward_std": 0.00703125074505806, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.998046875, "rewards/symbolic_reward_accuracy/std": 0.04419417306780815, "rewards/symbolic_reward_partial_score/mean": 0.998046875, "rewards/symbolic_reward_partial_score/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.101952314376831, "sampling/importance_sampling_ratio/min": 0.0027743689715862274, "sampling/sampling_logp_difference/max": 5.887331962585449, "sampling/sampling_logp_difference/mean": 0.16684381663799286, "step": 1881 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.4142802208662033, "epoch": 4.9526315789473685, "grad_norm": 0.0019588549621403217, "learning_rate": 1e-06, "loss": -0.0051, "step": 1882 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.4092801809310913, "epoch": 4.955263157894737, "grad_norm": 0.0023243881296366453, "learning_rate": 1e-06, "loss": -0.0035, "step": 1883 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.4138130843639374, "epoch": 4.957894736842105, "grad_norm": 0.0016379705630242825, "learning_rate": 1e-06, "loss": 0.0191, "step": 1884 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13990.0, "completions/max_terminated_length": 13990.0, "completions/mean_length": 5468.6640625, "completions/mean_terminated_length": 5468.6640625, "completions/min_length": 616.0, "completions/min_terminated_length": 616.0, "entropy": 0.41013991832733154, "epoch": 4.9605263157894735, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 1079975207.0, "reward": 0.9000000357627869, "reward_std": 0.0, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 1.0, "rewards/symbolic_reward_accuracy/std": 0.0, "rewards/symbolic_reward_partial_score/mean": 1.0, "rewards/symbolic_reward_partial_score/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.1011567115783691, "sampling/importance_sampling_ratio/min": 0.0015834070509299636, "sampling/sampling_logp_difference/max": 6.448176383972168, "sampling/sampling_logp_difference/mean": 0.16624604165554047, "step": 1885 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.4088575690984726, "epoch": 4.963157894736842, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 1886 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.4118724763393402, "epoch": 4.965789473684211, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 1887 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.4086661636829376, "epoch": 4.968421052631579, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 1888 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15787.0, "completions/max_terminated_length": 15787.0, "completions/mean_length": 6343.203125, "completions/mean_terminated_length": 6343.203125, "completions/min_length": 1531.0, "completions/min_terminated_length": 1531.0, "entropy": 0.39697757363319397, "epoch": 4.971052631578948, "frac_reward_zero_std": 0.9375, "grad_norm": 0.020623186603188515, "learning_rate": 1e-06, "loss": 0.0145, "num_tokens": 1083657775.0, "reward": 0.89013671875, "reward_std": 0.018373416736721992, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.986328125, "rewards/symbolic_reward_accuracy/std": 0.1162383034825325, "rewards/symbolic_reward_partial_score/mean": 0.9944661259651184, "rewards/symbolic_reward_partial_score/std": 0.06445936113595963, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0986179113388062, "sampling/importance_sampling_ratio/min": 0.0009520654566586018, "sampling/sampling_logp_difference/max": 6.956876754760742, "sampling/sampling_logp_difference/mean": 0.1618214100599289, "step": 1889 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.399369016289711, "epoch": 4.973684210526316, "grad_norm": 0.011686233803629875, "learning_rate": 1e-06, "loss": -0.0128, "step": 1890 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.39961469173431396, "epoch": 4.976315789473684, "grad_norm": 0.008714032359421253, "learning_rate": 1e-06, "loss": 0.0032, "step": 1891 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.3952711671590805, "epoch": 4.978947368421053, "grad_norm": 0.011127989739179611, "learning_rate": 1e-06, "loss": -0.0085, "step": 1892 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15865.0, "completions/max_terminated_length": 15865.0, "completions/mean_length": 5735.08984375, "completions/mean_terminated_length": 5735.08984375, "completions/min_length": 1041.0, "completions/min_terminated_length": 1041.0, "entropy": 0.40476512908935547, "epoch": 4.981578947368421, "frac_reward_zero_std": 0.90625, "grad_norm": 0.03821815550327301, "learning_rate": 1e-06, "loss": 0.0195, "num_tokens": 1086983613.0, "reward": 0.8939453363418579, "reward_std": 0.020503725856542587, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.9921875, "rewards/symbolic_reward_accuracy/std": 0.08812850713729858, "rewards/symbolic_reward_partial_score/mean": 0.9954426884651184, "rewards/symbolic_reward_partial_score/std": 0.06325981765985489, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.100600004196167, "sampling/importance_sampling_ratio/min": 0.002097643446177244, "sampling/sampling_logp_difference/max": 6.166940689086914, "sampling/sampling_logp_difference/mean": 0.16543450951576233, "step": 1893 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.4117436110973358, "epoch": 4.984210526315789, "grad_norm": 0.00405893474817276, "learning_rate": 1e-06, "loss": 0.0027, "step": 1894 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03125, "entropy": 0.40815281867980957, "epoch": 4.9868421052631575, "grad_norm": 0.005387400276958942, "learning_rate": 1e-06, "loss": -0.017, "step": 1895 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.40890026092529297, "epoch": 4.989473684210527, "grad_norm": 0.005877798423171043, "learning_rate": 1e-06, "loss": 0.0031, "step": 1896 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 16384.0, "completions/max_terminated_length": 16324.0, "completions/mean_length": 5756.1015625, "completions/mean_terminated_length": 5693.4619140625, "completions/min_length": 1348.0, "completions/min_terminated_length": 1348.0, "entropy": 0.4195985049009323, "epoch": 4.992105263157895, "frac_reward_zero_std": 0.96875, "grad_norm": 0.017484664916992188, "learning_rate": 1e-06, "loss": 0.0187, "num_tokens": 1090315697.0, "reward": 0.8935546875, "reward_std": 0.011645604856312275, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.9921875, "rewards/symbolic_reward_accuracy/std": 0.08812850713729858, "rewards/symbolic_reward_partial_score/mean": 0.99609375, "rewards/symbolic_reward_partial_score/std": 0.051725368946790695, "rewards/tag_count_reward/mean": -0.005859375, "rewards/tag_count_reward/std": 0.07639661431312561, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.1030421257019043, "sampling/importance_sampling_ratio/min": 0.0006267816643230617, "sampling/sampling_logp_difference/max": 7.374912261962891, "sampling/sampling_logp_difference/mean": 0.16903811693191528, "step": 1897 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.4194973558187485, "epoch": 4.994736842105263, "grad_norm": 0.0057156882248818874, "learning_rate": 1e-06, "loss": -0.0115, "step": 1898 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.41878555715084076, "epoch": 4.997368421052632, "grad_norm": 0.0061633591540157795, "learning_rate": 1e-06, "loss": 0.0063, "step": 1899 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.42264990508556366, "epoch": 5.0, "grad_norm": 0.005547456443309784, "learning_rate": 1e-06, "loss": 0.0059, "step": 1900 }, { "epoch": 5.0, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 11290.28125, "eval_completions/max_terminated_length": 11290.28125, "eval_completions/mean_length": 4926.05712890625, "eval_completions/mean_terminated_length": 4926.05712890625, "eval_completions/min_length": 1160.375, "eval_completions/min_terminated_length": 1160.375, "eval_entropy": 0.4272125642746687, "eval_frac_reward_zero_std": 0.99609375, "eval_loss": -6.615129677811638e-05, "eval_num_tokens": 1090315697.0, "eval_reward": 0.8997803088277578, "eval_reward_std": 0.0008789063431322575, "eval_rewards/progression_diversity/mean": 0.0, "eval_rewards/progression_diversity/std": 0.0, "eval_rewards/symbolic_reward_accuracy/mean": 0.999755859375, "eval_rewards/symbolic_reward_accuracy/std": 0.0027621358167380095, "eval_rewards/symbolic_reward_partial_score/mean": 0.999755859375, "eval_rewards/symbolic_reward_partial_score/std": 0.0027621358167380095, "eval_rewards/tag_count_reward/mean": 0.0, "eval_rewards/tag_count_reward/std": 0.0, "eval_runtime": 6121.2017, "eval_samples_per_second": 0.041, "eval_sampling/importance_sampling_ratio/max": 2.0, "eval_sampling/importance_sampling_ratio/mean": 1.1036077477037907, "eval_sampling/importance_sampling_ratio/min": 0.003590909782909435, "eval_sampling/sampling_logp_difference/max": 5.984421730041504, "eval_sampling/sampling_logp_difference/mean": 0.17040536273270845, "eval_steps_per_second": 0.0, "step": 1900 }, { "epoch": 5.0, "step": 1900, "total_flos": 0.0, "train_loss": 0.0027289845211903173, "train_runtime": 187388.7148, "train_samples_per_second": 0.081, "train_steps_per_second": 0.01 } ], "logging_steps": 1, "max_steps": 1900, "num_input_tokens_seen": 1090315697, "num_train_epochs": 5, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }