diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -2,9 +2,9 @@ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, - "epoch": 1.0, + "epoch": 2.0, "eval_steps": 500, - "global_step": 624, + "global_step": 1248, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, @@ -11904,19 +11904,11913 @@ "step": 624 }, { - "epoch": 1.0, - "step": 624, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16264.0, + "completions/mean_length": 7572.095703125, + "completions/mean_terminated_length": 7554.85107421875, + "completions/min_length": 3083.0, + "completions/min_terminated_length": 3083.0, + "entropy": 0.3038008362054825, + "epoch": 1.001602564102564, + "frac_reward_zero_std": 0.84375, + "grad_norm": 0.01717405952513218, + "learning_rate": 1e-06, + "loss": 0.0011, + "num_tokens": 833551563.0, + "reward": 0.891308605670929, + "reward_std": 0.03100288100540638, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.98828125, + "rewards/symbolic_reward_accuracy/std": 0.10772226005792618, + "rewards/symbolic_reward_partial_score/mean": 0.9951171875, + "rewards/symbolic_reward_partial_score/std": 0.06366384774446487, + "rewards/tag_count_reward/mean": -0.001953125, + "rewards/tag_count_reward/std": 0.04419417306780815, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0701267719268799, + "sampling/importance_sampling_ratio/min": 4.460942818695912e-06, + "sampling/sampling_logp_difference/max": 12.320150375366211, + "sampling/sampling_logp_difference/mean": 0.12167361378669739, + "step": 625 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.03125, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.03125, + "entropy": 0.3047471046447754, + "epoch": 1.0032051282051282, + "grad_norm": 0.0346001572906971, + "learning_rate": 1e-06, + "loss": 0.0141, + "step": 626 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "entropy": 0.3034920394420624, + "epoch": 1.0048076923076923, + "grad_norm": 0.006263586226850748, + "learning_rate": 1e-06, + "loss": -0.0261, + "step": 627 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.03125, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.03125, + "entropy": 0.3038324564695358, + "epoch": 1.0064102564102564, + "grad_norm": 0.005815045442432165, + "learning_rate": 1e-06, + "loss": 0.0183, + "step": 628 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15214.0, + "completions/max_terminated_length": 15214.0, + "completions/mean_length": 7960.427734375, + "completions/mean_terminated_length": 7960.427734375, + "completions/min_length": 3501.0, + "completions/min_terminated_length": 3501.0, + "entropy": 0.30271387100219727, + "epoch": 1.0080128205128205, + "frac_reward_zero_std": 0.875, + "grad_norm": 0.007918753661215305, + "learning_rate": 1e-06, + "loss": -0.0069, + "num_tokens": 838452422.0, + "reward": 0.8900195956230164, + "reward_std": 0.024737173691391945, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.984375, + "rewards/symbolic_reward_accuracy/std": 0.12414088100194931, + "rewards/symbolic_reward_partial_score/mean": 0.9979817867279053, + "rewards/symbolic_reward_partial_score/std": 0.01871746964752674, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0692280530929565, + "sampling/importance_sampling_ratio/min": 8.266865188488737e-05, + "sampling/sampling_logp_difference/max": 9.400670051574707, + "sampling/sampling_logp_difference/mean": 0.12023815512657166, + "step": 629 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0078125, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0078125, + "entropy": 0.29866258800029755, + "epoch": 1.0096153846153846, + "grad_norm": 0.012961495667696, + "learning_rate": 1e-06, + "loss": -0.0035, + "step": 630 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.0390625, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0390625, + "entropy": 0.30047234892845154, + "epoch": 1.0112179487179487, + "grad_norm": 0.025549575686454773, + "learning_rate": 1e-06, + "loss": 0.0027, + "step": 631 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0234375, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0234375, + "entropy": 0.29905661940574646, + "epoch": 1.0128205128205128, + "grad_norm": 0.02982858195900917, + "learning_rate": 1e-06, + "loss": 0.0138, + "step": 632 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14581.0, + "completions/max_terminated_length": 14581.0, + "completions/mean_length": 7970.181640625, + "completions/mean_terminated_length": 7970.181640625, + "completions/min_length": 4062.0, + "completions/min_terminated_length": 4062.0, + "entropy": 0.2968658357858658, + "epoch": 1.0144230769230769, + "frac_reward_zero_std": 0.875, + "grad_norm": 0.005726849194616079, + "learning_rate": 1e-06, + "loss": -0.0211, + "num_tokens": 843435523.0, + "reward": 0.8917480707168579, + "reward_std": 0.02855183184146881, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.990234375, + "rewards/symbolic_reward_accuracy/std": 0.09843364357948303, + "rewards/symbolic_reward_partial_score/mean": 0.9920247793197632, + "rewards/symbolic_reward_partial_score/std": 0.08819098025560379, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0682308673858643, + "sampling/importance_sampling_ratio/min": 0.0020298007875680923, + "sampling/sampling_logp_difference/max": 6.199817657470703, + "sampling/sampling_logp_difference/mean": 0.11880014836788177, + "step": 633 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.015625, + "clip_ratio/low_mean": 0.0078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0234375, + "entropy": 0.2927432805299759, + "epoch": 1.016025641025641, + "grad_norm": 0.019428346306085587, + "learning_rate": 1e-06, + "loss": 0.012, + "step": 634 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.03125, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.03125, + "entropy": 0.2961471676826477, + "epoch": 1.017628205128205, + "grad_norm": 0.004945802036672831, + "learning_rate": 1e-06, + "loss": -0.0203, + "step": 635 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.015625, + "clip_ratio/low_mean": 0.0078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0234375, + "entropy": 0.29704806208610535, + "epoch": 1.0192307692307692, + "grad_norm": 0.03395095467567444, + "learning_rate": 1e-06, + "loss": 0.0348, + "step": 636 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13490.0, + "completions/max_terminated_length": 13490.0, + "completions/mean_length": 7647.15625, + "completions/mean_terminated_length": 7647.15625, + "completions/min_length": 2218.0, + "completions/min_terminated_length": 2218.0, + "entropy": 0.29956918954849243, + "epoch": 1.0208333333333333, + "frac_reward_zero_std": 0.96875, + "grad_norm": 0.002243980998173356, + "learning_rate": 1e-06, + "loss": 0.0178, + "num_tokens": 848224323.0, + "reward": 0.8982422351837158, + "reward_std": 0.00703125074505806, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.998046875, + "rewards/symbolic_reward_accuracy/std": 0.04419417306780815, + "rewards/symbolic_reward_partial_score/mean": 0.998046875, + "rewards/symbolic_reward_partial_score/std": 0.04419417306780815, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0689256191253662, + "sampling/importance_sampling_ratio/min": 0.001507283071987331, + "sampling/sampling_logp_difference/max": 6.497446537017822, + "sampling/sampling_logp_difference/mean": 0.11993230879306793, + "step": 637 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.015625, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.015625, + "entropy": 0.29689641296863556, + "epoch": 1.0224358974358974, + "grad_norm": 0.0027434667572379112, + "learning_rate": 1e-06, + "loss": -0.0045, + "step": 638 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "entropy": 0.2978483736515045, + "epoch": 1.0240384615384615, + "grad_norm": 0.0023496190551668406, + "learning_rate": 1e-06, + "loss": -0.0039, + "step": 639 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "entropy": 0.30301567912101746, + "epoch": 1.0256410256410255, + "grad_norm": 0.002188858576118946, + "learning_rate": 1e-06, + "loss": -0.0033, + "step": 640 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14745.0, + "completions/max_terminated_length": 14745.0, + "completions/mean_length": 7850.69921875, + "completions/mean_terminated_length": 7850.69921875, + "completions/min_length": 2627.0, + "completions/min_terminated_length": 2627.0, + "entropy": 0.3039587587118149, + "epoch": 1.0272435897435896, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1e-06, + "loss": 0.0, + "num_tokens": 853077513.0, + "reward": 0.9000000357627869, + "reward_std": 0.0, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 1.0, + "rewards/symbolic_reward_accuracy/std": 0.0, + "rewards/symbolic_reward_partial_score/mean": 1.0, + "rewards/symbolic_reward_partial_score/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0695734024047852, + "sampling/importance_sampling_ratio/min": 0.0015758582158014178, + "sampling/sampling_logp_difference/max": 6.45295524597168, + "sampling/sampling_logp_difference/mean": 0.12085698544979095, + "step": 641 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "entropy": 0.3003881871700287, + "epoch": 1.0288461538461537, + "grad_norm": 0.0, + "learning_rate": 1e-06, + "loss": 0.0, + "step": 642 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "entropy": 0.30047284066677094, + "epoch": 1.0304487179487178, + "grad_norm": 0.0, + "learning_rate": 1e-06, + "loss": 0.0, + "step": 643 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "entropy": 0.29807648062705994, + "epoch": 1.032051282051282, + "grad_norm": 0.0, + "learning_rate": 1e-06, + "loss": 0.0, + "step": 644 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15055.0, + "completions/max_terminated_length": 15055.0, + "completions/mean_length": 8018.087890625, + "completions/mean_terminated_length": 8018.087890625, + "completions/min_length": 2745.0, + "completions/min_terminated_length": 2745.0, + "entropy": 0.29161442816257477, + "epoch": 1.0336538461538463, + "frac_reward_zero_std": 0.875, + "grad_norm": 0.01743435487151146, + "learning_rate": 1e-06, + "loss": 0.0105, + "num_tokens": 858024662.0, + "reward": 0.8869141340255737, + "reward_std": 0.030482003465294838, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.98046875, + "rewards/symbolic_reward_accuracy/std": 0.1385180652141571, + "rewards/symbolic_reward_partial_score/mean": 0.9954427480697632, + "rewards/symbolic_reward_partial_score/std": 0.05087684467434883, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.067512035369873, + "sampling/importance_sampling_ratio/min": 0.000799354340415448, + "sampling/sampling_logp_difference/max": 7.131706237792969, + "sampling/sampling_logp_difference/mean": 0.11742247641086578, + "step": 645 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.03125, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.03125, + "entropy": 0.29346978664398193, + "epoch": 1.0352564102564104, + "grad_norm": 0.006802158895879984, + "learning_rate": 1e-06, + "loss": -0.0234, + "step": 646 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0078125, + "clip_ratio/low_mean": 0.015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0234375, + "entropy": 0.29226499795913696, + "epoch": 1.0368589743589745, + "grad_norm": 0.008012805134057999, + "learning_rate": 1e-06, + "loss": -0.0024, + "step": 647 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0234375, + "clip_ratio/low_mean": 0.015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0390625, + "entropy": 0.29208989441394806, + "epoch": 1.0384615384615385, + "grad_norm": 0.021944493055343628, + "learning_rate": 1e-06, + "loss": 0.0179, + "step": 648 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.005859375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13799.0, + "completions/mean_length": 8456.9765625, + "completions/mean_terminated_length": 8410.255859375, + "completions/min_length": 3337.0, + "completions/min_terminated_length": 3337.0, + "entropy": 0.2881511002779007, + "epoch": 1.0400641025641026, + "frac_reward_zero_std": 0.84375, + "grad_norm": 0.036767441779375076, + "learning_rate": 1e-06, + "loss": 0.0646, + "num_tokens": 863227770.0, + "reward": 0.8824414014816284, + "reward_std": 0.044292449951171875, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.978515625, + "rewards/symbolic_reward_accuracy/std": 0.14513419568538666, + "rewards/symbolic_reward_partial_score/mean": 0.9863932132720947, + "rewards/symbolic_reward_partial_score/std": 0.11026550084352493, + "rewards/tag_count_reward/mean": -0.005859375, + "rewards/tag_count_reward/std": 0.07639661431312561, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0669612884521484, + "sampling/importance_sampling_ratio/min": 0.0011750105768442154, + "sampling/sampling_logp_difference/max": 6.746478080749512, + "sampling/sampling_logp_difference/mean": 0.11578300595283508, + "step": 649 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.0234375, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0234375, + "entropy": 0.2874348312616348, + "epoch": 1.0416666666666667, + "grad_norm": 0.008486775681376457, + "learning_rate": 1e-06, + "loss": -0.0081, + "step": 650 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.03125, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.03125, + "entropy": 0.2900143712759018, + "epoch": 1.0432692307692308, + "grad_norm": 0.01539598498493433, + "learning_rate": 1e-06, + "loss": -0.024, + "step": 651 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.015625, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.015625, + "entropy": 0.28809335827827454, + "epoch": 1.044871794871795, + "grad_norm": 0.014531200751662254, + "learning_rate": 1e-06, + "loss": -0.0172, + "step": 652 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13522.0, + "completions/max_terminated_length": 13522.0, + "completions/mean_length": 8069.59375, + "completions/mean_terminated_length": 8069.59375, + "completions/min_length": 4101.0, + "completions/min_terminated_length": 4101.0, + "entropy": 0.28572797775268555, + "epoch": 1.046474358974359, + "frac_reward_zero_std": 0.84375, + "grad_norm": 0.02514318935573101, + "learning_rate": 1e-06, + "loss": -0.009, + "num_tokens": 868277498.0, + "reward": 0.8922950029373169, + "reward_std": 0.027725880965590477, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.98828125, + "rewards/symbolic_reward_accuracy/std": 0.10772226005792618, + "rewards/symbolic_reward_partial_score/mean": 0.997753918170929, + "rewards/symbolic_reward_partial_score/std": 0.026105698198080063, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0669407844543457, + "sampling/importance_sampling_ratio/min": 0.00189927127212286, + "sampling/sampling_logp_difference/max": 6.266284942626953, + "sampling/sampling_logp_difference/mean": 0.11612759530544281, + "step": 653 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.015625, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.015625, + "entropy": 0.28615590929985046, + "epoch": 1.0480769230769231, + "grad_norm": 0.030614567920565605, + "learning_rate": 1e-06, + "loss": 0.0051, + "step": 654 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0078125, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0078125, + "entropy": 0.2873466908931732, + "epoch": 1.0496794871794872, + "grad_norm": 0.024968182668089867, + "learning_rate": 1e-06, + "loss": -0.0074, + "step": 655 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.046875, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.046875, + "entropy": 0.28713513910770416, + "epoch": 1.0512820512820513, + "grad_norm": 0.033524587750434875, + "learning_rate": 1e-06, + "loss": 0.0105, + "step": 656 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12987.0, + "completions/max_terminated_length": 12987.0, + "completions/mean_length": 7668.623046875, + "completions/mean_terminated_length": 7668.623046875, + "completions/min_length": 2465.0, + "completions/min_terminated_length": 2465.0, + "entropy": 0.29844947159290314, + "epoch": 1.0528846153846154, + "frac_reward_zero_std": 0.90625, + "grad_norm": 0.034264422953128815, + "learning_rate": 1e-06, + "loss": 0.0183, + "num_tokens": 873047097.0, + "reward": 0.8963037729263306, + "reward_std": 0.014785156585276127, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.994140625, + "rewards/symbolic_reward_accuracy/std": 0.07639661431312561, + "rewards/symbolic_reward_partial_score/mean": 0.9993977546691895, + "rewards/symbolic_reward_partial_score/std": 0.007960735820233822, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0682570934295654, + "sampling/importance_sampling_ratio/min": 0.004175328183919191, + "sampling/sampling_logp_difference/max": 5.478562355041504, + "sampling/sampling_logp_difference/mean": 0.11900214105844498, + "step": 657 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0078125, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0078125, + "entropy": 0.29740945994853973, + "epoch": 1.0544871794871795, + "grad_norm": 0.004157658200711012, + "learning_rate": 1e-06, + "loss": -0.0124, + "step": 658 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0078125, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0078125, + "entropy": 0.29370294511318207, + "epoch": 1.0560897435897436, + "grad_norm": 0.004620330408215523, + "learning_rate": 1e-06, + "loss": -0.0129, + "step": 659 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.015625, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.015625, + "entropy": 0.2955019474029541, + "epoch": 1.0576923076923077, + "grad_norm": 0.0033308968413621187, + "learning_rate": 1e-06, + "loss": 0.0019, + "step": 660 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14623.0, + "completions/max_terminated_length": 14623.0, + "completions/mean_length": 7852.88671875, + "completions/mean_terminated_length": 7852.88671875, + "completions/min_length": 3324.0, + "completions/min_terminated_length": 3324.0, + "entropy": 0.2860754281282425, + "epoch": 1.0592948717948718, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1e-06, + "loss": 0.0, + "num_tokens": 877923455.0, + "reward": 0.9000000357627869, + "reward_std": 0.0, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 1.0, + "rewards/symbolic_reward_accuracy/std": 0.0, + "rewards/symbolic_reward_partial_score/mean": 1.0, + "rewards/symbolic_reward_partial_score/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.066422700881958, + "sampling/importance_sampling_ratio/min": 0.001346131437458098, + "sampling/sampling_logp_difference/max": 6.610520362854004, + "sampling/sampling_logp_difference/mean": 0.11576870828866959, + "step": 661 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "entropy": 0.2884225696325302, + "epoch": 1.060897435897436, + "grad_norm": 0.0, + "learning_rate": 1e-06, + "loss": 0.0, + "step": 662 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "entropy": 0.28944897651672363, + "epoch": 1.0625, + "grad_norm": 0.0, + "learning_rate": 1e-06, + "loss": 0.0, + "step": 663 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "entropy": 0.28575797379016876, + "epoch": 1.064102564102564, + "grad_norm": 0.0, + "learning_rate": 1e-06, + "loss": 0.0, + "step": 664 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12896.0, + "completions/max_terminated_length": 12896.0, + "completions/mean_length": 7736.224609375, + "completions/mean_terminated_length": 7736.224609375, + "completions/min_length": 2776.0, + "completions/min_terminated_length": 2776.0, + "entropy": 0.2841587960720062, + "epoch": 1.0657051282051282, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.014903567731380463, + "learning_rate": 1e-06, + "loss": -0.0349, + "num_tokens": 882813394.0, + "reward": 0.8854101896286011, + "reward_std": 0.04905132204294205, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.978515625, + "rewards/symbolic_reward_accuracy/std": 0.14513419568538666, + "rewards/symbolic_reward_partial_score/mean": 0.994335949420929, + "rewards/symbolic_reward_partial_score/std": 0.06388040632009506, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0652308464050293, + "sampling/importance_sampling_ratio/min": 0.0015026178443804383, + "sampling/sampling_logp_difference/max": 6.500546455383301, + "sampling/sampling_logp_difference/mean": 0.11406149715185165, + "step": 665 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0625, + "clip_ratio/low_mean": 0.0078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0703125, + "entropy": 0.28192271292209625, + "epoch": 1.0673076923076923, + "grad_norm": 0.03575403615832329, + "learning_rate": 1e-06, + "loss": 0.0236, + "step": 666 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.0546875, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0546875, + "entropy": 0.28293684124946594, + "epoch": 1.0689102564102564, + "grad_norm": 0.03577136993408203, + "learning_rate": 1e-06, + "loss": 0.0142, + "step": 667 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.03125, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.03125, + "entropy": 0.282418817281723, + "epoch": 1.0705128205128205, + "grad_norm": 0.01990150660276413, + "learning_rate": 1e-06, + "loss": -0.0136, + "step": 668 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13928.0, + "completions/max_terminated_length": 13928.0, + "completions/mean_length": 7708.44140625, + "completions/mean_terminated_length": 7708.44140625, + "completions/min_length": 2868.0, + "completions/min_terminated_length": 2868.0, + "entropy": 0.28265492618083954, + "epoch": 1.0721153846153846, + "frac_reward_zero_std": 0.84375, + "grad_norm": 0.030519738793373108, + "learning_rate": 1e-06, + "loss": 0.0014, + "num_tokens": 887610468.0, + "reward": 0.8933585286140442, + "reward_std": 0.02347153052687645, + "rewards/progression_diversity/mean": -8.656460704514757e-05, + "rewards/progression_diversity/std": 0.0019587334245443344, + "rewards/symbolic_reward_accuracy/mean": 0.990234375, + "rewards/symbolic_reward_accuracy/std": 0.09843364357948303, + "rewards/symbolic_reward_partial_score/mean": 0.9973958730697632, + "rewards/symbolic_reward_partial_score/std": 0.04477177560329437, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0654292106628418, + "sampling/importance_sampling_ratio/min": 0.003087102435529232, + "sampling/sampling_logp_difference/max": 5.780522346496582, + "sampling/sampling_logp_difference/mean": 0.11411414295434952, + "step": 669 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.015625, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.015625, + "entropy": 0.2829418480396271, + "epoch": 1.0737179487179487, + "grad_norm": 0.03352310508489609, + "learning_rate": 1e-06, + "loss": 0.0097, + "step": 670 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.03125, + "clip_ratio/low_mean": 0.0078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0390625, + "entropy": 0.281900018453598, + "epoch": 1.0753205128205128, + "grad_norm": 0.005207178648561239, + "learning_rate": 1e-06, + "loss": 0.0079, + "step": 671 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0078125, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0078125, + "entropy": 0.2834167182445526, + "epoch": 1.0769230769230769, + "grad_norm": 0.006832549348473549, + "learning_rate": 1e-06, + "loss": -0.0133, + "step": 672 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13941.0, + "completions/max_terminated_length": 13941.0, + "completions/mean_length": 8134.1484375, + "completions/mean_terminated_length": 8134.1484375, + "completions/min_length": 4374.0, + "completions/min_terminated_length": 4374.0, + "entropy": 0.2623938024044037, + "epoch": 1.078525641025641, + "frac_reward_zero_std": 0.875, + "grad_norm": 0.017798224464058876, + "learning_rate": 1e-06, + "loss": 0.0028, + "num_tokens": 892704352.0, + "reward": 0.8955559730529785, + "reward_std": 0.01373315043747425, + "rewards/progression_diversity/mean": -7.182326226029545e-05, + "rewards/progression_diversity/std": 0.0012371126795187593, + "rewards/symbolic_reward_accuracy/mean": 0.994140625, + "rewards/symbolic_reward_accuracy/std": 0.07639661431312561, + "rewards/symbolic_reward_partial_score/mean": 0.9969075918197632, + "rewards/symbolic_reward_partial_score/std": 0.04949941113591194, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0605111122131348, + "sampling/importance_sampling_ratio/min": 0.0017345065716654062, + "sampling/sampling_logp_difference/max": 6.357032299041748, + "sampling/sampling_logp_difference/mean": 0.10625582188367844, + "step": 673 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.015625, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.015625, + "entropy": 0.2608880549669266, + "epoch": 1.080128205128205, + "grad_norm": 0.003827364183962345, + "learning_rate": 1e-06, + "loss": -0.0129, + "step": 674 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.015625, + "entropy": 0.25945594906806946, + "epoch": 1.0817307692307692, + "grad_norm": 0.004291590768843889, + "learning_rate": 1e-06, + "loss": 0.0058, + "step": 675 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0078125, + "clip_ratio/low_mean": 0.0078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.015625, + "entropy": 0.2584989666938782, + "epoch": 1.0833333333333333, + "grad_norm": 0.029342349618673325, + "learning_rate": 1e-06, + "loss": 0.0149, + "step": 676 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15488.0, + "completions/mean_length": 7833.21484375, + "completions/mean_terminated_length": 7816.4814453125, + "completions/min_length": 3418.0, + "completions/min_terminated_length": 3418.0, + "entropy": 0.27159421145915985, + "epoch": 1.0849358974358974, + "frac_reward_zero_std": 0.8125, + "grad_norm": 0.02988434210419655, + "learning_rate": 1e-06, + "loss": 0.0206, + "num_tokens": 897556846.0, + "reward": 0.8891208171844482, + "reward_std": 0.027503937482833862, + "rewards/progression_diversity/mean": -2.9832091968273744e-05, + "rewards/progression_diversity/std": 0.0006708584260195494, + "rewards/symbolic_reward_accuracy/mean": 0.984375, + "rewards/symbolic_reward_accuracy/std": 0.12414088100194931, + "rewards/symbolic_reward_partial_score/mean": 0.9956380128860474, + "rewards/symbolic_reward_partial_score/std": 0.051393549889326096, + "rewards/tag_count_reward/mean": -0.001953125, + "rewards/tag_count_reward/std": 0.04419417306780815, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.062467336654663, + "sampling/importance_sampling_ratio/min": 0.0017893225885927677, + "sampling/sampling_logp_difference/max": 6.325918197631836, + "sampling/sampling_logp_difference/mean": 0.10957305133342743, + "step": 677 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0234375, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0234375, + "entropy": 0.26841987669467926, + "epoch": 1.0865384615384615, + "grad_norm": 0.012452783063054085, + "learning_rate": 1e-06, + "loss": -0.0202, + "step": 678 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.03125, + "clip_ratio/low_mean": 0.015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.046875, + "entropy": 0.27052880823612213, + "epoch": 1.0881410256410255, + "grad_norm": 0.017503680661320686, + "learning_rate": 1e-06, + "loss": -0.0126, + "step": 679 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.0234375, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0234375, + "entropy": 0.2656054198741913, + "epoch": 1.0897435897435896, + "grad_norm": 0.025547392666339874, + "learning_rate": 1e-06, + "loss": 0.0068, + "step": 680 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15721.0, + "completions/mean_length": 7773.837890625, + "completions/mean_terminated_length": 7740.0732421875, + "completions/min_length": 2609.0, + "completions/min_terminated_length": 2609.0, + "entropy": 0.2652207016944885, + "epoch": 1.0913461538461537, + "frac_reward_zero_std": 0.8125, + "grad_norm": 0.007663046941161156, + "learning_rate": 1e-06, + "loss": 0.0097, + "num_tokens": 902448811.0, + "reward": 0.8927390575408936, + "reward_std": 0.02515510842204094, + "rewards/progression_diversity/mean": -2.1295189071679488e-05, + "rewards/progression_diversity/std": 0.00048185509513132274, + "rewards/symbolic_reward_accuracy/mean": 0.990234375, + "rewards/symbolic_reward_accuracy/std": 0.09843364357948303, + "rewards/symbolic_reward_partial_score/mean": 0.9972819089889526, + "rewards/symbolic_reward_partial_score/std": 0.04502078890800476, + "rewards/tag_count_reward/mean": -0.005859375, + "rewards/tag_count_reward/std": 0.07639661431312561, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0612239837646484, + "sampling/importance_sampling_ratio/min": 0.002235043328255415, + "sampling/sampling_logp_difference/max": 6.103494644165039, + "sampling/sampling_logp_difference/mean": 0.10745704174041748, + "step": 681 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.0390625, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0390625, + "entropy": 0.2610100209712982, + "epoch": 1.092948717948718, + "grad_norm": 0.038671281188726425, + "learning_rate": 1e-06, + "loss": 0.0154, + "step": 682 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.03125, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.03125, + "entropy": 0.26077619194984436, + "epoch": 1.094551282051282, + "grad_norm": 0.01831183023750782, + "learning_rate": 1e-06, + "loss": 0.0095, + "step": 683 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.0234375, + "clip_ratio/low_mean": 0.0078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.03125, + "entropy": 0.26326900720596313, + "epoch": 1.0961538461538463, + "grad_norm": 0.005235353950411081, + "learning_rate": 1e-06, + "loss": -0.0075, + "step": 684 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13355.0, + "completions/max_terminated_length": 13355.0, + "completions/mean_length": 7589.1484375, + "completions/mean_terminated_length": 7589.1484375, + "completions/min_length": 2987.0, + "completions/min_terminated_length": 2987.0, + "entropy": 0.26450496912002563, + "epoch": 1.0977564102564104, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.03519367799162865, + "learning_rate": 1e-06, + "loss": 0.043, + "num_tokens": 907172343.0, + "reward": 0.8846186399459839, + "reward_std": 0.046957530081272125, + "rewards/progression_diversity/mean": -5.9372316172812134e-05, + "rewards/progression_diversity/std": 0.0009608343243598938, + "rewards/symbolic_reward_accuracy/mean": 0.978515625, + "rewards/symbolic_reward_accuracy/std": 0.14513419568538666, + "rewards/symbolic_reward_partial_score/mean": 0.99169921875, + "rewards/symbolic_reward_partial_score/std": 0.07279406487941742, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0615901947021484, + "sampling/importance_sampling_ratio/min": 0.0027771987952291965, + "sampling/sampling_logp_difference/max": 5.886312484741211, + "sampling/sampling_logp_difference/mean": 0.1085769385099411, + "step": 685 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.0546875, + "clip_ratio/low_mean": 0.0078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0625, + "entropy": 0.2648194283246994, + "epoch": 1.0993589743589745, + "grad_norm": 0.017659511417150497, + "learning_rate": 1e-06, + "loss": 0.0099, + "step": 686 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.03125, + "clip_ratio/low_mean": 0.0078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0390625, + "entropy": 0.26925028860569, + "epoch": 1.1009615384615385, + "grad_norm": 0.006832172628492117, + "learning_rate": 1e-06, + "loss": -0.0204, + "step": 687 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.03125, + "clip_ratio/low_mean": 0.0078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0390625, + "entropy": 0.25970420241355896, + "epoch": 1.1025641025641026, + "grad_norm": 0.008957959711551666, + "learning_rate": 1e-06, + "loss": -0.009, + "step": 688 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13287.0, + "completions/mean_length": 7613.986328125, + "completions/mean_terminated_length": 7579.5947265625, + "completions/min_length": 2514.0, + "completions/min_terminated_length": 2514.0, + "entropy": 0.25761424005031586, + "epoch": 1.1041666666666667, + "frac_reward_zero_std": 0.78125, + "grad_norm": 0.02380330301821232, + "learning_rate": 1e-06, + "loss": 0.0246, + "num_tokens": 911937200.0, + "reward": 0.8876208066940308, + "reward_std": 0.04614756256341934, + "rewards/progression_diversity/mean": -0.0001286342740058899, + "rewards/progression_diversity/std": 0.0029106612782925367, + "rewards/symbolic_reward_accuracy/mean": 0.984375, + "rewards/symbolic_reward_accuracy/std": 0.12414088100194931, + "rewards/symbolic_reward_partial_score/mean": 0.9912923574447632, + "rewards/symbolic_reward_partial_score/std": 0.08297867327928543, + "rewards/tag_count_reward/mean": -0.00390625, + "rewards/tag_count_reward/std": 0.06243881583213806, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0609831809997559, + "sampling/importance_sampling_ratio/min": 0.0015549632953479886, + "sampling/sampling_logp_difference/max": 6.46630334854126, + "sampling/sampling_logp_difference/mean": 0.10692013055086136, + "step": 689 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.03125, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.03125, + "entropy": 0.2630904018878937, + "epoch": 1.1057692307692308, + "grad_norm": 0.015310103073716164, + "learning_rate": 1e-06, + "loss": 0.0122, + "step": 690 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.0234375, + "clip_ratio/low_mean": 0.0078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.03125, + "entropy": 0.26416918635368347, + "epoch": 1.107371794871795, + "grad_norm": 0.01843937300145626, + "learning_rate": 1e-06, + "loss": -0.0016, + "step": 691 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.03125, + "clip_ratio/low_mean": 0.0078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0390625, + "entropy": 0.2623608559370041, + "epoch": 1.108974358974359, + "grad_norm": 0.006605224683880806, + "learning_rate": 1e-06, + "loss": -0.0174, + "step": 692 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14738.0, + "completions/max_terminated_length": 14738.0, + "completions/mean_length": 7770.841796875, + "completions/mean_terminated_length": 7770.841796875, + "completions/min_length": 2381.0, + "completions/min_terminated_length": 2381.0, + "entropy": 0.2696673274040222, + "epoch": 1.1105769230769231, + "frac_reward_zero_std": 0.90625, + "grad_norm": 0.003520580241456628, + "learning_rate": 1e-06, + "loss": -0.0114, + "num_tokens": 916765599.0, + "reward": 0.89599609375, + "reward_std": 0.01601562649011612, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.994140625, + "rewards/symbolic_reward_accuracy/std": 0.07639661431312561, + "rewards/symbolic_reward_partial_score/mean": 0.9983723759651184, + "rewards/symbolic_reward_partial_score/std": 0.024398809298872948, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0617854595184326, + "sampling/importance_sampling_ratio/min": 0.0025482482742518187, + "sampling/sampling_logp_difference/max": 5.972349166870117, + "sampling/sampling_logp_difference/mean": 0.10866742581129074, + "step": 693 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "entropy": 0.26511339843273163, + "epoch": 1.1121794871794872, + "grad_norm": 0.020077740773558617, + "learning_rate": 1e-06, + "loss": 0.0033, + "step": 694 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.015625, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.015625, + "entropy": 0.2642548382282257, + "epoch": 1.1137820512820513, + "grad_norm": 0.0034260887186974287, + "learning_rate": 1e-06, + "loss": 0.0199, + "step": 695 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.03125, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.03125, + "entropy": 0.26379284262657166, + "epoch": 1.1153846153846154, + "grad_norm": 0.0034673307090997696, + "learning_rate": 1e-06, + "loss": -0.0128, + "step": 696 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14377.0, + "completions/max_terminated_length": 14377.0, + "completions/mean_length": 7378.373046875, + "completions/mean_terminated_length": 7378.373046875, + "completions/min_length": 2058.0, + "completions/min_terminated_length": 2058.0, + "entropy": 0.2753664702177048, + "epoch": 1.1169871794871795, + "frac_reward_zero_std": 0.84375, + "grad_norm": 0.02176603674888611, + "learning_rate": 1e-06, + "loss": 0.0048, + "num_tokens": 921404366.0, + "reward": 0.8903019428253174, + "reward_std": 0.022549103945493698, + "rewards/progression_diversity/mean": -8.592897211201489e-05, + "rewards/progression_diversity/std": 0.001514117349870503, + "rewards/symbolic_reward_accuracy/mean": 0.986328125, + "rewards/symbolic_reward_accuracy/std": 0.1162383034825325, + "rewards/symbolic_reward_partial_score/mean": 0.9950195550918579, + "rewards/symbolic_reward_partial_score/std": 0.06346382945775986, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0625696182250977, + "sampling/importance_sampling_ratio/min": 0.0005904682911932468, + "sampling/sampling_logp_difference/max": 7.434594631195068, + "sampling/sampling_logp_difference/mean": 0.11063844710588455, + "step": 697 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0234375, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0234375, + "entropy": 0.2678314447402954, + "epoch": 1.1185897435897436, + "grad_norm": 0.007807692512869835, + "learning_rate": 1e-06, + "loss": -0.0285, + "step": 698 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.046875, + "clip_ratio/low_mean": 0.0078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0546875, + "entropy": 0.2715350091457367, + "epoch": 1.1201923076923077, + "grad_norm": 0.012418070808053017, + "learning_rate": 1e-06, + "loss": 0.0153, + "step": 699 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.015625, + "clip_ratio/low_mean": 0.0078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0234375, + "entropy": 0.2664782702922821, + "epoch": 1.1217948717948718, + "grad_norm": 0.023162173107266426, + "learning_rate": 1e-06, + "loss": 0.0091, + "step": 700 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14574.0, + "completions/mean_length": 7569.21484375, + "completions/mean_terminated_length": 7534.6474609375, + "completions/min_length": 1983.0, + "completions/min_terminated_length": 1983.0, + "entropy": 0.2645212560892105, + "epoch": 1.123397435897436, + "frac_reward_zero_std": 0.8125, + "grad_norm": 0.02870558202266693, + "learning_rate": 1e-06, + "loss": -0.0007, + "num_tokens": 926171516.0, + "reward": 0.8899610042572021, + "reward_std": 0.0324556939303875, + "rewards/progression_diversity/mean": -5.122274160385132e-07, + "rewards/progression_diversity/std": 1.1590383110160474e-05, + "rewards/symbolic_reward_accuracy/mean": 0.986328125, + "rewards/symbolic_reward_accuracy/std": 0.1162383034825325, + "rewards/symbolic_reward_partial_score/mean": 0.9945312738418579, + "rewards/symbolic_reward_partial_score/std": 0.06467118859291077, + "rewards/tag_count_reward/mean": -0.001953125, + "rewards/tag_count_reward/std": 0.04419417306780815, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0612825155258179, + "sampling/importance_sampling_ratio/min": 0.0015629983972758055, + "sampling/sampling_logp_difference/max": 6.461149215698242, + "sampling/sampling_logp_difference/mean": 0.10810823738574982, + "step": 701 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.0234375, + "clip_ratio/low_mean": 0.0234375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.046875, + "entropy": 0.2682636231184006, + "epoch": 1.125, + "grad_norm": 0.005539908539503813, + "learning_rate": 1e-06, + "loss": 0.0636, + "step": 702 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0078125, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0078125, + "entropy": 0.2629767805337906, + "epoch": 1.126602564102564, + "grad_norm": 0.0060573117807507515, + "learning_rate": 1e-06, + "loss": -0.0252, + "step": 703 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.03125, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.03125, + "entropy": 0.2614084780216217, + "epoch": 1.1282051282051282, + "grad_norm": 0.005166211631149054, + "learning_rate": 1e-06, + "loss": -0.0221, + "step": 704 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14494.0, + "completions/mean_length": 7480.783203125, + "completions/mean_terminated_length": 7463.35986328125, + "completions/min_length": 2870.0, + "completions/min_terminated_length": 2870.0, + "entropy": 0.2685774117708206, + "epoch": 1.1298076923076923, + "frac_reward_zero_std": 0.71875, + "grad_norm": 0.008230074308812618, + "learning_rate": 1e-06, + "loss": -0.0097, + "num_tokens": 930796077.0, + "reward": 0.8889119625091553, + "reward_std": 0.04125416278839111, + "rewards/progression_diversity/mean": -0.00040413427632302046, + "rewards/progression_diversity/std": 0.004968197550624609, + "rewards/symbolic_reward_accuracy/mean": 0.984375, + "rewards/symbolic_reward_accuracy/std": 0.12414088100194931, + "rewards/symbolic_reward_partial_score/mean": 0.9949544668197632, + "rewards/symbolic_reward_partial_score/std": 0.06327638775110245, + "rewards/tag_count_reward/mean": -0.001953125, + "rewards/tag_count_reward/std": 0.04419417306780815, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0628039836883545, + "sampling/importance_sampling_ratio/min": 0.0042944494634866714, + "sampling/sampling_logp_difference/max": 5.450431823730469, + "sampling/sampling_logp_difference/mean": 0.11043090373277664, + "step": 705 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.03125, + "clip_ratio/low_mean": 0.0078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0390625, + "entropy": 0.27061736583709717, + "epoch": 1.1314102564102564, + "grad_norm": 0.006081659346818924, + "learning_rate": 1e-06, + "loss": 0.0214, + "step": 706 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.015625, + "clip_ratio/low_mean": 0.0078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0234375, + "entropy": 0.27226053178310394, + "epoch": 1.1330128205128205, + "grad_norm": 0.019175931811332703, + "learning_rate": 1e-06, + "loss": 0.021, + "step": 707 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.0546875, + "clip_ratio/low_mean": 0.0078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0625, + "entropy": 0.27494950592517853, + "epoch": 1.1346153846153846, + "grad_norm": 0.019779911264777184, + "learning_rate": 1e-06, + "loss": -0.0026, + "step": 708 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14330.0, + "completions/mean_length": 7486.875, + "completions/mean_terminated_length": 7451.98486328125, + "completions/min_length": 1773.0, + "completions/min_terminated_length": 1773.0, + "entropy": 0.2697181850671768, + "epoch": 1.1362179487179487, + "frac_reward_zero_std": 0.84375, + "grad_norm": 0.00622901227325201, + "learning_rate": 1e-06, + "loss": -0.0269, + "num_tokens": 935515357.0, + "reward": 0.8910156488418579, + "reward_std": 0.03098640777170658, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.98828125, + "rewards/symbolic_reward_accuracy/std": 0.10772226005792618, + "rewards/symbolic_reward_partial_score/mean": 0.9947916865348816, + "rewards/symbolic_reward_partial_score/std": 0.06532405316829681, + "rewards/tag_count_reward/mean": -0.00390625, + "rewards/tag_count_reward/std": 0.06243881583213806, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0624651908874512, + "sampling/importance_sampling_ratio/min": 0.0015064862091094255, + "sampling/sampling_logp_difference/max": 6.4979753494262695, + "sampling/sampling_logp_difference/mean": 0.10939180850982666, + "step": 709 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.0390625, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0390625, + "entropy": 0.2662170082330704, + "epoch": 1.1378205128205128, + "grad_norm": 0.03273667022585869, + "learning_rate": 1e-06, + "loss": 0.0403, + "step": 710 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0078125, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0078125, + "entropy": 0.2710031718015671, + "epoch": 1.1394230769230769, + "grad_norm": 0.006321210414171219, + "learning_rate": 1e-06, + "loss": -0.0072, + "step": 711 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.0234375, + "clip_ratio/low_mean": 0.0078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.03125, + "entropy": 0.2719338834285736, + "epoch": 1.141025641025641, + "grad_norm": 0.02884542942047119, + "learning_rate": 1e-06, + "loss": 0.0176, + "step": 712 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12891.0, + "completions/max_terminated_length": 12891.0, + "completions/mean_length": 6856.3828125, + "completions/mean_terminated_length": 6856.3828125, + "completions/min_length": 2791.0, + "completions/min_terminated_length": 2791.0, + "entropy": 0.2899167388677597, + "epoch": 1.142628205128205, + "frac_reward_zero_std": 0.84375, + "grad_norm": 0.005113643128424883, + "learning_rate": 1e-06, + "loss": -0.0193, + "num_tokens": 939780225.0, + "reward": 0.8940412998199463, + "reward_std": 0.020740434527397156, + "rewards/progression_diversity/mean": -0.0001685535826254636, + "rewards/progression_diversity/std": 0.00280111120082438, + "rewards/symbolic_reward_accuracy/mean": 0.9921875, + "rewards/symbolic_reward_accuracy/std": 0.08812850713729858, + "rewards/symbolic_reward_partial_score/mean": 0.9957682490348816, + "rewards/symbolic_reward_partial_score/std": 0.0626349002122879, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.066714882850647, + "sampling/importance_sampling_ratio/min": 0.0023814705200493336, + "sampling/sampling_logp_difference/max": 6.040037155151367, + "sampling/sampling_logp_difference/mean": 0.11683158576488495, + "step": 713 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.015625, + "clip_ratio/low_mean": 0.0078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0234375, + "entropy": 0.2890501618385315, + "epoch": 1.1442307692307692, + "grad_norm": 0.025946229696273804, + "learning_rate": 1e-06, + "loss": 0.0229, + "step": 714 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.03125, + "clip_ratio/low_mean": 0.015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.046875, + "entropy": 0.28599461913108826, + "epoch": 1.1458333333333333, + "grad_norm": 0.009792243130505085, + "learning_rate": 1e-06, + "loss": 0.0141, + "step": 715 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.0234375, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0234375, + "entropy": 0.28565630316734314, + "epoch": 1.1474358974358974, + "grad_norm": 0.004287914838641882, + "learning_rate": 1e-06, + "loss": -0.0057, + "step": 716 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14698.0, + "completions/mean_length": 6872.3984375, + "completions/mean_terminated_length": 6853.78466796875, + "completions/min_length": 1888.0, + "completions/min_terminated_length": 1888.0, + "entropy": 0.28366775810718536, + "epoch": 1.1490384615384615, + "frac_reward_zero_std": 0.8125, + "grad_norm": 0.016366643831133842, + "learning_rate": 1e-06, + "loss": 0.0085, + "num_tokens": 944098141.0, + "reward": 0.8756428956985474, + "reward_std": 0.04919969290494919, + "rewards/progression_diversity/mean": -0.0006512624095194042, + "rewards/progression_diversity/std": 0.014736386016011238, + "rewards/symbolic_reward_accuracy/mean": 0.96484375, + "rewards/symbolic_reward_accuracy/std": 0.1843547374010086, + "rewards/symbolic_reward_partial_score/mean": 0.9904459714889526, + "rewards/symbolic_reward_partial_score/std": 0.07924753427505493, + "rewards/tag_count_reward/mean": -0.00390625, + "rewards/tag_count_reward/std": 0.06243881583213806, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0652272701263428, + "sampling/importance_sampling_ratio/min": 0.0023581604473292828, + "sampling/sampling_logp_difference/max": 6.049873352050781, + "sampling/sampling_logp_difference/mean": 0.11434115469455719, + "step": 717 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.0234375, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0234375, + "entropy": 0.2845660150051117, + "epoch": 1.1506410256410255, + "grad_norm": 0.021158399060368538, + "learning_rate": 1e-06, + "loss": -0.0064, + "step": 718 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.015625, + "clip_ratio/low_mean": 0.0078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0234375, + "entropy": 0.28018422424793243, + "epoch": 1.1522435897435896, + "grad_norm": 0.015796631574630737, + "learning_rate": 1e-06, + "loss": 0.0102, + "step": 719 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.0390625, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0390625, + "entropy": 0.2849064916372299, + "epoch": 1.1538461538461537, + "grad_norm": 0.012021268717944622, + "learning_rate": 1e-06, + "loss": -0.0081, + "step": 720 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16245.0, + "completions/mean_length": 8046.552734375, + "completions/mean_terminated_length": 8013.857421875, + "completions/min_length": 3661.0, + "completions/min_terminated_length": 3661.0, + "entropy": 0.25592416524887085, + "epoch": 1.155448717948718, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.010126116685569286, + "learning_rate": 1e-06, + "loss": 0.0224, + "num_tokens": 949275272.0, + "reward": 0.8764989376068115, + "reward_std": 0.06346848607063293, + "rewards/progression_diversity/mean": -1.028479528031312e-05, + "rewards/progression_diversity/std": 0.00023271834652405232, + "rewards/symbolic_reward_accuracy/mean": 0.966796875, + "rewards/symbolic_reward_accuracy/std": 0.17934183776378632, + "rewards/symbolic_reward_partial_score/mean": 0.9893717169761658, + "rewards/symbolic_reward_partial_score/std": 0.08977557718753815, + "rewards/tag_count_reward/mean": -0.00390625, + "rewards/tag_count_reward/std": 0.06243881583213806, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0593876838684082, + "sampling/importance_sampling_ratio/min": 0.00013644673163071275, + "sampling/sampling_logp_difference/max": 8.899576187133789, + "sampling/sampling_logp_difference/mean": 0.10469293594360352, + "step": 721 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.0546875, + "clip_ratio/low_mean": 0.015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0703125, + "entropy": 0.2578028440475464, + "epoch": 1.157051282051282, + "grad_norm": 0.012335097417235374, + "learning_rate": 1e-06, + "loss": -0.0021, + "step": 722 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.03125, + "clip_ratio/low_mean": 0.0078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0390625, + "entropy": 0.2576797902584076, + "epoch": 1.1586538461538463, + "grad_norm": 0.02756766602396965, + "learning_rate": 1e-06, + "loss": 0.0038, + "step": 723 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.0390625, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0390625, + "entropy": 0.25286468863487244, + "epoch": 1.1602564102564104, + "grad_norm": 0.02716866135597229, + "learning_rate": 1e-06, + "loss": 0.0209, + "step": 724 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.005859375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13557.0, + "completions/mean_length": 7729.5390625, + "completions/mean_terminated_length": 7678.53076171875, + "completions/min_length": 3962.0, + "completions/min_terminated_length": 3962.0, + "entropy": 0.2743615508079529, + "epoch": 1.1618589743589745, + "frac_reward_zero_std": 0.78125, + "grad_norm": 0.012762167491018772, + "learning_rate": 1e-06, + "loss": -0.0301, + "num_tokens": 954120252.0, + "reward": 0.8848584890365601, + "reward_std": 0.041579727083444595, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.978515625, + "rewards/symbolic_reward_accuracy/std": 0.14513419568538666, + "rewards/symbolic_reward_partial_score/mean": 0.9944498538970947, + "rewards/symbolic_reward_partial_score/std": 0.06356307864189148, + "rewards/tag_count_reward/mean": -0.005859375, + "rewards/tag_count_reward/std": 0.07639661431312561, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.063340663909912, + "sampling/importance_sampling_ratio/min": 0.0027780013624578714, + "sampling/sampling_logp_difference/max": 5.88602352142334, + "sampling/sampling_logp_difference/mean": 0.11120878905057907, + "step": 725 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.046875, + "clip_ratio/low_mean": 0.015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0625, + "entropy": 0.2750166952610016, + "epoch": 1.1634615384615385, + "grad_norm": 0.02495425008237362, + "learning_rate": 1e-06, + "loss": 0.0247, + "step": 726 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.03125, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.03125, + "entropy": 0.2725471556186676, + "epoch": 1.1650641025641026, + "grad_norm": 0.0408598817884922, + "learning_rate": 1e-06, + "loss": 0.0145, + "step": 727 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.03125, + "clip_ratio/low_mean": 0.0078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0390625, + "entropy": 0.27416326105594635, + "epoch": 1.1666666666666667, + "grad_norm": 0.009657352231442928, + "learning_rate": 1e-06, + "loss": 0.0287, + "step": 728 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.009765625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14030.0, + "completions/mean_length": 7417.345703125, + "completions/mean_terminated_length": 7328.91748046875, + "completions/min_length": 3062.0, + "completions/min_terminated_length": 3062.0, + "entropy": 0.2749616950750351, + "epoch": 1.1682692307692308, + "frac_reward_zero_std": 0.875, + "grad_norm": 0.023918617516756058, + "learning_rate": 1e-06, + "loss": -0.0016, + "num_tokens": 958729549.0, + "reward": 0.890673816204071, + "reward_std": 0.02003018744289875, + "rewards/progression_diversity/mean": -1.871625840976776e-06, + "rewards/progression_diversity/std": 4.235006053932011e-05, + "rewards/symbolic_reward_accuracy/mean": 0.990234375, + "rewards/symbolic_reward_accuracy/std": 0.09843364357948303, + "rewards/symbolic_reward_partial_score/mean": 0.99169921875, + "rewards/symbolic_reward_partial_score/std": 0.08877533674240112, + "rewards/tag_count_reward/mean": -0.009765625, + "rewards/tag_count_reward/std": 0.09843364357948303, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0633912086486816, + "sampling/importance_sampling_ratio/min": 0.002192102139815688, + "sampling/sampling_logp_difference/max": 6.122894287109375, + "sampling/sampling_logp_difference/mean": 0.11154164373874664, + "step": 729 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.015625, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.015625, + "entropy": 0.27727919816970825, + "epoch": 1.169871794871795, + "grad_norm": 0.01305021345615387, + "learning_rate": 1e-06, + "loss": -0.0027, + "step": 730 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.0234375, + "clip_ratio/low_mean": 0.0078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.03125, + "entropy": 0.27219584584236145, + "epoch": 1.171474358974359, + "grad_norm": 0.008659505285322666, + "learning_rate": 1e-06, + "loss": 0.0143, + "step": 731 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.015625, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.015625, + "entropy": 0.2767972946166992, + "epoch": 1.1730769230769231, + "grad_norm": 0.0262775719165802, + "learning_rate": 1e-06, + "loss": 0.0295, + "step": 732 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15356.0, + "completions/mean_length": 7562.185546875, + "completions/mean_terminated_length": 7544.921875, + "completions/min_length": 1964.0, + "completions/min_terminated_length": 1964.0, + "entropy": 0.2714828997850418, + "epoch": 1.1746794871794872, + "frac_reward_zero_std": 0.71875, + "grad_norm": 0.026525314897298813, + "learning_rate": 1e-06, + "loss": 0.007, + "num_tokens": 963491644.0, + "reward": 0.8770654201507568, + "reward_std": 0.06552450358867645, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.96875, + "rewards/symbolic_reward_accuracy/std": 0.17416280508041382, + "rewards/symbolic_reward_partial_score/mean": 0.9867024421691895, + "rewards/symbolic_reward_partial_score/std": 0.09885963052511215, + "rewards/tag_count_reward/mean": -0.001953125, + "rewards/tag_count_reward/std": 0.04419417306780815, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0629959106445312, + "sampling/importance_sampling_ratio/min": 8.583244925830513e-05, + "sampling/sampling_logp_difference/max": 9.363113403320312, + "sampling/sampling_logp_difference/mean": 0.11046160757541656, + "step": 733 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.0703125, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0703125, + "entropy": 0.2706471383571625, + "epoch": 1.1762820512820513, + "grad_norm": 0.022599341347813606, + "learning_rate": 1e-06, + "loss": -0.0197, + "step": 734 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.015625, + "clip_ratio/low_mean": 0.0078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0234375, + "entropy": 0.2723312973976135, + "epoch": 1.1778846153846154, + "grad_norm": 0.026649735867977142, + "learning_rate": 1e-06, + "loss": 0.0485, + "step": 735 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.0234375, + "clip_ratio/low_mean": 0.015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0390625, + "entropy": 0.27444125711917877, + "epoch": 1.1794871794871795, + "grad_norm": 0.01070537231862545, + "learning_rate": 1e-06, + "loss": -0.0264, + "step": 736 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13870.0, + "completions/max_terminated_length": 13870.0, + "completions/mean_length": 7185.0625, + "completions/mean_terminated_length": 7185.0625, + "completions/min_length": 2770.0, + "completions/min_terminated_length": 2770.0, + "entropy": 0.2854379415512085, + "epoch": 1.1810897435897436, + "frac_reward_zero_std": 0.875, + "grad_norm": 0.0061562443152070045, + "learning_rate": 1e-06, + "loss": 0.0137, + "num_tokens": 968018860.0, + "reward": 0.8901855945587158, + "reward_std": 0.02915896289050579, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.986328125, + "rewards/symbolic_reward_accuracy/std": 0.1162383034825325, + "rewards/symbolic_reward_partial_score/mean": 0.99462890625, + "rewards/symbolic_reward_partial_score/std": 0.06525672227144241, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.065285563468933, + "sampling/importance_sampling_ratio/min": 0.0010467558167874813, + "sampling/sampling_logp_difference/max": 6.862059593200684, + "sampling/sampling_logp_difference/mean": 0.11443185061216354, + "step": 737 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.0234375, + "clip_ratio/low_mean": 0.0078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.03125, + "entropy": 0.28180165588855743, + "epoch": 1.1826923076923077, + "grad_norm": 0.007420375477522612, + "learning_rate": 1e-06, + "loss": -0.0116, + "step": 738 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.015625, + "clip_ratio/low_mean": 0.0078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0234375, + "entropy": 0.28390932083129883, + "epoch": 1.1842948717948718, + "grad_norm": 0.005746510811150074, + "learning_rate": 1e-06, + "loss": -0.0073, + "step": 739 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.015625, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.015625, + "entropy": 0.2805173099040985, + "epoch": 1.185897435897436, + "grad_norm": 0.010871785692870617, + "learning_rate": 1e-06, + "loss": -0.0052, + "step": 740 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14873.0, + "completions/max_terminated_length": 14873.0, + "completions/mean_length": 7421.39453125, + "completions/mean_terminated_length": 7421.39453125, + "completions/min_length": 2053.0, + "completions/min_terminated_length": 2053.0, + "entropy": 0.2859332859516144, + "epoch": 1.1875, + "frac_reward_zero_std": 0.875, + "grad_norm": 0.015115713700652122, + "learning_rate": 1e-06, + "loss": -0.0135, + "num_tokens": 972645830.0, + "reward": 0.8908447623252869, + "reward_std": 0.029325148090720177, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.98828125, + "rewards/symbolic_reward_accuracy/std": 0.10772226005792618, + "rewards/symbolic_reward_partial_score/mean": 0.9935709834098816, + "rewards/symbolic_reward_partial_score/std": 0.07672799378633499, + "rewards/tag_count_reward/mean": -0.001953125, + "rewards/tag_count_reward/std": 0.04419417306780815, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.065746784210205, + "sampling/importance_sampling_ratio/min": 0.0030088303610682487, + "sampling/sampling_logp_difference/max": 5.806203842163086, + "sampling/sampling_logp_difference/mean": 0.11504846811294556, + "step": 741 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.03125, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.03125, + "entropy": 0.2846004366874695, + "epoch": 1.189102564102564, + "grad_norm": 0.02620539255440235, + "learning_rate": 1e-06, + "loss": 0.0142, + "step": 742 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.03125, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.03125, + "entropy": 0.28722913563251495, + "epoch": 1.1907051282051282, + "grad_norm": 0.021206529811024666, + "learning_rate": 1e-06, + "loss": -0.0107, + "step": 743 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.015625, + "clip_ratio/low_mean": 0.0078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0234375, + "entropy": 0.28755249083042145, + "epoch": 1.1923076923076923, + "grad_norm": 0.014355083927512169, + "learning_rate": 1e-06, + "loss": 0.006, + "step": 744 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15088.0, + "completions/max_terminated_length": 15088.0, + "completions/mean_length": 7411.068359375, + "completions/mean_terminated_length": 7411.068359375, + "completions/min_length": 2300.0, + "completions/min_terminated_length": 2300.0, + "entropy": 0.287930428981781, + "epoch": 1.1939102564102564, + "frac_reward_zero_std": 0.90625, + "grad_norm": 0.0049547115340828896, + "learning_rate": 1e-06, + "loss": -0.0152, + "num_tokens": 977202697.0, + "reward": 0.8944531679153442, + "reward_std": 0.018543953076004982, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.9921875, + "rewards/symbolic_reward_accuracy/std": 0.08812850713729858, + "rewards/symbolic_reward_partial_score/mean": 0.9971354007720947, + "rewards/symbolic_reward_partial_score/std": 0.04583593085408211, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0666286945343018, + "sampling/importance_sampling_ratio/min": 0.0015383908757939935, + "sampling/sampling_logp_difference/max": 6.477018356323242, + "sampling/sampling_logp_difference/mean": 0.11660847067832947, + "step": 745 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0078125, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0078125, + "entropy": 0.29048343002796173, + "epoch": 1.1955128205128205, + "grad_norm": 0.02166350558400154, + "learning_rate": 1e-06, + "loss": 0.0241, + "step": 746 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0078125, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0078125, + "entropy": 0.2873390465974808, + "epoch": 1.1971153846153846, + "grad_norm": 0.0050978874787688255, + "learning_rate": 1e-06, + "loss": -0.0144, + "step": 747 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0078125, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0078125, + "entropy": 0.29117564857006073, + "epoch": 1.1987179487179487, + "grad_norm": 0.017066776752471924, + "learning_rate": 1e-06, + "loss": 0.0139, + "step": 748 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15377.0, + "completions/mean_length": 7712.28125, + "completions/mean_terminated_length": 7695.31103515625, + "completions/min_length": 2399.0, + "completions/min_terminated_length": 2399.0, + "entropy": 0.27627554535865784, + "epoch": 1.2003205128205128, + "frac_reward_zero_std": 0.90625, + "grad_norm": 0.024169372394680977, + "learning_rate": 1e-06, + "loss": 0.0219, + "num_tokens": 982081785.0, + "reward": 0.8903414011001587, + "reward_std": 0.018626626580953598, + "rewards/progression_diversity/mean": -4.191866173641756e-05, + "rewards/progression_diversity/std": 0.00094851094763726, + "rewards/symbolic_reward_accuracy/mean": 0.986328125, + "rewards/symbolic_reward_accuracy/std": 0.1162383034825325, + "rewards/symbolic_reward_partial_score/mean": 0.9964518547058105, + "rewards/symbolic_reward_partial_score/std": 0.048222340643405914, + "rewards/tag_count_reward/mean": -0.00390625, + "rewards/tag_count_reward/std": 0.06243881583213806, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0641599893569946, + "sampling/importance_sampling_ratio/min": 0.0002658157900441438, + "sampling/sampling_logp_difference/max": 8.232707023620605, + "sampling/sampling_logp_difference/mean": 0.11234469711780548, + "step": 749 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.015625, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.015625, + "entropy": 0.2774530202150345, + "epoch": 1.2019230769230769, + "grad_norm": 0.01591247320175171, + "learning_rate": 1e-06, + "loss": 0.0008, + "step": 750 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0078125, + "clip_ratio/low_mean": 0.0078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.015625, + "entropy": 0.27769312262535095, + "epoch": 1.203525641025641, + "grad_norm": 0.005623947829008102, + "learning_rate": 1e-06, + "loss": -0.0104, + "step": 751 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.015625, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.015625, + "entropy": 0.27439242601394653, + "epoch": 1.205128205128205, + "grad_norm": 0.011445503681898117, + "learning_rate": 1e-06, + "loss": -0.0085, + "step": 752 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15583.0, + "completions/mean_length": 7719.01171875, + "completions/mean_terminated_length": 7685.03173828125, + "completions/min_length": 2215.0, + "completions/min_terminated_length": 2215.0, + "entropy": 0.2831403911113739, + "epoch": 1.2067307692307692, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.012956078164279461, + "learning_rate": 1e-06, + "loss": -0.026, + "num_tokens": 986945087.0, + "reward": 0.8807443380355835, + "reward_std": 0.057317137718200684, + "rewards/progression_diversity/mean": -0.0002774419845081866, + "rewards/progression_diversity/std": 0.006277794949710369, + "rewards/symbolic_reward_accuracy/mean": 0.97265625, + "rewards/symbolic_reward_accuracy/std": 0.16324250400066376, + "rewards/symbolic_reward_partial_score/mean": 0.9918131828308105, + "rewards/symbolic_reward_partial_score/std": 0.06508524715900421, + "rewards/tag_count_reward/mean": -0.00390625, + "rewards/tag_count_reward/std": 0.06243881583213806, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0649077892303467, + "sampling/importance_sampling_ratio/min": 0.002849755110219121, + "sampling/sampling_logp_difference/max": 5.860522270202637, + "sampling/sampling_logp_difference/mean": 0.11349144577980042, + "step": 753 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.03125, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.03125, + "entropy": 0.28055115044116974, + "epoch": 1.2083333333333333, + "grad_norm": 0.02566658891737461, + "learning_rate": 1e-06, + "loss": 0.0113, + "step": 754 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.0625, + "clip_ratio/low_mean": 0.0078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0703125, + "entropy": 0.27944353222846985, + "epoch": 1.2099358974358974, + "grad_norm": 0.028652476146817207, + "learning_rate": 1e-06, + "loss": 0.0305, + "step": 755 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.0546875, + "clip_ratio/low_mean": 0.0078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0625, + "entropy": 0.28025032579898834, + "epoch": 1.2115384615384615, + "grad_norm": 0.017124585807323456, + "learning_rate": 1e-06, + "loss": -0.0014, + "step": 756 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14722.0, + "completions/mean_length": 8052.341796875, + "completions/mean_terminated_length": 8036.037109375, + "completions/min_length": 3876.0, + "completions/min_terminated_length": 3876.0, + "entropy": 0.277658149600029, + "epoch": 1.2131410256410255, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.03241975978016853, + "learning_rate": 1e-06, + "loss": 0.0281, + "num_tokens": 992013422.0, + "reward": 0.8829687833786011, + "reward_std": 0.05664096772670746, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.9765625, + "rewards/symbolic_reward_accuracy/std": 0.15143637359142303, + "rewards/symbolic_reward_partial_score/mean": 0.9907552003860474, + "rewards/symbolic_reward_partial_score/std": 0.08320756256580353, + "rewards/tag_count_reward/mean": -0.001953125, + "rewards/tag_count_reward/std": 0.04419417306780815, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0649545192718506, + "sampling/importance_sampling_ratio/min": 0.0018924587639048696, + "sampling/sampling_logp_difference/max": 6.269878387451172, + "sampling/sampling_logp_difference/mean": 0.11347892135381699, + "step": 757 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.046875, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.046875, + "entropy": 0.277600422501564, + "epoch": 1.2147435897435896, + "grad_norm": 0.033942461013793945, + "learning_rate": 1e-06, + "loss": 0.0586, + "step": 758 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.0234375, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0234375, + "entropy": 0.2830449342727661, + "epoch": 1.2163461538461537, + "grad_norm": 0.009825375862419605, + "learning_rate": 1e-06, + "loss": -0.0332, + "step": 759 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.0234375, + "clip_ratio/low_mean": 0.0078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.03125, + "entropy": 0.282797247171402, + "epoch": 1.217948717948718, + "grad_norm": 0.009744114242494106, + "learning_rate": 1e-06, + "loss": -0.0309, + "step": 760 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15098.0, + "completions/max_terminated_length": 15098.0, + "completions/mean_length": 7623.50390625, + "completions/mean_terminated_length": 7623.50390625, + "completions/min_length": 3855.0, + "completions/min_terminated_length": 3855.0, + "entropy": 0.2917735129594803, + "epoch": 1.219551282051282, + "frac_reward_zero_std": 0.875, + "grad_norm": 0.0076350378803908825, + "learning_rate": 1e-06, + "loss": -0.0117, + "num_tokens": 996748624.0, + "reward": 0.8948044776916504, + "reward_std": 0.01744021661579609, + "rewards/progression_diversity/mean": -2.3829052224755287e-05, + "rewards/progression_diversity/std": 0.0005391899030655622, + "rewards/symbolic_reward_accuracy/mean": 0.9921875, + "rewards/symbolic_reward_accuracy/std": 0.08812850713729858, + "rewards/symbolic_reward_partial_score/mean": 0.9983072876930237, + "rewards/symbolic_reward_partial_score/std": 0.019349031150341034, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0678482055664062, + "sampling/importance_sampling_ratio/min": 0.001051880419254303, + "sampling/sampling_logp_difference/max": 6.857175827026367, + "sampling/sampling_logp_difference/mean": 0.11811263114213943, + "step": 761 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.015625, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.015625, + "entropy": 0.2913740575313568, + "epoch": 1.2211538461538463, + "grad_norm": 0.004803275689482689, + "learning_rate": 1e-06, + "loss": -0.0062, + "step": 762 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.0390625, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0390625, + "entropy": 0.28947630524635315, + "epoch": 1.2227564102564104, + "grad_norm": 0.004451874177902937, + "learning_rate": 1e-06, + "loss": -0.0046, + "step": 763 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0078125, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0078125, + "entropy": 0.2937081605195999, + "epoch": 1.2243589743589745, + "grad_norm": 0.028366444632411003, + "learning_rate": 1e-06, + "loss": 0.0196, + "step": 764 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13416.0, + "completions/mean_length": 7715.34765625, + "completions/mean_terminated_length": 7698.38330078125, + "completions/min_length": 2870.0, + "completions/min_terminated_length": 2870.0, + "entropy": 0.293381005525589, + "epoch": 1.2259615384615385, + "frac_reward_zero_std": 0.90625, + "grad_norm": 0.004608824849128723, + "learning_rate": 1e-06, + "loss": 0.0005, + "num_tokens": 1001542498.0, + "reward": 0.8941406607627869, + "reward_std": 0.02034306898713112, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.9921875, + "rewards/symbolic_reward_accuracy/std": 0.08812850713729858, + "rewards/symbolic_reward_partial_score/mean": 0.9967448115348816, + "rewards/symbolic_reward_partial_score/std": 0.04962607100605965, + "rewards/tag_count_reward/mean": -0.001953125, + "rewards/tag_count_reward/std": 0.04419417306780815, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0679861307144165, + "sampling/importance_sampling_ratio/min": 0.0015377042582258582, + "sampling/sampling_logp_difference/max": 6.47746467590332, + "sampling/sampling_logp_difference/mean": 0.11805201321840286, + "step": 765 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.0234375, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0234375, + "entropy": 0.2906278818845749, + "epoch": 1.2275641025641026, + "grad_norm": 0.03442588075995445, + "learning_rate": 1e-06, + "loss": 0.0285, + "step": 766 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.015625, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.015625, + "entropy": 0.29679957032203674, + "epoch": 1.2291666666666667, + "grad_norm": 0.0050870683044195175, + "learning_rate": 1e-06, + "loss": -0.0035, + "step": 767 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0234375, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0234375, + "entropy": 0.29432834684848785, + "epoch": 1.2307692307692308, + "grad_norm": 0.004784552846103907, + "learning_rate": 1e-06, + "loss": -0.0157, + "step": 768 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13010.0, + "completions/max_terminated_length": 13010.0, + "completions/mean_length": 7732.798828125, + "completions/mean_terminated_length": 7732.798828125, + "completions/min_length": 2400.0, + "completions/min_terminated_length": 2400.0, + "entropy": 0.29547902941703796, + "epoch": 1.232371794871795, + "frac_reward_zero_std": 0.8125, + "grad_norm": 0.012175563722848892, + "learning_rate": 1e-06, + "loss": 0.008, + "num_tokens": 1006352315.0, + "reward": 0.8856445550918579, + "reward_std": 0.03763294965028763, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.978515625, + "rewards/symbolic_reward_accuracy/std": 0.14513419568538666, + "rewards/symbolic_reward_partial_score/mean": 0.9951171875, + "rewards/symbolic_reward_partial_score/std": 0.051642172038555145, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0691790580749512, + "sampling/importance_sampling_ratio/min": 0.0011769047705456614, + "sampling/sampling_logp_difference/max": 6.744867324829102, + "sampling/sampling_logp_difference/mean": 0.11996014416217804, + "step": 769 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.03125, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.03125, + "entropy": 0.2981226444244385, + "epoch": 1.233974358974359, + "grad_norm": 0.010661444626748562, + "learning_rate": 1e-06, + "loss": -0.0197, + "step": 770 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.015625, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.015625, + "entropy": 0.2980218082666397, + "epoch": 1.2355769230769231, + "grad_norm": 0.011630581691861153, + "learning_rate": 1e-06, + "loss": -0.0277, + "step": 771 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.0234375, + "clip_ratio/low_mean": 0.0078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.03125, + "entropy": 0.29873549938201904, + "epoch": 1.2371794871794872, + "grad_norm": 0.012972732074558735, + "learning_rate": 1e-06, + "loss": 0.0369, + "step": 772 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14956.0, + "completions/max_terminated_length": 14956.0, + "completions/mean_length": 7541.69921875, + "completions/mean_terminated_length": 7541.69921875, + "completions/min_length": 3751.0, + "completions/min_terminated_length": 3751.0, + "entropy": 0.29750749468803406, + "epoch": 1.2387820512820513, + "frac_reward_zero_std": 0.8125, + "grad_norm": 0.031947024166584015, + "learning_rate": 1e-06, + "loss": 0.0007, + "num_tokens": 1011072049.0, + "reward": 0.8877832293510437, + "reward_std": 0.038269512355327606, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.982421875, + "rewards/symbolic_reward_accuracy/std": 0.13154059648513794, + "rewards/symbolic_reward_partial_score/mean": 0.994433581829071, + "rewards/symbolic_reward_partial_score/std": 0.06408015638589859, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0681447982788086, + "sampling/importance_sampling_ratio/min": 0.0004270588397048414, + "sampling/sampling_logp_difference/max": 7.758588790893555, + "sampling/sampling_logp_difference/mean": 0.11879236251115799, + "step": 773 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0078125, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0078125, + "entropy": 0.2897566109895706, + "epoch": 1.2403846153846154, + "grad_norm": 0.03253176808357239, + "learning_rate": 1e-06, + "loss": 0.0032, + "step": 774 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0078125, + "clip_ratio/low_mean": 0.0078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.015625, + "entropy": 0.29233625531196594, + "epoch": 1.2419871794871795, + "grad_norm": 0.028909306973218918, + "learning_rate": 1e-06, + "loss": 0.0133, + "step": 775 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0234375, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0234375, + "entropy": 0.28808602690696716, + "epoch": 1.2435897435897436, + "grad_norm": 0.03054192289710045, + "learning_rate": 1e-06, + "loss": -0.0073, + "step": 776 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.009765625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15503.0, + "completions/mean_length": 7913.501953125, + "completions/mean_terminated_length": 7829.966796875, + "completions/min_length": 3547.0, + "completions/min_terminated_length": 3547.0, + "entropy": 0.2850220203399658, + "epoch": 1.2451923076923077, + "frac_reward_zero_std": 0.71875, + "grad_norm": 0.014626838266849518, + "learning_rate": 1e-06, + "loss": 0.0291, + "num_tokens": 1016047730.0, + "reward": 0.8811328411102295, + "reward_std": 0.0615340992808342, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.9765625, + "rewards/symbolic_reward_accuracy/std": 0.15143637359142303, + "rewards/symbolic_reward_partial_score/mean": 0.9872395992279053, + "rewards/symbolic_reward_partial_score/std": 0.10803718864917755, + "rewards/tag_count_reward/mean": -0.009765625, + "rewards/tag_count_reward/std": 0.09843364357948303, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0665440559387207, + "sampling/importance_sampling_ratio/min": 2.4944363758550026e-05, + "sampling/sampling_logp_difference/max": 10.598862648010254, + "sampling/sampling_logp_difference/mean": 0.11551474034786224, + "step": 777 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.0546875, + "clip_ratio/low_mean": 0.0078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0625, + "entropy": 0.2869865745306015, + "epoch": 1.2467948717948718, + "grad_norm": 0.03926634415984154, + "learning_rate": 1e-06, + "loss": 0.0145, + "step": 778 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.0390625, + "clip_ratio/low_mean": 0.0078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.046875, + "entropy": 0.2825475335121155, + "epoch": 1.248397435897436, + "grad_norm": 0.03181926906108856, + "learning_rate": 1e-06, + "loss": 0.0064, + "step": 779 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.03125, + "clip_ratio/low_mean": 0.0078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0390625, + "entropy": 0.28822551667690277, + "epoch": 1.25, + "grad_norm": 0.013265200890600681, + "learning_rate": 1e-06, + "loss": -0.0185, + "step": 780 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14623.0, + "completions/max_terminated_length": 14623.0, + "completions/mean_length": 7728.783203125, + "completions/mean_terminated_length": 7728.783203125, + "completions/min_length": 3294.0, + "completions/min_terminated_length": 3294.0, + "entropy": 0.29114533960819244, + "epoch": 1.251602564102564, + "frac_reward_zero_std": 0.875, + "grad_norm": 0.030667047947645187, + "learning_rate": 1e-06, + "loss": 0.0149, + "num_tokens": 1020910579.0, + "reward": 0.8891504406929016, + "reward_std": 0.02797744609415531, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.984375, + "rewards/symbolic_reward_accuracy/std": 0.12414088100194931, + "rewards/symbolic_reward_partial_score/mean": 0.9950846433639526, + "rewards/symbolic_reward_partial_score/std": 0.06306508928537369, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.068063735961914, + "sampling/importance_sampling_ratio/min": 0.0007180717075243592, + "sampling/sampling_logp_difference/max": 7.238941192626953, + "sampling/sampling_logp_difference/mean": 0.11831213533878326, + "step": 781 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.015625, + "clip_ratio/low_mean": 0.015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.03125, + "entropy": 0.2906217724084854, + "epoch": 1.2532051282051282, + "grad_norm": 0.006525769364088774, + "learning_rate": 1e-06, + "loss": 0.0125, + "step": 782 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.0234375, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0234375, + "entropy": 0.29298603534698486, + "epoch": 1.2548076923076923, + "grad_norm": 0.007502012420445681, + "learning_rate": 1e-06, + "loss": -0.0171, + "step": 783 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0078125, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0078125, + "entropy": 0.29166728258132935, + "epoch": 1.2564102564102564, + "grad_norm": 0.020680276677012444, + "learning_rate": 1e-06, + "loss": -0.0086, + "step": 784 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14149.0, + "completions/max_terminated_length": 14149.0, + "completions/mean_length": 7387.083984375, + "completions/mean_terminated_length": 7387.083984375, + "completions/min_length": 2680.0, + "completions/min_terminated_length": 2680.0, + "entropy": 0.3060101866722107, + "epoch": 1.2580128205128205, + "frac_reward_zero_std": 0.9375, + "grad_norm": 0.0034628999419510365, + "learning_rate": 1e-06, + "loss": 0.0057, + "num_tokens": 1025487758.0, + "reward": 0.8975389003753662, + "reward_std": 0.006725301966071129, + "rewards/progression_diversity/mean": -1.852334025898017e-05, + "rewards/progression_diversity/std": 0.0004191353218629956, + "rewards/symbolic_reward_accuracy/mean": 0.99609375, + "rewards/symbolic_reward_accuracy/std": 0.06243881583213806, + "rewards/symbolic_reward_partial_score/mean": 0.9996093511581421, + "rewards/symbolic_reward_partial_score/std": 0.006243883166462183, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.070400595664978, + "sampling/importance_sampling_ratio/min": 0.0005534536903724074, + "sampling/sampling_logp_difference/max": 7.499332427978516, + "sampling/sampling_logp_difference/mean": 0.12200077623128891, + "step": 785 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.015625, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.015625, + "entropy": 0.3041597902774811, + "epoch": 1.2596153846153846, + "grad_norm": 0.01983814872801304, + "learning_rate": 1e-06, + "loss": 0.0082, + "step": 786 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "entropy": 0.304287388920784, + "epoch": 1.2612179487179487, + "grad_norm": 0.0035897772759199142, + "learning_rate": 1e-06, + "loss": -0.0077, + "step": 787 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0078125, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0078125, + "entropy": 0.2998305410146713, + "epoch": 1.2628205128205128, + "grad_norm": 0.005747531540691853, + "learning_rate": 1e-06, + "loss": -0.0041, + "step": 788 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11941.0, + "completions/max_terminated_length": 11941.0, + "completions/mean_length": 7136.01171875, + "completions/mean_terminated_length": 7136.01171875, + "completions/min_length": 2860.0, + "completions/min_terminated_length": 2860.0, + "entropy": 0.3156207203865051, + "epoch": 1.2644230769230769, + "frac_reward_zero_std": 0.875, + "grad_norm": 0.0063291979022324085, + "learning_rate": 1e-06, + "loss": -0.0203, + "num_tokens": 1029842004.0, + "reward": 0.8918652534484863, + "reward_std": 0.025776326656341553, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.98828125, + "rewards/symbolic_reward_accuracy/std": 0.10772226005792618, + "rewards/symbolic_reward_partial_score/mean": 0.9963216185569763, + "rewards/symbolic_reward_partial_score/std": 0.050049230456352234, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.072809100151062, + "sampling/importance_sampling_ratio/min": 0.001212042523548007, + "sampling/sampling_logp_difference/max": 6.715448379516602, + "sampling/sampling_logp_difference/mean": 0.12579187750816345, + "step": 789 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0234375, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0234375, + "entropy": 0.3099760115146637, + "epoch": 1.266025641025641, + "grad_norm": 0.006072794087231159, + "learning_rate": 1e-06, + "loss": -0.0054, + "step": 790 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.015625, + "clip_ratio/low_mean": 0.0078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0234375, + "entropy": 0.31477801501750946, + "epoch": 1.267628205128205, + "grad_norm": 0.005829320289194584, + "learning_rate": 1e-06, + "loss": 0.0413, + "step": 791 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.0234375, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0234375, + "entropy": 0.31682026386260986, + "epoch": 1.2692307692307692, + "grad_norm": 0.01667245663702488, + "learning_rate": 1e-06, + "loss": -0.0101, + "step": 792 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15053.0, + "completions/max_terminated_length": 15053.0, + "completions/mean_length": 7788.76171875, + "completions/mean_terminated_length": 7788.76171875, + "completions/min_length": 2657.0, + "completions/min_terminated_length": 2657.0, + "entropy": 0.29877328872680664, + "epoch": 1.2708333333333333, + "frac_reward_zero_std": 0.8125, + "grad_norm": 0.027668794617056847, + "learning_rate": 1e-06, + "loss": -0.0109, + "num_tokens": 1034711162.0, + "reward": 0.8842675685882568, + "reward_std": 0.04109036177396774, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.9765625, + "rewards/symbolic_reward_accuracy/std": 0.15143637359142303, + "rewards/symbolic_reward_partial_score/mean": 0.9944336414337158, + "rewards/symbolic_reward_partial_score/std": 0.06344074010848999, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.068732500076294, + "sampling/importance_sampling_ratio/min": 0.003076533554121852, + "sampling/sampling_logp_difference/max": 5.783951759338379, + "sampling/sampling_logp_difference/mean": 0.11873342841863632, + "step": 793 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0234375, + "clip_ratio/low_mean": 0.0078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.03125, + "entropy": 0.2952185273170471, + "epoch": 1.2724358974358974, + "grad_norm": 0.011909693479537964, + "learning_rate": 1e-06, + "loss": -0.002, + "step": 794 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.046875, + "clip_ratio/low_mean": 0.015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0625, + "entropy": 0.29329799115657806, + "epoch": 1.2740384615384617, + "grad_norm": 0.008952183648943901, + "learning_rate": 1e-06, + "loss": 0.0087, + "step": 795 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.015625, + "clip_ratio/low_mean": 0.0078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0234375, + "entropy": 0.29316699504852295, + "epoch": 1.2756410256410255, + "grad_norm": 0.03064666874706745, + "learning_rate": 1e-06, + "loss": 0.0023, + "step": 796 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14643.0, + "completions/max_terminated_length": 14643.0, + "completions/mean_length": 7695.822265625, + "completions/mean_terminated_length": 7695.822265625, + "completions/min_length": 3069.0, + "completions/min_terminated_length": 3069.0, + "entropy": 0.30592451989650726, + "epoch": 1.2772435897435899, + "frac_reward_zero_std": 0.8125, + "grad_norm": 0.008664174936711788, + "learning_rate": 1e-06, + "loss": -0.0229, + "num_tokens": 1039480127.0, + "reward": 0.8838672637939453, + "reward_std": 0.04563511162996292, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.978515625, + "rewards/symbolic_reward_accuracy/std": 0.14513419568538666, + "rewards/symbolic_reward_partial_score/mean": 0.9891927242279053, + "rewards/symbolic_reward_partial_score/std": 0.09879883378744125, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.071099042892456, + "sampling/importance_sampling_ratio/min": 0.0011856004130095243, + "sampling/sampling_logp_difference/max": 6.737505912780762, + "sampling/sampling_logp_difference/mean": 0.12261907756328583, + "step": 797 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0234375, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0234375, + "entropy": 0.30264943838119507, + "epoch": 1.2788461538461537, + "grad_norm": 0.020319325849413872, + "learning_rate": 1e-06, + "loss": -0.0038, + "step": 798 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.03125, + "clip_ratio/low_mean": 0.015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.046875, + "entropy": 0.3089882731437683, + "epoch": 1.280448717948718, + "grad_norm": 0.013271676376461983, + "learning_rate": 1e-06, + "loss": 0.0276, + "step": 799 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.0234375, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0234375, + "entropy": 0.30357275903224945, + "epoch": 1.282051282051282, + "grad_norm": 0.022130422294139862, + "learning_rate": 1e-06, + "loss": -0.0088, + "step": 800 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 12987.0, + "completions/mean_length": 7707.5078125, + "completions/mean_terminated_length": 7690.5283203125, + "completions/min_length": 2717.0, + "completions/min_terminated_length": 2717.0, + "entropy": 0.3025813400745392, + "epoch": 1.2836538461538463, + "frac_reward_zero_std": 0.90625, + "grad_norm": 0.009830499067902565, + "learning_rate": 1e-06, + "loss": -0.0073, + "num_tokens": 1044275923.0, + "reward": 0.8870117664337158, + "reward_std": 0.022976521402597427, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.98046875, + "rewards/symbolic_reward_accuracy/std": 0.1385180652141571, + "rewards/symbolic_reward_partial_score/mean": 0.9964193105697632, + "rewards/symbolic_reward_partial_score/std": 0.045904021710157394, + "rewards/tag_count_reward/mean": -0.001953125, + "rewards/tag_count_reward/std": 0.04419417306780815, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0703761577606201, + "sampling/importance_sampling_ratio/min": 0.0024872496724128723, + "sampling/sampling_logp_difference/max": 5.996577739715576, + "sampling/sampling_logp_difference/mean": 0.12130090594291687, + "step": 801 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0078125, + "entropy": 0.3026769459247589, + "epoch": 1.2852564102564101, + "grad_norm": 0.00772315775975585, + "learning_rate": 1e-06, + "loss": -0.0016, + "step": 802 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.015625, + "clip_ratio/low_mean": 0.0078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0234375, + "entropy": 0.3021104037761688, + "epoch": 1.2868589743589745, + "grad_norm": 0.01086731068789959, + "learning_rate": 1e-06, + "loss": 0.0311, + "step": 803 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0078125, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0078125, + "entropy": 0.299283891916275, + "epoch": 1.2884615384615383, + "grad_norm": 0.012979859486222267, + "learning_rate": 1e-06, + "loss": -0.0104, + "step": 804 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16163.0, + "completions/max_terminated_length": 16163.0, + "completions/mean_length": 8625.98046875, + "completions/mean_terminated_length": 8625.98046875, + "completions/min_length": 3964.0, + "completions/min_terminated_length": 3964.0, + "entropy": 0.29120542109012604, + "epoch": 1.2900641025641026, + "frac_reward_zero_std": 0.875, + "grad_norm": 0.005270938854664564, + "learning_rate": 1e-06, + "loss": -0.02, + "num_tokens": 1049633401.0, + "reward": 0.8940331935882568, + "reward_std": 0.02386718988418579, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.9921875, + "rewards/symbolic_reward_accuracy/std": 0.08812850713729858, + "rewards/symbolic_reward_partial_score/mean": 0.9957356452941895, + "rewards/symbolic_reward_partial_score/std": 0.06268040835857391, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0678813457489014, + "sampling/importance_sampling_ratio/min": 0.0018456288380548358, + "sampling/sampling_logp_difference/max": 6.29493522644043, + "sampling/sampling_logp_difference/mean": 0.11715521663427353, + "step": 805 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.0234375, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0234375, + "entropy": 0.29076600074768066, + "epoch": 1.2916666666666667, + "grad_norm": 0.024370063096284866, + "learning_rate": 1e-06, + "loss": -0.0091, + "step": 806 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.015625, + "clip_ratio/low_mean": 0.0078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0234375, + "entropy": 0.2884713411331177, + "epoch": 1.2932692307692308, + "grad_norm": 0.005116340704262257, + "learning_rate": 1e-06, + "loss": 0.0212, + "step": 807 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0078125, + "clip_ratio/low_mean": 0.0078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.015625, + "entropy": 0.28749142587184906, + "epoch": 1.294871794871795, + "grad_norm": 0.005040435586124659, + "learning_rate": 1e-06, + "loss": 0.0027, + "step": 808 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16067.0, + "completions/mean_length": 8135.697265625, + "completions/mean_terminated_length": 8103.3515625, + "completions/min_length": 2674.0, + "completions/min_terminated_length": 2674.0, + "entropy": 0.2949254959821701, + "epoch": 1.296474358974359, + "frac_reward_zero_std": 0.84375, + "grad_norm": 0.008695917204022408, + "learning_rate": 1e-06, + "loss": 0.0171, + "num_tokens": 1054715534.0, + "reward": 0.8852832317352295, + "reward_std": 0.037169456481933594, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.978515625, + "rewards/symbolic_reward_accuracy/std": 0.14513419568538666, + "rewards/symbolic_reward_partial_score/mean": 0.9945638179779053, + "rewards/symbolic_reward_partial_score/std": 0.06339205801486969, + "rewards/tag_count_reward/mean": -0.001953125, + "rewards/tag_count_reward/std": 0.04419417306780815, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.068376064300537, + "sampling/importance_sampling_ratio/min": 0.0008881228277459741, + "sampling/sampling_logp_difference/max": 7.026400566101074, + "sampling/sampling_logp_difference/mean": 0.11818452924489975, + "step": 809 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.03125, + "clip_ratio/low_mean": 0.0078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0390625, + "entropy": 0.2952611595392227, + "epoch": 1.2980769230769231, + "grad_norm": 0.015448620542883873, + "learning_rate": 1e-06, + "loss": 0.0007, + "step": 810 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.015625, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.015625, + "entropy": 0.2906734049320221, + "epoch": 1.2996794871794872, + "grad_norm": 0.01636776700615883, + "learning_rate": 1e-06, + "loss": -0.0312, + "step": 811 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0078125, + "entropy": 0.2910599410533905, + "epoch": 1.3012820512820513, + "grad_norm": 0.020219558849930763, + "learning_rate": 1e-06, + "loss": 0.0283, + "step": 812 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15860.0, + "completions/max_terminated_length": 15860.0, + "completions/mean_length": 8440.0234375, + "completions/mean_terminated_length": 8440.0234375, + "completions/min_length": 4233.0, + "completions/min_terminated_length": 4233.0, + "entropy": 0.29061928391456604, + "epoch": 1.3028846153846154, + "frac_reward_zero_std": 0.875, + "grad_norm": 0.006722176913172007, + "learning_rate": 1e-06, + "loss": -0.0116, + "num_tokens": 1059909306.0, + "reward": 0.8908935785293579, + "reward_std": 0.02562917396426201, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.986328125, + "rewards/symbolic_reward_accuracy/std": 0.1162383034825325, + "rewards/symbolic_reward_partial_score/mean": 0.9969889521598816, + "rewards/symbolic_reward_partial_score/std": 0.045237038284540176, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0675506591796875, + "sampling/importance_sampling_ratio/min": 0.0014303690986707807, + "sampling/sampling_logp_difference/max": 6.549822807312012, + "sampling/sampling_logp_difference/mean": 0.11663618683815002, + "step": 813 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0078125, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0078125, + "entropy": 0.28442804515361786, + "epoch": 1.3044871794871795, + "grad_norm": 0.028145821765065193, + "learning_rate": 1e-06, + "loss": -0.0056, + "step": 814 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.015625, + "clip_ratio/low_mean": 0.015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.03125, + "entropy": 0.28922364115715027, + "epoch": 1.3060897435897436, + "grad_norm": 0.018946906551718712, + "learning_rate": 1e-06, + "loss": 0.0325, + "step": 815 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0078125, + "entropy": 0.28923703730106354, + "epoch": 1.3076923076923077, + "grad_norm": 0.005810855887830257, + "learning_rate": 1e-06, + "loss": -0.0143, + "step": 816 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15852.0, + "completions/mean_length": 8810.791015625, + "completions/mean_terminated_length": 8751.1591796875, + "completions/min_length": 4011.0, + "completions/min_terminated_length": 4011.0, + "entropy": 0.26773253083229065, + "epoch": 1.3092948717948718, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.03435671329498291, + "learning_rate": 1e-06, + "loss": 0.078, + "num_tokens": 1065399423.0, + "reward": 0.8867088556289673, + "reward_std": 0.053164899349212646, + "rewards/progression_diversity/mean": -2.084466177620925e-05, + "rewards/progression_diversity/std": 0.0004716608382295817, + "rewards/symbolic_reward_accuracy/mean": 0.982421875, + "rewards/symbolic_reward_accuracy/std": 0.13154059648513794, + "rewards/symbolic_reward_partial_score/mean": 0.9928059577941895, + "rewards/symbolic_reward_partial_score/std": 0.07733853906393051, + "rewards/tag_count_reward/mean": -0.005859375, + "rewards/tag_count_reward/std": 0.07639661431312561, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0633445978164673, + "sampling/importance_sampling_ratio/min": 7.085214019753039e-05, + "sampling/sampling_logp_difference/max": 9.554915428161621, + "sampling/sampling_logp_difference/mean": 0.10956880450248718, + "step": 817 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.0625, + "clip_ratio/low_mean": 0.0078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0703125, + "entropy": 0.26666541397571564, + "epoch": 1.310897435897436, + "grad_norm": 0.02670695073902607, + "learning_rate": 1e-06, + "loss": -0.0012, + "step": 818 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.03125, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.03125, + "entropy": 0.2719925343990326, + "epoch": 1.3125, + "grad_norm": 0.026015251874923706, + "learning_rate": 1e-06, + "loss": -0.0272, + "step": 819 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.078125, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.078125, + "entropy": 0.26630181074142456, + "epoch": 1.314102564102564, + "grad_norm": 0.0247061625123024, + "learning_rate": 1e-06, + "loss": -0.0138, + "step": 820 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15426.0, + "completions/mean_length": 7572.466796875, + "completions/mean_terminated_length": 7503.08447265625, + "completions/min_length": 2604.0, + "completions/min_terminated_length": 2604.0, + "entropy": 0.27331072092056274, + "epoch": 1.3157051282051282, + "frac_reward_zero_std": 0.71875, + "grad_norm": 0.025832634419202805, + "learning_rate": 1e-06, + "loss": 0.0133, + "num_tokens": 1070202158.0, + "reward": 0.8901264071464539, + "reward_std": 0.033305682241916656, + "rewards/progression_diversity/mean": -5.895454887649976e-05, + "rewards/progression_diversity/std": 0.0013339892029762268, + "rewards/symbolic_reward_accuracy/mean": 0.986328125, + "rewards/symbolic_reward_accuracy/std": 0.1162383034825325, + "rewards/symbolic_reward_partial_score/mean": 0.9970377683639526, + "rewards/symbolic_reward_partial_score/std": 0.04511844739317894, + "rewards/tag_count_reward/mean": -0.0078125, + "rewards/tag_count_reward/std": 0.08812850713729858, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0644633769989014, + "sampling/importance_sampling_ratio/min": 0.002112078480422497, + "sampling/sampling_logp_difference/max": 6.160082817077637, + "sampling/sampling_logp_difference/mean": 0.1119283139705658, + "step": 821 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.0234375, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0234375, + "entropy": 0.27660806477069855, + "epoch": 1.3173076923076923, + "grad_norm": 0.02830098383128643, + "learning_rate": 1e-06, + "loss": 0.0153, + "step": 822 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.046875, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.046875, + "entropy": 0.2817423641681671, + "epoch": 1.3189102564102564, + "grad_norm": 0.020086728036403656, + "learning_rate": 1e-06, + "loss": -0.0156, + "step": 823 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.015625, + "clip_ratio/low_mean": 0.015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.03125, + "entropy": 0.274202823638916, + "epoch": 1.3205128205128205, + "grad_norm": 0.023740731179714203, + "learning_rate": 1e-06, + "loss": 0.0539, + "step": 824 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14339.0, + "completions/mean_length": 7367.796875, + "completions/mean_terminated_length": 7350.15234375, + "completions/min_length": 2376.0, + "completions/min_terminated_length": 2376.0, + "entropy": 0.2823334336280823, + "epoch": 1.3221153846153846, + "frac_reward_zero_std": 0.84375, + "grad_norm": 0.00718251708894968, + "learning_rate": 1e-06, + "loss": -0.0283, + "num_tokens": 1074893158.0, + "reward": 0.8895508050918579, + "reward_std": 0.031776778399944305, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.984375, + "rewards/symbolic_reward_accuracy/std": 0.12414088100194931, + "rewards/symbolic_reward_partial_score/mean": 0.9964193105697632, + "rewards/symbolic_reward_partial_score/std": 0.04641921818256378, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0652776956558228, + "sampling/importance_sampling_ratio/min": 4.965396328771021e-06, + "sampling/sampling_logp_difference/max": 12.213017463684082, + "sampling/sampling_logp_difference/mean": 0.11385728418827057, + "step": 825 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.0234375, + "clip_ratio/low_mean": 0.0078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.03125, + "entropy": 0.28175710141658783, + "epoch": 1.3237179487179487, + "grad_norm": 0.005745855160057545, + "learning_rate": 1e-06, + "loss": 0.0189, + "step": 826 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0234375, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0234375, + "entropy": 0.28442713618278503, + "epoch": 1.3253205128205128, + "grad_norm": 0.007170431315898895, + "learning_rate": 1e-06, + "loss": -0.017, + "step": 827 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0234375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0234375, + "entropy": 0.2808009684085846, + "epoch": 1.3269230769230769, + "grad_norm": 0.031063031405210495, + "learning_rate": 1e-06, + "loss": 0.0343, + "step": 828 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13142.0, + "completions/max_terminated_length": 13142.0, + "completions/mean_length": 6885.845703125, + "completions/mean_terminated_length": 6885.845703125, + "completions/min_length": 1827.0, + "completions/min_terminated_length": 1827.0, + "entropy": 0.2910860776901245, + "epoch": 1.328525641025641, + "frac_reward_zero_std": 0.96875, + "grad_norm": 0.002486742567270994, + "learning_rate": 1e-06, + "loss": -0.005, + "num_tokens": 1079341143.0, + "reward": 0.8982422351837158, + "reward_std": 0.00703125074505806, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.998046875, + "rewards/symbolic_reward_accuracy/std": 0.04419417306780815, + "rewards/symbolic_reward_partial_score/mean": 0.998046875, + "rewards/symbolic_reward_partial_score/std": 0.04419417306780815, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0668704509735107, + "sampling/importance_sampling_ratio/min": 0.0020832736045122147, + "sampling/sampling_logp_difference/max": 6.17381477355957, + "sampling/sampling_logp_difference/mean": 0.1167755201458931, + "step": 829 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "entropy": 0.2902107387781143, + "epoch": 1.330128205128205, + "grad_norm": 0.002582802902907133, + "learning_rate": 1e-06, + "loss": -0.0052, + "step": 830 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "entropy": 0.29032963514328003, + "epoch": 1.3317307692307692, + "grad_norm": 0.0026250346563756466, + "learning_rate": 1e-06, + "loss": -0.0057, + "step": 831 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "entropy": 0.2910378873348236, + "epoch": 1.3333333333333333, + "grad_norm": 0.02715476043522358, + "learning_rate": 1e-06, + "loss": 0.0168, + "step": 832 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12247.0, + "completions/max_terminated_length": 12247.0, + "completions/mean_length": 6663.154296875, + "completions/mean_terminated_length": 6663.154296875, + "completions/min_length": 2193.0, + "completions/min_terminated_length": 2193.0, + "entropy": 0.2914612591266632, + "epoch": 1.3349358974358974, + "frac_reward_zero_std": 0.90625, + "grad_norm": 0.004975066985934973, + "learning_rate": 1e-06, + "loss": 0.0004, + "num_tokens": 1083612246.0, + "reward": 0.8934961557388306, + "reward_std": 0.021559644490480423, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.9921875, + "rewards/symbolic_reward_accuracy/std": 0.08812850713729858, + "rewards/symbolic_reward_partial_score/mean": 0.993945300579071, + "rewards/symbolic_reward_partial_score/std": 0.07650934904813766, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.066861629486084, + "sampling/importance_sampling_ratio/min": 0.0018572451081126928, + "sampling/sampling_logp_difference/max": 6.288661003112793, + "sampling/sampling_logp_difference/mean": 0.11649785935878754, + "step": 833 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0078125, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0078125, + "entropy": 0.2909384220838547, + "epoch": 1.3365384615384617, + "grad_norm": 0.00430506095290184, + "learning_rate": 1e-06, + "loss": -0.003, + "step": 834 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0078125, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0078125, + "entropy": 0.2926383912563324, + "epoch": 1.3381410256410255, + "grad_norm": 0.0038858274929225445, + "learning_rate": 1e-06, + "loss": 0.0099, + "step": 835 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.03125, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.03125, + "entropy": 0.2900676280260086, + "epoch": 1.3397435897435899, + "grad_norm": 0.0030539829749614, + "learning_rate": 1e-06, + "loss": -0.0002, + "step": 836 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14258.0, + "completions/max_terminated_length": 14258.0, + "completions/mean_length": 6737.3671875, + "completions/mean_terminated_length": 6737.3671875, + "completions/min_length": 2386.0, + "completions/min_terminated_length": 2386.0, + "entropy": 0.2902229428291321, + "epoch": 1.3413461538461537, + "frac_reward_zero_std": 0.9375, + "grad_norm": 0.00329815992154181, + "learning_rate": 1e-06, + "loss": -0.0076, + "num_tokens": 1087951362.0, + "reward": 0.8975489139556885, + "reward_std": 0.009804688394069672, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.99609375, + "rewards/symbolic_reward_accuracy/std": 0.06243881583213806, + "rewards/symbolic_reward_partial_score/mean": 0.9996418952941895, + "rewards/symbolic_reward_partial_score/std": 0.0057472530752420425, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0671701431274414, + "sampling/importance_sampling_ratio/min": 0.002303540473803878, + "sampling/sampling_logp_difference/max": 6.073307991027832, + "sampling/sampling_logp_difference/mean": 0.11671570688486099, + "step": 837 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0078125, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0078125, + "entropy": 0.29137638211250305, + "epoch": 1.342948717948718, + "grad_norm": 0.003731071949005127, + "learning_rate": 1e-06, + "loss": 0.0101, + "step": 838 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0078125, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0078125, + "entropy": 0.292741596698761, + "epoch": 1.344551282051282, + "grad_norm": 0.002997028874233365, + "learning_rate": 1e-06, + "loss": -0.008, + "step": 839 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.0234375, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0234375, + "entropy": 0.2919839918613434, + "epoch": 1.3461538461538463, + "grad_norm": 0.023894716054201126, + "learning_rate": 1e-06, + "loss": 0.0054, + "step": 840 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14003.0, + "completions/max_terminated_length": 14003.0, + "completions/mean_length": 6641.5, + "completions/mean_terminated_length": 6641.5, + "completions/min_length": 2245.0, + "completions/min_terminated_length": 2245.0, + "entropy": 0.29364730417728424, + "epoch": 1.3477564102564101, + "frac_reward_zero_std": 0.78125, + "grad_norm": 0.006924602203071117, + "learning_rate": 1e-06, + "loss": -0.0172, + "num_tokens": 1092244098.0, + "reward": 0.8913965225219727, + "reward_std": 0.03441406041383743, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.986328125, + "rewards/symbolic_reward_accuracy/std": 0.1162383034825325, + "rewards/symbolic_reward_partial_score/mean": 0.9986653327941895, + "rewards/symbolic_reward_partial_score/std": 0.011836127378046513, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0672986507415771, + "sampling/importance_sampling_ratio/min": 0.0001825806830311194, + "sampling/sampling_logp_difference/max": 8.608318328857422, + "sampling/sampling_logp_difference/mean": 0.11722898483276367, + "step": 841 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.046875, + "clip_ratio/low_mean": 0.015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0625, + "entropy": 0.2935139685869217, + "epoch": 1.3493589743589745, + "grad_norm": 0.02281988225877285, + "learning_rate": 1e-06, + "loss": 0.0156, + "step": 842 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.0234375, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0234375, + "entropy": 0.2943962812423706, + "epoch": 1.3509615384615383, + "grad_norm": 0.00623247679322958, + "learning_rate": 1e-06, + "loss": -0.0121, + "step": 843 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0078125, + "clip_ratio/low_mean": 0.0078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.015625, + "entropy": 0.29411324858665466, + "epoch": 1.3525641025641026, + "grad_norm": 0.006375083699822426, + "learning_rate": 1e-06, + "loss": 0.0022, + "step": 844 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11405.0, + "completions/max_terminated_length": 11405.0, + "completions/mean_length": 7117.671875, + "completions/mean_terminated_length": 7117.671875, + "completions/min_length": 2860.0, + "completions/min_terminated_length": 2860.0, + "entropy": 0.29216840863227844, + "epoch": 1.3541666666666667, + "frac_reward_zero_std": 0.90625, + "grad_norm": 0.01883051171898842, + "learning_rate": 1e-06, + "loss": 0.0144, + "num_tokens": 1096780922.0, + "reward": 0.8945703506469727, + "reward_std": 0.0186243187636137, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.9921875, + "rewards/symbolic_reward_accuracy/std": 0.08812850713729858, + "rewards/symbolic_reward_partial_score/mean": 0.9975260496139526, + "rewards/symbolic_reward_partial_score/std": 0.044694118201732635, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0677756071090698, + "sampling/importance_sampling_ratio/min": 0.0024964665062725544, + "sampling/sampling_logp_difference/max": 5.9928789138793945, + "sampling/sampling_logp_difference/mean": 0.11734534054994583, + "step": 845 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "entropy": 0.29655005037784576, + "epoch": 1.3557692307692308, + "grad_norm": 0.004739716649055481, + "learning_rate": 1e-06, + "loss": -0.0133, + "step": 846 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.0234375, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0234375, + "entropy": 0.29636050760746, + "epoch": 1.357371794871795, + "grad_norm": 0.004647532943636179, + "learning_rate": 1e-06, + "loss": -0.0138, + "step": 847 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0078125, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0078125, + "entropy": 0.2918502986431122, + "epoch": 1.358974358974359, + "grad_norm": 0.030562305822968483, + "learning_rate": 1e-06, + "loss": 0.0146, + "step": 848 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13102.0, + "completions/max_terminated_length": 13102.0, + "completions/mean_length": 7432.736328125, + "completions/mean_terminated_length": 7432.736328125, + "completions/min_length": 2530.0, + "completions/min_terminated_length": 2530.0, + "entropy": 0.30163420736789703, + "epoch": 1.3605769230769231, + "frac_reward_zero_std": 0.96875, + "grad_norm": 0.0024499939754605293, + "learning_rate": 1e-06, + "loss": -0.0051, + "num_tokens": 1101412211.0, + "reward": 0.8987793326377869, + "reward_std": 0.004882812965661287, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.998046875, + "rewards/symbolic_reward_accuracy/std": 0.04419417306780815, + "rewards/symbolic_reward_partial_score/mean": 0.9998372793197632, + "rewards/symbolic_reward_partial_score/std": 0.003682846901938319, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0680015087127686, + "sampling/importance_sampling_ratio/min": 0.0015895604155957699, + "sampling/sampling_logp_difference/max": 6.444297790527344, + "sampling/sampling_logp_difference/mean": 0.11842922866344452, + "step": 849 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "entropy": 0.29871611297130585, + "epoch": 1.3621794871794872, + "grad_norm": 0.002597298938781023, + "learning_rate": 1e-06, + "loss": -0.0048, + "step": 850 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0078125, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0078125, + "entropy": 0.2991296947002411, + "epoch": 1.3637820512820513, + "grad_norm": 0.02783992327749729, + "learning_rate": 1e-06, + "loss": 0.0167, + "step": 851 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "entropy": 0.2933713495731354, + "epoch": 1.3653846153846154, + "grad_norm": 0.0024502624291926622, + "learning_rate": 1e-06, + "loss": -0.0046, + "step": 852 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14793.0, + "completions/max_terminated_length": 14793.0, + "completions/mean_length": 7011.0234375, + "completions/mean_terminated_length": 7011.0234375, + "completions/min_length": 2970.0, + "completions/min_terminated_length": 2970.0, + "entropy": 0.31015750765800476, + "epoch": 1.3669871794871795, + "frac_reward_zero_std": 0.96875, + "grad_norm": 0.0022398389410227537, + "learning_rate": 1e-06, + "loss": 0.0133, + "num_tokens": 1105784511.0, + "reward": 0.8987793326377869, + "reward_std": 0.004882812965661287, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.998046875, + "rewards/symbolic_reward_accuracy/std": 0.04419417306780815, + "rewards/symbolic_reward_partial_score/mean": 0.9998372793197632, + "rewards/symbolic_reward_partial_score/std": 0.003682846901938319, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0706534385681152, + "sampling/importance_sampling_ratio/min": 0.0033097388222813606, + "sampling/sampling_logp_difference/max": 5.710886001586914, + "sampling/sampling_logp_difference/mean": 0.12256957590579987, + "step": 853 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "entropy": 0.30707795917987823, + "epoch": 1.3685897435897436, + "grad_norm": 0.002265053801238537, + "learning_rate": 1e-06, + "loss": -0.0042, + "step": 854 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0078125, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0078125, + "entropy": 0.3124368190765381, + "epoch": 1.3701923076923077, + "grad_norm": 0.001633352367207408, + "learning_rate": 1e-06, + "loss": -0.0043, + "step": 855 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "entropy": 0.3110050559043884, + "epoch": 1.3717948717948718, + "grad_norm": 0.0022995418403297663, + "learning_rate": 1e-06, + "loss": -0.0039, + "step": 856 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14868.0, + "completions/max_terminated_length": 14868.0, + "completions/mean_length": 7329.43359375, + "completions/mean_terminated_length": 7329.43359375, + "completions/min_length": 2311.0, + "completions/min_terminated_length": 2311.0, + "entropy": 0.29253579676151276, + "epoch": 1.373397435897436, + "frac_reward_zero_std": 0.90625, + "grad_norm": 0.005778716877102852, + "learning_rate": 1e-06, + "loss": -0.0164, + "num_tokens": 1110410605.0, + "reward": 0.8926172256469727, + "reward_std": 0.0187346413731575, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.98828125, + "rewards/symbolic_reward_accuracy/std": 0.10772226005792618, + "rewards/symbolic_reward_partial_score/mean": 0.998828113079071, + "rewards/symbolic_reward_partial_score/std": 0.011265556327998638, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.066795825958252, + "sampling/importance_sampling_ratio/min": 0.003194189630448818, + "sampling/sampling_logp_difference/max": 5.746421813964844, + "sampling/sampling_logp_difference/mean": 0.11596965044736862, + "step": 857 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.03125, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.03125, + "entropy": 0.29146991670131683, + "epoch": 1.375, + "grad_norm": 0.005454590544104576, + "learning_rate": 1e-06, + "loss": 0.0106, + "step": 858 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0078125, + "entropy": 0.2908618450164795, + "epoch": 1.376602564102564, + "grad_norm": 0.014141952618956566, + "learning_rate": 1e-06, + "loss": 0.0008, + "step": 859 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.015625, + "clip_ratio/low_mean": 0.0078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0234375, + "entropy": 0.2932679355144501, + "epoch": 1.3782051282051282, + "grad_norm": 0.012503272853791714, + "learning_rate": 1e-06, + "loss": 0.0113, + "step": 860 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11966.0, + "completions/max_terminated_length": 11966.0, + "completions/mean_length": 7037.12109375, + "completions/mean_terminated_length": 7037.12109375, + "completions/min_length": 3731.0, + "completions/min_terminated_length": 3731.0, + "entropy": 0.30436907708644867, + "epoch": 1.3798076923076923, + "frac_reward_zero_std": 0.90625, + "grad_norm": 0.005315630231052637, + "learning_rate": 1e-06, + "loss": -0.0167, + "num_tokens": 1114885243.0, + "reward": 0.8938672542572021, + "reward_std": 0.018317628651857376, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.990234375, + "rewards/symbolic_reward_accuracy/std": 0.09843364357948303, + "rewards/symbolic_reward_partial_score/mean": 0.9990885257720947, + "rewards/symbolic_reward_partial_score/std": 0.009222573600709438, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0696780681610107, + "sampling/importance_sampling_ratio/min": 0.0011150204809382558, + "sampling/sampling_logp_difference/max": 6.798882484436035, + "sampling/sampling_logp_difference/mean": 0.1209033951163292, + "step": 861 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0078125, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0078125, + "entropy": 0.30297212302684784, + "epoch": 1.3814102564102564, + "grad_norm": 0.01675940863788128, + "learning_rate": 1e-06, + "loss": 0.0164, + "step": 862 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.015625, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.015625, + "entropy": 0.304256334900856, + "epoch": 1.3830128205128205, + "grad_norm": 0.016471805050969124, + "learning_rate": 1e-06, + "loss": 0.0067, + "step": 863 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.015625, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.015625, + "entropy": 0.30140577256679535, + "epoch": 1.3846153846153846, + "grad_norm": 0.017290934920310974, + "learning_rate": 1e-06, + "loss": -0.0046, + "step": 864 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14854.0, + "completions/max_terminated_length": 14854.0, + "completions/mean_length": 7528.865234375, + "completions/mean_terminated_length": 7528.865234375, + "completions/min_length": 2487.0, + "completions/min_terminated_length": 2487.0, + "entropy": 0.29779052734375, + "epoch": 1.3862179487179487, + "frac_reward_zero_std": 0.9375, + "grad_norm": 0.016191750764846802, + "learning_rate": 1e-06, + "loss": 0.0112, + "num_tokens": 1119569030.0, + "reward": 0.8950879573822021, + "reward_std": 0.012819098308682442, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.9921875, + "rewards/symbolic_reward_accuracy/std": 0.08812850713729858, + "rewards/symbolic_reward_partial_score/mean": 0.9992513060569763, + "rewards/symbolic_reward_partial_score/std": 0.008469752036035061, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.06855046749115, + "sampling/importance_sampling_ratio/min": 0.0019925027154386044, + "sampling/sampling_logp_difference/max": 6.2183637619018555, + "sampling/sampling_logp_difference/mean": 0.11855703592300415, + "step": 865 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.015625, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.015625, + "entropy": 0.2984299212694168, + "epoch": 1.3878205128205128, + "grad_norm": 0.004113992676138878, + "learning_rate": 1e-06, + "loss": -0.0144, + "step": 866 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0078125, + "entropy": 0.29633837938308716, + "epoch": 1.3894230769230769, + "grad_norm": 0.004194690380245447, + "learning_rate": 1e-06, + "loss": -0.0033, + "step": 867 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "entropy": 0.2999422550201416, + "epoch": 1.391025641025641, + "grad_norm": 0.005653866101056337, + "learning_rate": 1e-06, + "loss": 0.0154, + "step": 868 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13589.0, + "completions/max_terminated_length": 13589.0, + "completions/mean_length": 7836.474609375, + "completions/mean_terminated_length": 7836.474609375, + "completions/min_length": 3547.0, + "completions/min_terminated_length": 3547.0, + "entropy": 0.2959325313568115, + "epoch": 1.392628205128205, + "frac_reward_zero_std": 0.90625, + "grad_norm": 0.031015831977128983, + "learning_rate": 1e-06, + "loss": 0.0285, + "num_tokens": 1124527769.0, + "reward": 0.8963379263877869, + "reward_std": 0.0146484375, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.994140625, + "rewards/symbolic_reward_accuracy/std": 0.07639661431312561, + "rewards/symbolic_reward_partial_score/mean": 0.99951171875, + "rewards/symbolic_reward_partial_score/std": 0.006366382818669081, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.067394733428955, + "sampling/importance_sampling_ratio/min": 0.002571824938058853, + "sampling/sampling_logp_difference/max": 5.963139533996582, + "sampling/sampling_logp_difference/mean": 0.11667640507221222, + "step": 869 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0078125, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0078125, + "entropy": 0.2887873947620392, + "epoch": 1.3942307692307692, + "grad_norm": 0.00414247065782547, + "learning_rate": 1e-06, + "loss": -0.0135, + "step": 870 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.0234375, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0234375, + "entropy": 0.2930924892425537, + "epoch": 1.3958333333333333, + "grad_norm": 0.003337219590321183, + "learning_rate": 1e-06, + "loss": -0.0123, + "step": 871 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0078125, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0078125, + "entropy": 0.29419972002506256, + "epoch": 1.3974358974358974, + "grad_norm": 0.004136944655328989, + "learning_rate": 1e-06, + "loss": 0.0065, + "step": 872 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12698.0, + "completions/max_terminated_length": 12698.0, + "completions/mean_length": 7294.765625, + "completions/mean_terminated_length": 7294.765625, + "completions/min_length": 3576.0, + "completions/min_terminated_length": 3576.0, + "entropy": 0.29781144857406616, + "epoch": 1.3990384615384617, + "frac_reward_zero_std": 0.90625, + "grad_norm": 0.025357889011502266, + "learning_rate": 1e-06, + "loss": 0.0257, + "num_tokens": 1129125905.0, + "reward": 0.8950977325439453, + "reward_std": 0.016514942049980164, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.9921875, + "rewards/symbolic_reward_accuracy/std": 0.08812850713729858, + "rewards/symbolic_reward_partial_score/mean": 0.9992839097976685, + "rewards/symbolic_reward_partial_score/std": 0.00811202172189951, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0683658123016357, + "sampling/importance_sampling_ratio/min": 0.00021021001157350838, + "sampling/sampling_logp_difference/max": 8.467403411865234, + "sampling/sampling_logp_difference/mean": 0.11810927093029022, + "step": 873 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0078125, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0078125, + "entropy": 0.2955755293369293, + "epoch": 1.4006410256410255, + "grad_norm": 0.004411085043102503, + "learning_rate": 1e-06, + "loss": -0.013, + "step": 874 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0078125, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0078125, + "entropy": 0.29726772010326385, + "epoch": 1.4022435897435899, + "grad_norm": 0.026978353038430214, + "learning_rate": 1e-06, + "loss": 0.0058, + "step": 875 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.03125, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.03125, + "entropy": 0.29815293848514557, + "epoch": 1.4038461538461537, + "grad_norm": 0.003415036480873823, + "learning_rate": 1e-06, + "loss": -0.0142, + "step": 876 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14252.0, + "completions/max_terminated_length": 14252.0, + "completions/mean_length": 7402.025390625, + "completions/mean_terminated_length": 7402.025390625, + "completions/min_length": 3103.0, + "completions/min_terminated_length": 3103.0, + "entropy": 0.2916039377450943, + "epoch": 1.405448717948718, + "frac_reward_zero_std": 0.9375, + "grad_norm": 0.003685388946905732, + "learning_rate": 1e-06, + "loss": -0.0106, + "num_tokens": 1133740766.0, + "reward": 0.8975586295127869, + "reward_std": 0.009765625, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.99609375, + "rewards/symbolic_reward_accuracy/std": 0.06243881583213806, + "rewards/symbolic_reward_partial_score/mean": 0.9996744990348816, + "rewards/symbolic_reward_partial_score/std": 0.005203233566135168, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.066849946975708, + "sampling/importance_sampling_ratio/min": 0.0012651029974222183, + "sampling/sampling_logp_difference/max": 6.672601699829102, + "sampling/sampling_logp_difference/mean": 0.11604957282543182, + "step": 877 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0078125, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0078125, + "entropy": 0.2909359931945801, + "epoch": 1.407051282051282, + "grad_norm": 0.0035066467244178057, + "learning_rate": 1e-06, + "loss": -0.0093, + "step": 878 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "entropy": 0.2898210138082504, + "epoch": 1.4086538461538463, + "grad_norm": 0.0033086894545704126, + "learning_rate": 1e-06, + "loss": 0.0095, + "step": 879 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0078125, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0078125, + "entropy": 0.29252733290195465, + "epoch": 1.4102564102564101, + "grad_norm": 0.02426670864224434, + "learning_rate": 1e-06, + "loss": 0.0116, + "step": 880 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15130.0, + "completions/max_terminated_length": 15130.0, + "completions/mean_length": 7525.994140625, + "completions/mean_terminated_length": 7525.994140625, + "completions/min_length": 2851.0, + "completions/min_terminated_length": 2851.0, + "entropy": 0.28987959027290344, + "epoch": 1.4118589743589745, + "frac_reward_zero_std": 0.8125, + "grad_norm": 0.007168716751039028, + "learning_rate": 1e-06, + "loss": -0.0173, + "num_tokens": 1138505355.0, + "reward": 0.8906446099281311, + "reward_std": 0.034327443689107895, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.986328125, + "rewards/symbolic_reward_accuracy/std": 0.1162383034825325, + "rewards/symbolic_reward_partial_score/mean": 0.9961588382720947, + "rewards/symbolic_reward_partial_score/std": 0.0501725897192955, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.065748691558838, + "sampling/importance_sampling_ratio/min": 0.003183236112818122, + "sampling/sampling_logp_difference/max": 5.749856948852539, + "sampling/sampling_logp_difference/mean": 0.11428984254598618, + "step": 881 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.0234375, + "clip_ratio/low_mean": 0.0078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.03125, + "entropy": 0.2841818630695343, + "epoch": 1.4134615384615383, + "grad_norm": 0.026367392390966415, + "learning_rate": 1e-06, + "loss": 0.0049, + "step": 882 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.03125, + "clip_ratio/low_mean": 0.015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.046875, + "entropy": 0.2883693128824234, + "epoch": 1.4150641025641026, + "grad_norm": 0.019833385944366455, + "learning_rate": 1e-06, + "loss": 0.0391, + "step": 883 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.0234375, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0234375, + "entropy": 0.28537461161613464, + "epoch": 1.4166666666666667, + "grad_norm": 0.00673852302134037, + "learning_rate": 1e-06, + "loss": -0.0287, + "step": 884 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14382.0, + "completions/max_terminated_length": 14382.0, + "completions/mean_length": 7235.87109375, + "completions/mean_terminated_length": 7235.87109375, + "completions/min_length": 3265.0, + "completions/min_terminated_length": 3265.0, + "entropy": 0.29649393260478973, + "epoch": 1.4182692307692308, + "frac_reward_zero_std": 0.96875, + "grad_norm": 0.0026662563905119896, + "learning_rate": 1e-06, + "loss": -0.0052, + "num_tokens": 1143070409.0, + "reward": 0.898681640625, + "reward_std": 0.0052734375931322575, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.998046875, + "rewards/symbolic_reward_accuracy/std": 0.04419417306780815, + "rewards/symbolic_reward_partial_score/mean": 0.99951171875, + "rewards/symbolic_reward_partial_score/std": 0.011048543266952038, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.068642497062683, + "sampling/importance_sampling_ratio/min": 0.00319400685839355, + "sampling/sampling_logp_difference/max": 5.746479034423828, + "sampling/sampling_logp_difference/mean": 0.11909361928701401, + "step": 885 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0078125, + "entropy": 0.3017093986272812, + "epoch": 1.419871794871795, + "grad_norm": 0.0022972896695137024, + "learning_rate": 1e-06, + "loss": 0.0138, + "step": 886 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "entropy": 0.30264417827129364, + "epoch": 1.421474358974359, + "grad_norm": 0.0025393194518983364, + "learning_rate": 1e-06, + "loss": -0.0061, + "step": 887 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.015625, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.015625, + "entropy": 0.2955441474914551, + "epoch": 1.4230769230769231, + "grad_norm": 0.0, + "learning_rate": 1e-06, + "loss": -0.005, + "step": 888 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12235.0, + "completions/max_terminated_length": 12235.0, + "completions/mean_length": 7418.005859375, + "completions/mean_terminated_length": 7418.005859375, + "completions/min_length": 2399.0, + "completions/min_terminated_length": 2399.0, + "entropy": 0.29583777487277985, + "epoch": 1.4246794871794872, + "frac_reward_zero_std": 0.875, + "grad_norm": 0.005485072731971741, + "learning_rate": 1e-06, + "loss": -0.019, + "num_tokens": 1147788892.0, + "reward": 0.8933594226837158, + "reward_std": 0.02346806786954403, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.990234375, + "rewards/symbolic_reward_accuracy/std": 0.09843364357948303, + "rewards/symbolic_reward_partial_score/mean": 0.9973958730697632, + "rewards/symbolic_reward_partial_score/std": 0.04477177560329437, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0678550004959106, + "sampling/importance_sampling_ratio/min": 2.4652272259118035e-05, + "sampling/sampling_logp_difference/max": 10.610641479492188, + "sampling/sampling_logp_difference/mean": 0.11751426756381989, + "step": 889 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0078125, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0078125, + "entropy": 0.2936114966869354, + "epoch": 1.4262820512820513, + "grad_norm": 0.019580816850066185, + "learning_rate": 1e-06, + "loss": -0.0057, + "step": 890 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.015625, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.015625, + "entropy": 0.2942151129245758, + "epoch": 1.4278846153846154, + "grad_norm": 0.04152410104870796, + "learning_rate": 1e-06, + "loss": 0.0524, + "step": 891 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.0234375, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0234375, + "entropy": 0.298058420419693, + "epoch": 1.4294871794871795, + "grad_norm": 0.005063659977167845, + "learning_rate": 1e-06, + "loss": -0.0209, + "step": 892 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13606.0, + "completions/mean_length": 7480.78515625, + "completions/mean_terminated_length": 7445.87109375, + "completions/min_length": 2894.0, + "completions/min_terminated_length": 2894.0, + "entropy": 0.2844327837228775, + "epoch": 1.4310897435897436, + "frac_reward_zero_std": 0.84375, + "grad_norm": 0.0049340990372002125, + "learning_rate": 1e-06, + "loss": -0.0191, + "num_tokens": 1152608126.0, + "reward": 0.8936182260513306, + "reward_std": 0.0255273450165987, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.9921875, + "rewards/symbolic_reward_accuracy/std": 0.08812850713729858, + "rewards/symbolic_reward_partial_score/mean": 0.995654284954071, + "rewards/symbolic_reward_partial_score/std": 0.06281018257141113, + "rewards/tag_count_reward/mean": -0.00390625, + "rewards/tag_count_reward/std": 0.06243881583213806, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.065583348274231, + "sampling/importance_sampling_ratio/min": 0.0014057016232982278, + "sampling/sampling_logp_difference/max": 6.567218780517578, + "sampling/sampling_logp_difference/mean": 0.11391088366508484, + "step": 893 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.03125, + "clip_ratio/low_mean": 0.0078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0390625, + "entropy": 0.2806769758462906, + "epoch": 1.4326923076923077, + "grad_norm": 0.040655478835105896, + "learning_rate": 1e-06, + "loss": 0.0593, + "step": 894 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.0390625, + "clip_ratio/low_mean": 0.0078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.046875, + "entropy": 0.2880139797925949, + "epoch": 1.4342948717948718, + "grad_norm": 0.004034819081425667, + "learning_rate": 1e-06, + "loss": 0.0154, + "step": 895 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.03125, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.03125, + "entropy": 0.28479818999767303, + "epoch": 1.435897435897436, + "grad_norm": 0.004762680269777775, + "learning_rate": 1e-06, + "loss": -0.0189, + "step": 896 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12721.0, + "completions/max_terminated_length": 12721.0, + "completions/mean_length": 7157.248046875, + "completions/mean_terminated_length": 7157.248046875, + "completions/min_length": 1919.0, + "completions/min_terminated_length": 1919.0, + "entropy": 0.29515235126018524, + "epoch": 1.4375, + "frac_reward_zero_std": 0.9375, + "grad_norm": 0.002855000551789999, + "learning_rate": 1e-06, + "loss": -0.0081, + "num_tokens": 1157202189.0, + "reward": 0.8975586295127869, + "reward_std": 0.009765625, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.99609375, + "rewards/symbolic_reward_accuracy/std": 0.06243881583213806, + "rewards/symbolic_reward_partial_score/mean": 0.9996744990348816, + "rewards/symbolic_reward_partial_score/std": 0.005203233566135168, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.067905306816101, + "sampling/importance_sampling_ratio/min": 0.0016314449021592736, + "sampling/sampling_logp_difference/max": 6.4182891845703125, + "sampling/sampling_logp_difference/mean": 0.11758951842784882, + "step": 897 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.03125, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.03125, + "entropy": 0.2956536114215851, + "epoch": 1.439102564102564, + "grad_norm": 0.001475250581279397, + "learning_rate": 1e-06, + "loss": 0.0062, + "step": 898 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.015625, + "clip_ratio/low_mean": 0.0078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0234375, + "entropy": 0.29977406561374664, + "epoch": 1.4407051282051282, + "grad_norm": 0.0027920929715037346, + "learning_rate": 1e-06, + "loss": 0.0064, + "step": 899 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "entropy": 0.294843852519989, + "epoch": 1.4423076923076923, + "grad_norm": 0.0032186834141612053, + "learning_rate": 1e-06, + "loss": -0.0077, + "step": 900 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11521.0, + "completions/max_terminated_length": 11521.0, + "completions/mean_length": 6777.044921875, + "completions/mean_terminated_length": 6777.044921875, + "completions/min_length": 2950.0, + "completions/min_terminated_length": 2950.0, + "entropy": 0.3089780658483505, + "epoch": 1.4439102564102564, + "frac_reward_zero_std": 0.96875, + "grad_norm": 0.0031850591767579317, + "learning_rate": 1e-06, + "loss": -0.0055, + "num_tokens": 1161592100.0, + "reward": 0.8970215320587158, + "reward_std": 0.008288709446787834, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.99609375, + "rewards/symbolic_reward_accuracy/std": 0.06243881583213806, + "rewards/symbolic_reward_partial_score/mean": 0.9978841543197632, + "rewards/symbolic_reward_partial_score/std": 0.04434017464518547, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0706193447113037, + "sampling/importance_sampling_ratio/min": 0.001069498248398304, + "sampling/sampling_logp_difference/max": 6.8405656814575195, + "sampling/sampling_logp_difference/mean": 0.12223009765148163, + "step": 901 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "entropy": 0.3052385300397873, + "epoch": 1.4455128205128205, + "grad_norm": 0.002852542558684945, + "learning_rate": 1e-06, + "loss": 0.0031, + "step": 902 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0078125, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0078125, + "entropy": 0.3105472922325134, + "epoch": 1.4471153846153846, + "grad_norm": 0.002079383237287402, + "learning_rate": 1e-06, + "loss": -0.0058, + "step": 903 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "entropy": 0.3094532638788223, + "epoch": 1.4487179487179487, + "grad_norm": 0.003387544071301818, + "learning_rate": 1e-06, + "loss": 0.0072, + "step": 904 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13264.0, + "completions/max_terminated_length": 13264.0, + "completions/mean_length": 7214.5078125, + "completions/mean_terminated_length": 7214.5078125, + "completions/min_length": 3526.0, + "completions/min_terminated_length": 3526.0, + "entropy": 0.3008831590414047, + "epoch": 1.4503205128205128, + "frac_reward_zero_std": 0.875, + "grad_norm": 0.006545086856931448, + "learning_rate": 1e-06, + "loss": 0.014, + "num_tokens": 1166184456.0, + "reward": 0.890869140625, + "reward_std": 0.026565231382846832, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.986328125, + "rewards/symbolic_reward_accuracy/std": 0.1162383034825325, + "rewards/symbolic_reward_partial_score/mean": 0.9969075918197632, + "rewards/symbolic_reward_partial_score/std": 0.04549367353320122, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0693827867507935, + "sampling/importance_sampling_ratio/min": 0.0020341959316283464, + "sampling/sampling_logp_difference/max": 6.197654724121094, + "sampling/sampling_logp_difference/mean": 0.11990463733673096, + "step": 905 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.0234375, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0234375, + "entropy": 0.3034283220767975, + "epoch": 1.4519230769230769, + "grad_norm": 0.025849303230643272, + "learning_rate": 1e-06, + "loss": 0.0067, + "step": 906 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.03125, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.03125, + "entropy": 0.2999170422554016, + "epoch": 1.453525641025641, + "grad_norm": 0.005960631184279919, + "learning_rate": 1e-06, + "loss": -0.0245, + "step": 907 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.015625, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.015625, + "entropy": 0.30265505611896515, + "epoch": 1.455128205128205, + "grad_norm": 0.018048452213406563, + "learning_rate": 1e-06, + "loss": 0.0078, + "step": 908 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13220.0, + "completions/max_terminated_length": 13220.0, + "completions/mean_length": 6950.375, + "completions/mean_terminated_length": 6950.375, + "completions/min_length": 2652.0, + "completions/min_terminated_length": 2652.0, + "entropy": 0.301173597574234, + "epoch": 1.4567307692307692, + "frac_reward_zero_std": 0.8125, + "grad_norm": 0.006944697350263596, + "learning_rate": 1e-06, + "loss": -0.0061, + "num_tokens": 1170566440.0, + "reward": 0.8882812857627869, + "reward_std": 0.040093328803777695, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.984375, + "rewards/symbolic_reward_accuracy/std": 0.12414088100194931, + "rewards/symbolic_reward_partial_score/mean": 0.9921875, + "rewards/symbolic_reward_partial_score/std": 0.08017498254776001, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0687031745910645, + "sampling/importance_sampling_ratio/min": 0.00040110392728820443, + "sampling/sampling_logp_difference/max": 7.821290016174316, + "sampling/sampling_logp_difference/mean": 0.11923378705978394, + "step": 909 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.0234375, + "clip_ratio/low_mean": 0.0078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.03125, + "entropy": 0.2987861931324005, + "epoch": 1.4583333333333333, + "grad_norm": 0.020350439473986626, + "learning_rate": 1e-06, + "loss": 0.0218, + "step": 910 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.046875, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.046875, + "entropy": 0.2985246926546097, + "epoch": 1.4599358974358974, + "grad_norm": 0.030750460922718048, + "learning_rate": 1e-06, + "loss": -0.0054, + "step": 911 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.03125, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.03125, + "entropy": 0.2979954779148102, + "epoch": 1.4615384615384617, + "grad_norm": 0.01273882295936346, + "learning_rate": 1e-06, + "loss": 0.0, + "step": 912 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11232.0, + "completions/max_terminated_length": 11232.0, + "completions/mean_length": 6890.78125, + "completions/mean_terminated_length": 6890.78125, + "completions/min_length": 2828.0, + "completions/min_terminated_length": 2828.0, + "entropy": 0.29950951039791107, + "epoch": 1.4631410256410255, + "frac_reward_zero_std": 0.875, + "grad_norm": 0.004619269166141748, + "learning_rate": 1e-06, + "loss": 0.0198, + "num_tokens": 1174936248.0, + "reward": 0.8934961557388306, + "reward_std": 0.02601562812924385, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.9921875, + "rewards/symbolic_reward_accuracy/std": 0.08812850713729858, + "rewards/symbolic_reward_partial_score/mean": 0.993945300579071, + "rewards/symbolic_reward_partial_score/std": 0.07650934904813766, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0690256357192993, + "sampling/importance_sampling_ratio/min": 0.0026924286503344774, + "sampling/sampling_logp_difference/max": 5.917311668395996, + "sampling/sampling_logp_difference/mean": 0.1194806694984436, + "step": 913 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.0234375, + "clip_ratio/low_mean": 0.0078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.03125, + "entropy": 0.3017955720424652, + "epoch": 1.4647435897435899, + "grad_norm": 0.023214569315314293, + "learning_rate": 1e-06, + "loss": 0.017, + "step": 914 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.046875, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.046875, + "entropy": 0.30300796031951904, + "epoch": 1.4663461538461537, + "grad_norm": 0.004014391452074051, + "learning_rate": 1e-06, + "loss": -0.0151, + "step": 915 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.015625, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.015625, + "entropy": 0.3013370633125305, + "epoch": 1.467948717948718, + "grad_norm": 0.004451179411262274, + "learning_rate": 1e-06, + "loss": -0.0157, + "step": 916 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11576.0, + "completions/max_terminated_length": 11576.0, + "completions/mean_length": 6749.005859375, + "completions/mean_terminated_length": 6749.005859375, + "completions/min_length": 3037.0, + "completions/min_terminated_length": 3037.0, + "entropy": 0.305722251534462, + "epoch": 1.469551282051282, + "frac_reward_zero_std": 0.875, + "grad_norm": 0.0044407895766198635, + "learning_rate": 1e-06, + "loss": 0.0082, + "num_tokens": 1179280027.0, + "reward": 0.89404296875, + "reward_std": 0.02382812649011612, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.9921875, + "rewards/symbolic_reward_accuracy/std": 0.08812850713729858, + "rewards/symbolic_reward_partial_score/mean": 0.9957682490348816, + "rewards/symbolic_reward_partial_score/std": 0.0626349002122879, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0697790384292603, + "sampling/importance_sampling_ratio/min": 0.0032580127008259296, + "sampling/sampling_logp_difference/max": 5.726637840270996, + "sampling/sampling_logp_difference/mean": 0.12138458341360092, + "step": 917 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.0234375, + "clip_ratio/low_mean": 0.0078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.03125, + "entropy": 0.3042749762535095, + "epoch": 1.4711538461538463, + "grad_norm": 0.004154628608375788, + "learning_rate": 1e-06, + "loss": 0.015, + "step": 918 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.015625, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.015625, + "entropy": 0.3077068626880646, + "epoch": 1.4727564102564101, + "grad_norm": 0.021867236122488976, + "learning_rate": 1e-06, + "loss": -0.0009, + "step": 919 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.03125, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.03125, + "entropy": 0.30803389847278595, + "epoch": 1.4743589743589745, + "grad_norm": 0.003819097066298127, + "learning_rate": 1e-06, + "loss": -0.0163, + "step": 920 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11396.0, + "completions/max_terminated_length": 11396.0, + "completions/mean_length": 6802.4296875, + "completions/mean_terminated_length": 6802.4296875, + "completions/min_length": 2662.0, + "completions/min_terminated_length": 2662.0, + "entropy": 0.31033851206302643, + "epoch": 1.4759615384615383, + "frac_reward_zero_std": 0.96875, + "grad_norm": 0.004083678126335144, + "learning_rate": 1e-06, + "loss": -0.0065, + "num_tokens": 1183613159.0, + "reward": 0.8967773914337158, + "reward_std": 0.008847462944686413, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.99609375, + "rewards/symbolic_reward_accuracy/std": 0.06243881583213806, + "rewards/symbolic_reward_partial_score/mean": 0.9970703125, + "rewards/symbolic_reward_partial_score/std": 0.04937189444899559, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0699421167373657, + "sampling/importance_sampling_ratio/min": 0.002445993945002556, + "sampling/sampling_logp_difference/max": 6.013303756713867, + "sampling/sampling_logp_difference/mean": 0.12115398049354553, + "step": 921 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0078125, + "entropy": 0.30309993028640747, + "epoch": 1.4775641025641026, + "grad_norm": 0.015554931946098804, + "learning_rate": 1e-06, + "loss": 0.0182, + "step": 922 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "entropy": 0.307483434677124, + "epoch": 1.4791666666666667, + "grad_norm": 0.0038423407822847366, + "learning_rate": 1e-06, + "loss": -0.0062, + "step": 923 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "entropy": 0.30638065934181213, + "epoch": 1.4807692307692308, + "grad_norm": 0.0036329433787614107, + "learning_rate": 1e-06, + "loss": -0.0063, + "step": 924 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12225.0, + "completions/max_terminated_length": 12225.0, + "completions/mean_length": 6971.83984375, + "completions/mean_terminated_length": 6971.83984375, + "completions/min_length": 3684.0, + "completions/min_terminated_length": 3684.0, + "entropy": 0.3044200539588928, + "epoch": 1.482371794871795, + "frac_reward_zero_std": 0.84375, + "grad_norm": 0.00529886968433857, + "learning_rate": 1e-06, + "loss": -0.003, + "num_tokens": 1188058277.0, + "reward": 0.8922510147094727, + "reward_std": 0.03099609725177288, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.990234375, + "rewards/symbolic_reward_accuracy/std": 0.09843364357948303, + "rewards/symbolic_reward_partial_score/mean": 0.993701159954071, + "rewards/symbolic_reward_partial_score/std": 0.076689213514328, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0690126419067383, + "sampling/importance_sampling_ratio/min": 0.002685052575543523, + "sampling/sampling_logp_difference/max": 5.920054912567139, + "sampling/sampling_logp_difference/mean": 0.11991654336452484, + "step": 925 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.03125, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.03125, + "entropy": 0.30244874954223633, + "epoch": 1.483974358974359, + "grad_norm": 0.03052562102675438, + "learning_rate": 1e-06, + "loss": 0.0028, + "step": 926 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.0234375, + "clip_ratio/low_mean": 0.0078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.03125, + "entropy": 0.3017626106739044, + "epoch": 1.4855769230769231, + "grad_norm": 0.005101877264678478, + "learning_rate": 1e-06, + "loss": 0.0135, + "step": 927 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.0390625, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0390625, + "entropy": 0.30204281210899353, + "epoch": 1.4871794871794872, + "grad_norm": 0.004146016668528318, + "learning_rate": 1e-06, + "loss": -0.0054, + "step": 928 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 12020.0, + "completions/mean_length": 6972.96484375, + "completions/mean_terminated_length": 6954.5478515625, + "completions/min_length": 2702.0, + "completions/min_terminated_length": 2702.0, + "entropy": 0.3108600080013275, + "epoch": 1.4887820512820513, + "frac_reward_zero_std": 0.90625, + "grad_norm": 0.004198870155960321, + "learning_rate": 1e-06, + "loss": 0.0025, + "num_tokens": 1192476643.0, + "reward": 0.8955957293510437, + "reward_std": 0.01684584841132164, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.994140625, + "rewards/symbolic_reward_accuracy/std": 0.07639661431312561, + "rewards/symbolic_reward_partial_score/mean": 0.9976887702941895, + "rewards/symbolic_reward_partial_score/std": 0.0445505827665329, + "rewards/tag_count_reward/mean": -0.001953125, + "rewards/tag_count_reward/std": 0.04419417306780815, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0708767175674438, + "sampling/importance_sampling_ratio/min": 0.0016421438194811344, + "sampling/sampling_logp_difference/max": 6.411752700805664, + "sampling/sampling_logp_difference/mean": 0.12295570969581604, + "step": 929 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0078125, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0078125, + "entropy": 0.3109143078327179, + "epoch": 1.4903846153846154, + "grad_norm": 0.004327795933932066, + "learning_rate": 1e-06, + "loss": -0.009, + "step": 930 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.0234375, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0234375, + "entropy": 0.3126075565814972, + "epoch": 1.4919871794871795, + "grad_norm": 0.003698945278301835, + "learning_rate": 1e-06, + "loss": -0.0132, + "step": 931 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0078125, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0078125, + "entropy": 0.31349894404411316, + "epoch": 1.4935897435897436, + "grad_norm": 0.0040559847839176655, + "learning_rate": 1e-06, + "loss": 0.0121, + "step": 932 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12509.0, + "completions/max_terminated_length": 12509.0, + "completions/mean_length": 6793.474609375, + "completions/mean_terminated_length": 6793.474609375, + "completions/min_length": 3194.0, + "completions/min_terminated_length": 3194.0, + "entropy": 0.30804505944252014, + "epoch": 1.4951923076923077, + "frac_reward_zero_std": 0.90625, + "grad_norm": 0.028041956946253777, + "learning_rate": 1e-06, + "loss": 0.0087, + "num_tokens": 1196736406.0, + "reward": 0.8957666754722595, + "reward_std": 0.016933593899011612, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.994140625, + "rewards/symbolic_reward_accuracy/std": 0.07639661431312561, + "rewards/symbolic_reward_partial_score/mean": 0.997607409954071, + "rewards/symbolic_reward_partial_score/std": 0.04473654553294182, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.07077956199646, + "sampling/importance_sampling_ratio/min": 0.0015226604882627726, + "sampling/sampling_logp_difference/max": 6.487296104431152, + "sampling/sampling_logp_difference/mean": 0.1224517673254013, + "step": 933 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.0234375, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0234375, + "entropy": 0.30982083082199097, + "epoch": 1.4967948717948718, + "grad_norm": 0.027506569400429726, + "learning_rate": 1e-06, + "loss": 0.0069, + "step": 934 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.015625, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.015625, + "entropy": 0.31032316386699677, + "epoch": 1.498397435897436, + "grad_norm": 0.0033966144546866417, + "learning_rate": 1e-06, + "loss": -0.0113, + "step": 935 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.015625, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.015625, + "entropy": 0.30926352739334106, + "epoch": 1.5, + "grad_norm": 0.024733208119869232, + "learning_rate": 1e-06, + "loss": 0.0028, + "step": 936 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11656.0, + "completions/max_terminated_length": 11656.0, + "completions/mean_length": 6906.4140625, + "completions/mean_terminated_length": 6906.4140625, + "completions/min_length": 3048.0, + "completions/min_terminated_length": 3048.0, + "entropy": 0.3099443316459656, + "epoch": 1.501602564102564, + "frac_reward_zero_std": 0.96875, + "grad_norm": 0.025104807689785957, + "learning_rate": 1e-06, + "loss": 0.0138, + "num_tokens": 1201122746.0, + "reward": 0.8987793326377869, + "reward_std": 0.0048828125, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.998046875, + "rewards/symbolic_reward_accuracy/std": 0.04419417306780815, + "rewards/symbolic_reward_partial_score/mean": 0.9998372793197632, + "rewards/symbolic_reward_partial_score/std": 0.003682846901938319, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.07002854347229, + "sampling/importance_sampling_ratio/min": 0.0023560093250125647, + "sampling/sampling_logp_difference/max": 6.050786018371582, + "sampling/sampling_logp_difference/mean": 0.12201914191246033, + "step": 937 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0078125, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0078125, + "entropy": 0.30576586723327637, + "epoch": 1.5032051282051282, + "grad_norm": 0.0017299925675615668, + "learning_rate": 1e-06, + "loss": -0.0043, + "step": 938 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0078125, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0078125, + "entropy": 0.30867621302604675, + "epoch": 1.5048076923076923, + "grad_norm": 0.0017178656999021769, + "learning_rate": 1e-06, + "loss": -0.0045, + "step": 939 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0078125, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0078125, + "entropy": 0.31071707606315613, + "epoch": 1.5064102564102564, + "grad_norm": 0.0020263735204935074, + "learning_rate": 1e-06, + "loss": -0.0047, + "step": 940 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12147.0, + "completions/max_terminated_length": 12147.0, + "completions/mean_length": 7048.5234375, + "completions/mean_terminated_length": 7048.5234375, + "completions/min_length": 2555.0, + "completions/min_terminated_length": 2555.0, + "entropy": 0.30272747576236725, + "epoch": 1.5080128205128205, + "frac_reward_zero_std": 0.9375, + "grad_norm": 0.0034703791607171297, + "learning_rate": 1e-06, + "loss": -0.0094, + "num_tokens": 1205583014.0, + "reward": 0.8975098133087158, + "reward_std": 0.00996093824505806, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.99609375, + "rewards/symbolic_reward_accuracy/std": 0.06243881583213806, + "rewards/symbolic_reward_partial_score/mean": 0.99951171875, + "rewards/symbolic_reward_partial_score/std": 0.008228649385273457, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0694454908370972, + "sampling/importance_sampling_ratio/min": 0.0011049419408664107, + "sampling/sampling_logp_difference/max": 6.807962417602539, + "sampling/sampling_logp_difference/mean": 0.12039151787757874, + "step": 941 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "entropy": 0.3034730553627014, + "epoch": 1.5096153846153846, + "grad_norm": 0.003563554957509041, + "learning_rate": 1e-06, + "loss": -0.0098, + "step": 942 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "entropy": 0.30564577877521515, + "epoch": 1.5112179487179487, + "grad_norm": 0.0264495387673378, + "learning_rate": 1e-06, + "loss": 0.0068, + "step": 943 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "entropy": 0.3048667460680008, + "epoch": 1.5128205128205128, + "grad_norm": 0.028183914721012115, + "learning_rate": 1e-06, + "loss": 0.0111, + "step": 944 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11975.0, + "completions/max_terminated_length": 11975.0, + "completions/mean_length": 7280.6484375, + "completions/mean_terminated_length": 7280.6484375, + "completions/min_length": 2948.0, + "completions/min_terminated_length": 2948.0, + "entropy": 0.2988281399011612, + "epoch": 1.5144230769230769, + "frac_reward_zero_std": 0.90625, + "grad_norm": 0.004710206296294928, + "learning_rate": 1e-06, + "loss": -0.0147, + "num_tokens": 1210179298.0, + "reward": 0.8951172232627869, + "reward_std": 0.01643681712448597, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.9921875, + "rewards/symbolic_reward_accuracy/std": 0.08812850713729858, + "rewards/symbolic_reward_partial_score/mean": 0.9993489980697632, + "rewards/symbolic_reward_partial_score/std": 0.0073440405540168285, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0679197311401367, + "sampling/importance_sampling_ratio/min": 0.0007556354394182563, + "sampling/sampling_logp_difference/max": 7.187951564788818, + "sampling/sampling_logp_difference/mean": 0.11771701276302338, + "step": 945 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0078125, + "clip_ratio/low_mean": 0.0078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.015625, + "entropy": 0.2950664162635803, + "epoch": 1.516025641025641, + "grad_norm": 0.00515699153766036, + "learning_rate": 1e-06, + "loss": 0.032, + "step": 946 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0078125, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0078125, + "entropy": 0.29714447259902954, + "epoch": 1.5176282051282053, + "grad_norm": 0.004459913820028305, + "learning_rate": 1e-06, + "loss": -0.0161, + "step": 947 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.03125, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.03125, + "entropy": 0.297834575176239, + "epoch": 1.5192307692307692, + "grad_norm": 0.00498380558565259, + "learning_rate": 1e-06, + "loss": -0.0013, + "step": 948 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12741.0, + "completions/max_terminated_length": 12741.0, + "completions/mean_length": 7402.451171875, + "completions/mean_terminated_length": 7402.451171875, + "completions/min_length": 3591.0, + "completions/min_terminated_length": 3591.0, + "entropy": 0.296127051115036, + "epoch": 1.5208333333333335, + "frac_reward_zero_std": 0.96875, + "grad_norm": 0.004056466277688742, + "learning_rate": 1e-06, + "loss": -0.0066, + "num_tokens": 1214899529.0, + "reward": 0.8970117568969727, + "reward_std": 0.008309577591717243, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.99609375, + "rewards/symbolic_reward_accuracy/std": 0.06243881583213806, + "rewards/symbolic_reward_partial_score/mean": 0.997851550579071, + "rewards/symbolic_reward_partial_score/std": 0.0444059856235981, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.068399429321289, + "sampling/importance_sampling_ratio/min": 0.0013273690128698945, + "sampling/sampling_logp_difference/max": 6.624556541442871, + "sampling/sampling_logp_difference/mean": 0.11876702308654785, + "step": 949 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0078125, + "entropy": 0.30057577788829803, + "epoch": 1.5224358974358974, + "grad_norm": 0.003635054687038064, + "learning_rate": 1e-06, + "loss": 0.0065, + "step": 950 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.0234375, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0234375, + "entropy": 0.3005140721797943, + "epoch": 1.5240384615384617, + "grad_norm": 0.0, + "learning_rate": 1e-06, + "loss": -0.0058, + "step": 951 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "entropy": 0.30061452090740204, + "epoch": 1.5256410256410255, + "grad_norm": 0.014728136360645294, + "learning_rate": 1e-06, + "loss": 0.0044, + "step": 952 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12208.0, + "completions/max_terminated_length": 12208.0, + "completions/mean_length": 7126.783203125, + "completions/mean_terminated_length": 7126.783203125, + "completions/min_length": 2316.0, + "completions/min_terminated_length": 2316.0, + "entropy": 0.30513474345207214, + "epoch": 1.5272435897435899, + "frac_reward_zero_std": 0.96875, + "grad_norm": 0.002268366049975157, + "learning_rate": 1e-06, + "loss": -0.004, + "num_tokens": 1219390618.0, + "reward": 0.8987793326377869, + "reward_std": 0.004882812965661287, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.998046875, + "rewards/symbolic_reward_accuracy/std": 0.04419417306780815, + "rewards/symbolic_reward_partial_score/mean": 0.9998372793197632, + "rewards/symbolic_reward_partial_score/std": 0.003682846901938319, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0697507858276367, + "sampling/importance_sampling_ratio/min": 0.0018025912577286363, + "sampling/sampling_logp_difference/max": 6.318530082702637, + "sampling/sampling_logp_difference/mean": 0.12075302749872208, + "step": 953 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.015625, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.015625, + "entropy": 0.30484290421009064, + "epoch": 1.5288461538461537, + "grad_norm": 0.0017726526129990816, + "learning_rate": 1e-06, + "loss": -0.004, + "step": 954 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "entropy": 0.30700714886188507, + "epoch": 1.530448717948718, + "grad_norm": 0.002408325904980302, + "learning_rate": 1e-06, + "loss": -0.0043, + "step": 955 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.015625, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.015625, + "entropy": 0.30840322375297546, + "epoch": 1.532051282051282, + "grad_norm": 0.0, + "learning_rate": 1e-06, + "loss": 0.0142, + "step": 956 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13056.0, + "completions/mean_length": 7702.8828125, + "completions/mean_terminated_length": 7685.89404296875, + "completions/min_length": 3550.0, + "completions/min_terminated_length": 3550.0, + "entropy": 0.3104442358016968, + "epoch": 1.5336538461538463, + "frac_reward_zero_std": 0.9375, + "grad_norm": 0.004158318508416414, + "learning_rate": 1e-06, + "loss": 0.0016, + "num_tokens": 1224167902.0, + "reward": 0.8956055045127869, + "reward_std": 0.01448369212448597, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.994140625, + "rewards/symbolic_reward_accuracy/std": 0.07639661431312561, + "rewards/symbolic_reward_partial_score/mean": 0.9977213740348816, + "rewards/symbolic_reward_partial_score/std": 0.04448510333895683, + "rewards/tag_count_reward/mean": -0.001953125, + "rewards/tag_count_reward/std": 0.04419417306780815, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0701613426208496, + "sampling/importance_sampling_ratio/min": 1.4257681701934644e-09, + "sampling/sampling_logp_difference/max": 20.368555068969727, + "sampling/sampling_logp_difference/mean": 0.12146088480949402, + "step": 957 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0078125, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0078125, + "entropy": 0.3077358156442642, + "epoch": 1.5352564102564101, + "grad_norm": 0.004114874638617039, + "learning_rate": 1e-06, + "loss": 0.022, + "step": 958 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0234375, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0234375, + "entropy": 0.30623532831668854, + "epoch": 1.5368589743589745, + "grad_norm": 0.001660304144024849, + "learning_rate": 1e-06, + "loss": -0.0103, + "step": 959 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0078125, + "entropy": 0.30911290645599365, + "epoch": 1.5384615384615383, + "grad_norm": 0.003970026969909668, + "learning_rate": 1e-06, + "loss": 0.0053, + "step": 960 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 12576.0, + "completions/mean_length": 7168.396484375, + "completions/mean_terminated_length": 7150.36181640625, + "completions/min_length": 2494.0, + "completions/min_terminated_length": 2494.0, + "entropy": 0.30951225757598877, + "epoch": 1.5400641025641026, + "frac_reward_zero_std": 0.90625, + "grad_norm": 0.03025236912071705, + "learning_rate": 1e-06, + "loss": 0.005, + "num_tokens": 1228666761.0, + "reward": 0.8961328268051147, + "reward_std": 0.015468751080334187, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.994140625, + "rewards/symbolic_reward_accuracy/std": 0.07639661431312561, + "rewards/symbolic_reward_partial_score/mean": 0.9994791746139526, + "rewards/symbolic_reward_partial_score/std": 0.006817440502345562, + "rewards/tag_count_reward/mean": -0.001953125, + "rewards/tag_count_reward/std": 0.04419417306780815, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0703372955322266, + "sampling/importance_sampling_ratio/min": 0.0028708188328891993, + "sampling/sampling_logp_difference/max": 5.853157997131348, + "sampling/sampling_logp_difference/mean": 0.12191639095544815, + "step": 961 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0078125, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0078125, + "entropy": 0.3105301558971405, + "epoch": 1.5416666666666665, + "grad_norm": 0.00395960221067071, + "learning_rate": 1e-06, + "loss": 0.0362, + "step": 962 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0078125, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0078125, + "entropy": 0.30917251110076904, + "epoch": 1.5432692307692308, + "grad_norm": 0.003832736052572727, + "learning_rate": 1e-06, + "loss": -0.0135, + "step": 963 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.015625, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.015625, + "entropy": 0.3083617687225342, + "epoch": 1.5448717948717947, + "grad_norm": 0.003537870245054364, + "learning_rate": 1e-06, + "loss": -0.0129, + "step": 964 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12432.0, + "completions/max_terminated_length": 12432.0, + "completions/mean_length": 7791.91796875, + "completions/mean_terminated_length": 7791.91796875, + "completions/min_length": 4424.0, + "completions/min_terminated_length": 4424.0, + "entropy": 0.2928401529788971, + "epoch": 1.546474358974359, + "frac_reward_zero_std": 0.875, + "grad_norm": 0.005985740572214127, + "learning_rate": 1e-06, + "loss": -0.0197, + "num_tokens": 1233547311.0, + "reward": 0.89306640625, + "reward_std": 0.02463994361460209, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.990234375, + "rewards/symbolic_reward_accuracy/std": 0.09843364357948303, + "rewards/symbolic_reward_partial_score/mean": 0.9964193105697632, + "rewards/symbolic_reward_partial_score/std": 0.05014854669570923, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0672111511230469, + "sampling/importance_sampling_ratio/min": 0.0024554759729653597, + "sampling/sampling_logp_difference/max": 6.009434700012207, + "sampling/sampling_logp_difference/mean": 0.11611609160900116, + "step": 965 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0078125, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0078125, + "entropy": 0.29419833421707153, + "epoch": 1.5480769230769231, + "grad_norm": 0.019973278045654297, + "learning_rate": 1e-06, + "loss": -0.0069, + "step": 966 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0078125, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0078125, + "entropy": 0.2934655100107193, + "epoch": 1.5496794871794872, + "grad_norm": 0.00576686579734087, + "learning_rate": 1e-06, + "loss": -0.0064, + "step": 967 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.0390625, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0390625, + "entropy": 0.29317569732666016, + "epoch": 1.5512820512820513, + "grad_norm": 0.03426339477300644, + "learning_rate": 1e-06, + "loss": 0.0292, + "step": 968 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12050.0, + "completions/max_terminated_length": 12050.0, + "completions/mean_length": 7161.708984375, + "completions/mean_terminated_length": 7161.708984375, + "completions/min_length": 2797.0, + "completions/min_terminated_length": 2797.0, + "entropy": 0.3102886974811554, + "epoch": 1.5528846153846154, + "frac_reward_zero_std": 0.90625, + "grad_norm": 0.0047947121784091, + "learning_rate": 1e-06, + "loss": -0.0141, + "num_tokens": 1238009850.0, + "reward": 0.8939453363418579, + "reward_std": 0.021000541746616364, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.9921875, + "rewards/symbolic_reward_accuracy/std": 0.08812850713729858, + "rewards/symbolic_reward_partial_score/mean": 0.9954426884651184, + "rewards/symbolic_reward_partial_score/std": 0.06325981765985489, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0706758499145508, + "sampling/importance_sampling_ratio/min": 0.0009838519617915154, + "sampling/sampling_logp_difference/max": 6.92403507232666, + "sampling/sampling_logp_difference/mean": 0.12199265509843826, + "step": 969 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.015625, + "clip_ratio/low_mean": 0.0078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0234375, + "entropy": 0.31009647250175476, + "epoch": 1.5544871794871795, + "grad_norm": 0.02185901068150997, + "learning_rate": 1e-06, + "loss": 0.011, + "step": 970 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.015625, + "clip_ratio/low_mean": 0.015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.03125, + "entropy": 0.30998343229293823, + "epoch": 1.5560897435897436, + "grad_norm": 0.005330664571374655, + "learning_rate": 1e-06, + "loss": 0.0082, + "step": 971 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.03125, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.03125, + "entropy": 0.30864378809928894, + "epoch": 1.5576923076923077, + "grad_norm": 0.004321873653680086, + "learning_rate": 1e-06, + "loss": -0.0145, + "step": 972 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12500.0, + "completions/max_terminated_length": 12500.0, + "completions/mean_length": 7311.296875, + "completions/mean_terminated_length": 7311.296875, + "completions/min_length": 2664.0, + "completions/min_terminated_length": 2664.0, + "entropy": 0.2941237837076187, + "epoch": 1.5592948717948718, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1e-06, + "loss": 0.0, + "num_tokens": 1242603026.0, + "reward": 0.9000000357627869, + "reward_std": 0.0, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 1.0, + "rewards/symbolic_reward_accuracy/std": 0.0, + "rewards/symbolic_reward_partial_score/mean": 1.0, + "rewards/symbolic_reward_partial_score/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0677077770233154, + "sampling/importance_sampling_ratio/min": 0.002588698174804449, + "sampling/sampling_logp_difference/max": 5.956600189208984, + "sampling/sampling_logp_difference/mean": 0.11709814518690109, + "step": 973 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "entropy": 0.29679691791534424, + "epoch": 1.560897435897436, + "grad_norm": 0.0, + "learning_rate": 1e-06, + "loss": 0.0, + "step": 974 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "entropy": 0.2933846265077591, + "epoch": 1.5625, + "grad_norm": 0.0, + "learning_rate": 1e-06, + "loss": 0.0, + "step": 975 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "entropy": 0.29794517159461975, + "epoch": 1.564102564102564, + "grad_norm": 0.0, + "learning_rate": 1e-06, + "loss": 0.0, + "step": 976 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12324.0, + "completions/max_terminated_length": 12324.0, + "completions/mean_length": 7672.275390625, + "completions/mean_terminated_length": 7672.275390625, + "completions/min_length": 2624.0, + "completions/min_terminated_length": 2624.0, + "entropy": 0.2908579856157303, + "epoch": 1.5657051282051282, + "frac_reward_zero_std": 0.96875, + "grad_norm": 0.002392068738117814, + "learning_rate": 1e-06, + "loss": -0.0045, + "num_tokens": 1247454351.0, + "reward": 0.8982422351837158, + "reward_std": 0.00703125074505806, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.998046875, + "rewards/symbolic_reward_accuracy/std": 0.04419417306780815, + "rewards/symbolic_reward_partial_score/mean": 0.998046875, + "rewards/symbolic_reward_partial_score/std": 0.04419417306780815, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.066673994064331, + "sampling/importance_sampling_ratio/min": 0.0019470222759991884, + "sampling/sampling_logp_difference/max": 6.241454124450684, + "sampling/sampling_logp_difference/mean": 0.1153443232178688, + "step": 977 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "entropy": 0.28915299475193024, + "epoch": 1.5673076923076923, + "grad_norm": 0.0020979184191673994, + "learning_rate": 1e-06, + "loss": -0.0041, + "step": 978 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.015625, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.015625, + "entropy": 0.29180823266506195, + "epoch": 1.5689102564102564, + "grad_norm": 0.0016745133325457573, + "learning_rate": 1e-06, + "loss": -0.0046, + "step": 979 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "entropy": 0.2878534197807312, + "epoch": 1.5705128205128205, + "grad_norm": 0.023565437644720078, + "learning_rate": 1e-06, + "loss": 0.0117, + "step": 980 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13424.0, + "completions/max_terminated_length": 13424.0, + "completions/mean_length": 7492.591796875, + "completions/mean_terminated_length": 7492.591796875, + "completions/min_length": 3296.0, + "completions/min_terminated_length": 3296.0, + "entropy": 0.302181601524353, + "epoch": 1.5721153846153846, + "frac_reward_zero_std": 0.9375, + "grad_norm": 0.004688016138970852, + "learning_rate": 1e-06, + "loss": -0.0118, + "num_tokens": 1252190958.0, + "reward": 0.8957812786102295, + "reward_std": 0.01375581230968237, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.994140625, + "rewards/symbolic_reward_accuracy/std": 0.07639661431312561, + "rewards/symbolic_reward_partial_score/mean": 0.9976562261581421, + "rewards/symbolic_reward_partial_score/std": 0.044615939259529114, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0691838264465332, + "sampling/importance_sampling_ratio/min": 0.0014014577027410269, + "sampling/sampling_logp_difference/max": 6.570242404937744, + "sampling/sampling_logp_difference/mean": 0.11948972940444946, + "step": 981 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "entropy": 0.30064649879932404, + "epoch": 1.5737179487179487, + "grad_norm": 0.004267314448952675, + "learning_rate": 1e-06, + "loss": -0.0106, + "step": 982 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0078125, + "entropy": 0.30243119597435, + "epoch": 1.5753205128205128, + "grad_norm": 0.003907489590346813, + "learning_rate": 1e-06, + "loss": 0.0174, + "step": 983 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0078125, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0078125, + "entropy": 0.30112794041633606, + "epoch": 1.5769230769230769, + "grad_norm": 0.0035790693946182728, + "learning_rate": 1e-06, + "loss": 0.006, + "step": 984 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14669.0, + "completions/mean_length": 7357.505859375, + "completions/mean_terminated_length": 7339.84130859375, + "completions/min_length": 3741.0, + "completions/min_terminated_length": 3741.0, + "entropy": 0.3018489480018616, + "epoch": 1.578525641025641, + "frac_reward_zero_std": 0.9375, + "grad_norm": 0.003833780298009515, + "learning_rate": 1e-06, + "loss": -0.0092, + "num_tokens": 1256820097.0, + "reward": 0.8965234756469727, + "reward_std": 0.013906250707805157, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.99609375, + "rewards/symbolic_reward_accuracy/std": 0.06243881583213806, + "rewards/symbolic_reward_partial_score/mean": 0.996874988079071, + "rewards/symbolic_reward_partial_score/std": 0.051494304090738297, + "rewards/tag_count_reward/mean": -0.001953125, + "rewards/tag_count_reward/std": 0.04419417306780815, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0690256357192993, + "sampling/importance_sampling_ratio/min": 0.00016220644465647638, + "sampling/sampling_logp_difference/max": 8.726640701293945, + "sampling/sampling_logp_difference/mean": 0.11971522867679596, + "step": 985 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.015625, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.015625, + "entropy": 0.30244719982147217, + "epoch": 1.5801282051282053, + "grad_norm": 0.0026512236800044775, + "learning_rate": 1e-06, + "loss": 0.0229, + "step": 986 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0078125, + "clip_ratio/low_mean": 0.0078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.015625, + "entropy": 0.30103157460689545, + "epoch": 1.5817307692307692, + "grad_norm": 0.0024182642810046673, + "learning_rate": 1e-06, + "loss": 0.0206, + "step": 987 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "entropy": 0.30128341913223267, + "epoch": 1.5833333333333335, + "grad_norm": 0.00353293027728796, + "learning_rate": 1e-06, + "loss": -0.009, + "step": 988 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11739.0, + "completions/max_terminated_length": 11739.0, + "completions/mean_length": 7189.322265625, + "completions/mean_terminated_length": 7189.322265625, + "completions/min_length": 3961.0, + "completions/min_terminated_length": 3961.0, + "entropy": 0.308343768119812, + "epoch": 1.5849358974358974, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1e-06, + "loss": 0.0, + "num_tokens": 1261283766.0, + "reward": 0.9000000357627869, + "reward_std": 0.0, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 1.0, + "rewards/symbolic_reward_accuracy/std": 0.0, + "rewards/symbolic_reward_partial_score/mean": 1.0, + "rewards/symbolic_reward_partial_score/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0706350803375244, + "sampling/importance_sampling_ratio/min": 0.0024685210082679987, + "sampling/sampling_logp_difference/max": 6.004136085510254, + "sampling/sampling_logp_difference/mean": 0.12226317822933197, + "step": 989 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "entropy": 0.30838756263256073, + "epoch": 1.5865384615384617, + "grad_norm": 0.0, + "learning_rate": 1e-06, + "loss": 0.0, + "step": 990 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "entropy": 0.3085535615682602, + "epoch": 1.5881410256410255, + "grad_norm": 0.0, + "learning_rate": 1e-06, + "loss": 0.0, + "step": 991 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "entropy": 0.30801253020763397, + "epoch": 1.5897435897435899, + "grad_norm": 0.0, + "learning_rate": 1e-06, + "loss": 0.0, + "step": 992 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14298.0, + "completions/max_terminated_length": 14298.0, + "completions/mean_length": 7401.361328125, + "completions/mean_terminated_length": 7401.361328125, + "completions/min_length": 3344.0, + "completions/min_terminated_length": 3344.0, + "entropy": 0.30265168845653534, + "epoch": 1.5913461538461537, + "frac_reward_zero_std": 0.9375, + "grad_norm": 0.016912033781409264, + "learning_rate": 1e-06, + "loss": 0.01, + "num_tokens": 1265987423.0, + "reward": 0.8945801258087158, + "reward_std": 0.014904549345374107, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.9921875, + "rewards/symbolic_reward_accuracy/std": 0.08812850713729858, + "rewards/symbolic_reward_partial_score/mean": 0.99755859375, + "rewards/symbolic_reward_partial_score/std": 0.044628970324993134, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0696299076080322, + "sampling/importance_sampling_ratio/min": 0.0019413894042372704, + "sampling/sampling_logp_difference/max": 6.244351387023926, + "sampling/sampling_logp_difference/mean": 0.12015168368816376, + "step": 993 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "entropy": 0.3008352816104889, + "epoch": 1.592948717948718, + "grad_norm": 0.005295754410326481, + "learning_rate": 1e-06, + "loss": -0.0123, + "step": 994 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0078125, + "entropy": 0.30222633481025696, + "epoch": 1.594551282051282, + "grad_norm": 0.014241804368793964, + "learning_rate": 1e-06, + "loss": 0.0207, + "step": 995 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.015625, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.015625, + "entropy": 0.30317962169647217, + "epoch": 1.5961538461538463, + "grad_norm": 0.003911672160029411, + "learning_rate": 1e-06, + "loss": -0.0128, + "step": 996 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14579.0, + "completions/max_terminated_length": 14579.0, + "completions/mean_length": 7341.966796875, + "completions/mean_terminated_length": 7341.966796875, + "completions/min_length": 3543.0, + "completions/min_terminated_length": 3543.0, + "entropy": 0.3111606240272522, + "epoch": 1.5977564102564101, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1e-06, + "loss": 0.0, + "num_tokens": 1270548686.0, + "reward": 0.9000000357627869, + "reward_std": 0.0, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 1.0, + "rewards/symbolic_reward_accuracy/std": 0.0, + "rewards/symbolic_reward_partial_score/mean": 1.0, + "rewards/symbolic_reward_partial_score/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0711815357208252, + "sampling/importance_sampling_ratio/min": 1.5573759729742287e-12, + "sampling/sampling_logp_difference/max": 27.188018798828125, + "sampling/sampling_logp_difference/mean": 0.12292777001857758, + "step": 997 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "entropy": 0.3086211085319519, + "epoch": 1.5993589743589745, + "grad_norm": 0.0, + "learning_rate": 1e-06, + "loss": 0.0, + "step": 998 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "entropy": 0.30883246660232544, + "epoch": 1.6009615384615383, + "grad_norm": 0.0, + "learning_rate": 1e-06, + "loss": 0.0, + "step": 999 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "entropy": 0.31077563762664795, + "epoch": 1.6025641025641026, + "grad_norm": 0.0, + "learning_rate": 1e-06, + "loss": 0.0, + "step": 1000 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12309.0, + "completions/max_terminated_length": 12309.0, + "completions/mean_length": 7422.494140625, + "completions/mean_terminated_length": 7422.494140625, + "completions/min_length": 3676.0, + "completions/min_terminated_length": 3676.0, + "entropy": 0.2989673614501953, + "epoch": 1.6041666666666665, + "frac_reward_zero_std": 0.96875, + "grad_norm": 0.002359275473281741, + "learning_rate": 1e-06, + "loss": -0.0036, + "num_tokens": 1275273659.0, + "reward": 0.8982422351837158, + "reward_std": 0.00703125074505806, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.998046875, + "rewards/symbolic_reward_accuracy/std": 0.04419417306780815, + "rewards/symbolic_reward_partial_score/mean": 0.998046875, + "rewards/symbolic_reward_partial_score/std": 0.04419417306780815, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0692410469055176, + "sampling/importance_sampling_ratio/min": 0.002691791858524084, + "sampling/sampling_logp_difference/max": 5.917548179626465, + "sampling/sampling_logp_difference/mean": 0.11995130777359009, + "step": 1001 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "entropy": 0.30219927430152893, + "epoch": 1.6057692307692308, + "grad_norm": 0.0021961370948702097, + "learning_rate": 1e-06, + "loss": 0.0098, + "step": 1002 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "entropy": 0.301533043384552, + "epoch": 1.6073717948717947, + "grad_norm": 0.0021459702402353287, + "learning_rate": 1e-06, + "loss": -0.0035, + "step": 1003 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "entropy": 0.3046448081731796, + "epoch": 1.608974358974359, + "grad_norm": 0.002298478502780199, + "learning_rate": 1e-06, + "loss": -0.0039, + "step": 1004 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12258.0, + "completions/max_terminated_length": 12258.0, + "completions/mean_length": 7409.1484375, + "completions/mean_terminated_length": 7409.1484375, + "completions/min_length": 3620.0, + "completions/min_terminated_length": 3620.0, + "entropy": 0.305515393614769, + "epoch": 1.6105769230769231, + "frac_reward_zero_std": 0.875, + "grad_norm": 0.006075072567909956, + "learning_rate": 1e-06, + "loss": 0.0092, + "num_tokens": 1279892007.0, + "reward": 0.893310546875, + "reward_std": 0.02366338111460209, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.990234375, + "rewards/symbolic_reward_accuracy/std": 0.09843364357948303, + "rewards/symbolic_reward_partial_score/mean": 0.9972330927848816, + "rewards/symbolic_reward_partial_score/std": 0.04513990134000778, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0702705383300781, + "sampling/importance_sampling_ratio/min": 9.090582153703508e-08, + "sampling/sampling_logp_difference/max": 16.213441848754883, + "sampling/sampling_logp_difference/mean": 0.12114111334085464, + "step": 1005 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0078125, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0078125, + "entropy": 0.30623990297317505, + "epoch": 1.6121794871794872, + "grad_norm": 0.01973203755915165, + "learning_rate": 1e-06, + "loss": -0.0047, + "step": 1006 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.0234375, + "clip_ratio/low_mean": 0.0078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.03125, + "entropy": 0.3060086518526077, + "epoch": 1.6137820512820513, + "grad_norm": 0.004733169451355934, + "learning_rate": 1e-06, + "loss": -0.0043, + "step": 1007 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.03125, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.03125, + "entropy": 0.3062121272087097, + "epoch": 1.6153846153846154, + "grad_norm": 0.0320599228143692, + "learning_rate": 1e-06, + "loss": 0.0035, + "step": 1008 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13785.0, + "completions/max_terminated_length": 13785.0, + "completions/mean_length": 7285.33203125, + "completions/mean_terminated_length": 7285.33203125, + "completions/min_length": 2963.0, + "completions/min_terminated_length": 2963.0, + "entropy": 0.31654123961925507, + "epoch": 1.6169871794871795, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1e-06, + "loss": 0.0, + "num_tokens": 1284390481.0, + "reward": 0.9000000357627869, + "reward_std": 0.0, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 1.0, + "rewards/symbolic_reward_accuracy/std": 0.0, + "rewards/symbolic_reward_partial_score/mean": 1.0, + "rewards/symbolic_reward_partial_score/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0720587968826294, + "sampling/importance_sampling_ratio/min": 0.0013273663353174925, + "sampling/sampling_logp_difference/max": 6.624558448791504, + "sampling/sampling_logp_difference/mean": 0.12469251453876495, + "step": 1009 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "entropy": 0.31280288100242615, + "epoch": 1.6185897435897436, + "grad_norm": 0.0, + "learning_rate": 1e-06, + "loss": 0.0, + "step": 1010 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "entropy": 0.3145320415496826, + "epoch": 1.6201923076923077, + "grad_norm": 0.0, + "learning_rate": 1e-06, + "loss": 0.0, + "step": 1011 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "entropy": 0.3184860795736313, + "epoch": 1.6217948717948718, + "grad_norm": 0.0, + "learning_rate": 1e-06, + "loss": 0.0, + "step": 1012 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12617.0, + "completions/max_terminated_length": 12617.0, + "completions/mean_length": 7735.546875, + "completions/mean_terminated_length": 7735.546875, + "completions/min_length": 3578.0, + "completions/min_terminated_length": 3578.0, + "entropy": 0.30475133657455444, + "epoch": 1.623397435897436, + "frac_reward_zero_std": 0.90625, + "grad_norm": 0.004162474535405636, + "learning_rate": 1e-06, + "loss": -0.0123, + "num_tokens": 1289199177.0, + "reward": 0.8960742950439453, + "reward_std": 0.015703124925494194, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.994140625, + "rewards/symbolic_reward_accuracy/std": 0.07639661431312561, + "rewards/symbolic_reward_partial_score/mean": 0.9986327886581421, + "rewards/symbolic_reward_partial_score/std": 0.022945649921894073, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0703797340393066, + "sampling/importance_sampling_ratio/min": 0.0022015005815774202, + "sampling/sampling_logp_difference/max": 6.118616104125977, + "sampling/sampling_logp_difference/mean": 0.1216018795967102, + "step": 1013 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.015625, + "clip_ratio/low_mean": 0.0078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0234375, + "entropy": 0.30561186373233795, + "epoch": 1.625, + "grad_norm": 0.004154075402766466, + "learning_rate": 1e-06, + "loss": 0.0073, + "step": 1014 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.015625, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.015625, + "entropy": 0.30855441093444824, + "epoch": 1.626602564102564, + "grad_norm": 0.00390764232724905, + "learning_rate": 1e-06, + "loss": -0.0126, + "step": 1015 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.0234375, + "clip_ratio/low_mean": 0.0078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.03125, + "entropy": 0.30875104665756226, + "epoch": 1.6282051282051282, + "grad_norm": 0.0035740600433200598, + "learning_rate": 1e-06, + "loss": 0.0296, + "step": 1016 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13899.0, + "completions/max_terminated_length": 13899.0, + "completions/mean_length": 7892.951171875, + "completions/mean_terminated_length": 7892.951171875, + "completions/min_length": 2880.0, + "completions/min_terminated_length": 2880.0, + "entropy": 0.3002883791923523, + "epoch": 1.6298076923076923, + "frac_reward_zero_std": 0.875, + "grad_norm": 0.0051344577223062515, + "learning_rate": 1e-06, + "loss": -0.0177, + "num_tokens": 1294171984.0, + "reward": 0.8945556879043579, + "reward_std": 0.02177734486758709, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.9921875, + "rewards/symbolic_reward_accuracy/std": 0.08812850713729858, + "rewards/symbolic_reward_partial_score/mean": 0.9974772334098816, + "rewards/symbolic_reward_partial_score/std": 0.04481436312198639, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.068955421447754, + "sampling/importance_sampling_ratio/min": 0.00017753004794940352, + "sampling/sampling_logp_difference/max": 8.636370658874512, + "sampling/sampling_logp_difference/mean": 0.11876966059207916, + "step": 1017 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.03125, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.03125, + "entropy": 0.30114126205444336, + "epoch": 1.6314102564102564, + "grad_norm": 0.025489378720521927, + "learning_rate": 1e-06, + "loss": 0.0132, + "step": 1018 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.015625, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.015625, + "entropy": 0.29750263690948486, + "epoch": 1.6330128205128205, + "grad_norm": 0.004476260859519243, + "learning_rate": 1e-06, + "loss": -0.0006, + "step": 1019 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0234375, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0234375, + "entropy": 0.2999611049890518, + "epoch": 1.6346153846153846, + "grad_norm": 0.03016292303800583, + "learning_rate": 1e-06, + "loss": 0.003, + "step": 1020 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12380.0, + "completions/max_terminated_length": 12380.0, + "completions/mean_length": 7600.552734375, + "completions/mean_terminated_length": 7600.552734375, + "completions/min_length": 3388.0, + "completions/min_terminated_length": 3388.0, + "entropy": 0.31947872042655945, + "epoch": 1.6362179487179487, + "frac_reward_zero_std": 0.9375, + "grad_norm": 0.0039420961402356625, + "learning_rate": 1e-06, + "loss": -0.0093, + "num_tokens": 1298850939.0, + "reward": 0.8975342512130737, + "reward_std": 0.00986328162252903, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.99609375, + "rewards/symbolic_reward_accuracy/std": 0.06243881583213806, + "rewards/symbolic_reward_partial_score/mean": 0.9995931386947632, + "rewards/symbolic_reward_partial_score/std": 0.006633348762989044, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.072902798652649, + "sampling/importance_sampling_ratio/min": 0.0011838971404358745, + "sampling/sampling_logp_difference/max": 6.738943576812744, + "sampling/sampling_logp_difference/mean": 0.1259896457195282, + "step": 1021 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0078125, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0078125, + "entropy": 0.3202350288629532, + "epoch": 1.6378205128205128, + "grad_norm": 0.003697711741551757, + "learning_rate": 1e-06, + "loss": -0.0088, + "step": 1022 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "entropy": 0.3217895179986954, + "epoch": 1.6394230769230769, + "grad_norm": 0.027309000492095947, + "learning_rate": 1e-06, + "loss": 0.0269, + "step": 1023 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "entropy": 0.3194885402917862, + "epoch": 1.641025641025641, + "grad_norm": 0.0037369539495557547, + "learning_rate": 1e-06, + "loss": -0.0094, + "step": 1024 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14350.0, + "completions/max_terminated_length": 14350.0, + "completions/mean_length": 7857.98046875, + "completions/mean_terminated_length": 7857.98046875, + "completions/min_length": 4121.0, + "completions/min_terminated_length": 4121.0, + "entropy": 0.316010519862175, + "epoch": 1.6426282051282053, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1e-06, + "loss": 0.0, + "num_tokens": 1303731377.0, + "reward": 0.9000000357627869, + "reward_std": 0.0, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 1.0, + "rewards/symbolic_reward_accuracy/std": 0.0, + "rewards/symbolic_reward_partial_score/mean": 1.0, + "rewards/symbolic_reward_partial_score/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.07135808467865, + "sampling/importance_sampling_ratio/min": 0.0011098433751612902, + "sampling/sampling_logp_difference/max": 6.803536415100098, + "sampling/sampling_logp_difference/mean": 0.12318795174360275, + "step": 1025 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "entropy": 0.3135620355606079, + "epoch": 1.6442307692307692, + "grad_norm": 0.0, + "learning_rate": 1e-06, + "loss": 0.0, + "step": 1026 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "entropy": 0.30700941383838654, + "epoch": 1.6458333333333335, + "grad_norm": 0.0, + "learning_rate": 1e-06, + "loss": 0.0, + "step": 1027 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "entropy": 0.3093920797109604, + "epoch": 1.6474358974358974, + "grad_norm": 0.0, + "learning_rate": 1e-06, + "loss": 0.0, + "step": 1028 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13836.0, + "completions/max_terminated_length": 13836.0, + "completions/mean_length": 7814.375, + "completions/mean_terminated_length": 7814.375, + "completions/min_length": 3011.0, + "completions/min_terminated_length": 3011.0, + "entropy": 0.3088514804840088, + "epoch": 1.6490384615384617, + "frac_reward_zero_std": 0.96875, + "grad_norm": 0.0025735602248460054, + "learning_rate": 1e-06, + "loss": -0.0049, + "num_tokens": 1308590833.0, + "reward": 0.8987793326377869, + "reward_std": 0.0048828125, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.998046875, + "rewards/symbolic_reward_accuracy/std": 0.04419417306780815, + "rewards/symbolic_reward_partial_score/mean": 0.9998372793197632, + "rewards/symbolic_reward_partial_score/std": 0.003682846901938319, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0705108642578125, + "sampling/importance_sampling_ratio/min": 0.003677040571346879, + "sampling/sampling_logp_difference/max": 5.605647087097168, + "sampling/sampling_logp_difference/mean": 0.12232539057731628, + "step": 1029 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "entropy": 0.3050430715084076, + "epoch": 1.6506410256410255, + "grad_norm": 0.026151379570364952, + "learning_rate": 1e-06, + "loss": 0.0156, + "step": 1030 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "entropy": 0.3086225986480713, + "epoch": 1.6522435897435899, + "grad_norm": 0.0025779418647289276, + "learning_rate": 1e-06, + "loss": -0.0047, + "step": 1031 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "entropy": 0.3103097528219223, + "epoch": 1.6538461538461537, + "grad_norm": 0.002719617448747158, + "learning_rate": 1e-06, + "loss": -0.0048, + "step": 1032 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14775.0, + "completions/max_terminated_length": 14775.0, + "completions/mean_length": 8091.33984375, + "completions/mean_terminated_length": 8091.33984375, + "completions/min_length": 4480.0, + "completions/min_terminated_length": 4480.0, + "entropy": 0.2884677052497864, + "epoch": 1.655448717948718, + "frac_reward_zero_std": 0.9375, + "grad_norm": 0.0037412128876894712, + "learning_rate": 1e-06, + "loss": -0.0101, + "num_tokens": 1313699583.0, + "reward": 0.8975489139556885, + "reward_std": 0.009804688394069672, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.99609375, + "rewards/symbolic_reward_accuracy/std": 0.06243881583213806, + "rewards/symbolic_reward_partial_score/mean": 0.9996419548988342, + "rewards/symbolic_reward_partial_score/std": 0.0057472530752420425, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0662262439727783, + "sampling/importance_sampling_ratio/min": 0.005839198362082243, + "sampling/sampling_logp_difference/max": 5.143161773681641, + "sampling/sampling_logp_difference/mean": 0.1148136556148529, + "step": 1033 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0078125, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0078125, + "entropy": 0.28683267533779144, + "epoch": 1.657051282051282, + "grad_norm": 0.003717794781550765, + "learning_rate": 1e-06, + "loss": -0.0095, + "step": 1034 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0078125, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0078125, + "entropy": 0.2895479053258896, + "epoch": 1.6586538461538463, + "grad_norm": 0.00363312684930861, + "learning_rate": 1e-06, + "loss": -0.0098, + "step": 1035 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.015625, + "clip_ratio/low_mean": 0.0078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0234375, + "entropy": 0.28496386110782623, + "epoch": 1.6602564102564101, + "grad_norm": 0.027082180604338646, + "learning_rate": 1e-06, + "loss": 0.029, + "step": 1036 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11484.0, + "completions/max_terminated_length": 11484.0, + "completions/mean_length": 7448.791015625, + "completions/mean_terminated_length": 7448.791015625, + "completions/min_length": 3219.0, + "completions/min_terminated_length": 3219.0, + "entropy": 0.3116293400526047, + "epoch": 1.6618589743589745, + "frac_reward_zero_std": 0.96875, + "grad_norm": 0.0021872571669518948, + "learning_rate": 1e-06, + "loss": -0.0034, + "num_tokens": 1318367204.0, + "reward": 0.8987696170806885, + "reward_std": 0.004921874962747097, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.998046875, + "rewards/symbolic_reward_accuracy/std": 0.04419417306780815, + "rewards/symbolic_reward_partial_score/mean": 0.999804675579071, + "rewards/symbolic_reward_partial_score/std": 0.004419418517500162, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0708409547805786, + "sampling/importance_sampling_ratio/min": 0.0022659434471279383, + "sampling/sampling_logp_difference/max": 6.08976411819458, + "sampling/sampling_logp_difference/mean": 0.12293624877929688, + "step": 1037 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.015625, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.015625, + "entropy": 0.30697862803936005, + "epoch": 1.6634615384615383, + "grad_norm": 0.0015544932102784514, + "learning_rate": 1e-06, + "loss": -0.0036, + "step": 1038 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0078125, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0078125, + "entropy": 0.31002984941005707, + "epoch": 1.6650641025641026, + "grad_norm": 0.023099038749933243, + "learning_rate": 1e-06, + "loss": 0.0098, + "step": 1039 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "entropy": 0.31141215562820435, + "epoch": 1.6666666666666665, + "grad_norm": 0.0024219758342951536, + "learning_rate": 1e-06, + "loss": -0.0038, + "step": 1040 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12306.0, + "completions/max_terminated_length": 12306.0, + "completions/mean_length": 7141.751953125, + "completions/mean_terminated_length": 7141.751953125, + "completions/min_length": 3117.0, + "completions/min_terminated_length": 3117.0, + "entropy": 0.3169794976711273, + "epoch": 1.6682692307692308, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1e-06, + "loss": 0.0, + "num_tokens": 1322892181.0, + "reward": 0.9000000357627869, + "reward_std": 0.0, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 1.0, + "rewards/symbolic_reward_accuracy/std": 0.0, + "rewards/symbolic_reward_partial_score/mean": 1.0, + "rewards/symbolic_reward_partial_score/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0715605020523071, + "sampling/importance_sampling_ratio/min": 0.0019754667300730944, + "sampling/sampling_logp_difference/max": 6.226950645446777, + "sampling/sampling_logp_difference/mean": 0.12364828586578369, + "step": 1041 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "entropy": 0.31208787858486176, + "epoch": 1.6698717948717947, + "grad_norm": 0.0, + "learning_rate": 1e-06, + "loss": 0.0, + "step": 1042 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "entropy": 0.3129581958055496, + "epoch": 1.671474358974359, + "grad_norm": 0.0, + "learning_rate": 1e-06, + "loss": 0.0, + "step": 1043 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "entropy": 0.31092701852321625, + "epoch": 1.6730769230769231, + "grad_norm": 0.0, + "learning_rate": 1e-06, + "loss": 0.0, + "step": 1044 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13112.0, + "completions/max_terminated_length": 13112.0, + "completions/mean_length": 7309.1953125, + "completions/mean_terminated_length": 7309.1953125, + "completions/min_length": 3182.0, + "completions/min_terminated_length": 3182.0, + "entropy": 0.2980504482984543, + "epoch": 1.6746794871794872, + "frac_reward_zero_std": 0.8125, + "grad_norm": 0.005616697948426008, + "learning_rate": 1e-06, + "loss": -0.0244, + "num_tokens": 1327557033.0, + "reward": 0.8921875357627869, + "reward_std": 0.0312500037252903, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.98828125, + "rewards/symbolic_reward_accuracy/std": 0.10772226005792618, + "rewards/symbolic_reward_partial_score/mean": 0.9973958730697632, + "rewards/symbolic_reward_partial_score/std": 0.0320318341255188, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0682005882263184, + "sampling/importance_sampling_ratio/min": 0.001380621804855764, + "sampling/sampling_logp_difference/max": 6.585221290588379, + "sampling/sampling_logp_difference/mean": 0.1181633397936821, + "step": 1045 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.0390625, + "clip_ratio/low_mean": 0.0078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.046875, + "entropy": 0.2949713468551636, + "epoch": 1.6762820512820513, + "grad_norm": 0.035627998411655426, + "learning_rate": 1e-06, + "loss": 0.0373, + "step": 1046 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.046875, + "clip_ratio/low_mean": 0.0078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0546875, + "entropy": 0.3000248521566391, + "epoch": 1.6778846153846154, + "grad_norm": 0.004616168327629566, + "learning_rate": 1e-06, + "loss": -0.0122, + "step": 1047 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.03125, + "clip_ratio/low_mean": 0.0078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0390625, + "entropy": 0.2983495146036148, + "epoch": 1.6794871794871795, + "grad_norm": 0.005554524250328541, + "learning_rate": 1e-06, + "loss": -0.0077, + "step": 1048 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12885.0, + "completions/max_terminated_length": 12885.0, + "completions/mean_length": 7191.85546875, + "completions/mean_terminated_length": 7191.85546875, + "completions/min_length": 3008.0, + "completions/min_terminated_length": 3008.0, + "entropy": 0.3059154152870178, + "epoch": 1.6810897435897436, + "frac_reward_zero_std": 0.875, + "grad_norm": 0.02501201070845127, + "learning_rate": 1e-06, + "loss": -0.002, + "num_tokens": 1332076111.0, + "reward": 0.89404296875, + "reward_std": 0.02382812649011612, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.9921875, + "rewards/symbolic_reward_accuracy/std": 0.08812850713729858, + "rewards/symbolic_reward_partial_score/mean": 0.9957682490348816, + "rewards/symbolic_reward_partial_score/std": 0.0626349002122879, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.069615125656128, + "sampling/importance_sampling_ratio/min": 0.002162045333534479, + "sampling/sampling_logp_difference/max": 6.136700630187988, + "sampling/sampling_logp_difference/mean": 0.12109000235795975, + "step": 1049 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0078125, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0078125, + "entropy": 0.30635979771614075, + "epoch": 1.6826923076923077, + "grad_norm": 0.004613403230905533, + "learning_rate": 1e-06, + "loss": -0.0175, + "step": 1050 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.0234375, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0234375, + "entropy": 0.30721524357795715, + "epoch": 1.6842948717948718, + "grad_norm": 0.025122642517089844, + "learning_rate": 1e-06, + "loss": 0.0025, + "step": 1051 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.0234375, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0234375, + "entropy": 0.303785964846611, + "epoch": 1.685897435897436, + "grad_norm": 0.026602037250995636, + "learning_rate": 1e-06, + "loss": 0.0249, + "step": 1052 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12410.0, + "completions/max_terminated_length": 12410.0, + "completions/mean_length": 7009.740234375, + "completions/mean_terminated_length": 7009.740234375, + "completions/min_length": 3039.0, + "completions/min_terminated_length": 3039.0, + "entropy": 0.30636970698833466, + "epoch": 1.6875, + "frac_reward_zero_std": 0.875, + "grad_norm": 0.0045735533349215984, + "learning_rate": 1e-06, + "loss": -0.0164, + "num_tokens": 1336534170.0, + "reward": 0.893505871295929, + "reward_std": 0.02597656473517418, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.9921875, + "rewards/symbolic_reward_accuracy/std": 0.08812850713729858, + "rewards/symbolic_reward_partial_score/mean": 0.9939779043197632, + "rewards/symbolic_reward_partial_score/std": 0.07647283375263214, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0696215629577637, + "sampling/importance_sampling_ratio/min": 0.0018621934577822685, + "sampling/sampling_logp_difference/max": 6.2860002517700195, + "sampling/sampling_logp_difference/mean": 0.1207767054438591, + "step": 1053 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0078125, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0078125, + "entropy": 0.3046419471502304, + "epoch": 1.689102564102564, + "grad_norm": 0.0311747919768095, + "learning_rate": 1e-06, + "loss": 0.0121, + "step": 1054 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.03125, + "clip_ratio/low_mean": 0.0078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0390625, + "entropy": 0.3092672973871231, + "epoch": 1.6907051282051282, + "grad_norm": 0.026585765182971954, + "learning_rate": 1e-06, + "loss": 0.0187, + "step": 1055 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.015625, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.015625, + "entropy": 0.3052953779697418, + "epoch": 1.6923076923076923, + "grad_norm": 0.004019228275865316, + "learning_rate": 1e-06, + "loss": -0.0155, + "step": 1056 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12460.0, + "completions/max_terminated_length": 12460.0, + "completions/mean_length": 7024.05078125, + "completions/mean_terminated_length": 7024.05078125, + "completions/min_length": 3136.0, + "completions/min_terminated_length": 3136.0, + "entropy": 0.31111109256744385, + "epoch": 1.6939102564102564, + "frac_reward_zero_std": 0.9375, + "grad_norm": 0.0032420039642602205, + "learning_rate": 1e-06, + "loss": -0.0079, + "num_tokens": 1340954756.0, + "reward": 0.8969531059265137, + "reward_std": 0.012187501415610313, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.99609375, + "rewards/symbolic_reward_accuracy/std": 0.06243881583213806, + "rewards/symbolic_reward_partial_score/mean": 0.9976562261581421, + "rewards/symbolic_reward_partial_score/std": 0.0450524240732193, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0706874132156372, + "sampling/importance_sampling_ratio/min": 0.0010910998098552227, + "sampling/sampling_logp_difference/max": 6.820569038391113, + "sampling/sampling_logp_difference/mean": 0.12288743257522583, + "step": 1057 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.015625, + "entropy": 0.30891941487789154, + "epoch": 1.6955128205128205, + "grad_norm": 0.003183747874572873, + "learning_rate": 1e-06, + "loss": 0.0228, + "step": 1058 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.015625, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.015625, + "entropy": 0.3126651793718338, + "epoch": 1.6971153846153846, + "grad_norm": 0.0023205929901450872, + "learning_rate": 1e-06, + "loss": -0.0077, + "step": 1059 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "entropy": 0.3115437924861908, + "epoch": 1.6987179487179487, + "grad_norm": 0.0032985415309667587, + "learning_rate": 1e-06, + "loss": -0.0089, + "step": 1060 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12176.0, + "completions/max_terminated_length": 12176.0, + "completions/mean_length": 7136.126953125, + "completions/mean_terminated_length": 7136.126953125, + "completions/min_length": 3152.0, + "completions/min_terminated_length": 3152.0, + "entropy": 0.31162822246551514, + "epoch": 1.7003205128205128, + "frac_reward_zero_std": 0.96875, + "grad_norm": 0.0024950928054749966, + "learning_rate": 1e-06, + "loss": -0.0047, + "num_tokens": 1345428069.0, + "reward": 0.8987793326377869, + "reward_std": 0.0048828125, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.998046875, + "rewards/symbolic_reward_accuracy/std": 0.04419417306780815, + "rewards/symbolic_reward_partial_score/mean": 0.9998372793197632, + "rewards/symbolic_reward_partial_score/std": 0.003682846901938319, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0699987411499023, + "sampling/importance_sampling_ratio/min": 0.001176500809378922, + "sampling/sampling_logp_difference/max": 6.745210647583008, + "sampling/sampling_logp_difference/mean": 0.12207436561584473, + "step": 1061 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0078125, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0078125, + "entropy": 0.30960360169410706, + "epoch": 1.7019230769230769, + "grad_norm": 0.0017172261141240597, + "learning_rate": 1e-06, + "loss": -0.0045, + "step": 1062 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0078125, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0078125, + "entropy": 0.3100372850894928, + "epoch": 1.703525641025641, + "grad_norm": 0.0024772342294454575, + "learning_rate": 1e-06, + "loss": -0.0048, + "step": 1063 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "entropy": 0.31119243800640106, + "epoch": 1.7051282051282053, + "grad_norm": 0.02614988386631012, + "learning_rate": 1e-06, + "loss": 0.0144, + "step": 1064 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12824.0, + "completions/max_terminated_length": 12824.0, + "completions/mean_length": 7220.416015625, + "completions/mean_terminated_length": 7220.416015625, + "completions/min_length": 2008.0, + "completions/min_terminated_length": 2008.0, + "entropy": 0.31035639345645905, + "epoch": 1.7067307692307692, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1e-06, + "loss": 0.0, + "num_tokens": 1349993530.0, + "reward": 0.9000000357627869, + "reward_std": 0.0, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 1.0, + "rewards/symbolic_reward_accuracy/std": 0.0, + "rewards/symbolic_reward_partial_score/mean": 1.0, + "rewards/symbolic_reward_partial_score/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.069838047027588, + "sampling/importance_sampling_ratio/min": 0.00014011569146532565, + "sampling/sampling_logp_difference/max": 8.873042106628418, + "sampling/sampling_logp_difference/mean": 0.12209658324718475, + "step": 1065 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "entropy": 0.30622316896915436, + "epoch": 1.7083333333333335, + "grad_norm": 0.0, + "learning_rate": 1e-06, + "loss": 0.0, + "step": 1066 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "entropy": 0.3089989274740219, + "epoch": 1.7099358974358974, + "grad_norm": 0.0, + "learning_rate": 1e-06, + "loss": 0.0, + "step": 1067 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "entropy": 0.3127421587705612, + "epoch": 1.7115384615384617, + "grad_norm": 0.0, + "learning_rate": 1e-06, + "loss": 0.0, + "step": 1068 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12989.0, + "completions/max_terminated_length": 12989.0, + "completions/mean_length": 7588.2265625, + "completions/mean_terminated_length": 7588.2265625, + "completions/min_length": 4223.0, + "completions/min_terminated_length": 4223.0, + "entropy": 0.3054191470146179, + "epoch": 1.7131410256410255, + "frac_reward_zero_std": 0.9375, + "grad_norm": 0.018871258944272995, + "learning_rate": 1e-06, + "loss": 0.0026, + "num_tokens": 1354722926.0, + "reward": 0.8963379263877869, + "reward_std": 0.01155400462448597, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.994140625, + "rewards/symbolic_reward_accuracy/std": 0.07639661431312561, + "rewards/symbolic_reward_partial_score/mean": 0.99951171875, + "rewards/symbolic_reward_partial_score/std": 0.006366382818669081, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0694046020507812, + "sampling/importance_sampling_ratio/min": 0.0024460311979055405, + "sampling/sampling_logp_difference/max": 6.013288497924805, + "sampling/sampling_logp_difference/mean": 0.12116353213787079, + "step": 1069 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.015625, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.015625, + "entropy": 0.3093167692422867, + "epoch": 1.7147435897435899, + "grad_norm": 0.0049339476972818375, + "learning_rate": 1e-06, + "loss": -0.0122, + "step": 1070 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "entropy": 0.30434533953666687, + "epoch": 1.7163461538461537, + "grad_norm": 0.004389526788145304, + "learning_rate": 1e-06, + "loss": -0.0122, + "step": 1071 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0078125, + "clip_ratio/low_mean": 0.015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0234375, + "entropy": 0.30614908039569855, + "epoch": 1.717948717948718, + "grad_norm": 0.0026515277568250895, + "learning_rate": 1e-06, + "loss": 0.0261, + "step": 1072 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12732.0, + "completions/max_terminated_length": 12732.0, + "completions/mean_length": 7372.23828125, + "completions/mean_terminated_length": 7372.23828125, + "completions/min_length": 2964.0, + "completions/min_terminated_length": 2964.0, + "entropy": 0.31145814061164856, + "epoch": 1.719551282051282, + "frac_reward_zero_std": 0.875, + "grad_norm": 0.006173459812998772, + "learning_rate": 1e-06, + "loss": -0.0209, + "num_tokens": 1359454808.0, + "reward": 0.8932129144668579, + "reward_std": 0.02405400574207306, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.990234375, + "rewards/symbolic_reward_accuracy/std": 0.09843364357948303, + "rewards/symbolic_reward_partial_score/mean": 0.9969075918197632, + "rewards/symbolic_reward_partial_score/std": 0.046963535249233246, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0696375370025635, + "sampling/importance_sampling_ratio/min": 0.002213636413216591, + "sampling/sampling_logp_difference/max": 6.113118648529053, + "sampling/sampling_logp_difference/mean": 0.12133283913135529, + "step": 1073 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.015625, + "clip_ratio/low_mean": 0.0078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0234375, + "entropy": 0.3093886077404022, + "epoch": 1.7211538461538463, + "grad_norm": 0.0051024542190134525, + "learning_rate": 1e-06, + "loss": -0.0022, + "step": 1074 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.015625, + "clip_ratio/low_mean": 0.0078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0234375, + "entropy": 0.3084931820631027, + "epoch": 1.7227564102564101, + "grad_norm": 0.02555413916707039, + "learning_rate": 1e-06, + "loss": 0.0442, + "step": 1075 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0078125, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0078125, + "entropy": 0.30458875000476837, + "epoch": 1.7243589743589745, + "grad_norm": 0.005829751957207918, + "learning_rate": 1e-06, + "loss": -0.0204, + "step": 1076 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12915.0, + "completions/max_terminated_length": 12915.0, + "completions/mean_length": 7552.041015625, + "completions/mean_terminated_length": 7552.041015625, + "completions/min_length": 4131.0, + "completions/min_terminated_length": 4131.0, + "entropy": 0.3053240031003952, + "epoch": 1.7259615384615383, + "frac_reward_zero_std": 0.9375, + "grad_norm": 0.0045834193006157875, + "learning_rate": 1e-06, + "loss": -0.012, + "num_tokens": 1364209533.0, + "reward": 0.8963379263877869, + "reward_std": 0.011554005555808544, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.994140625, + "rewards/symbolic_reward_accuracy/std": 0.07639661431312561, + "rewards/symbolic_reward_partial_score/mean": 0.99951171875, + "rewards/symbolic_reward_partial_score/std": 0.006366382818669081, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0692803859710693, + "sampling/importance_sampling_ratio/min": 4.1937363448596443e-07, + "sampling/sampling_logp_difference/max": 14.684503555297852, + "sampling/sampling_logp_difference/mean": 0.12061621993780136, + "step": 1077 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "entropy": 0.30637161433696747, + "epoch": 1.7275641025641026, + "grad_norm": 0.004549259319901466, + "learning_rate": 1e-06, + "loss": -0.0116, + "step": 1078 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.015625, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.015625, + "entropy": 0.3078131079673767, + "epoch": 1.7291666666666665, + "grad_norm": 0.003365602809935808, + "learning_rate": 1e-06, + "loss": 0.0011, + "step": 1079 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "entropy": 0.30383966863155365, + "epoch": 1.7307692307692308, + "grad_norm": 0.005219892133027315, + "learning_rate": 1e-06, + "loss": 0.0237, + "step": 1080 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13163.0, + "completions/max_terminated_length": 13163.0, + "completions/mean_length": 7803.697265625, + "completions/mean_terminated_length": 7803.697265625, + "completions/min_length": 3165.0, + "completions/min_terminated_length": 3165.0, + "entropy": 0.29921863973140717, + "epoch": 1.7323717948717947, + "frac_reward_zero_std": 0.90625, + "grad_norm": 0.0048124357126653194, + "learning_rate": 1e-06, + "loss": 0.0241, + "num_tokens": 1369126098.0, + "reward": 0.8949414491653442, + "reward_std": 0.017115186899900436, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.9921875, + "rewards/symbolic_reward_accuracy/std": 0.08812850713729858, + "rewards/symbolic_reward_partial_score/mean": 0.9987630248069763, + "rewards/symbolic_reward_partial_score/std": 0.01657147705554962, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0677688121795654, + "sampling/importance_sampling_ratio/min": 0.0015285428380593657, + "sampling/sampling_logp_difference/max": 6.483440399169922, + "sampling/sampling_logp_difference/mean": 0.11848267912864685, + "step": 1081 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.015625, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.015625, + "entropy": 0.29725104570388794, + "epoch": 1.733974358974359, + "grad_norm": 0.004948347806930542, + "learning_rate": 1e-06, + "loss": -0.0151, + "step": 1082 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.015625, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.015625, + "entropy": 0.30296705663204193, + "epoch": 1.7355769230769231, + "grad_norm": 0.004423732403665781, + "learning_rate": 1e-06, + "loss": -0.0158, + "step": 1083 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0078125, + "clip_ratio/low_mean": 0.0078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.015625, + "entropy": 0.2992386966943741, + "epoch": 1.7371794871794872, + "grad_norm": 0.01645185425877571, + "learning_rate": 1e-06, + "loss": 0.007, + "step": 1084 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 12400.0, + "completions/mean_length": 7506.873046875, + "completions/mean_terminated_length": 7489.5009765625, + "completions/min_length": 2675.0, + "completions/min_terminated_length": 2675.0, + "entropy": 0.30802831053733826, + "epoch": 1.7387820512820513, + "frac_reward_zero_std": 0.90625, + "grad_norm": 0.005042756907641888, + "learning_rate": 1e-06, + "loss": -0.0045, + "num_tokens": 1373828897.0, + "reward": 0.8933007717132568, + "reward_std": 0.02315332740545273, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.9921875, + "rewards/symbolic_reward_accuracy/std": 0.08812850713729858, + "rewards/symbolic_reward_partial_score/mean": 0.993945300579071, + "rewards/symbolic_reward_partial_score/std": 0.07650934904813766, + "rewards/tag_count_reward/mean": -0.001953125, + "rewards/tag_count_reward/std": 0.04419417306780815, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.06916344165802, + "sampling/importance_sampling_ratio/min": 0.002870687283575535, + "sampling/sampling_logp_difference/max": 5.853203773498535, + "sampling/sampling_logp_difference/mean": 0.12105688452720642, + "step": 1085 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0078125, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0078125, + "entropy": 0.30676987767219543, + "epoch": 1.7403846153846154, + "grad_norm": 0.019612792879343033, + "learning_rate": 1e-06, + "loss": -0.0029, + "step": 1086 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.015625, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.015625, + "entropy": 0.3068690150976181, + "epoch": 1.7419871794871795, + "grad_norm": 0.03385534510016441, + "learning_rate": 1e-06, + "loss": 0.0183, + "step": 1087 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0078125, + "clip_ratio/low_mean": 0.0078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.015625, + "entropy": 0.30766068398952484, + "epoch": 1.7435897435897436, + "grad_norm": 0.004236425273120403, + "learning_rate": 1e-06, + "loss": 0.0023, + "step": 1088 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13133.0, + "completions/max_terminated_length": 13133.0, + "completions/mean_length": 8049.69140625, + "completions/mean_terminated_length": 8049.69140625, + "completions/min_length": 3345.0, + "completions/min_terminated_length": 3345.0, + "entropy": 0.29996953904628754, + "epoch": 1.7451923076923077, + "frac_reward_zero_std": 0.9375, + "grad_norm": 0.003684076014906168, + "learning_rate": 1e-06, + "loss": -0.0095, + "num_tokens": 1378946483.0, + "reward": 0.8975489139556885, + "reward_std": 0.009804688394069672, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.99609375, + "rewards/symbolic_reward_accuracy/std": 0.06243881583213806, + "rewards/symbolic_reward_partial_score/mean": 0.9996418952941895, + "rewards/symbolic_reward_partial_score/std": 0.0057472530752420425, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.067392349243164, + "sampling/importance_sampling_ratio/min": 0.0015522823669016361, + "sampling/sampling_logp_difference/max": 6.468029022216797, + "sampling/sampling_logp_difference/mean": 0.11816658079624176, + "step": 1089 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.0234375, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0234375, + "entropy": 0.29776372015476227, + "epoch": 1.7467948717948718, + "grad_norm": 0.0024523658212274313, + "learning_rate": 1e-06, + "loss": -0.0091, + "step": 1090 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0078125, + "entropy": 0.2962539494037628, + "epoch": 1.748397435897436, + "grad_norm": 0.029643870890140533, + "learning_rate": 1e-06, + "loss": 0.0333, + "step": 1091 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0078125, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0078125, + "entropy": 0.3009512573480606, + "epoch": 1.75, + "grad_norm": 0.003705281764268875, + "learning_rate": 1e-06, + "loss": -0.0101, + "step": 1092 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13541.0, + "completions/max_terminated_length": 13541.0, + "completions/mean_length": 7528.818359375, + "completions/mean_terminated_length": 7528.818359375, + "completions/min_length": 2870.0, + "completions/min_terminated_length": 2870.0, + "entropy": 0.3087153285741806, + "epoch": 1.751602564102564, + "frac_reward_zero_std": 0.90625, + "grad_norm": 0.03856688365340233, + "learning_rate": 1e-06, + "loss": 0.0225, + "num_tokens": 1383655846.0, + "reward": 0.8951172232627869, + "reward_std": 0.01643681712448597, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.9921875, + "rewards/symbolic_reward_accuracy/std": 0.08812850713729858, + "rewards/symbolic_reward_partial_score/mean": 0.9993489980697632, + "rewards/symbolic_reward_partial_score/std": 0.0073440405540168285, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.069844365119934, + "sampling/importance_sampling_ratio/min": 0.0015164552023634315, + "sampling/sampling_logp_difference/max": 6.491379737854004, + "sampling/sampling_logp_difference/mean": 0.12191909551620483, + "step": 1093 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.015625, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.015625, + "entropy": 0.3065723031759262, + "epoch": 1.7532051282051282, + "grad_norm": 0.004395030438899994, + "learning_rate": 1e-06, + "loss": -0.0046, + "step": 1094 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.015625, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.015625, + "entropy": 0.31010180711746216, + "epoch": 1.7548076923076923, + "grad_norm": 0.005476230755448341, + "learning_rate": 1e-06, + "loss": -0.0161, + "step": 1095 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0078125, + "clip_ratio/low_mean": 0.0078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.015625, + "entropy": 0.3097592294216156, + "epoch": 1.7564102564102564, + "grad_norm": 0.00418106047436595, + "learning_rate": 1e-06, + "loss": -0.0023, + "step": 1096 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13614.0, + "completions/max_terminated_length": 13614.0, + "completions/mean_length": 7423.01953125, + "completions/mean_terminated_length": 7423.01953125, + "completions/min_length": 3172.0, + "completions/min_terminated_length": 3172.0, + "entropy": 0.3101639300584793, + "epoch": 1.7580128205128205, + "frac_reward_zero_std": 0.9375, + "grad_norm": 0.004919133614748716, + "learning_rate": 1e-06, + "loss": -0.0111, + "num_tokens": 1388312768.0, + "reward": 0.8963086605072021, + "reward_std": 0.011646436527371407, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.994140625, + "rewards/symbolic_reward_accuracy/std": 0.07639661431312561, + "rewards/symbolic_reward_partial_score/mean": 0.9994140863418579, + "rewards/symbolic_reward_partial_score/std": 0.007639662828296423, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0697271823883057, + "sampling/importance_sampling_ratio/min": 0.0010703522711992264, + "sampling/sampling_logp_difference/max": 6.8397674560546875, + "sampling/sampling_logp_difference/mean": 0.12227816879749298, + "step": 1097 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0078125, + "entropy": 0.31154482066631317, + "epoch": 1.7596153846153846, + "grad_norm": 0.004658720921725035, + "learning_rate": 1e-06, + "loss": 0.0214, + "step": 1098 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.015625, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.015625, + "entropy": 0.31060969829559326, + "epoch": 1.7612179487179487, + "grad_norm": 0.004687273874878883, + "learning_rate": 1e-06, + "loss": -0.0112, + "step": 1099 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0078125, + "clip_ratio/low_mean": 0.0078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.015625, + "entropy": 0.30701225996017456, + "epoch": 1.7628205128205128, + "grad_norm": 0.004379766061902046, + "learning_rate": 1e-06, + "loss": 0.001, + "step": 1100 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12550.0, + "completions/max_terminated_length": 12550.0, + "completions/mean_length": 7696.623046875, + "completions/mean_terminated_length": 7696.623046875, + "completions/min_length": 2813.0, + "completions/min_terminated_length": 2813.0, + "entropy": 0.30509281158447266, + "epoch": 1.7644230769230769, + "frac_reward_zero_std": 0.96875, + "grad_norm": 0.0035905460827052593, + "learning_rate": 1e-06, + "loss": 0.0067, + "num_tokens": 1393157807.0, + "reward": 0.8975586295127869, + "reward_std": 0.0066711921244859695, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.99609375, + "rewards/symbolic_reward_accuracy/std": 0.06243881583213806, + "rewards/symbolic_reward_partial_score/mean": 0.9996744990348816, + "rewards/symbolic_reward_partial_score/std": 0.005203233566135168, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0688618421554565, + "sampling/importance_sampling_ratio/min": 0.002739542629569769, + "sampling/sampling_logp_difference/max": 5.899964332580566, + "sampling/sampling_logp_difference/mean": 0.1204448714852333, + "step": 1101 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "entropy": 0.30465711653232574, + "epoch": 1.766025641025641, + "grad_norm": 0.0038503403775393963, + "learning_rate": 1e-06, + "loss": -0.0069, + "step": 1102 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0078125, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0078125, + "entropy": 0.30177219212055206, + "epoch": 1.7676282051282053, + "grad_norm": 0.00662583764642477, + "learning_rate": 1e-06, + "loss": -0.0069, + "step": 1103 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "entropy": 0.30572329461574554, + "epoch": 1.7692307692307692, + "grad_norm": 0.019112765789031982, + "learning_rate": 1e-06, + "loss": 0.0083, + "step": 1104 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12681.0, + "completions/max_terminated_length": 12681.0, + "completions/mean_length": 7622.841796875, + "completions/mean_terminated_length": 7622.841796875, + "completions/min_length": 3801.0, + "completions/min_terminated_length": 3801.0, + "entropy": 0.3011849373579025, + "epoch": 1.7708333333333335, + "frac_reward_zero_std": 0.90625, + "grad_norm": 0.027583763003349304, + "learning_rate": 1e-06, + "loss": 0.0133, + "num_tokens": 1397982030.0, + "reward": 0.8950977325439453, + "reward_std": 0.016514942049980164, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.9921875, + "rewards/symbolic_reward_accuracy/std": 0.08812850713729858, + "rewards/symbolic_reward_partial_score/mean": 0.9992838501930237, + "rewards/symbolic_reward_partial_score/std": 0.00811202172189951, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0681089162826538, + "sampling/importance_sampling_ratio/min": 0.0003649833088275045, + "sampling/sampling_logp_difference/max": 7.915658950805664, + "sampling/sampling_logp_difference/mean": 0.11929503828287125, + "step": 1105 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.015625, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.015625, + "entropy": 0.30475567281246185, + "epoch": 1.7724358974358974, + "grad_norm": 0.005002895370125771, + "learning_rate": 1e-06, + "loss": -0.016, + "step": 1106 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "entropy": 0.29859915375709534, + "epoch": 1.7740384615384617, + "grad_norm": 0.005493351258337498, + "learning_rate": 1e-06, + "loss": -0.0167, + "step": 1107 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "entropy": 0.3013754189014435, + "epoch": 1.7756410256410255, + "grad_norm": 0.02845832332968712, + "learning_rate": 1e-06, + "loss": 0.0172, + "step": 1108 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13094.0, + "completions/max_terminated_length": 13094.0, + "completions/mean_length": 7377.328125, + "completions/mean_terminated_length": 7377.328125, + "completions/min_length": 4239.0, + "completions/min_terminated_length": 4239.0, + "entropy": 0.30232779681682587, + "epoch": 1.7772435897435899, + "frac_reward_zero_std": 0.9375, + "grad_norm": 0.003389689838513732, + "learning_rate": 1e-06, + "loss": -0.0081, + "num_tokens": 1402510950.0, + "reward": 0.8975391387939453, + "reward_std": 0.009843749925494194, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.99609375, + "rewards/symbolic_reward_accuracy/std": 0.06243881583213806, + "rewards/symbolic_reward_partial_score/mean": 0.9996093511581421, + "rewards/symbolic_reward_partial_score/std": 0.006243883166462183, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.069146752357483, + "sampling/importance_sampling_ratio/min": 0.0019089938141405582, + "sampling/sampling_logp_difference/max": 6.261178970336914, + "sampling/sampling_logp_difference/mean": 0.12119454890489578, + "step": 1109 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.015625, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.015625, + "entropy": 0.30659207701683044, + "epoch": 1.7788461538461537, + "grad_norm": 0.002910124370828271, + "learning_rate": 1e-06, + "loss": 0.0114, + "step": 1110 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "entropy": 0.3085610866546631, + "epoch": 1.780448717948718, + "grad_norm": 0.003460104111582041, + "learning_rate": 1e-06, + "loss": 0.0081, + "step": 1111 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0078125, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0078125, + "entropy": 0.30735933780670166, + "epoch": 1.782051282051282, + "grad_norm": 0.0027840950060635805, + "learning_rate": 1e-06, + "loss": -0.0078, + "step": 1112 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13833.0, + "completions/max_terminated_length": 13833.0, + "completions/mean_length": 7699.49609375, + "completions/mean_terminated_length": 7699.49609375, + "completions/min_length": 2247.0, + "completions/min_terminated_length": 2247.0, + "entropy": 0.3024802505970001, + "epoch": 1.7836538461538463, + "frac_reward_zero_std": 0.90625, + "grad_norm": 0.0052666435949504375, + "learning_rate": 1e-06, + "loss": 0.0069, + "num_tokens": 1407318500.0, + "reward": 0.8945801258087158, + "reward_std": 0.01858525536954403, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.9921875, + "rewards/symbolic_reward_accuracy/std": 0.08812850713729858, + "rewards/symbolic_reward_partial_score/mean": 0.99755859375, + "rewards/symbolic_reward_partial_score/std": 0.044628970324993134, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0683372020721436, + "sampling/importance_sampling_ratio/min": 0.00017096490773838013, + "sampling/sampling_logp_difference/max": 8.674052238464355, + "sampling/sampling_logp_difference/mean": 0.11985655128955841, + "step": 1113 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.03125, + "clip_ratio/low_mean": 0.0078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0390625, + "entropy": 0.30356265604496, + "epoch": 1.7852564102564101, + "grad_norm": 0.0042807371355593204, + "learning_rate": 1e-06, + "loss": 0.0029, + "step": 1114 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.0234375, + "clip_ratio/low_mean": 0.0078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.03125, + "entropy": 0.30395713448524475, + "epoch": 1.7868589743589745, + "grad_norm": 0.004926716443151236, + "learning_rate": 1e-06, + "loss": -0.001, + "step": 1115 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.0234375, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0234375, + "entropy": 0.30027005076408386, + "epoch": 1.7884615384615383, + "grad_norm": 0.019763482734560966, + "learning_rate": 1e-06, + "loss": -0.0021, + "step": 1116 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13435.0, + "completions/max_terminated_length": 13435.0, + "completions/mean_length": 7422.830078125, + "completions/mean_terminated_length": 7422.830078125, + "completions/min_length": 2285.0, + "completions/min_terminated_length": 2285.0, + "entropy": 0.30064651370048523, + "epoch": 1.7900641025641026, + "frac_reward_zero_std": 0.90625, + "grad_norm": 0.004853361286222935, + "learning_rate": 1e-06, + "loss": -0.0149, + "num_tokens": 1412002157.0, + "reward": 0.8963282108306885, + "reward_std": 0.014687499962747097, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.994140625, + "rewards/symbolic_reward_accuracy/std": 0.07639661431312561, + "rewards/symbolic_reward_partial_score/mean": 0.9994791746139526, + "rewards/symbolic_reward_partial_score/std": 0.006817440502345562, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0680367946624756, + "sampling/importance_sampling_ratio/min": 1.495128998385553e-07, + "sampling/sampling_logp_difference/max": 15.715883255004883, + "sampling/sampling_logp_difference/mean": 0.11941249668598175, + "step": 1117 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.015625, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.015625, + "entropy": 0.30547034740448, + "epoch": 1.7916666666666665, + "grad_norm": 0.03301442041993141, + "learning_rate": 1e-06, + "loss": 0.0097, + "step": 1118 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "entropy": 0.3002014458179474, + "epoch": 1.7932692307692308, + "grad_norm": 0.030427923426032066, + "learning_rate": 1e-06, + "loss": 0.0106, + "step": 1119 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "entropy": 0.29985660314559937, + "epoch": 1.7948717948717947, + "grad_norm": 0.029749156907200813, + "learning_rate": 1e-06, + "loss": 0.0088, + "step": 1120 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12488.0, + "completions/max_terminated_length": 12488.0, + "completions/mean_length": 7279.498046875, + "completions/mean_terminated_length": 7279.498046875, + "completions/min_length": 3535.0, + "completions/min_terminated_length": 3535.0, + "entropy": 0.3116193860769272, + "epoch": 1.796474358974359, + "frac_reward_zero_std": 0.96875, + "grad_norm": 0.002386681968346238, + "learning_rate": 1e-06, + "loss": -0.0042, + "num_tokens": 1416602988.0, + "reward": 0.8982422351837158, + "reward_std": 0.00703125074505806, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.998046875, + "rewards/symbolic_reward_accuracy/std": 0.04419417306780815, + "rewards/symbolic_reward_partial_score/mean": 0.998046875, + "rewards/symbolic_reward_partial_score/std": 0.04419417306780815, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0702404975891113, + "sampling/importance_sampling_ratio/min": 0.0015238322084769607, + "sampling/sampling_logp_difference/max": 6.486526966094971, + "sampling/sampling_logp_difference/mean": 0.12285487353801727, + "step": 1121 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0078125, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0078125, + "entropy": 0.311521053314209, + "epoch": 1.7980769230769231, + "grad_norm": 0.0017178966663777828, + "learning_rate": 1e-06, + "loss": -0.0039, + "step": 1122 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "entropy": 0.31351254880428314, + "epoch": 1.7996794871794872, + "grad_norm": 0.002310063922777772, + "learning_rate": 1e-06, + "loss": 0.0122, + "step": 1123 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0078125, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0078125, + "entropy": 0.3113110065460205, + "epoch": 1.8012820512820513, + "grad_norm": 0.0022971148137003183, + "learning_rate": 1e-06, + "loss": -0.0039, + "step": 1124 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12657.0, + "completions/max_terminated_length": 12657.0, + "completions/mean_length": 7098.353515625, + "completions/mean_terminated_length": 7098.353515625, + "completions/min_length": 2688.0, + "completions/min_terminated_length": 2688.0, + "entropy": 0.30783234536647797, + "epoch": 1.8028846153846154, + "frac_reward_zero_std": 0.90625, + "grad_norm": 0.005364755168557167, + "learning_rate": 1e-06, + "loss": -0.0181, + "num_tokens": 1421106401.0, + "reward": 0.8938770294189453, + "reward_std": 0.018278565257787704, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.990234375, + "rewards/symbolic_reward_accuracy/std": 0.09843364357948303, + "rewards/symbolic_reward_partial_score/mean": 0.9991210699081421, + "rewards/symbolic_reward_partial_score/std": 0.008895767852663994, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.069352388381958, + "sampling/importance_sampling_ratio/min": 0.0006668087444268167, + "sampling/sampling_logp_difference/max": 7.313007354736328, + "sampling/sampling_logp_difference/mean": 0.12137231230735779, + "step": 1125 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.0234375, + "clip_ratio/low_mean": 0.0078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.03125, + "entropy": 0.30746787786483765, + "epoch": 1.8044871794871795, + "grad_norm": 0.0049836039543151855, + "learning_rate": 1e-06, + "loss": 0.0083, + "step": 1126 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.015625, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.015625, + "entropy": 0.307818204164505, + "epoch": 1.8060897435897436, + "grad_norm": 0.00515682203695178, + "learning_rate": 1e-06, + "loss": -0.0155, + "step": 1127 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0078125, + "clip_ratio/low_mean": 0.0078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.015625, + "entropy": 0.3116377890110016, + "epoch": 1.8076923076923077, + "grad_norm": 0.005574796348810196, + "learning_rate": 1e-06, + "loss": 0.0293, + "step": 1128 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12872.0, + "completions/max_terminated_length": 12872.0, + "completions/mean_length": 7302.146484375, + "completions/mean_terminated_length": 7302.146484375, + "completions/min_length": 3062.0, + "completions/min_terminated_length": 3062.0, + "entropy": 0.30744411051273346, + "epoch": 1.8092948717948718, + "frac_reward_zero_std": 0.90625, + "grad_norm": 0.004532939754426479, + "learning_rate": 1e-06, + "loss": -0.0001, + "num_tokens": 1425751324.0, + "reward": 0.8945116996765137, + "reward_std": 0.0183095782995224, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.9921875, + "rewards/symbolic_reward_accuracy/std": 0.08812850713729858, + "rewards/symbolic_reward_partial_score/mean": 0.9973307251930237, + "rewards/symbolic_reward_partial_score/std": 0.04520295187830925, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.06903076171875, + "sampling/importance_sampling_ratio/min": 0.0014675172278657556, + "sampling/sampling_logp_difference/max": 6.52418327331543, + "sampling/sampling_logp_difference/mean": 0.12041410058736801, + "step": 1129 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0078125, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0078125, + "entropy": 0.3038233667612076, + "epoch": 1.810897435897436, + "grad_norm": 0.004973679780960083, + "learning_rate": 1e-06, + "loss": -0.014, + "step": 1130 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.015625, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.015625, + "entropy": 0.3052085041999817, + "epoch": 1.8125, + "grad_norm": 0.0037389693316072226, + "learning_rate": 1e-06, + "loss": 0.0152, + "step": 1131 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.03125, + "clip_ratio/low_mean": 0.0078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0390625, + "entropy": 0.3043749928474426, + "epoch": 1.814102564102564, + "grad_norm": 0.004087743815034628, + "learning_rate": 1e-06, + "loss": -0.0033, + "step": 1132 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15298.0, + "completions/max_terminated_length": 15298.0, + "completions/mean_length": 6917.4140625, + "completions/mean_terminated_length": 6917.4140625, + "completions/min_length": 3192.0, + "completions/min_terminated_length": 3192.0, + "entropy": 0.31978972256183624, + "epoch": 1.8157051282051282, + "frac_reward_zero_std": 0.90625, + "grad_norm": 0.029717234894633293, + "learning_rate": 1e-06, + "loss": 0.0039, + "num_tokens": 1430061392.0, + "reward": 0.8950879573822021, + "reward_std": 0.016529249027371407, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.9921875, + "rewards/symbolic_reward_accuracy/std": 0.08812850713729858, + "rewards/symbolic_reward_partial_score/mean": 0.9992513060569763, + "rewards/symbolic_reward_partial_score/std": 0.008469752967357635, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.071985125541687, + "sampling/importance_sampling_ratio/min": 0.0019307893235236406, + "sampling/sampling_logp_difference/max": 6.249826431274414, + "sampling/sampling_logp_difference/mean": 0.12560206651687622, + "step": 1133 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0234375, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0234375, + "entropy": 0.32091666758060455, + "epoch": 1.8173076923076923, + "grad_norm": 0.01882363110780716, + "learning_rate": 1e-06, + "loss": -0.0033, + "step": 1134 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0078125, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0078125, + "entropy": 0.32024863362312317, + "epoch": 1.8189102564102564, + "grad_norm": 0.024035094305872917, + "learning_rate": 1e-06, + "loss": -0.0005, + "step": 1135 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0078125, + "clip_ratio/low_mean": 0.0078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.015625, + "entropy": 0.3210635632276535, + "epoch": 1.8205128205128205, + "grad_norm": 0.004419194534420967, + "learning_rate": 1e-06, + "loss": -0.0015, + "step": 1136 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12108.0, + "completions/max_terminated_length": 12108.0, + "completions/mean_length": 7406.19140625, + "completions/mean_terminated_length": 7406.19140625, + "completions/min_length": 3186.0, + "completions/min_terminated_length": 3186.0, + "entropy": 0.30747954547405243, + "epoch": 1.8221153846153846, + "frac_reward_zero_std": 0.9375, + "grad_norm": 0.027421489357948303, + "learning_rate": 1e-06, + "loss": 0.0097, + "num_tokens": 1434730626.0, + "reward": 0.8975489139556885, + "reward_std": 0.009804688394069672, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.99609375, + "rewards/symbolic_reward_accuracy/std": 0.06243881583213806, + "rewards/symbolic_reward_partial_score/mean": 0.9996419548988342, + "rewards/symbolic_reward_partial_score/std": 0.0057472530752420425, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.069690227508545, + "sampling/importance_sampling_ratio/min": 0.001751437084749341, + "sampling/sampling_logp_difference/max": 6.347318649291992, + "sampling/sampling_logp_difference/mean": 0.12143737822771072, + "step": 1137 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.0234375, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0234375, + "entropy": 0.3087593764066696, + "epoch": 1.8237179487179487, + "grad_norm": 0.0031265048310160637, + "learning_rate": 1e-06, + "loss": -0.0087, + "step": 1138 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0078125, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0078125, + "entropy": 0.3095775842666626, + "epoch": 1.8253205128205128, + "grad_norm": 0.0032832089345902205, + "learning_rate": 1e-06, + "loss": 0.0071, + "step": 1139 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.015625, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.015625, + "entropy": 0.3101077526807785, + "epoch": 1.8269230769230769, + "grad_norm": 0.002986360341310501, + "learning_rate": 1e-06, + "loss": -0.0087, + "step": 1140 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13108.0, + "completions/max_terminated_length": 13108.0, + "completions/mean_length": 7286.09375, + "completions/mean_terminated_length": 7286.09375, + "completions/min_length": 3523.0, + "completions/min_terminated_length": 3523.0, + "entropy": 0.30785150825977325, + "epoch": 1.828525641025641, + "frac_reward_zero_std": 0.9375, + "grad_norm": 0.004749401472508907, + "learning_rate": 1e-06, + "loss": -0.0117, + "num_tokens": 1439318514.0, + "reward": 0.8963379263877869, + "reward_std": 0.01155400462448597, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.994140625, + "rewards/symbolic_reward_accuracy/std": 0.07639661431312561, + "rewards/symbolic_reward_partial_score/mean": 0.99951171875, + "rewards/symbolic_reward_partial_score/std": 0.006366382818669081, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0699366331100464, + "sampling/importance_sampling_ratio/min": 0.0016831329558044672, + "sampling/sampling_logp_difference/max": 6.38709831237793, + "sampling/sampling_logp_difference/mean": 0.12186126410961151, + "step": 1141 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0078125, + "clip_ratio/low_mean": 0.0078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.015625, + "entropy": 0.3084726333618164, + "epoch": 1.8301282051282053, + "grad_norm": 0.003292299574241042, + "learning_rate": 1e-06, + "loss": 0.0, + "step": 1142 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0078125, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0078125, + "entropy": 0.3109244406223297, + "epoch": 1.8317307692307692, + "grad_norm": 0.004434341564774513, + "learning_rate": 1e-06, + "loss": -0.0107, + "step": 1143 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0078125, + "clip_ratio/low_mean": 0.0078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.015625, + "entropy": 0.31032970547676086, + "epoch": 1.8333333333333335, + "grad_norm": 0.003546637948602438, + "learning_rate": 1e-06, + "loss": 0.0189, + "step": 1144 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13205.0, + "completions/max_terminated_length": 13205.0, + "completions/mean_length": 7583.61328125, + "completions/mean_terminated_length": 7583.61328125, + "completions/min_length": 4315.0, + "completions/min_terminated_length": 4315.0, + "entropy": 0.3044735789299011, + "epoch": 1.8349358974358974, + "frac_reward_zero_std": 0.96875, + "grad_norm": 0.0023911476600915194, + "learning_rate": 1e-06, + "loss": 0.0219, + "num_tokens": 1444095996.0, + "reward": 0.8987793326377869, + "reward_std": 0.004882812965661287, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.998046875, + "rewards/symbolic_reward_accuracy/std": 0.04419417306780815, + "rewards/symbolic_reward_partial_score/mean": 0.9998372793197632, + "rewards/symbolic_reward_partial_score/std": 0.003682846901938319, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.068371295928955, + "sampling/importance_sampling_ratio/min": 0.0025255377404391766, + "sampling/sampling_logp_difference/max": 5.981301307678223, + "sampling/sampling_logp_difference/mean": 0.11961719393730164, + "step": 1145 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0078125, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0078125, + "entropy": 0.3037119656801224, + "epoch": 1.8365384615384617, + "grad_norm": 0.0018107750220224261, + "learning_rate": 1e-06, + "loss": -0.0045, + "step": 1146 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "entropy": 0.2995622456073761, + "epoch": 1.8381410256410255, + "grad_norm": 0.0025102372746914625, + "learning_rate": 1e-06, + "loss": -0.0041, + "step": 1147 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0078125, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0078125, + "entropy": 0.30536913871765137, + "epoch": 1.8397435897435899, + "grad_norm": 0.0023733393754810095, + "learning_rate": 1e-06, + "loss": -0.0043, + "step": 1148 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14755.0, + "completions/max_terminated_length": 14755.0, + "completions/mean_length": 7358.2578125, + "completions/mean_terminated_length": 7358.2578125, + "completions/min_length": 3661.0, + "completions/min_terminated_length": 3661.0, + "entropy": 0.3135741055011749, + "epoch": 1.8413461538461537, + "frac_reward_zero_std": 0.9375, + "grad_norm": 0.0034127088729292154, + "learning_rate": 1e-06, + "loss": -0.0087, + "num_tokens": 1448658560.0, + "reward": 0.8975586295127869, + "reward_std": 0.009765625, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.99609375, + "rewards/symbolic_reward_accuracy/std": 0.06243881583213806, + "rewards/symbolic_reward_partial_score/mean": 0.9996744990348816, + "rewards/symbolic_reward_partial_score/std": 0.005203233566135168, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.071128010749817, + "sampling/importance_sampling_ratio/min": 0.0019161163363605738, + "sampling/sampling_logp_difference/max": 6.257454872131348, + "sampling/sampling_logp_difference/mean": 0.12349528074264526, + "step": 1149 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0078125, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0078125, + "entropy": 0.31672754883766174, + "epoch": 1.842948717948718, + "grad_norm": 0.023975789546966553, + "learning_rate": 1e-06, + "loss": 0.007, + "step": 1150 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.0234375, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0234375, + "entropy": 0.31721872091293335, + "epoch": 1.844551282051282, + "grad_norm": 0.0023351602721959352, + "learning_rate": 1e-06, + "loss": -0.0086, + "step": 1151 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0078125, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0078125, + "entropy": 0.31144554913043976, + "epoch": 1.8461538461538463, + "grad_norm": 0.025021545588970184, + "learning_rate": 1e-06, + "loss": 0.0078, + "step": 1152 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13811.0, + "completions/max_terminated_length": 13811.0, + "completions/mean_length": 7853.33203125, + "completions/mean_terminated_length": 7853.33203125, + "completions/min_length": 4144.0, + "completions/min_terminated_length": 4144.0, + "entropy": 0.3009275645017624, + "epoch": 1.8477564102564101, + "frac_reward_zero_std": 0.90625, + "grad_norm": 0.005614398512989283, + "learning_rate": 1e-06, + "loss": -0.0154, + "num_tokens": 1453604122.0, + "reward": 0.8940234184265137, + "reward_std": 0.0202627032995224, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.9921875, + "rewards/symbolic_reward_accuracy/std": 0.08812850713729858, + "rewards/symbolic_reward_partial_score/mean": 0.9957031011581421, + "rewards/symbolic_reward_partial_score/std": 0.06272586435079575, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0684130191802979, + "sampling/importance_sampling_ratio/min": 0.0020829718559980392, + "sampling/sampling_logp_difference/max": 6.173959732055664, + "sampling/sampling_logp_difference/mean": 0.11914074420928955, + "step": 1153 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.0234375, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0234375, + "entropy": 0.3018946796655655, + "epoch": 1.8493589743589745, + "grad_norm": 0.026210306212306023, + "learning_rate": 1e-06, + "loss": 0.0008, + "step": 1154 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0078125, + "entropy": 0.3021729290485382, + "epoch": 1.8509615384615383, + "grad_norm": 0.004779895767569542, + "learning_rate": 1e-06, + "loss": -0.003, + "step": 1155 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.03125, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.03125, + "entropy": 0.30005723237991333, + "epoch": 1.8525641025641026, + "grad_norm": 0.0037132585421204567, + "learning_rate": 1e-06, + "loss": 0.0172, + "step": 1156 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12805.0, + "completions/max_terminated_length": 12805.0, + "completions/mean_length": 7567.783203125, + "completions/mean_terminated_length": 7567.783203125, + "completions/min_length": 4338.0, + "completions/min_terminated_length": 4338.0, + "entropy": 0.3199533224105835, + "epoch": 1.8541666666666665, + "frac_reward_zero_std": 0.90625, + "grad_norm": 0.004679365549236536, + "learning_rate": 1e-06, + "loss": -0.0136, + "num_tokens": 1458243931.0, + "reward": 0.8961426019668579, + "reward_std": 0.01542968861758709, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.994140625, + "rewards/symbolic_reward_accuracy/std": 0.07639661431312561, + "rewards/symbolic_reward_partial_score/mean": 0.9988607168197632, + "rewards/symbolic_reward_partial_score/std": 0.016854895278811455, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.071823239326477, + "sampling/importance_sampling_ratio/min": 0.0027870051562786102, + "sampling/sampling_logp_difference/max": 5.882787704467773, + "sampling/sampling_logp_difference/mean": 0.12532271444797516, + "step": 1157 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0078125, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0078125, + "entropy": 0.318808451294899, + "epoch": 1.8557692307692308, + "grad_norm": 0.004111420828849077, + "learning_rate": 1e-06, + "loss": -0.0134, + "step": 1158 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.03125, + "clip_ratio/low_mean": 0.0078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0390625, + "entropy": 0.3141668140888214, + "epoch": 1.8573717948717947, + "grad_norm": 0.027966901659965515, + "learning_rate": 1e-06, + "loss": 0.0258, + "step": 1159 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0078125, + "entropy": 0.3200962245464325, + "epoch": 1.858974358974359, + "grad_norm": 0.004104991443455219, + "learning_rate": 1e-06, + "loss": 0.0024, + "step": 1160 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13143.0, + "completions/max_terminated_length": 13143.0, + "completions/mean_length": 7944.10546875, + "completions/mean_terminated_length": 7944.10546875, + "completions/min_length": 4646.0, + "completions/min_terminated_length": 4646.0, + "entropy": 0.30335795879364014, + "epoch": 1.8605769230769231, + "frac_reward_zero_std": 0.90625, + "grad_norm": 0.004546971060335636, + "learning_rate": 1e-06, + "loss": 0.0075, + "num_tokens": 1463237121.0, + "reward": 0.8958008289337158, + "reward_std": 0.01679687574505806, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.994140625, + "rewards/symbolic_reward_accuracy/std": 0.07639661431312561, + "rewards/symbolic_reward_partial_score/mean": 0.9977213740348816, + "rewards/symbolic_reward_partial_score/std": 0.04448510333895683, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0681650638580322, + "sampling/importance_sampling_ratio/min": 0.002749611856415868, + "sampling/sampling_logp_difference/max": 5.896295547485352, + "sampling/sampling_logp_difference/mean": 0.11904764920473099, + "step": 1161 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.015625, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.015625, + "entropy": 0.3040137141942978, + "epoch": 1.8621794871794872, + "grad_norm": 0.004086529370397329, + "learning_rate": 1e-06, + "loss": -0.0139, + "step": 1162 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0078125, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0078125, + "entropy": 0.30111585557460785, + "epoch": 1.8637820512820513, + "grad_norm": 0.004171317908912897, + "learning_rate": 1e-06, + "loss": -0.0122, + "step": 1163 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.015625, + "entropy": 0.29825007915496826, + "epoch": 1.8653846153846154, + "grad_norm": 0.003949691541492939, + "learning_rate": 1e-06, + "loss": 0.0259, + "step": 1164 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14200.0, + "completions/max_terminated_length": 14200.0, + "completions/mean_length": 7925.0078125, + "completions/mean_terminated_length": 7925.0078125, + "completions/min_length": 4500.0, + "completions/min_terminated_length": 4500.0, + "entropy": 0.29860930144786835, + "epoch": 1.8669871794871795, + "frac_reward_zero_std": 0.9375, + "grad_norm": 0.005291419103741646, + "learning_rate": 1e-06, + "loss": -0.0159, + "num_tokens": 1468163589.0, + "reward": 0.8951172232627869, + "reward_std": 0.013342385180294514, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.9921875, + "rewards/symbolic_reward_accuracy/std": 0.08812850713729858, + "rewards/symbolic_reward_partial_score/mean": 0.9993489980697632, + "rewards/symbolic_reward_partial_score/std": 0.0073440405540168285, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.067738652229309, + "sampling/importance_sampling_ratio/min": 0.0019308445043861866, + "sampling/sampling_logp_difference/max": 6.249797821044922, + "sampling/sampling_logp_difference/mean": 0.11880922317504883, + "step": 1165 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0078125, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0078125, + "entropy": 0.29944686591625214, + "epoch": 1.8685897435897436, + "grad_norm": 0.022908741608262062, + "learning_rate": 1e-06, + "loss": 0.0343, + "step": 1166 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.03125, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.03125, + "entropy": 0.3004586547613144, + "epoch": 1.8701923076923077, + "grad_norm": 0.016763806343078613, + "learning_rate": 1e-06, + "loss": -0.0032, + "step": 1167 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.0234375, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0234375, + "entropy": 0.29962702095508575, + "epoch": 1.8717948717948718, + "grad_norm": 0.004018325824290514, + "learning_rate": 1e-06, + "loss": -0.015, + "step": 1168 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14060.0, + "completions/max_terminated_length": 14060.0, + "completions/mean_length": 7686.337890625, + "completions/mean_terminated_length": 7686.337890625, + "completions/min_length": 3536.0, + "completions/min_terminated_length": 3536.0, + "entropy": 0.30295367538928986, + "epoch": 1.873397435897436, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1e-06, + "loss": 0.0, + "num_tokens": 1472935090.0, + "reward": 0.9000000357627869, + "reward_std": 0.0, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 1.0, + "rewards/symbolic_reward_accuracy/std": 0.0, + "rewards/symbolic_reward_partial_score/mean": 1.0, + "rewards/symbolic_reward_partial_score/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0685579776763916, + "sampling/importance_sampling_ratio/min": 0.0032049978617578745, + "sampling/sampling_logp_difference/max": 5.743043899536133, + "sampling/sampling_logp_difference/mean": 0.11951278150081635, + "step": 1169 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "entropy": 0.30429041385650635, + "epoch": 1.875, + "grad_norm": 0.0, + "learning_rate": 1e-06, + "loss": 0.0, + "step": 1170 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "entropy": 0.30121228098869324, + "epoch": 1.876602564102564, + "grad_norm": 0.0, + "learning_rate": 1e-06, + "loss": 0.0, + "step": 1171 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "entropy": 0.30381765961647034, + "epoch": 1.8782051282051282, + "grad_norm": 0.0, + "learning_rate": 1e-06, + "loss": 0.0, + "step": 1172 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14807.0, + "completions/max_terminated_length": 14807.0, + "completions/mean_length": 8211.1796875, + "completions/mean_terminated_length": 8211.1796875, + "completions/min_length": 2933.0, + "completions/min_terminated_length": 2933.0, + "entropy": 0.2926924079656601, + "epoch": 1.8798076923076923, + "frac_reward_zero_std": 0.9375, + "grad_norm": 0.003593733301386237, + "learning_rate": 1e-06, + "loss": -0.0092, + "num_tokens": 1478097134.0, + "reward": 0.8975586295127869, + "reward_std": 0.009765625931322575, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.99609375, + "rewards/symbolic_reward_accuracy/std": 0.06243881583213806, + "rewards/symbolic_reward_partial_score/mean": 0.9996744990348816, + "rewards/symbolic_reward_partial_score/std": 0.005203233566135168, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0667604207992554, + "sampling/importance_sampling_ratio/min": 0.0008619053987786174, + "sampling/sampling_logp_difference/max": 7.056365013122559, + "sampling/sampling_logp_difference/mean": 0.11660229414701462, + "step": 1173 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "entropy": 0.29721662402153015, + "epoch": 1.8814102564102564, + "grad_norm": 0.003629385493695736, + "learning_rate": 1e-06, + "loss": -0.0099, + "step": 1174 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.015625, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.015625, + "entropy": 0.29691576957702637, + "epoch": 1.8830128205128205, + "grad_norm": 0.0030263112857937813, + "learning_rate": 1e-06, + "loss": 0.0374, + "step": 1175 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.015625, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.015625, + "entropy": 0.29324012994766235, + "epoch": 1.8846153846153846, + "grad_norm": 0.0032665154431015253, + "learning_rate": 1e-06, + "loss": -0.0088, + "step": 1176 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15435.0, + "completions/max_terminated_length": 15435.0, + "completions/mean_length": 8228.052734375, + "completions/mean_terminated_length": 8228.052734375, + "completions/min_length": 4305.0, + "completions/min_terminated_length": 4305.0, + "entropy": 0.29094359278678894, + "epoch": 1.8862179487179487, + "frac_reward_zero_std": 0.875, + "grad_norm": 0.007459859363734722, + "learning_rate": 1e-06, + "loss": -0.0249, + "num_tokens": 1483287497.0, + "reward": 0.8914356231689453, + "reward_std": 0.023461204022169113, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.986328125, + "rewards/symbolic_reward_accuracy/std": 0.1162383034825325, + "rewards/symbolic_reward_partial_score/mean": 0.9987956285476685, + "rewards/symbolic_reward_partial_score/std": 0.010277888737618923, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0661139488220215, + "sampling/importance_sampling_ratio/min": 5.513464174100591e-08, + "sampling/sampling_logp_difference/max": 16.71348762512207, + "sampling/sampling_logp_difference/mean": 0.11546041816473007, + "step": 1177 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.0234375, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0234375, + "entropy": 0.2922461926937103, + "epoch": 1.8878205128205128, + "grad_norm": 0.007165617309510708, + "learning_rate": 1e-06, + "loss": -0.0021, + "step": 1178 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0078125, + "clip_ratio/low_mean": 0.0078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.015625, + "entropy": 0.28839120268821716, + "epoch": 1.8894230769230769, + "grad_norm": 0.0291578508913517, + "learning_rate": 1e-06, + "loss": 0.0343, + "step": 1179 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.0234375, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0234375, + "entropy": 0.2906486839056015, + "epoch": 1.891025641025641, + "grad_norm": 0.0069350446574389935, + "learning_rate": 1e-06, + "loss": -0.0131, + "step": 1180 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13589.0, + "completions/max_terminated_length": 13589.0, + "completions/mean_length": 7907.95703125, + "completions/mean_terminated_length": 7907.95703125, + "completions/min_length": 3527.0, + "completions/min_terminated_length": 3527.0, + "entropy": 0.30357837677001953, + "epoch": 1.8926282051282053, + "frac_reward_zero_std": 0.875, + "grad_norm": 0.005162107292562723, + "learning_rate": 1e-06, + "loss": 0.0184, + "num_tokens": 1488209347.0, + "reward": 0.8948730826377869, + "reward_std": 0.0205078125, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.9921875, + "rewards/symbolic_reward_accuracy/std": 0.08812850713729858, + "rewards/symbolic_reward_partial_score/mean": 0.99853515625, + "rewards/symbolic_reward_partial_score/std": 0.022975130006670952, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0685532093048096, + "sampling/importance_sampling_ratio/min": 0.0008111695642583072, + "sampling/sampling_logp_difference/max": 7.1170334815979, + "sampling/sampling_logp_difference/mean": 0.11978597193956375, + "step": 1181 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.0234375, + "clip_ratio/low_mean": 0.0078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.03125, + "entropy": 0.3027549088001251, + "epoch": 1.8942307692307692, + "grad_norm": 0.004358244594186544, + "learning_rate": 1e-06, + "loss": 0.001, + "step": 1182 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.03125, + "clip_ratio/low_mean": 0.0078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0390625, + "entropy": 0.3017651438713074, + "epoch": 1.8958333333333335, + "grad_norm": 0.004373589064925909, + "learning_rate": 1e-06, + "loss": 0.0001, + "step": 1183 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.0234375, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0234375, + "entropy": 0.30212725698947906, + "epoch": 1.8974358974358974, + "grad_norm": 0.004225477576255798, + "learning_rate": 1e-06, + "loss": -0.0188, + "step": 1184 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13942.0, + "completions/max_terminated_length": 13942.0, + "completions/mean_length": 7953.171875, + "completions/mean_terminated_length": 7953.171875, + "completions/min_length": 2041.0, + "completions/min_terminated_length": 2041.0, + "entropy": 0.30399172008037567, + "epoch": 1.8990384615384617, + "frac_reward_zero_std": 0.9375, + "grad_norm": 0.003516353666782379, + "learning_rate": 1e-06, + "loss": -0.0094, + "num_tokens": 1493164651.0, + "reward": 0.8975489139556885, + "reward_std": 0.009804687462747097, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.99609375, + "rewards/symbolic_reward_accuracy/std": 0.06243881583213806, + "rewards/symbolic_reward_partial_score/mean": 0.9996418952941895, + "rewards/symbolic_reward_partial_score/std": 0.0057472530752420425, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0690739154815674, + "sampling/importance_sampling_ratio/min": 0.004113218747079372, + "sampling/sampling_logp_difference/max": 5.493549346923828, + "sampling/sampling_logp_difference/mean": 0.12033951282501221, + "step": 1185 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "entropy": 0.30357593297958374, + "epoch": 1.9006410256410255, + "grad_norm": 0.0033840613905340433, + "learning_rate": 1e-06, + "loss": 0.0093, + "step": 1186 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.015625, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.015625, + "entropy": 0.3051948994398117, + "epoch": 1.9022435897435899, + "grad_norm": 0.0033332568127661943, + "learning_rate": 1e-06, + "loss": 0.0098, + "step": 1187 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.015625, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.015625, + "entropy": 0.30435769259929657, + "epoch": 1.9038461538461537, + "grad_norm": 0.003013043198734522, + "learning_rate": 1e-06, + "loss": -0.0096, + "step": 1188 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14426.0, + "completions/max_terminated_length": 14426.0, + "completions/mean_length": 8251.572265625, + "completions/mean_terminated_length": 8251.572265625, + "completions/min_length": 3841.0, + "completions/min_terminated_length": 3841.0, + "entropy": 0.2953474074602127, + "epoch": 1.905448717948718, + "frac_reward_zero_std": 0.96875, + "grad_norm": 0.0061998371966183186, + "learning_rate": 1e-06, + "loss": -0.0125, + "num_tokens": 1498310864.0, + "reward": 0.8951172232627869, + "reward_std": 0.008734640665352345, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.9921875, + "rewards/symbolic_reward_accuracy/std": 0.08812850713729858, + "rewards/symbolic_reward_partial_score/mean": 0.9993489980697632, + "rewards/symbolic_reward_partial_score/std": 0.0073440405540168285, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0669368505477905, + "sampling/importance_sampling_ratio/min": 0.003785366890951991, + "sampling/sampling_logp_difference/max": 5.57661247253418, + "sampling/sampling_logp_difference/mean": 0.11687429249286652, + "step": 1189 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0078125, + "entropy": 0.294635072350502, + "epoch": 1.907051282051282, + "grad_norm": 0.005789327435195446, + "learning_rate": 1e-06, + "loss": 0.001, + "step": 1190 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "entropy": 0.29555289447307587, + "epoch": 1.9086538461538463, + "grad_norm": 0.012939794920384884, + "learning_rate": 1e-06, + "loss": 0.0127, + "step": 1191 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0078125, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0078125, + "entropy": 0.29336051642894745, + "epoch": 1.9102564102564101, + "grad_norm": 0.013760295696556568, + "learning_rate": 1e-06, + "loss": 0.0024, + "step": 1192 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15622.0, + "completions/max_terminated_length": 15622.0, + "completions/mean_length": 8120.173828125, + "completions/mean_terminated_length": 8120.173828125, + "completions/min_length": 3226.0, + "completions/min_terminated_length": 3226.0, + "entropy": 0.2943793833255768, + "epoch": 1.9118589743589745, + "frac_reward_zero_std": 0.90625, + "grad_norm": 0.03283732756972313, + "learning_rate": 1e-06, + "loss": 0.0086, + "num_tokens": 1503395177.0, + "reward": 0.8958008289337158, + "reward_std": 0.01679687574505806, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.994140625, + "rewards/symbolic_reward_accuracy/std": 0.07639661431312561, + "rewards/symbolic_reward_partial_score/mean": 0.9977213740348816, + "rewards/symbolic_reward_partial_score/std": 0.04448510333895683, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0672670602798462, + "sampling/importance_sampling_ratio/min": 0.002507069380953908, + "sampling/sampling_logp_difference/max": 5.988640785217285, + "sampling/sampling_logp_difference/mean": 0.11740481853485107, + "step": 1193 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.0234375, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0234375, + "entropy": 0.2965329438447952, + "epoch": 1.9134615384615383, + "grad_norm": 0.0039680697955191135, + "learning_rate": 1e-06, + "loss": 0.0089, + "step": 1194 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0078125, + "clip_ratio/low_mean": 0.0078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.015625, + "entropy": 0.2973887622356415, + "epoch": 1.9150641025641026, + "grad_norm": 0.0037179056089371443, + "learning_rate": 1e-06, + "loss": 0.0066, + "step": 1195 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.0234375, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0234375, + "entropy": 0.2941938638687134, + "epoch": 1.9166666666666665, + "grad_norm": 0.0037544008810073137, + "learning_rate": 1e-06, + "loss": -0.0129, + "step": 1196 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14247.0, + "completions/mean_length": 7902.625, + "completions/mean_terminated_length": 7886.02734375, + "completions/min_length": 3220.0, + "completions/min_terminated_length": 3220.0, + "entropy": 0.3004665970802307, + "epoch": 1.9182692307692308, + "frac_reward_zero_std": 0.9375, + "grad_norm": 0.00423093605786562, + "learning_rate": 1e-06, + "loss": 0.0041, + "num_tokens": 1508268089.0, + "reward": 0.8950684070587158, + "reward_std": 0.01596381887793541, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.994140625, + "rewards/symbolic_reward_accuracy/std": 0.07639661431312561, + "rewards/symbolic_reward_partial_score/mean": 0.9959310293197632, + "rewards/symbolic_reward_partial_score/std": 0.06253714859485626, + "rewards/tag_count_reward/mean": -0.001953125, + "rewards/tag_count_reward/std": 0.04419417306780815, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0682998895645142, + "sampling/importance_sampling_ratio/min": 0.0023962194100022316, + "sampling/sampling_logp_difference/max": 6.033863067626953, + "sampling/sampling_logp_difference/mean": 0.11888313293457031, + "step": 1197 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0078125, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0078125, + "entropy": 0.2986510396003723, + "epoch": 1.9198717948717947, + "grad_norm": 0.014576694928109646, + "learning_rate": 1e-06, + "loss": 0.0284, + "step": 1198 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0078125, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0078125, + "entropy": 0.3021872192621231, + "epoch": 1.921474358974359, + "grad_norm": 0.004513179417699575, + "learning_rate": 1e-06, + "loss": -0.0113, + "step": 1199 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "entropy": 0.2986437827348709, + "epoch": 1.9230769230769231, + "grad_norm": 0.004633089527487755, + "learning_rate": 1e-06, + "loss": -0.0111, + "step": 1200 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15059.0, + "completions/max_terminated_length": 15059.0, + "completions/mean_length": 8173.046875, + "completions/mean_terminated_length": 8173.046875, + "completions/min_length": 3649.0, + "completions/min_terminated_length": 3649.0, + "entropy": 0.29754653573036194, + "epoch": 1.9246794871794872, + "frac_reward_zero_std": 0.96875, + "grad_norm": 0.0024362760595977306, + "learning_rate": 1e-06, + "loss": -0.0055, + "num_tokens": 1513281009.0, + "reward": 0.8987793326377869, + "reward_std": 0.0048828125, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.998046875, + "rewards/symbolic_reward_accuracy/std": 0.04419417306780815, + "rewards/symbolic_reward_partial_score/mean": 0.9998372793197632, + "rewards/symbolic_reward_partial_score/std": 0.003682846901938319, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0677549839019775, + "sampling/importance_sampling_ratio/min": 0.004164539277553558, + "sampling/sampling_logp_difference/max": 5.481149673461914, + "sampling/sampling_logp_difference/mean": 0.11856520175933838, + "step": 1201 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "entropy": 0.30047936737537384, + "epoch": 1.9262820512820513, + "grad_norm": 0.002458572620525956, + "learning_rate": 1e-06, + "loss": -0.006, + "step": 1202 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "entropy": 0.2958284020423889, + "epoch": 1.9278846153846154, + "grad_norm": 0.0025684263091534376, + "learning_rate": 1e-06, + "loss": -0.0043, + "step": 1203 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0078125, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0078125, + "entropy": 0.30148808658123016, + "epoch": 1.9294871794871795, + "grad_norm": 0.0025260718539357185, + "learning_rate": 1e-06, + "loss": 0.0233, + "step": 1204 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14282.0, + "completions/max_terminated_length": 14282.0, + "completions/mean_length": 8364.19140625, + "completions/mean_terminated_length": 8364.19140625, + "completions/min_length": 3836.0, + "completions/min_terminated_length": 3836.0, + "entropy": 0.2892305999994278, + "epoch": 1.9310897435897436, + "frac_reward_zero_std": 0.875, + "grad_norm": 0.019499700516462326, + "learning_rate": 1e-06, + "loss": 0.0032, + "num_tokens": 1518456547.0, + "reward": 0.8886035680770874, + "reward_std": 0.028310654684901237, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.982421875, + "rewards/symbolic_reward_accuracy/std": 0.13154059648513794, + "rewards/symbolic_reward_partial_score/mean": 0.9971679449081421, + "rewards/symbolic_reward_partial_score/std": 0.027863482013344765, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.065371036529541, + "sampling/importance_sampling_ratio/min": 0.0013542938977479935, + "sampling/sampling_logp_difference/max": 6.604475021362305, + "sampling/sampling_logp_difference/mean": 0.11470746994018555, + "step": 1205 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.015625, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.015625, + "entropy": 0.29128527641296387, + "epoch": 1.9326923076923077, + "grad_norm": 0.01882844790816307, + "learning_rate": 1e-06, + "loss": 0.0367, + "step": 1206 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.0390625, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0390625, + "entropy": 0.2861212342977524, + "epoch": 1.9342948717948718, + "grad_norm": 0.022711176425218582, + "learning_rate": 1e-06, + "loss": -0.0139, + "step": 1207 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0078125, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0078125, + "entropy": 0.281126469373703, + "epoch": 1.935897435897436, + "grad_norm": 0.008646919392049313, + "learning_rate": 1e-06, + "loss": -0.0146, + "step": 1208 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16226.0, + "completions/mean_length": 8800.673828125, + "completions/mean_terminated_length": 8785.833984375, + "completions/min_length": 3895.0, + "completions/min_terminated_length": 3895.0, + "entropy": 0.2832132428884506, + "epoch": 1.9375, + "frac_reward_zero_std": 0.90625, + "grad_norm": 0.005954642314463854, + "learning_rate": 1e-06, + "loss": -0.0179, + "num_tokens": 1523900092.0, + "reward": 0.8942675590515137, + "reward_std": 0.01977486163377762, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.9921875, + "rewards/symbolic_reward_accuracy/std": 0.08812850713729858, + "rewards/symbolic_reward_partial_score/mean": 0.9971679449081421, + "rewards/symbolic_reward_partial_score/std": 0.04577268287539482, + "rewards/tag_count_reward/mean": -0.001953125, + "rewards/tag_count_reward/std": 0.04419417306780815, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0643864870071411, + "sampling/importance_sampling_ratio/min": 0.0024164393544197083, + "sampling/sampling_logp_difference/max": 6.025460243225098, + "sampling/sampling_logp_difference/mean": 0.11290614306926727, + "step": 1209 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0078125, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0078125, + "entropy": 0.27907827496528625, + "epoch": 1.939102564102564, + "grad_norm": 0.03196370601654053, + "learning_rate": 1e-06, + "loss": 0.0323, + "step": 1210 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.0390625, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0390625, + "entropy": 0.2788865268230438, + "epoch": 1.9407051282051282, + "grad_norm": 0.004880078136920929, + "learning_rate": 1e-06, + "loss": 0.0012, + "step": 1211 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0078125, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0078125, + "entropy": 0.28488609194755554, + "epoch": 1.9423076923076923, + "grad_norm": 0.018956732004880905, + "learning_rate": 1e-06, + "loss": -0.0029, + "step": 1212 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14333.0, + "completions/max_terminated_length": 14333.0, + "completions/mean_length": 8613.55859375, + "completions/mean_terminated_length": 8613.55859375, + "completions/min_length": 3934.0, + "completions/min_terminated_length": 3934.0, + "entropy": 0.28035739064216614, + "epoch": 1.9439102564102564, + "frac_reward_zero_std": 0.90625, + "grad_norm": 0.0046940273605287075, + "learning_rate": 1e-06, + "loss": -0.0131, + "num_tokens": 1529260330.0, + "reward": 0.8960742950439453, + "reward_std": 0.015703124925494194, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.994140625, + "rewards/symbolic_reward_accuracy/std": 0.07639661431312561, + "rewards/symbolic_reward_partial_score/mean": 0.9986327886581421, + "rewards/symbolic_reward_partial_score/std": 0.022945649921894073, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0646021366119385, + "sampling/importance_sampling_ratio/min": 0.0013317377306520939, + "sampling/sampling_logp_difference/max": 6.621270656585693, + "sampling/sampling_logp_difference/mean": 0.11306291818618774, + "step": 1213 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.015625, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.015625, + "entropy": 0.2839740216732025, + "epoch": 1.9455128205128205, + "grad_norm": 0.0036105273757129908, + "learning_rate": 1e-06, + "loss": 0.019, + "step": 1214 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.0234375, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0234375, + "entropy": 0.2793775945901871, + "epoch": 1.9471153846153846, + "grad_norm": 0.025335954502224922, + "learning_rate": 1e-06, + "loss": 0.0055, + "step": 1215 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.015625, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.015625, + "entropy": 0.2792014926671982, + "epoch": 1.9487179487179487, + "grad_norm": 0.003647672478109598, + "learning_rate": 1e-06, + "loss": -0.0137, + "step": 1216 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15800.0, + "completions/max_terminated_length": 15800.0, + "completions/mean_length": 8183.126953125, + "completions/mean_terminated_length": 8183.126953125, + "completions/min_length": 3387.0, + "completions/min_terminated_length": 3387.0, + "entropy": 0.2911984920501709, + "epoch": 1.9503205128205128, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1e-06, + "loss": 0.0, + "num_tokens": 1534237771.0, + "reward": 0.9000000357627869, + "reward_std": 0.0, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 1.0, + "rewards/symbolic_reward_accuracy/std": 0.0, + "rewards/symbolic_reward_partial_score/mean": 1.0, + "rewards/symbolic_reward_partial_score/std": 0.0, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0660669803619385, + "sampling/importance_sampling_ratio/min": 0.00016737951955292374, + "sampling/sampling_logp_difference/max": 8.695246696472168, + "sampling/sampling_logp_difference/mean": 0.11629128456115723, + "step": 1217 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "entropy": 0.2953929454088211, + "epoch": 1.9519230769230769, + "grad_norm": 0.0, + "learning_rate": 1e-06, + "loss": 0.0, + "step": 1218 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "entropy": 0.2897518128156662, + "epoch": 1.953525641025641, + "grad_norm": 0.0, + "learning_rate": 1e-06, + "loss": 0.0, + "step": 1219 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "entropy": 0.28839822113513947, + "epoch": 1.9551282051282053, + "grad_norm": 0.0, + "learning_rate": 1e-06, + "loss": 0.0, + "step": 1220 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13799.0, + "completions/max_terminated_length": 13799.0, + "completions/mean_length": 8616.69921875, + "completions/mean_terminated_length": 8616.69921875, + "completions/min_length": 4475.0, + "completions/min_terminated_length": 4475.0, + "entropy": 0.2832952290773392, + "epoch": 1.9567307692307692, + "frac_reward_zero_std": 0.9375, + "grad_norm": 0.0034733996726572514, + "learning_rate": 1e-06, + "loss": 0.0261, + "num_tokens": 1539519377.0, + "reward": 0.8975489139556885, + "reward_std": 0.009804688394069672, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.99609375, + "rewards/symbolic_reward_accuracy/std": 0.06243881583213806, + "rewards/symbolic_reward_partial_score/mean": 0.9996418952941895, + "rewards/symbolic_reward_partial_score/std": 0.0057472530752420425, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.064860224723816, + "sampling/importance_sampling_ratio/min": 0.0020989603362977505, + "sampling/sampling_logp_difference/max": 6.166313171386719, + "sampling/sampling_logp_difference/mean": 0.1138753816485405, + "step": 1221 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0078125, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0078125, + "entropy": 0.28177329897880554, + "epoch": 1.9583333333333335, + "grad_norm": 0.003140608314424753, + "learning_rate": 1e-06, + "loss": -0.0095, + "step": 1222 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0078125, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0078125, + "entropy": 0.2827134430408478, + "epoch": 1.9599358974358974, + "grad_norm": 0.003823231440037489, + "learning_rate": 1e-06, + "loss": -0.0103, + "step": 1223 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0078125, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0078125, + "entropy": 0.2860657125711441, + "epoch": 1.9615384615384617, + "grad_norm": 0.003579829353839159, + "learning_rate": 1e-06, + "loss": -0.0095, + "step": 1224 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14100.0, + "completions/max_terminated_length": 14100.0, + "completions/mean_length": 8672.375, + "completions/mean_terminated_length": 8672.375, + "completions/min_length": 4399.0, + "completions/min_terminated_length": 4399.0, + "entropy": 0.2804241478443146, + "epoch": 1.9631410256410255, + "frac_reward_zero_std": 0.9375, + "grad_norm": 0.0035035342443734407, + "learning_rate": 1e-06, + "loss": -0.01, + "num_tokens": 1544860417.0, + "reward": 0.8970215320587158, + "reward_std": 0.01191406324505806, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.99609375, + "rewards/symbolic_reward_accuracy/std": 0.06243881583213806, + "rewards/symbolic_reward_partial_score/mean": 0.9978841543197632, + "rewards/symbolic_reward_partial_score/std": 0.04434017464518547, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0639734268188477, + "sampling/importance_sampling_ratio/min": 9.44249695749022e-05, + "sampling/sampling_logp_difference/max": 9.267704963684082, + "sampling/sampling_logp_difference/mean": 0.11255315691232681, + "step": 1225 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0078125, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0078125, + "entropy": 0.2808882147073746, + "epoch": 1.9647435897435899, + "grad_norm": 0.0034705237485468388, + "learning_rate": 1e-06, + "loss": -0.0088, + "step": 1226 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.015625, + "clip_ratio/low_mean": 0.0078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0234375, + "entropy": 0.2766978591680527, + "epoch": 1.9663461538461537, + "grad_norm": 0.027924956753849983, + "learning_rate": 1e-06, + "loss": 0.0309, + "step": 1227 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0078125, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0078125, + "entropy": 0.2793623208999634, + "epoch": 1.967948717948718, + "grad_norm": 0.0028724130243062973, + "learning_rate": 1e-06, + "loss": -0.0095, + "step": 1228 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16123.0, + "completions/max_terminated_length": 16123.0, + "completions/mean_length": 8708.259765625, + "completions/mean_terminated_length": 8708.259765625, + "completions/min_length": 4282.0, + "completions/min_terminated_length": 4282.0, + "entropy": 0.2766948789358139, + "epoch": 1.969551282051282, + "frac_reward_zero_std": 0.875, + "grad_norm": 0.005033229012042284, + "learning_rate": 1e-06, + "loss": -0.0205, + "num_tokens": 1550202678.0, + "reward": 0.8943262100219727, + "reward_std": 0.02269531413912773, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.9921875, + "rewards/symbolic_reward_accuracy/std": 0.08812850713729858, + "rewards/symbolic_reward_partial_score/mean": 0.997363269329071, + "rewards/symbolic_reward_partial_score/std": 0.045138679444789886, + "rewards/tag_count_reward/mean": -0.001953125, + "rewards/tag_count_reward/std": 0.04419417306780815, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0631859302520752, + "sampling/importance_sampling_ratio/min": 0.003191123716533184, + "sampling/sampling_logp_difference/max": 5.747382164001465, + "sampling/sampling_logp_difference/mean": 0.11148595809936523, + "step": 1229 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.03125, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.03125, + "entropy": 0.27636095881462097, + "epoch": 1.9711538461538463, + "grad_norm": 0.004540975205600262, + "learning_rate": 1e-06, + "loss": 0.0021, + "step": 1230 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.0234375, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0234375, + "entropy": 0.2746479660272598, + "epoch": 1.9727564102564101, + "grad_norm": 0.004312835168093443, + "learning_rate": 1e-06, + "loss": 0.0013, + "step": 1231 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.03125, + "clip_ratio/low_mean": 0.0078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0390625, + "entropy": 0.2745707780122757, + "epoch": 1.9743589743589745, + "grad_norm": 0.004494936671108007, + "learning_rate": 1e-06, + "loss": 0.0129, + "step": 1232 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15025.0, + "completions/mean_length": 7987.767578125, + "completions/mean_terminated_length": 7971.33642578125, + "completions/min_length": 3112.0, + "completions/min_terminated_length": 3112.0, + "entropy": 0.28349705040454865, + "epoch": 1.9759615384615383, + "frac_reward_zero_std": 0.9375, + "grad_norm": 0.00874620396643877, + "learning_rate": 1e-06, + "loss": -0.0211, + "num_tokens": 1555131423.0, + "reward": 0.8898340463638306, + "reward_std": 0.01714194566011429, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.986328125, + "rewards/symbolic_reward_accuracy/std": 0.1162383034825325, + "rewards/symbolic_reward_partial_score/mean": 0.9941080808639526, + "rewards/symbolic_reward_partial_score/std": 0.06613869965076447, + "rewards/tag_count_reward/mean": -0.001953125, + "rewards/tag_count_reward/std": 0.04419417306780815, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0643033981323242, + "sampling/importance_sampling_ratio/min": 0.0017722464399412274, + "sampling/sampling_logp_difference/max": 6.335507392883301, + "sampling/sampling_logp_difference/mean": 0.11349356174468994, + "step": 1233 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "entropy": 0.2810690402984619, + "epoch": 1.9775641025641026, + "grad_norm": 0.008442613296210766, + "learning_rate": 1e-06, + "loss": -0.0023, + "step": 1234 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.015625, + "clip_ratio/low_mean": 0.015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.03125, + "entropy": 0.28290465474128723, + "epoch": 1.9791666666666665, + "grad_norm": 0.002465788973495364, + "learning_rate": 1e-06, + "loss": 0.0125, + "step": 1235 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.015625, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.015625, + "entropy": 0.28290964663028717, + "epoch": 1.9807692307692308, + "grad_norm": 0.026566775515675545, + "learning_rate": 1e-06, + "loss": 0.0143, + "step": 1236 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13867.0, + "completions/max_terminated_length": 13867.0, + "completions/mean_length": 8091.439453125, + "completions/mean_terminated_length": 8091.439453125, + "completions/min_length": 3847.0, + "completions/min_terminated_length": 3847.0, + "entropy": 0.2784264087677002, + "epoch": 1.9823717948717947, + "frac_reward_zero_std": 0.90625, + "grad_norm": 0.004467155318707228, + "learning_rate": 1e-06, + "loss": -0.0132, + "num_tokens": 1560154784.0, + "reward": 0.895751953125, + "reward_std": 0.01699218899011612, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.994140625, + "rewards/symbolic_reward_accuracy/std": 0.07639661431312561, + "rewards/symbolic_reward_partial_score/mean": 0.99755859375, + "rewards/symbolic_reward_partial_score/std": 0.044932443648576736, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0629961490631104, + "sampling/importance_sampling_ratio/min": 1.6737449186621234e-05, + "sampling/sampling_logp_difference/max": 10.997861862182617, + "sampling/sampling_logp_difference/mean": 0.11135035753250122, + "step": 1237 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.03125, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.03125, + "entropy": 0.2753297984600067, + "epoch": 1.983974358974359, + "grad_norm": 0.0024451538920402527, + "learning_rate": 1e-06, + "loss": -0.0016, + "step": 1238 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.03125, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.03125, + "entropy": 0.27506595849990845, + "epoch": 1.9855769230769231, + "grad_norm": 0.003434085752815008, + "learning_rate": 1e-06, + "loss": 0.0044, + "step": 1239 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.015625, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.015625, + "entropy": 0.27488943934440613, + "epoch": 1.9871794871794872, + "grad_norm": 0.0032492363825440407, + "learning_rate": 1e-06, + "loss": 0.0037, + "step": 1240 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14333.0, + "completions/max_terminated_length": 14333.0, + "completions/mean_length": 8140.427734375, + "completions/mean_terminated_length": 8140.427734375, + "completions/min_length": 3333.0, + "completions/min_terminated_length": 3333.0, + "entropy": 0.28159183263778687, + "epoch": 1.9887820512820513, + "frac_reward_zero_std": 0.875, + "grad_norm": 0.00511184660717845, + "learning_rate": 1e-06, + "loss": 0.0158, + "num_tokens": 1565177275.0, + "reward": 0.8945459127426147, + "reward_std": 0.021816406399011612, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.9921875, + "rewards/symbolic_reward_accuracy/std": 0.08812850713729858, + "rewards/symbolic_reward_partial_score/mean": 0.9974446296691895, + "rewards/symbolic_reward_partial_score/std": 0.044879186898469925, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0642945766448975, + "sampling/importance_sampling_ratio/min": 0.0013277892721816897, + "sampling/sampling_logp_difference/max": 6.624239921569824, + "sampling/sampling_logp_difference/mean": 0.11332094669342041, + "step": 1241 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.0234375, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0234375, + "entropy": 0.28187714517116547, + "epoch": 1.9903846153846154, + "grad_norm": 0.004039302468299866, + "learning_rate": 1e-06, + "loss": 0.0025, + "step": 1242 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.046875, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.046875, + "entropy": 0.2771865576505661, + "epoch": 1.9919871794871795, + "grad_norm": 0.003922486677765846, + "learning_rate": 1e-06, + "loss": -0.0177, + "step": 1243 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0078125, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0078125, + "entropy": 0.28154416382312775, + "epoch": 1.9935897435897436, + "grad_norm": 0.004652918316423893, + "learning_rate": 1e-06, + "loss": -0.0016, + "step": 1244 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14042.0, + "completions/max_terminated_length": 14042.0, + "completions/mean_length": 8255.6796875, + "completions/mean_terminated_length": 8255.6796875, + "completions/min_length": 3448.0, + "completions/min_terminated_length": 3448.0, + "entropy": 0.27853433787822723, + "epoch": 1.9951923076923077, + "frac_reward_zero_std": 0.90625, + "grad_norm": 0.030363189056515694, + "learning_rate": 1e-06, + "loss": 0.0109, + "num_tokens": 1570280631.0, + "reward": 0.8975096940994263, + "reward_std": 0.009961274452507496, + "rewards/progression_diversity/mean": -8.409509973716922e-06, + "rewards/progression_diversity/std": 0.0001902854855870828, + "rewards/symbolic_reward_accuracy/mean": 0.99609375, + "rewards/symbolic_reward_accuracy/std": 0.06243881583213806, + "rewards/symbolic_reward_partial_score/mean": 0.99951171875, + "rewards/symbolic_reward_partial_score/std": 0.008228649385273457, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.063688039779663, + "sampling/importance_sampling_ratio/min": 0.0021940346341580153, + "sampling/sampling_logp_difference/max": 6.122013092041016, + "sampling/sampling_logp_difference/mean": 0.11205926537513733, + "step": 1245 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0078125, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0078125, + "entropy": 0.27532532811164856, + "epoch": 1.9967948717948718, + "grad_norm": 0.004188434686511755, + "learning_rate": 1e-06, + "loss": -0.0088, + "step": 1246 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.03125, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.03125, + "entropy": 0.2833784818649292, + "epoch": 1.998397435897436, + "grad_norm": 0.029332676902413368, + "learning_rate": 1e-06, + "loss": 0.0118, + "step": 1247 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0078125, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0078125, + "entropy": 0.27936534583568573, + "epoch": 2.0, + "grad_norm": 0.0042526316829025745, + "learning_rate": 1e-06, + "loss": -0.0109, + "step": 1248 + }, + { + "epoch": 2.0, + "eval_clip_ratio/high_max": 0.0, + "eval_clip_ratio/high_mean": 0.0, + "eval_clip_ratio/low_mean": 0.0, + "eval_clip_ratio/low_min": 0.0, + "eval_clip_ratio/region_mean": 0.0, + "eval_completions/clipped_ratio": 0.000244140625, + "eval_completions/max_length": 13595.1875, + "eval_completions/max_terminated_length": 13568.59375, + "eval_completions/mean_length": 8149.333740234375, + "eval_completions/mean_terminated_length": 8147.492645263672, + "eval_completions/min_length": 4439.53125, + "eval_completions/min_terminated_length": 4439.53125, + "eval_entropy": 0.27844913955777884, + "eval_frac_reward_zero_std": 0.8984375, + "eval_loss": 0.0007266444154083729, + "eval_num_tokens": 1570280631.0, + "eval_reward": 0.8951080553233624, + "eval_reward_std": 0.01795433840015903, + "eval_rewards/progression_diversity/mean": -1.194192577713693e-06, + "eval_rewards/progression_diversity/std": 1.3510747521650046e-05, + "eval_rewards/symbolic_reward_accuracy/mean": 0.992919921875, + "eval_rewards/symbolic_reward_accuracy/std": 0.057395454961806536, + "eval_rewards/symbolic_reward_partial_score/mean": 0.9978536050766706, + "eval_rewards/symbolic_reward_partial_score/std": 0.018804883409757167, + "eval_rewards/tag_count_reward/mean": 0.0, + "eval_rewards/tag_count_reward/std": 0.0, + "eval_runtime": 8401.4648, + "eval_samples_per_second": 0.03, + "eval_sampling/importance_sampling_ratio/max": 2.0, + "eval_sampling/importance_sampling_ratio/mean": 1.0635985918343067, + "eval_sampling/importance_sampling_ratio/min": 0.003701142359204823, + "eval_sampling/sampling_logp_difference/max": 5.673463404178619, + "eval_sampling/sampling_logp_difference/mean": 0.11233945097774267, + "eval_steps_per_second": 0.0, + "step": 1248 + }, + { + "epoch": 2.0, + "step": 1248, "total_flos": 0.0, - "train_loss": 0.0, - "train_runtime": 2.7042, - "train_samples_per_second": 1848.955, - "train_steps_per_second": 230.75 + "train_loss": 0.0006463412022966599, + "train_runtime": 115411.6286, + "train_samples_per_second": 0.087, + "train_steps_per_second": 0.011 } ], "logging_steps": 1, - "max_steps": 624, - "num_input_tokens_seen": 828826298, - "num_train_epochs": 1, + "max_steps": 1248, + "num_input_tokens_seen": 1570280631, + "num_train_epochs": 2, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { @@ -11925,7 +23819,7 @@ "should_evaluate": false, "should_log": false, "should_save": true, - "should_training_stop": false + "should_training_stop": true }, "attributes": {} }