diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,48634 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.8055493398970687, + "eval_steps": 500, + "global_step": 1800, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 634.0, + "completions/max_terminated_length": 634.0, + "completions/mean_length": 354.1875, + "completions/mean_terminated_length": 354.1875, + "completions/min_length": 190.0, + "completions/min_terminated_length": 190.0, + "epoch": 0.00044752741105392703, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05193624645471573, + "learning_rate": 0.0, + "loss": -0.0477, + "num_tokens": 136396.0, + "reward": 2.5913524627685547, + "reward_std": 2.2160112857818604, + "rewards/accuracy_reward/mean": 2.125, + "rewards/accuracy_reward/std": 3.000000238418579, + "rewards/ngram_similarity_reward/mean": 0.46635231375694275, + "rewards/ngram_similarity_reward/std": 0.33532705903053284, + "step": 1 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 896.0, + "completions/max_terminated_length": 896.0, + "completions/mean_length": 553.640625, + "completions/mean_terminated_length": 553.640625, + "completions/min_length": 297.0, + "completions/min_terminated_length": 297.0, + "epoch": 0.0008950548221078541, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.038587283343076706, + "learning_rate": 2.2321428571428572e-08, + "loss": -0.0007, + "num_tokens": 294677.0, + "reward": 1.472002387046814, + "reward_std": 1.188935399055481, + "rewards/accuracy_reward/mean": 0.984375, + "rewards/accuracy_reward/std": 2.6306629180908203, + "rewards/ngram_similarity_reward/mean": 0.4876273274421692, + "rewards/ngram_similarity_reward/std": 0.3976302742958069, + "step": 2 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1000.0, + "completions/max_terminated_length": 1000.0, + "completions/mean_length": 555.84375, + "completions/mean_terminated_length": 555.84375, + "completions/min_length": 337.0, + "completions/min_terminated_length": 337.0, + "epoch": 0.0013425822331617813, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.03477500006556511, + "learning_rate": 4.4642857142857145e-08, + "loss": -0.0399, + "num_tokens": 432059.0, + "reward": 3.8383781909942627, + "reward_std": 2.0193111896514893, + "rewards/accuracy_reward/mean": 3.453125, + "rewards/accuracy_reward/std": 3.0728185176849365, + "rewards/ngram_similarity_reward/mean": 0.38525325059890747, + "rewards/ngram_similarity_reward/std": 0.3316202759742737, + "step": 3 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 682.0, + "completions/max_terminated_length": 682.0, + "completions/mean_length": 501.40625, + "completions/mean_terminated_length": 501.40625, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "epoch": 0.0017901096442157081, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04869001358747482, + "learning_rate": 6.696428571428571e-08, + "loss": -0.0012, + "num_tokens": 574293.0, + "reward": 0.7406258583068848, + "reward_std": 2.232048511505127, + "rewards/accuracy_reward/mean": 0.484375, + "rewards/accuracy_reward/std": 2.312781572341919, + "rewards/ngram_similarity_reward/mean": 0.25625085830688477, + "rewards/ngram_similarity_reward/std": 0.1979217827320099, + "step": 4 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 766.0, + "completions/max_terminated_length": 766.0, + "completions/mean_length": 415.828125, + "completions/mean_terminated_length": 415.828125, + "completions/min_length": 214.0, + "completions/min_terminated_length": 214.0, + "epoch": 0.0022376370552696354, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.048768848180770874, + "learning_rate": 8.928571428571429e-08, + "loss": -0.0033, + "num_tokens": 763018.0, + "reward": 1.6235284805297852, + "reward_std": 0.8000533580780029, + "rewards/accuracy_reward/mean": 1.25, + "rewards/accuracy_reward/std": 2.7888667583465576, + "rewards/ngram_similarity_reward/mean": 0.3735284209251404, + "rewards/ngram_similarity_reward/std": 0.3147267699241638, + "step": 5 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 945.0, + "completions/max_terminated_length": 945.0, + "completions/mean_length": 417.78125, + "completions/mean_terminated_length": 417.78125, + "completions/min_length": 222.0, + "completions/min_terminated_length": 222.0, + "epoch": 0.0026851644663235625, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0469495914876461, + "learning_rate": 1.1160714285714287e-07, + "loss": 0.0593, + "num_tokens": 914380.0, + "reward": 4.5329790115356445, + "reward_std": 0.535412609577179, + "rewards/accuracy_reward/mean": 3.90625, + "rewards/accuracy_reward/std": 2.6709415912628174, + "rewards/ngram_similarity_reward/mean": 0.6267289519309998, + "rewards/ngram_similarity_reward/std": 0.3272629976272583, + "step": 6 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 809.0, + "completions/max_terminated_length": 809.0, + "completions/mean_length": 458.0, + "completions/mean_terminated_length": 458.0, + "completions/min_length": 208.0, + "completions/min_terminated_length": 208.0, + "epoch": 0.003132691877377489, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04102824255824089, + "learning_rate": 1.3392857142857142e-07, + "loss": -0.0378, + "num_tokens": 1121724.0, + "reward": 1.5913887023925781, + "reward_std": 1.4958816766738892, + "rewards/accuracy_reward/mean": 1.203125, + "rewards/accuracy_reward/std": 3.0533857345581055, + "rewards/ngram_similarity_reward/mean": 0.3882637619972229, + "rewards/ngram_similarity_reward/std": 0.3548280894756317, + "step": 7 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 889.0, + "completions/max_terminated_length": 889.0, + "completions/mean_length": 566.21875, + "completions/mean_terminated_length": 566.21875, + "completions/min_length": 270.0, + "completions/min_terminated_length": 270.0, + "epoch": 0.0035802192884314163, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.035449933260679245, + "learning_rate": 1.5625e-07, + "loss": -0.0124, + "num_tokens": 1278298.0, + "reward": 6.122851371765137, + "reward_std": 0.2047199010848999, + "rewards/accuracy_reward/mean": 5.5, + "rewards/accuracy_reward/std": 0.0, + "rewards/ngram_similarity_reward/mean": 0.6228512525558472, + "rewards/ngram_similarity_reward/std": 0.23033379018306732, + "step": 8 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 704.0, + "completions/max_terminated_length": 704.0, + "completions/mean_length": 504.25, + "completions/mean_terminated_length": 504.25, + "completions/min_length": 317.0, + "completions/min_terminated_length": 317.0, + "epoch": 0.004027746699485343, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04072337597608566, + "learning_rate": 1.7857142857142858e-07, + "loss": 0.026, + "num_tokens": 1476954.0, + "reward": 5.461928844451904, + "reward_std": 1.5869636535644531, + "rewards/accuracy_reward/mean": 4.90625, + "rewards/accuracy_reward/std": 1.8663159608840942, + "rewards/ngram_similarity_reward/mean": 0.5556788444519043, + "rewards/ngram_similarity_reward/std": 0.33379217982292175, + "step": 9 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 576.0, + "completions/max_terminated_length": 576.0, + "completions/mean_length": 373.90625, + "completions/mean_terminated_length": 373.90625, + "completions/min_length": 200.0, + "completions/min_terminated_length": 200.0, + "epoch": 0.004475274110539271, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05392281338572502, + "learning_rate": 2.0089285714285717e-07, + "loss": -0.0074, + "num_tokens": 1613524.0, + "reward": 1.703417181968689, + "reward_std": 0.7789101600646973, + "rewards/accuracy_reward/mean": 1.359375, + "rewards/accuracy_reward/std": 2.816432476043701, + "rewards/ngram_similarity_reward/mean": 0.3440423011779785, + "rewards/ngram_similarity_reward/std": 0.31959813833236694, + "step": 10 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 533.0, + "completions/max_terminated_length": 533.0, + "completions/mean_length": 379.625, + "completions/mean_terminated_length": 379.625, + "completions/min_length": 274.0, + "completions/min_terminated_length": 274.0, + "epoch": 0.0049228015215931975, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.050036825239658356, + "learning_rate": 2.2321428571428574e-07, + "loss": 0.0064, + "num_tokens": 1747084.0, + "reward": 3.507603168487549, + "reward_std": 1.5738906860351562, + "rewards/accuracy_reward/mean": 2.875, + "rewards/accuracy_reward/std": 3.000000238418579, + "rewards/ngram_similarity_reward/mean": 0.6326034069061279, + "rewards/ngram_similarity_reward/std": 0.43446189165115356, + "step": 11 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 924.0, + "completions/max_terminated_length": 924.0, + "completions/mean_length": 574.3125, + "completions/mean_terminated_length": 574.3125, + "completions/min_length": 319.0, + "completions/min_terminated_length": 319.0, + "epoch": 0.005370328932647125, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0350542776286602, + "learning_rate": 2.455357142857143e-07, + "loss": 0.0035, + "num_tokens": 1903600.0, + "reward": 2.881260871887207, + "reward_std": 0.5595024228096008, + "rewards/accuracy_reward/mean": 2.40625, + "rewards/accuracy_reward/std": 3.0222392082214355, + "rewards/ngram_similarity_reward/mean": 0.47501087188720703, + "rewards/ngram_similarity_reward/std": 0.3289196491241455, + "step": 12 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 890.0, + "completions/max_terminated_length": 890.0, + "completions/mean_length": 590.140625, + "completions/mean_terminated_length": 590.140625, + "completions/min_length": 385.0, + "completions/min_terminated_length": 385.0, + "epoch": 0.005817856343701052, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.040751758962869644, + "learning_rate": 2.6785714285714284e-07, + "loss": 0.009, + "num_tokens": 2084681.0, + "reward": 2.4814798831939697, + "reward_std": 0.8326557874679565, + "rewards/accuracy_reward/mean": 2.03125, + "rewards/accuracy_reward/std": 2.986577272415161, + "rewards/ngram_similarity_reward/mean": 0.45022982358932495, + "rewards/ngram_similarity_reward/std": 0.3732970952987671, + "step": 13 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 766.0, + "completions/max_terminated_length": 766.0, + "completions/mean_length": 497.515625, + "completions/mean_terminated_length": 497.515625, + "completions/min_length": 303.0, + "completions/min_terminated_length": 303.0, + "epoch": 0.006265383754754978, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04194331914186478, + "learning_rate": 2.901785714285715e-07, + "loss": 0.0094, + "num_tokens": 2263066.0, + "reward": 5.9365949630737305, + "reward_std": 0.4958219528198242, + "rewards/accuracy_reward/mean": 5.40625, + "rewards/accuracy_reward/std": 0.7500000596046448, + "rewards/ngram_similarity_reward/mean": 0.53034508228302, + "rewards/ngram_similarity_reward/std": 0.1774548590183258, + "step": 14 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 785.0, + "completions/max_terminated_length": 785.0, + "completions/mean_length": 525.234375, + "completions/mean_terminated_length": 525.234375, + "completions/min_length": 243.0, + "completions/min_terminated_length": 243.0, + "epoch": 0.006712911165808906, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04093608260154724, + "learning_rate": 3.125e-07, + "loss": -0.0241, + "num_tokens": 2399881.0, + "reward": 4.266244888305664, + "reward_std": 0.7034568786621094, + "rewards/accuracy_reward/mean": 3.71875, + "rewards/accuracy_reward/std": 2.7629566192626953, + "rewards/ngram_similarity_reward/mean": 0.5474951267242432, + "rewards/ngram_similarity_reward/std": 0.23368707299232483, + "step": 15 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 736.0, + "completions/max_terminated_length": 736.0, + "completions/mean_length": 459.015625, + "completions/mean_terminated_length": 459.015625, + "completions/min_length": 243.0, + "completions/min_terminated_length": 243.0, + "epoch": 0.0071604385768628325, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05097994580864906, + "learning_rate": 3.3482142857142856e-07, + "loss": 0.0236, + "num_tokens": 2542938.0, + "reward": 4.596446990966797, + "reward_std": 0.706952691078186, + "rewards/accuracy_reward/mean": 4.109375, + "rewards/accuracy_reward/std": 2.354443311691284, + "rewards/ngram_similarity_reward/mean": 0.4870717227458954, + "rewards/ngram_similarity_reward/std": 0.3474036753177643, + "step": 16 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 683.0, + "completions/max_terminated_length": 683.0, + "completions/mean_length": 448.453125, + "completions/mean_terminated_length": 448.453125, + "completions/min_length": 279.0, + "completions/min_terminated_length": 279.0, + "epoch": 0.00760796598791676, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04329349100589752, + "learning_rate": 3.5714285714285716e-07, + "loss": 0.0215, + "num_tokens": 2690647.0, + "reward": 3.7567265033721924, + "reward_std": 1.6267927885055542, + "rewards/accuracy_reward/mean": 3.421875, + "rewards/accuracy_reward/std": 2.896657705307007, + "rewards/ngram_similarity_reward/mean": 0.3348517119884491, + "rewards/ngram_similarity_reward/std": 0.31598660349845886, + "step": 17 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 763.0, + "completions/max_terminated_length": 763.0, + "completions/mean_length": 542.546875, + "completions/mean_terminated_length": 542.546875, + "completions/min_length": 235.0, + "completions/min_terminated_length": 235.0, + "epoch": 0.008055493398970687, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.03838830068707466, + "learning_rate": 3.794642857142857e-07, + "loss": 0.0247, + "num_tokens": 2852970.0, + "reward": 2.5155956745147705, + "reward_std": 1.9023000001907349, + "rewards/accuracy_reward/mean": 2.203125, + "rewards/accuracy_reward/std": 3.1354587078094482, + "rewards/ngram_similarity_reward/mean": 0.31247082352638245, + "rewards/ngram_similarity_reward/std": 0.21052129566669464, + "step": 18 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 768.0, + "completions/max_terminated_length": 768.0, + "completions/mean_length": 558.28125, + "completions/mean_terminated_length": 558.28125, + "completions/min_length": 244.0, + "completions/min_terminated_length": 244.0, + "epoch": 0.008503020810024613, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.040122196078300476, + "learning_rate": 4.0178571428571434e-07, + "loss": -0.0191, + "num_tokens": 2975660.0, + "reward": 5.218594074249268, + "reward_std": 0.8617122173309326, + "rewards/accuracy_reward/mean": 4.84375, + "rewards/accuracy_reward/std": 1.8874586820602417, + "rewards/ngram_similarity_reward/mean": 0.3748440444469452, + "rewards/ngram_similarity_reward/std": 0.21162128448486328, + "step": 19 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 713.0, + "completions/max_terminated_length": 713.0, + "completions/mean_length": 431.421875, + "completions/mean_terminated_length": 431.421875, + "completions/min_length": 164.0, + "completions/min_terminated_length": 164.0, + "epoch": 0.008950548221078542, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05076023191213608, + "learning_rate": 4.2410714285714293e-07, + "loss": 0.0282, + "num_tokens": 3114839.0, + "reward": 4.0693511962890625, + "reward_std": 1.58225417137146, + "rewards/accuracy_reward/mean": 3.3125, + "rewards/accuracy_reward/std": 2.948634386062622, + "rewards/ngram_similarity_reward/mean": 0.756851077079773, + "rewards/ngram_similarity_reward/std": 0.36218729615211487, + "step": 20 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 670.0, + "completions/max_terminated_length": 670.0, + "completions/mean_length": 430.359375, + "completions/mean_terminated_length": 430.359375, + "completions/min_length": 193.0, + "completions/min_terminated_length": 193.0, + "epoch": 0.009398075632132468, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04346788302063942, + "learning_rate": 4.4642857142857147e-07, + "loss": -0.0295, + "num_tokens": 3248126.0, + "reward": 3.656329393386841, + "reward_std": 1.5899829864501953, + "rewards/accuracy_reward/mean": 3.140625, + "rewards/accuracy_reward/std": 2.9727182388305664, + "rewards/ngram_similarity_reward/mean": 0.5157045125961304, + "rewards/ngram_similarity_reward/std": 0.39607036113739014, + "step": 21 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 659.0, + "completions/max_terminated_length": 659.0, + "completions/mean_length": 426.421875, + "completions/mean_terminated_length": 426.421875, + "completions/min_length": 281.0, + "completions/min_terminated_length": 281.0, + "epoch": 0.009845603043186395, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04499637708067894, + "learning_rate": 4.6875000000000006e-07, + "loss": -0.0182, + "num_tokens": 3437609.0, + "reward": 2.1935207843780518, + "reward_std": 2.1468324661254883, + "rewards/accuracy_reward/mean": 1.875, + "rewards/accuracy_reward/std": 3.0315799713134766, + "rewards/ngram_similarity_reward/mean": 0.3185208737850189, + "rewards/ngram_similarity_reward/std": 0.2438775599002838, + "step": 22 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 677.0, + "completions/max_terminated_length": 677.0, + "completions/mean_length": 482.25, + "completions/mean_terminated_length": 482.25, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, + "epoch": 0.010293130454240322, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0404851995408535, + "learning_rate": 4.910714285714286e-07, + "loss": 0.0027, + "num_tokens": 3609705.0, + "reward": 5.2805070877075195, + "reward_std": 1.4755187034606934, + "rewards/accuracy_reward/mean": 4.734375, + "rewards/accuracy_reward/std": 2.04506516456604, + "rewards/ngram_similarity_reward/mean": 0.5461318492889404, + "rewards/ngram_similarity_reward/std": 0.40060216188430786, + "step": 23 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 954.0, + "completions/max_terminated_length": 954.0, + "completions/mean_length": 496.84375, + "completions/mean_terminated_length": 496.84375, + "completions/min_length": 161.0, + "completions/min_terminated_length": 161.0, + "epoch": 0.01074065786529425, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05109725892543793, + "learning_rate": 5.133928571428571e-07, + "loss": 0.0107, + "num_tokens": 3773583.0, + "reward": 2.40609073638916, + "reward_std": 0.0649171993136406, + "rewards/accuracy_reward/mean": 2.0, + "rewards/accuracy_reward/std": 3.5276684761047363, + "rewards/ngram_similarity_reward/mean": 0.4060908854007721, + "rewards/ngram_similarity_reward/std": 0.24911099672317505, + "step": 24 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 663.0, + "completions/max_terminated_length": 663.0, + "completions/mean_length": 437.015625, + "completions/mean_terminated_length": 437.015625, + "completions/min_length": 245.0, + "completions/min_terminated_length": 245.0, + "epoch": 0.011188185276348177, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.049206919968128204, + "learning_rate": 5.357142857142857e-07, + "loss": 0.0513, + "num_tokens": 3998208.0, + "reward": 3.4450340270996094, + "reward_std": 2.016695261001587, + "rewards/accuracy_reward/mean": 2.96875, + "rewards/accuracy_reward/std": 2.986577272415161, + "rewards/ngram_similarity_reward/mean": 0.47628408670425415, + "rewards/ngram_similarity_reward/std": 0.4277351498603821, + "step": 25 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 863.0, + "completions/max_terminated_length": 863.0, + "completions/mean_length": 540.015625, + "completions/mean_terminated_length": 540.015625, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, + "epoch": 0.011635712687402103, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0415828563272953, + "learning_rate": 5.580357142857143e-07, + "loss": 0.022, + "num_tokens": 4127169.0, + "reward": 2.0314695835113525, + "reward_std": 2.002382755279541, + "rewards/accuracy_reward/mean": 1.625, + "rewards/accuracy_reward/std": 2.9304099082946777, + "rewards/ngram_similarity_reward/mean": 0.40646952390670776, + "rewards/ngram_similarity_reward/std": 0.2828986942768097, + "step": 26 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 633.0, + "completions/max_terminated_length": 633.0, + "completions/mean_length": 374.234375, + "completions/mean_terminated_length": 374.234375, + "completions/min_length": 173.0, + "completions/min_terminated_length": 173.0, + "epoch": 0.01208324009845603, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04801315814256668, + "learning_rate": 5.80357142857143e-07, + "loss": 0.0078, + "num_tokens": 4280480.0, + "reward": 1.0853207111358643, + "reward_std": 0.8151639699935913, + "rewards/accuracy_reward/mean": 0.671875, + "rewards/accuracy_reward/std": 2.465988874435425, + "rewards/ngram_similarity_reward/mean": 0.41344574093818665, + "rewards/ngram_similarity_reward/std": 0.3735671937465668, + "step": 27 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 645.0, + "completions/max_terminated_length": 645.0, + "completions/mean_length": 401.21875, + "completions/mean_terminated_length": 401.21875, + "completions/min_length": 272.0, + "completions/min_terminated_length": 272.0, + "epoch": 0.012530767509509957, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05504123494029045, + "learning_rate": 6.026785714285715e-07, + "loss": 0.0103, + "num_tokens": 4488494.0, + "reward": 4.440656661987305, + "reward_std": 2.232184410095215, + "rewards/accuracy_reward/mean": 3.8125, + "rewards/accuracy_reward/std": 2.7189810276031494, + "rewards/ngram_similarity_reward/mean": 0.6281571388244629, + "rewards/ngram_similarity_reward/std": 0.4001638889312744, + "step": 28 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 820.0, + "completions/max_terminated_length": 820.0, + "completions/mean_length": 451.03125, + "completions/mean_terminated_length": 451.03125, + "completions/min_length": 279.0, + "completions/min_terminated_length": 279.0, + "epoch": 0.012978294920563885, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0563817173242569, + "learning_rate": 6.25e-07, + "loss": 0.0162, + "num_tokens": 4694608.0, + "reward": 2.5744268894195557, + "reward_std": 1.4449257850646973, + "rewards/accuracy_reward/mean": 2.25, + "rewards/accuracy_reward/std": 3.0860671997070312, + "rewards/ngram_similarity_reward/mean": 0.3244269788265228, + "rewards/ngram_similarity_reward/std": 0.31681835651397705, + "step": 29 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 712.0, + "completions/max_terminated_length": 712.0, + "completions/mean_length": 494.5625, + "completions/mean_terminated_length": 494.5625, + "completions/min_length": 221.0, + "completions/min_terminated_length": 221.0, + "epoch": 0.013425822331617812, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04479817673563957, + "learning_rate": 6.473214285714287e-07, + "loss": -0.0052, + "num_tokens": 4860052.0, + "reward": 3.1864285469055176, + "reward_std": 0.5244491100311279, + "rewards/accuracy_reward/mean": 2.59375, + "rewards/accuracy_reward/std": 3.0222392082214355, + "rewards/ngram_similarity_reward/mean": 0.5926785469055176, + "rewards/ngram_similarity_reward/std": 0.21132132411003113, + "step": 30 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 711.0, + "completions/max_terminated_length": 711.0, + "completions/mean_length": 464.265625, + "completions/mean_terminated_length": 464.265625, + "completions/min_length": 207.0, + "completions/min_terminated_length": 207.0, + "epoch": 0.013873349742671738, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04182331636548042, + "learning_rate": 6.696428571428571e-07, + "loss": 0.029, + "num_tokens": 5026789.0, + "reward": 2.2157487869262695, + "reward_std": 1.5576858520507812, + "rewards/accuracy_reward/mean": 1.453125, + "rewards/accuracy_reward/std": 3.077979803085327, + "rewards/ngram_similarity_reward/mean": 0.7626237869262695, + "rewards/ngram_similarity_reward/std": 0.3413919508457184, + "step": 31 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1115.0, + "completions/max_terminated_length": 1115.0, + "completions/mean_length": 603.671875, + "completions/mean_terminated_length": 603.671875, + "completions/min_length": 293.0, + "completions/min_terminated_length": 293.0, + "epoch": 0.014320877153725665, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.03902869299054146, + "learning_rate": 6.919642857142858e-07, + "loss": 0.0338, + "num_tokens": 5196720.0, + "reward": 5.464887619018555, + "reward_std": 1.6251399517059326, + "rewards/accuracy_reward/mean": 4.984375, + "rewards/accuracy_reward/std": 1.790558934211731, + "rewards/ngram_similarity_reward/mean": 0.4805128276348114, + "rewards/ngram_similarity_reward/std": 0.18633846938610077, + "step": 32 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 641.0, + "completions/max_terminated_length": 641.0, + "completions/mean_length": 440.859375, + "completions/mean_terminated_length": 440.859375, + "completions/min_length": 277.0, + "completions/min_terminated_length": 277.0, + "epoch": 0.014768404564779593, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04514501616358757, + "learning_rate": 7.142857142857143e-07, + "loss": 0.0242, + "num_tokens": 5358903.0, + "reward": 2.4213662147521973, + "reward_std": 1.0139738321304321, + "rewards/accuracy_reward/mean": 2.203125, + "rewards/accuracy_reward/std": 3.0272817611694336, + "rewards/ngram_similarity_reward/mean": 0.21824108064174652, + "rewards/ngram_similarity_reward/std": 0.17951112985610962, + "step": 33 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 680.0, + "completions/max_terminated_length": 680.0, + "completions/mean_length": 479.546875, + "completions/mean_terminated_length": 479.546875, + "completions/min_length": 306.0, + "completions/min_terminated_length": 306.0, + "epoch": 0.01521593197583352, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.03740592673420906, + "learning_rate": 7.36607142857143e-07, + "loss": 0.0389, + "num_tokens": 5491690.0, + "reward": 5.158116340637207, + "reward_std": 1.3710438013076782, + "rewards/accuracy_reward/mean": 4.75, + "rewards/accuracy_reward/std": 2.0, + "rewards/ngram_similarity_reward/mean": 0.4081161618232727, + "rewards/ngram_similarity_reward/std": 0.32177120447158813, + "step": 34 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1008.0, + "completions/max_terminated_length": 1008.0, + "completions/mean_length": 602.734375, + "completions/mean_terminated_length": 602.734375, + "completions/min_length": 303.0, + "completions/min_terminated_length": 303.0, + "epoch": 0.015663459386887447, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.03585360199213028, + "learning_rate": 7.589285714285714e-07, + "loss": 0.0072, + "num_tokens": 5624489.0, + "reward": 5.171844482421875, + "reward_std": 1.2208667993545532, + "rewards/accuracy_reward/mean": 4.640625, + "rewards/accuracy_reward/std": 2.1445181369781494, + "rewards/ngram_similarity_reward/mean": 0.5312194228172302, + "rewards/ngram_similarity_reward/std": 0.28985437750816345, + "step": 35 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 562.0, + "completions/max_terminated_length": 562.0, + "completions/mean_length": 383.734375, + "completions/mean_terminated_length": 383.734375, + "completions/min_length": 218.0, + "completions/min_terminated_length": 218.0, + "epoch": 0.016110986797941373, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.054027266800403595, + "learning_rate": 7.8125e-07, + "loss": 0.002, + "num_tokens": 5789336.0, + "reward": 3.743284225463867, + "reward_std": 1.6936142444610596, + "rewards/accuracy_reward/mean": 3.21875, + "rewards/accuracy_reward/std": 2.9732606410980225, + "rewards/ngram_similarity_reward/mean": 0.5245344042778015, + "rewards/ngram_similarity_reward/std": 0.34867286682128906, + "step": 36 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 896.0, + "completions/max_terminated_length": 896.0, + "completions/mean_length": 456.9375, + "completions/mean_terminated_length": 456.9375, + "completions/min_length": 254.0, + "completions/min_terminated_length": 254.0, + "epoch": 0.0165585142089953, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04130946472287178, + "learning_rate": 8.035714285714287e-07, + "loss": -0.0255, + "num_tokens": 5944020.0, + "reward": 3.0221548080444336, + "reward_std": 0.6041401624679565, + "rewards/accuracy_reward/mean": 2.375, + "rewards/accuracy_reward/std": 3.057647228240967, + "rewards/ngram_similarity_reward/mean": 0.6471550464630127, + "rewards/ngram_similarity_reward/std": 0.3582594692707062, + "step": 37 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 912.0, + "completions/max_terminated_length": 912.0, + "completions/mean_length": 543.265625, + "completions/mean_terminated_length": 543.265625, + "completions/min_length": 308.0, + "completions/min_terminated_length": 308.0, + "epoch": 0.017006041620049227, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.039212699979543686, + "learning_rate": 8.258928571428572e-07, + "loss": -0.0365, + "num_tokens": 6109381.0, + "reward": 3.2279555797576904, + "reward_std": 1.5816094875335693, + "rewards/accuracy_reward/mean": 2.828125, + "rewards/accuracy_reward/std": 3.167567253112793, + "rewards/ngram_similarity_reward/mean": 0.3998306691646576, + "rewards/ngram_similarity_reward/std": 0.2724950611591339, + "step": 38 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1093.0, + "completions/max_terminated_length": 1093.0, + "completions/mean_length": 486.671875, + "completions/mean_terminated_length": 486.671875, + "completions/min_length": 160.0, + "completions/min_terminated_length": 160.0, + "epoch": 0.017453569031103153, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04461957886815071, + "learning_rate": 8.482142857142859e-07, + "loss": -0.0291, + "num_tokens": 6312912.0, + "reward": 2.9794769287109375, + "reward_std": 1.6418548822402954, + "rewards/accuracy_reward/mean": 2.59375, + "rewards/accuracy_reward/std": 3.2353148460388184, + "rewards/ngram_similarity_reward/mean": 0.3857269585132599, + "rewards/ngram_similarity_reward/std": 0.2621348798274994, + "step": 39 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 839.0, + "completions/max_terminated_length": 839.0, + "completions/mean_length": 600.125, + "completions/mean_terminated_length": 600.125, + "completions/min_length": 311.0, + "completions/min_terminated_length": 311.0, + "epoch": 0.017901096442157084, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.03619004786014557, + "learning_rate": 8.705357142857143e-07, + "loss": 0.0238, + "num_tokens": 6480600.0, + "reward": 3.216808557510376, + "reward_std": 1.4437744617462158, + "rewards/accuracy_reward/mean": 2.734375, + "rewards/accuracy_reward/std": 3.0692648887634277, + "rewards/ngram_similarity_reward/mean": 0.4824334979057312, + "rewards/ngram_similarity_reward/std": 0.3833947777748108, + "step": 40 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 689.0, + "completions/max_terminated_length": 689.0, + "completions/mean_length": 459.5, + "completions/mean_terminated_length": 459.5, + "completions/min_length": 228.0, + "completions/min_terminated_length": 228.0, + "epoch": 0.01834862385321101, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04745332896709442, + "learning_rate": 8.928571428571429e-07, + "loss": 0.0291, + "num_tokens": 6705192.0, + "reward": 3.351127862930298, + "reward_std": 0.8508948087692261, + "rewards/accuracy_reward/mean": 2.859375, + "rewards/accuracy_reward/std": 3.0203921794891357, + "rewards/ngram_similarity_reward/mean": 0.4917528033256531, + "rewards/ngram_similarity_reward/std": 0.4018055200576782, + "step": 41 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 733.0, + "completions/max_terminated_length": 733.0, + "completions/mean_length": 487.71875, + "completions/mean_terminated_length": 487.71875, + "completions/min_length": 344.0, + "completions/min_terminated_length": 344.0, + "epoch": 0.018796151264264937, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.03918340429663658, + "learning_rate": 9.151785714285715e-07, + "loss": -0.0052, + "num_tokens": 6857878.0, + "reward": 5.294514179229736, + "reward_std": 1.5053304433822632, + "rewards/accuracy_reward/mean": 4.9375, + "rewards/accuracy_reward/std": 1.7627090215682983, + "rewards/ngram_similarity_reward/mean": 0.3570142984390259, + "rewards/ngram_similarity_reward/std": 0.3143937885761261, + "step": 42 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 802.0, + "completions/max_terminated_length": 802.0, + "completions/mean_length": 504.78125, + "completions/mean_terminated_length": 504.78125, + "completions/min_length": 314.0, + "completions/min_terminated_length": 314.0, + "epoch": 0.019243678675318864, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04142072796821594, + "learning_rate": 9.375000000000001e-07, + "loss": -0.0023, + "num_tokens": 7040488.0, + "reward": 2.6512510776519775, + "reward_std": 2.6500251293182373, + "rewards/accuracy_reward/mean": 2.109375, + "rewards/accuracy_reward/std": 3.0164480209350586, + "rewards/ngram_similarity_reward/mean": 0.5418761968612671, + "rewards/ngram_similarity_reward/std": 0.3100045323371887, + "step": 43 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 710.0, + "completions/max_terminated_length": 710.0, + "completions/mean_length": 548.046875, + "completions/mean_terminated_length": 548.046875, + "completions/min_length": 370.0, + "completions/min_terminated_length": 370.0, + "epoch": 0.01969120608637279, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.038975950330495834, + "learning_rate": 9.598214285714287e-07, + "loss": -0.0069, + "num_tokens": 7197131.0, + "reward": 5.58192253112793, + "reward_std": 1.3043636083602905, + "rewards/accuracy_reward/mean": 4.9375, + "rewards/accuracy_reward/std": 1.7627090215682983, + "rewards/ngram_similarity_reward/mean": 0.6444226503372192, + "rewards/ngram_similarity_reward/std": 0.36534932255744934, + "step": 44 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 744.0, + "completions/max_terminated_length": 744.0, + "completions/mean_length": 483.625, + "completions/mean_terminated_length": 483.625, + "completions/min_length": 303.0, + "completions/min_terminated_length": 303.0, + "epoch": 0.020138733497426717, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04290686175227165, + "learning_rate": 9.821428571428572e-07, + "loss": 0.0074, + "num_tokens": 7329955.0, + "reward": 0.9148364067077637, + "reward_std": 2.1159703731536865, + "rewards/accuracy_reward/mean": 0.65625, + "rewards/accuracy_reward/std": 2.607795238494873, + "rewards/ngram_similarity_reward/mean": 0.25858643651008606, + "rewards/ngram_similarity_reward/std": 0.15243861079216003, + "step": 45 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 686.0, + "completions/max_terminated_length": 686.0, + "completions/mean_length": 470.265625, + "completions/mean_terminated_length": 470.265625, + "completions/min_length": 280.0, + "completions/min_terminated_length": 280.0, + "epoch": 0.020586260908480643, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.043191708624362946, + "learning_rate": 1.0044642857142857e-06, + "loss": 0.0283, + "num_tokens": 7490068.0, + "reward": 3.445145606994629, + "reward_std": 2.0403270721435547, + "rewards/accuracy_reward/mean": 3.03125, + "rewards/accuracy_reward/std": 3.0130341053009033, + "rewards/ngram_similarity_reward/mean": 0.4138953983783722, + "rewards/ngram_similarity_reward/std": 0.31189674139022827, + "step": 46 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 726.0, + "completions/max_terminated_length": 726.0, + "completions/mean_length": 516.578125, + "completions/mean_terminated_length": 516.578125, + "completions/min_length": 331.0, + "completions/min_terminated_length": 331.0, + "epoch": 0.02103378831953457, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.03917146101593971, + "learning_rate": 1.0267857142857143e-06, + "loss": 0.0058, + "num_tokens": 7677833.0, + "reward": 3.6610493659973145, + "reward_std": 0.9401412606239319, + "rewards/accuracy_reward/mean": 2.96875, + "rewards/accuracy_reward/std": 2.986577272415161, + "rewards/ngram_similarity_reward/mean": 0.6922991275787354, + "rewards/ngram_similarity_reward/std": 0.3831275999546051, + "step": 47 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 713.0, + "completions/max_terminated_length": 713.0, + "completions/mean_length": 425.296875, + "completions/mean_terminated_length": 425.296875, + "completions/min_length": 234.0, + "completions/min_terminated_length": 234.0, + "epoch": 0.0214813157305885, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05108068510890007, + "learning_rate": 1.049107142857143e-06, + "loss": -0.0085, + "num_tokens": 7783212.0, + "reward": 3.6683921813964844, + "reward_std": 0.9822139739990234, + "rewards/accuracy_reward/mean": 3.234375, + "rewards/accuracy_reward/std": 2.950610399246216, + "rewards/ngram_similarity_reward/mean": 0.43401747941970825, + "rewards/ngram_similarity_reward/std": 0.2134656310081482, + "step": 48 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 680.0, + "completions/max_terminated_length": 680.0, + "completions/mean_length": 446.359375, + "completions/mean_terminated_length": 446.359375, + "completions/min_length": 252.0, + "completions/min_terminated_length": 252.0, + "epoch": 0.021928843141642427, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04882439225912094, + "learning_rate": 1.0714285714285714e-06, + "loss": -0.0035, + "num_tokens": 7927283.0, + "reward": 0.4699714779853821, + "reward_std": 1.4633207321166992, + "rewards/accuracy_reward/mean": 0.078125, + "rewards/accuracy_reward/std": 1.9338643550872803, + "rewards/ngram_similarity_reward/mean": 0.39184650778770447, + "rewards/ngram_similarity_reward/std": 0.2500672936439514, + "step": 49 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 803.0, + "completions/max_terminated_length": 803.0, + "completions/mean_length": 523.703125, + "completions/mean_terminated_length": 523.703125, + "completions/min_length": 271.0, + "completions/min_terminated_length": 271.0, + "epoch": 0.022376370552696354, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.050660137087106705, + "learning_rate": 1.0937500000000001e-06, + "loss": -0.0242, + "num_tokens": 8075872.0, + "reward": 4.185028076171875, + "reward_std": 1.3079955577850342, + "rewards/accuracy_reward/mean": 3.875, + "rewards/accuracy_reward/std": 2.7284510135650635, + "rewards/ngram_similarity_reward/mean": 0.31002795696258545, + "rewards/ngram_similarity_reward/std": 0.29505354166030884, + "step": 50 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 678.0, + "completions/max_terminated_length": 678.0, + "completions/mean_length": 457.828125, + "completions/mean_terminated_length": 457.828125, + "completions/min_length": 245.0, + "completions/min_terminated_length": 245.0, + "epoch": 0.02282389796375028, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04532083496451378, + "learning_rate": 1.1160714285714287e-06, + "loss": 0.0207, + "num_tokens": 8224117.0, + "reward": 3.2468814849853516, + "reward_std": 1.64667809009552, + "rewards/accuracy_reward/mean": 2.859375, + "rewards/accuracy_reward/std": 3.0203921794891357, + "rewards/ngram_similarity_reward/mean": 0.38750651478767395, + "rewards/ngram_similarity_reward/std": 0.2459414303302765, + "step": 51 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 874.0, + "completions/max_terminated_length": 874.0, + "completions/mean_length": 538.1875, + "completions/mean_terminated_length": 538.1875, + "completions/min_length": 174.0, + "completions/min_terminated_length": 174.0, + "epoch": 0.023271425374804207, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.038736362010240555, + "learning_rate": 1.1383928571428572e-06, + "loss": -0.0134, + "num_tokens": 8356273.0, + "reward": 3.7373809814453125, + "reward_std": 0.941716194152832, + "rewards/accuracy_reward/mean": 3.3125, + "rewards/accuracy_reward/std": 2.948634386062622, + "rewards/ngram_similarity_reward/mean": 0.4248809814453125, + "rewards/ngram_similarity_reward/std": 0.2687967121601105, + "step": 52 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 818.0, + "completions/max_terminated_length": 818.0, + "completions/mean_length": 492.953125, + "completions/mean_terminated_length": 492.953125, + "completions/min_length": 244.0, + "completions/min_terminated_length": 244.0, + "epoch": 0.023718952785858134, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.03777806833386421, + "learning_rate": 1.160714285714286e-06, + "loss": -0.0303, + "num_tokens": 8515358.0, + "reward": 4.260929584503174, + "reward_std": 1.2759349346160889, + "rewards/accuracy_reward/mean": 3.65625, + "rewards/accuracy_reward/std": 2.868652582168579, + "rewards/ngram_similarity_reward/mean": 0.6046797037124634, + "rewards/ngram_similarity_reward/std": 0.4571002721786499, + "step": 53 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 935.0, + "completions/max_terminated_length": 935.0, + "completions/mean_length": 542.90625, + "completions/mean_terminated_length": 542.90625, + "completions/min_length": 349.0, + "completions/min_terminated_length": 349.0, + "epoch": 0.02416648019691206, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.037424709647893906, + "learning_rate": 1.1830357142857143e-06, + "loss": 0.0158, + "num_tokens": 8662824.0, + "reward": 3.1465628147125244, + "reward_std": 1.4686698913574219, + "rewards/accuracy_reward/mean": 2.734375, + "rewards/accuracy_reward/std": 3.0692648887634277, + "rewards/ngram_similarity_reward/mean": 0.41218769550323486, + "rewards/ngram_similarity_reward/std": 0.20500494539737701, + "step": 54 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 701.0, + "completions/max_terminated_length": 701.0, + "completions/mean_length": 468.71875, + "completions/mean_terminated_length": 468.71875, + "completions/min_length": 330.0, + "completions/min_terminated_length": 330.0, + "epoch": 0.024614007607965987, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.041173093020915985, + "learning_rate": 1.205357142857143e-06, + "loss": 0.0014, + "num_tokens": 8831638.0, + "reward": 4.659518241882324, + "reward_std": 0.2648826539516449, + "rewards/accuracy_reward/mean": 4.0, + "rewards/accuracy_reward/std": 2.618614673614502, + "rewards/ngram_similarity_reward/mean": 0.6595180034637451, + "rewards/ngram_similarity_reward/std": 0.39609453082084656, + "step": 55 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 944.0, + "completions/max_terminated_length": 944.0, + "completions/mean_length": 557.984375, + "completions/mean_terminated_length": 557.984375, + "completions/min_length": 312.0, + "completions/min_terminated_length": 312.0, + "epoch": 0.025061535019019913, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04032100737094879, + "learning_rate": 1.2276785714285716e-06, + "loss": 0.0007, + "num_tokens": 8978853.0, + "reward": 3.653855323791504, + "reward_std": 1.3852059841156006, + "rewards/accuracy_reward/mean": 3.203125, + "rewards/accuracy_reward/std": 2.995656728744507, + "rewards/ngram_similarity_reward/mean": 0.4507303237915039, + "rewards/ngram_similarity_reward/std": 0.30365705490112305, + "step": 56 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 973.0, + "completions/max_terminated_length": 973.0, + "completions/mean_length": 447.5625, + "completions/mean_terminated_length": 447.5625, + "completions/min_length": 238.0, + "completions/min_terminated_length": 238.0, + "epoch": 0.025509062430073844, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04351978376507759, + "learning_rate": 1.25e-06, + "loss": 0.0547, + "num_tokens": 9109401.0, + "reward": 2.6265063285827637, + "reward_std": 1.2239631414413452, + "rewards/accuracy_reward/mean": 2.03125, + "rewards/accuracy_reward/std": 2.986577272415161, + "rewards/ngram_similarity_reward/mean": 0.5952565670013428, + "rewards/ngram_similarity_reward/std": 0.3527531027793884, + "step": 57 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 738.0, + "completions/max_terminated_length": 738.0, + "completions/mean_length": 413.140625, + "completions/mean_terminated_length": 413.140625, + "completions/min_length": 158.0, + "completions/min_terminated_length": 158.0, + "epoch": 0.02595658984112777, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.03853273391723633, + "learning_rate": 1.2723214285714286e-06, + "loss": -0.0464, + "num_tokens": 9255714.0, + "reward": 2.893002510070801, + "reward_std": 0.7541555166244507, + "rewards/accuracy_reward/mean": 2.125, + "rewards/accuracy_reward/std": 3.000000238418579, + "rewards/ngram_similarity_reward/mean": 0.7680025100708008, + "rewards/ngram_similarity_reward/std": 0.3234589397907257, + "step": 58 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 942.0, + "completions/max_terminated_length": 942.0, + "completions/mean_length": 604.34375, + "completions/mean_terminated_length": 604.34375, + "completions/min_length": 373.0, + "completions/min_terminated_length": 373.0, + "epoch": 0.026404117252181697, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.03918275237083435, + "learning_rate": 1.2946428571428574e-06, + "loss": 0.0445, + "num_tokens": 9396520.0, + "reward": 2.26601505279541, + "reward_std": 1.9519855976104736, + "rewards/accuracy_reward/mean": 1.890625, + "rewards/accuracy_reward/std": 3.0164480209350586, + "rewards/ngram_similarity_reward/mean": 0.37539005279541016, + "rewards/ngram_similarity_reward/std": 0.1937084197998047, + "step": 59 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 677.0, + "completions/max_terminated_length": 677.0, + "completions/mean_length": 444.015625, + "completions/mean_terminated_length": 444.015625, + "completions/min_length": 246.0, + "completions/min_terminated_length": 246.0, + "epoch": 0.026851644663235624, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.044681839644908905, + "learning_rate": 1.316964285714286e-06, + "loss": -0.0, + "num_tokens": 9521689.0, + "reward": 5.670622825622559, + "reward_std": 0.9016618132591248, + "rewards/accuracy_reward/mean": 5.03125, + "rewards/accuracy_reward/std": 1.6229382753372192, + "rewards/ngram_similarity_reward/mean": 0.6393731236457825, + "rewards/ngram_similarity_reward/std": 0.3744213879108429, + "step": 60 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1087.0, + "completions/max_terminated_length": 1087.0, + "completions/mean_length": 503.84375, + "completions/mean_terminated_length": 503.84375, + "completions/min_length": 180.0, + "completions/min_terminated_length": 180.0, + "epoch": 0.02729917207428955, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04541458934545517, + "learning_rate": 1.3392857142857143e-06, + "loss": 0.0333, + "num_tokens": 9663695.0, + "reward": 4.416263580322266, + "reward_std": 0.6310166120529175, + "rewards/accuracy_reward/mean": 3.890625, + "rewards/accuracy_reward/std": 2.6998953819274902, + "rewards/ngram_similarity_reward/mean": 0.5256385803222656, + "rewards/ngram_similarity_reward/std": 0.3763166069984436, + "step": 61 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1038.0, + "completions/max_terminated_length": 1038.0, + "completions/mean_length": 490.609375, + "completions/mean_terminated_length": 490.609375, + "completions/min_length": 296.0, + "completions/min_terminated_length": 296.0, + "epoch": 0.027746699485343477, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04283396899700165, + "learning_rate": 1.3616071428571428e-06, + "loss": 0.0117, + "num_tokens": 9825190.0, + "reward": 2.2028346061706543, + "reward_std": 1.3296781778335571, + "rewards/accuracy_reward/mean": 1.921875, + "rewards/accuracy_reward/std": 2.9857051372528076, + "rewards/ngram_similarity_reward/mean": 0.2809595465660095, + "rewards/ngram_similarity_reward/std": 0.18962761759757996, + "step": 62 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1120.0, + "completions/max_terminated_length": 1120.0, + "completions/mean_length": 557.828125, + "completions/mean_terminated_length": 557.828125, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "epoch": 0.028194226896397404, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.03895300626754761, + "learning_rate": 1.3839285714285715e-06, + "loss": 0.0294, + "num_tokens": 9960459.0, + "reward": 4.8852128982543945, + "reward_std": 2.1436607837677, + "rewards/accuracy_reward/mean": 4.53125, + "rewards/accuracy_reward/std": 2.27455735206604, + "rewards/ngram_similarity_reward/mean": 0.3539627194404602, + "rewards/ngram_similarity_reward/std": 0.22679011523723602, + "step": 63 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 856.0, + "completions/max_terminated_length": 856.0, + "completions/mean_length": 471.390625, + "completions/mean_terminated_length": 471.390625, + "completions/min_length": 247.0, + "completions/min_terminated_length": 247.0, + "epoch": 0.02864175430745133, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04387656971812248, + "learning_rate": 1.40625e-06, + "loss": 0.0381, + "num_tokens": 10086228.0, + "reward": 3.403301477432251, + "reward_std": 1.543312430381775, + "rewards/accuracy_reward/mean": 3.0625, + "rewards/accuracy_reward/std": 2.9700891971588135, + "rewards/ngram_similarity_reward/mean": 0.340801477432251, + "rewards/ngram_similarity_reward/std": 0.1824437826871872, + "step": 64 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 723.0, + "completions/max_terminated_length": 723.0, + "completions/mean_length": 477.84375, + "completions/mean_terminated_length": 477.84375, + "completions/min_length": 284.0, + "completions/min_terminated_length": 284.0, + "epoch": 0.029089281718505257, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04593445733189583, + "learning_rate": 1.4285714285714286e-06, + "loss": 0.0418, + "num_tokens": 10272906.0, + "reward": 2.3133389949798584, + "reward_std": 2.1918067932128906, + "rewards/accuracy_reward/mean": 1.984375, + "rewards/accuracy_reward/std": 3.03415584564209, + "rewards/ngram_similarity_reward/mean": 0.328963965177536, + "rewards/ngram_similarity_reward/std": 0.2558513283729553, + "step": 65 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 791.0, + "completions/max_terminated_length": 791.0, + "completions/mean_length": 445.28125, + "completions/mean_terminated_length": 445.28125, + "completions/min_length": 214.0, + "completions/min_terminated_length": 214.0, + "epoch": 0.029536809129559187, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.061927419155836105, + "learning_rate": 1.4508928571428574e-06, + "loss": -0.0232, + "num_tokens": 10470828.0, + "reward": 1.4907304048538208, + "reward_std": 0.48672711849212646, + "rewards/accuracy_reward/mean": 1.09375, + "rewards/accuracy_reward/std": 2.6709415912628174, + "rewards/ngram_similarity_reward/mean": 0.39698031544685364, + "rewards/ngram_similarity_reward/std": 0.23842564225196838, + "step": 66 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 752.0, + "completions/max_terminated_length": 752.0, + "completions/mean_length": 485.640625, + "completions/mean_terminated_length": 485.640625, + "completions/min_length": 291.0, + "completions/min_terminated_length": 291.0, + "epoch": 0.029984336540613114, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.037593647837638855, + "learning_rate": 1.473214285714286e-06, + "loss": 0.0226, + "num_tokens": 10637333.0, + "reward": 1.6215100288391113, + "reward_std": 0.4261550009250641, + "rewards/accuracy_reward/mean": 1.09375, + "rewards/accuracy_reward/std": 2.6709415912628174, + "rewards/ngram_similarity_reward/mean": 0.5277600288391113, + "rewards/ngram_similarity_reward/std": 0.392575740814209, + "step": 67 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 833.0, + "completions/max_terminated_length": 833.0, + "completions/mean_length": 558.578125, + "completions/mean_terminated_length": 558.578125, + "completions/min_length": 306.0, + "completions/min_terminated_length": 306.0, + "epoch": 0.03043186395166704, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.038532938808202744, + "learning_rate": 1.4955357142857145e-06, + "loss": 0.0055, + "num_tokens": 10768250.0, + "reward": 4.58320426940918, + "reward_std": 0.2183823436498642, + "rewards/accuracy_reward/mean": 4.0, + "rewards/accuracy_reward/std": 2.618614673614502, + "rewards/ngram_similarity_reward/mean": 0.5832041501998901, + "rewards/ngram_similarity_reward/std": 0.2772037982940674, + "step": 68 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 772.0, + "completions/max_terminated_length": 772.0, + "completions/mean_length": 508.21875, + "completions/mean_terminated_length": 508.21875, + "completions/min_length": 331.0, + "completions/min_terminated_length": 331.0, + "epoch": 0.030879391362720967, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.041135646402835846, + "learning_rate": 1.5178571428571428e-06, + "loss": 0.0032, + "num_tokens": 10935176.0, + "reward": 2.627410888671875, + "reward_std": 2.070981979370117, + "rewards/accuracy_reward/mean": 2.109375, + "rewards/accuracy_reward/std": 3.0164480209350586, + "rewards/ngram_similarity_reward/mean": 0.5180359482765198, + "rewards/ngram_similarity_reward/std": 0.2647475302219391, + "step": 69 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 729.0, + "completions/max_terminated_length": 729.0, + "completions/mean_length": 444.890625, + "completions/mean_terminated_length": 444.890625, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "epoch": 0.031326918773774894, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05290055274963379, + "learning_rate": 1.5401785714285715e-06, + "loss": -0.0024, + "num_tokens": 11132913.0, + "reward": 3.530862331390381, + "reward_std": 1.4581063985824585, + "rewards/accuracy_reward/mean": 3.21875, + "rewards/accuracy_reward/std": 2.9732606410980225, + "rewards/ngram_similarity_reward/mean": 0.31211215257644653, + "rewards/ngram_similarity_reward/std": 0.2300308793783188, + "step": 70 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 733.0, + "completions/max_terminated_length": 733.0, + "completions/mean_length": 474.890625, + "completions/mean_terminated_length": 474.890625, + "completions/min_length": 230.0, + "completions/min_terminated_length": 230.0, + "epoch": 0.031774446184828824, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.046519357711076736, + "learning_rate": 1.5625e-06, + "loss": -0.015, + "num_tokens": 11288058.0, + "reward": 2.81695556640625, + "reward_std": 0.12096526473760605, + "rewards/accuracy_reward/mean": 2.46875, + "rewards/accuracy_reward/std": 3.06007981300354, + "rewards/ngram_similarity_reward/mean": 0.3482056260108948, + "rewards/ngram_similarity_reward/std": 0.271560937166214, + "step": 71 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 695.0, + "completions/max_terminated_length": 695.0, + "completions/mean_length": 497.9375, + "completions/mean_terminated_length": 497.9375, + "completions/min_length": 339.0, + "completions/min_terminated_length": 339.0, + "epoch": 0.03222197359588275, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04361455515027046, + "learning_rate": 1.5848214285714286e-06, + "loss": -0.0223, + "num_tokens": 11440454.0, + "reward": 1.5024499893188477, + "reward_std": 2.089134454727173, + "rewards/accuracy_reward/mean": 0.984375, + "rewards/accuracy_reward/std": 2.7544608116149902, + "rewards/ngram_similarity_reward/mean": 0.5180749893188477, + "rewards/ngram_similarity_reward/std": 0.35097816586494446, + "step": 72 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 980.0, + "completions/max_terminated_length": 980.0, + "completions/mean_length": 509.234375, + "completions/mean_terminated_length": 509.234375, + "completions/min_length": 305.0, + "completions/min_terminated_length": 305.0, + "epoch": 0.03266950100693668, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.03942706063389778, + "learning_rate": 1.6071428571428574e-06, + "loss": 0.0095, + "num_tokens": 11607461.0, + "reward": 1.3038195371627808, + "reward_std": 0.1510372906923294, + "rewards/accuracy_reward/mean": 0.984375, + "rewards/accuracy_reward/std": 2.6306629180908203, + "rewards/ngram_similarity_reward/mean": 0.31944456696510315, + "rewards/ngram_similarity_reward/std": 0.29977452754974365, + "step": 73 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1047.0, + "completions/max_terminated_length": 1047.0, + "completions/mean_length": 640.359375, + "completions/mean_terminated_length": 640.359375, + "completions/min_length": 217.0, + "completions/min_terminated_length": 217.0, + "epoch": 0.0331170284179906, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04091178998351097, + "learning_rate": 1.629464285714286e-06, + "loss": 0.0386, + "num_tokens": 11837804.0, + "reward": 1.7923917770385742, + "reward_std": 0.8479336500167847, + "rewards/accuracy_reward/mean": 1.421875, + "rewards/accuracy_reward/std": 2.880171298980713, + "rewards/ngram_similarity_reward/mean": 0.37051689624786377, + "rewards/ngram_similarity_reward/std": 0.3080310523509979, + "step": 74 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 878.0, + "completions/max_terminated_length": 878.0, + "completions/mean_length": 566.265625, + "completions/mean_terminated_length": 566.265625, + "completions/min_length": 237.0, + "completions/min_terminated_length": 237.0, + "epoch": 0.03356455582904453, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.03632889688014984, + "learning_rate": 1.6517857142857144e-06, + "loss": 0.0286, + "num_tokens": 12010413.0, + "reward": 2.0851569175720215, + "reward_std": 1.3150734901428223, + "rewards/accuracy_reward/mean": 1.640625, + "rewards/accuracy_reward/std": 3.2409443855285645, + "rewards/ngram_similarity_reward/mean": 0.44453203678131104, + "rewards/ngram_similarity_reward/std": 0.2968069314956665, + "step": 75 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 638.0, + "completions/max_terminated_length": 638.0, + "completions/mean_length": 485.34375, + "completions/mean_terminated_length": 485.34375, + "completions/min_length": 320.0, + "completions/min_terminated_length": 320.0, + "epoch": 0.034012083240098454, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04526903107762337, + "learning_rate": 1.6741071428571428e-06, + "loss": 0.0471, + "num_tokens": 12245683.0, + "reward": -0.17060258984565735, + "reward_std": 0.41853880882263184, + "rewards/accuracy_reward/mean": -0.40625, + "rewards/accuracy_reward/std": 0.7500000596046448, + "rewards/ngram_similarity_reward/mean": 0.23564741015434265, + "rewards/ngram_similarity_reward/std": 0.21910040080547333, + "step": 76 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 770.0, + "completions/max_terminated_length": 770.0, + "completions/mean_length": 430.859375, + "completions/mean_terminated_length": 430.859375, + "completions/min_length": 251.0, + "completions/min_terminated_length": 251.0, + "epoch": 0.034459610651152384, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.042782995849847794, + "learning_rate": 1.6964285714285717e-06, + "loss": -0.0128, + "num_tokens": 12423866.0, + "reward": 1.6177589893341064, + "reward_std": 0.5983878374099731, + "rewards/accuracy_reward/mean": 1.1875, + "rewards/accuracy_reward/std": 2.7189810276031494, + "rewards/ngram_similarity_reward/mean": 0.43025898933410645, + "rewards/ngram_similarity_reward/std": 0.3256424367427826, + "step": 77 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 694.0, + "completions/max_terminated_length": 694.0, + "completions/mean_length": 480.953125, + "completions/mean_terminated_length": 480.953125, + "completions/min_length": 286.0, + "completions/min_terminated_length": 286.0, + "epoch": 0.03490713806220631, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.038841940462589264, + "learning_rate": 1.71875e-06, + "loss": 0.0114, + "num_tokens": 12587143.0, + "reward": 4.244690895080566, + "reward_std": 0.8090140223503113, + "rewards/accuracy_reward/mean": 3.703125, + "rewards/accuracy_reward/std": 2.789889335632324, + "rewards/ngram_similarity_reward/mean": 0.5415658950805664, + "rewards/ngram_similarity_reward/std": 0.3842836916446686, + "step": 78 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 843.0, + "completions/max_terminated_length": 843.0, + "completions/mean_length": 544.9375, + "completions/mean_terminated_length": 544.9375, + "completions/min_length": 257.0, + "completions/min_terminated_length": 257.0, + "epoch": 0.03535466547326024, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.03895031660795212, + "learning_rate": 1.7410714285714286e-06, + "loss": -0.0014, + "num_tokens": 12752179.0, + "reward": 4.22926664352417, + "reward_std": 1.0198653936386108, + "rewards/accuracy_reward/mean": 3.75, + "rewards/accuracy_reward/std": 2.8284270763397217, + "rewards/ngram_similarity_reward/mean": 0.47926658391952515, + "rewards/ngram_similarity_reward/std": 0.40398722887039185, + "step": 79 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 889.0, + "completions/max_terminated_length": 889.0, + "completions/mean_length": 524.953125, + "completions/mean_terminated_length": 524.953125, + "completions/min_length": 237.0, + "completions/min_terminated_length": 237.0, + "epoch": 0.03580219288431417, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04183723405003548, + "learning_rate": 1.7633928571428574e-06, + "loss": 0.002, + "num_tokens": 12924576.0, + "reward": 4.149851322174072, + "reward_std": 0.7294169068336487, + "rewards/accuracy_reward/mean": 3.78125, + "rewards/accuracy_reward/std": 2.7744226455688477, + "rewards/ngram_similarity_reward/mean": 0.36860156059265137, + "rewards/ngram_similarity_reward/std": 0.30380597710609436, + "step": 80 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 696.0, + "completions/max_terminated_length": 696.0, + "completions/mean_length": 436.1875, + "completions/mean_terminated_length": 436.1875, + "completions/min_length": 191.0, + "completions/min_terminated_length": 191.0, + "epoch": 0.03624972029536809, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.048419512808322906, + "learning_rate": 1.7857142857142859e-06, + "loss": -0.004, + "num_tokens": 13157100.0, + "reward": 1.6001965999603271, + "reward_std": 0.6172314286231995, + "rewards/accuracy_reward/mean": 1.1875, + "rewards/accuracy_reward/std": 2.7189810276031494, + "rewards/ngram_similarity_reward/mean": 0.4126965403556824, + "rewards/ngram_similarity_reward/std": 0.3676692247390747, + "step": 81 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 872.0, + "completions/max_terminated_length": 872.0, + "completions/mean_length": 552.828125, + "completions/mean_terminated_length": 552.828125, + "completions/min_length": 341.0, + "completions/min_terminated_length": 341.0, + "epoch": 0.03669724770642202, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04128267988562584, + "learning_rate": 1.8080357142857144e-06, + "loss": -0.0078, + "num_tokens": 13299041.0, + "reward": 2.3026301860809326, + "reward_std": 1.387294888496399, + "rewards/accuracy_reward/mean": 1.890625, + "rewards/accuracy_reward/std": 3.0164480209350586, + "rewards/ngram_similarity_reward/mean": 0.4120052754878998, + "rewards/ngram_similarity_reward/std": 0.34123870730400085, + "step": 82 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 994.0, + "completions/max_terminated_length": 994.0, + "completions/mean_length": 553.34375, + "completions/mean_terminated_length": 553.34375, + "completions/min_length": 267.0, + "completions/min_terminated_length": 267.0, + "epoch": 0.037144775117475944, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.03644965589046478, + "learning_rate": 1.830357142857143e-06, + "loss": 0.0274, + "num_tokens": 13436519.0, + "reward": 5.4892730712890625, + "reward_std": 0.8663454055786133, + "rewards/accuracy_reward/mean": 5.03125, + "rewards/accuracy_reward/std": 1.6229382753372192, + "rewards/ngram_similarity_reward/mean": 0.4580225944519043, + "rewards/ngram_similarity_reward/std": 0.2673526406288147, + "step": 83 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 722.0, + "completions/max_terminated_length": 722.0, + "completions/mean_length": 460.859375, + "completions/mean_terminated_length": 460.859375, + "completions/min_length": 269.0, + "completions/min_terminated_length": 269.0, + "epoch": 0.037592302528529874, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04325474426150322, + "learning_rate": 1.8526785714285717e-06, + "loss": 0.0131, + "num_tokens": 13568558.0, + "reward": 4.265529632568359, + "reward_std": 0.8148656487464905, + "rewards/accuracy_reward/mean": 3.625, + "rewards/accuracy_reward/std": 2.8030595779418945, + "rewards/ngram_similarity_reward/mean": 0.6405298709869385, + "rewards/ngram_similarity_reward/std": 0.2982690930366516, + "step": 84 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 959.0, + "completions/max_terminated_length": 959.0, + "completions/mean_length": 544.1875, + "completions/mean_terminated_length": 544.1875, + "completions/min_length": 300.0, + "completions/min_terminated_length": 300.0, + "epoch": 0.0380398299395838, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.039485905319452286, + "learning_rate": 1.8750000000000003e-06, + "loss": 0.0067, + "num_tokens": 13701690.0, + "reward": 2.838294744491577, + "reward_std": 0.8348063230514526, + "rewards/accuracy_reward/mean": 2.5, + "rewards/accuracy_reward/std": 3.0237157344818115, + "rewards/ngram_similarity_reward/mean": 0.33829477429389954, + "rewards/ngram_similarity_reward/std": 0.1729307323694229, + "step": 85 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 708.0, + "completions/mean_length": 467.390625, + "completions/mean_terminated_length": 442.3016052246094, + "completions/min_length": 273.0, + "completions/min_terminated_length": 273.0, + "epoch": 0.03848735735063773, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.051283568143844604, + "learning_rate": 1.8973214285714286e-06, + "loss": -0.0116, + "num_tokens": 13856371.0, + "reward": 2.866548776626587, + "reward_std": 0.18996483087539673, + "rewards/accuracy_reward/mean": 2.5, + "rewards/accuracy_reward/std": 3.0237157344818115, + "rewards/ngram_similarity_reward/mean": 0.3665488660335541, + "rewards/ngram_similarity_reward/std": 0.2918168306350708, + "step": 86 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 775.0, + "completions/max_terminated_length": 775.0, + "completions/mean_length": 512.171875, + "completions/mean_terminated_length": 512.171875, + "completions/min_length": 316.0, + "completions/min_terminated_length": 316.0, + "epoch": 0.03893488476169165, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04058018699288368, + "learning_rate": 1.9196428571428573e-06, + "loss": -0.0175, + "num_tokens": 14019662.0, + "reward": 4.449865341186523, + "reward_std": 0.2415446937084198, + "rewards/accuracy_reward/mean": 4.0, + "rewards/accuracy_reward/std": 2.618614673614502, + "rewards/ngram_similarity_reward/mean": 0.4498656094074249, + "rewards/ngram_similarity_reward/std": 0.2835099697113037, + "step": 87 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 658.0, + "completions/max_terminated_length": 658.0, + "completions/mean_length": 494.015625, + "completions/mean_terminated_length": 494.015625, + "completions/min_length": 301.0, + "completions/min_terminated_length": 301.0, + "epoch": 0.03938241217274558, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04097066447138786, + "learning_rate": 1.941964285714286e-06, + "loss": -0.0135, + "num_tokens": 14143855.0, + "reward": 0.696371853351593, + "reward_std": 0.8611838221549988, + "rewards/accuracy_reward/mean": 0.34375, + "rewards/accuracy_reward/std": 2.102294683456421, + "rewards/ngram_similarity_reward/mean": 0.352621853351593, + "rewards/ngram_similarity_reward/std": 0.2985036075115204, + "step": 88 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 635.0, + "completions/max_terminated_length": 635.0, + "completions/mean_length": 477.4375, + "completions/mean_terminated_length": 477.4375, + "completions/min_length": 249.0, + "completions/min_terminated_length": 249.0, + "epoch": 0.03982993958379951, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04545062407851219, + "learning_rate": 1.9642857142857144e-06, + "loss": -0.0054, + "num_tokens": 14315243.0, + "reward": 0.8684969544410706, + "reward_std": 1.7307615280151367, + "rewards/accuracy_reward/mean": 0.71875, + "rewards/accuracy_reward/std": 2.566380739212036, + "rewards/ngram_similarity_reward/mean": 0.14974701404571533, + "rewards/ngram_similarity_reward/std": 0.08504395186901093, + "step": 89 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 589.0, + "completions/max_terminated_length": 589.0, + "completions/mean_length": 413.359375, + "completions/mean_terminated_length": 413.359375, + "completions/min_length": 179.0, + "completions/min_terminated_length": 179.0, + "epoch": 0.040277466994853434, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.046176016330718994, + "learning_rate": 1.9866071428571427e-06, + "loss": -0.0133, + "num_tokens": 14486578.0, + "reward": 1.0136975049972534, + "reward_std": 1.921843409538269, + "rewards/accuracy_reward/mean": 0.8125, + "rewards/accuracy_reward/std": 2.5, + "rewards/ngram_similarity_reward/mean": 0.201197549700737, + "rewards/ngram_similarity_reward/std": 0.16361328959465027, + "step": 90 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 746.0, + "completions/mean_length": 475.71875, + "completions/mean_terminated_length": 450.7619323730469, + "completions/min_length": 249.0, + "completions/min_terminated_length": 249.0, + "epoch": 0.040724994405907364, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04870830848813057, + "learning_rate": 2.0089285714285715e-06, + "loss": 0.0291, + "num_tokens": 14603360.0, + "reward": 1.7914204597473145, + "reward_std": 0.9845656156539917, + "rewards/accuracy_reward/mean": 1.28125, + "rewards/accuracy_reward/std": 2.9945387840270996, + "rewards/ngram_similarity_reward/mean": 0.510170578956604, + "rewards/ngram_similarity_reward/std": 0.4287711977958679, + "step": 91 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 688.0, + "completions/max_terminated_length": 688.0, + "completions/mean_length": 446.5625, + "completions/mean_terminated_length": 446.5625, + "completions/min_length": 242.0, + "completions/min_terminated_length": 242.0, + "epoch": 0.04117252181696129, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05217056721448898, + "learning_rate": 2.0312500000000002e-06, + "loss": -0.0099, + "num_tokens": 14773364.0, + "reward": 4.463947772979736, + "reward_std": 0.512155294418335, + "rewards/accuracy_reward/mean": 4.09375, + "rewards/accuracy_reward/std": 2.5617377758026123, + "rewards/ngram_similarity_reward/mean": 0.3701978325843811, + "rewards/ngram_similarity_reward/std": 0.3230494260787964, + "step": 92 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 594.0, + "completions/max_terminated_length": 594.0, + "completions/mean_length": 407.0, + "completions/mean_terminated_length": 407.0, + "completions/min_length": 240.0, + "completions/min_terminated_length": 240.0, + "epoch": 0.04162004922801522, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.055510032922029495, + "learning_rate": 2.0535714285714286e-06, + "loss": -0.0025, + "num_tokens": 14983252.0, + "reward": 2.310368061065674, + "reward_std": 1.5189405679702759, + "rewards/accuracy_reward/mean": 2.03125, + "rewards/accuracy_reward/std": 2.986577272415161, + "rewards/ngram_similarity_reward/mean": 0.27911806106567383, + "rewards/ngram_similarity_reward/std": 0.20924191176891327, + "step": 93 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 737.0, + "completions/max_terminated_length": 737.0, + "completions/mean_length": 491.546875, + "completions/mean_terminated_length": 491.546875, + "completions/min_length": 256.0, + "completions/min_terminated_length": 256.0, + "epoch": 0.04206757663906914, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04173356667160988, + "learning_rate": 2.0758928571428573e-06, + "loss": 0.0024, + "num_tokens": 15147351.0, + "reward": 3.0661611557006836, + "reward_std": 0.45635727047920227, + "rewards/accuracy_reward/mean": 2.59375, + "rewards/accuracy_reward/std": 3.0222392082214355, + "rewards/ngram_similarity_reward/mean": 0.47241121530532837, + "rewards/ngram_similarity_reward/std": 0.2998490035533905, + "step": 94 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 713.0, + "completions/max_terminated_length": 713.0, + "completions/mean_length": 463.4375, + "completions/mean_terminated_length": 463.4375, + "completions/min_length": 260.0, + "completions/min_terminated_length": 260.0, + "epoch": 0.04251510405012307, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04815267398953438, + "learning_rate": 2.098214285714286e-06, + "loss": 0.0653, + "num_tokens": 15272083.0, + "reward": 3.49208664894104, + "reward_std": 2.500577688217163, + "rewards/accuracy_reward/mean": 3.140625, + "rewards/accuracy_reward/std": 2.9727182388305664, + "rewards/ngram_similarity_reward/mean": 0.3514615297317505, + "rewards/ngram_similarity_reward/std": 0.3659766614437103, + "step": 95 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 536.0, + "completions/max_terminated_length": 536.0, + "completions/mean_length": 379.0625, + "completions/mean_terminated_length": 379.0625, + "completions/min_length": 273.0, + "completions/min_terminated_length": 273.0, + "epoch": 0.042962631461177, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04753585159778595, + "learning_rate": 2.1205357142857144e-06, + "loss": 0.0354, + "num_tokens": 15390535.0, + "reward": 6.03642463684082, + "reward_std": 0.17955493927001953, + "rewards/accuracy_reward/mean": 5.5, + "rewards/accuracy_reward/std": 0.0, + "rewards/ngram_similarity_reward/mean": 0.5364243984222412, + "rewards/ngram_similarity_reward/std": 0.34952878952026367, + "step": 96 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1160.0, + "completions/max_terminated_length": 1160.0, + "completions/mean_length": 636.890625, + "completions/mean_terminated_length": 636.890625, + "completions/min_length": 278.0, + "completions/min_terminated_length": 278.0, + "epoch": 0.043410158872230924, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.03281034156680107, + "learning_rate": 2.1428571428571427e-06, + "loss": -0.0114, + "num_tokens": 15560800.0, + "reward": 3.7070133686065674, + "reward_std": 1.9805774688720703, + "rewards/accuracy_reward/mean": 3.328125, + "rewards/accuracy_reward/std": 2.9252848625183105, + "rewards/ngram_similarity_reward/mean": 0.37888818979263306, + "rewards/ngram_similarity_reward/std": 0.2659132778644562, + "step": 97 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 711.0, + "completions/max_terminated_length": 711.0, + "completions/mean_length": 419.578125, + "completions/mean_terminated_length": 419.578125, + "completions/min_length": 203.0, + "completions/min_terminated_length": 203.0, + "epoch": 0.043857686283284854, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04952317103743553, + "learning_rate": 2.1651785714285715e-06, + "loss": 0.0254, + "num_tokens": 15709669.0, + "reward": 4.6569719314575195, + "reward_std": 1.7644875049591064, + "rewards/accuracy_reward/mean": 4.375, + "rewards/accuracy_reward/std": 2.3603873252868652, + "rewards/ngram_similarity_reward/mean": 0.2819717824459076, + "rewards/ngram_similarity_reward/std": 0.20795418322086334, + "step": 98 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 828.0, + "completions/max_terminated_length": 828.0, + "completions/mean_length": 591.234375, + "completions/mean_terminated_length": 591.234375, + "completions/min_length": 267.0, + "completions/min_terminated_length": 267.0, + "epoch": 0.04430521369433878, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.03588514402508736, + "learning_rate": 2.1875000000000002e-06, + "loss": 0.01, + "num_tokens": 15931524.0, + "reward": 3.773355484008789, + "reward_std": 1.285116195678711, + "rewards/accuracy_reward/mean": 3.375, + "rewards/accuracy_reward/std": 3.0783421993255615, + "rewards/ngram_similarity_reward/mean": 0.3983556032180786, + "rewards/ngram_similarity_reward/std": 0.2645743191242218, + "step": 99 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 728.0, + "completions/max_terminated_length": 728.0, + "completions/mean_length": 493.515625, + "completions/mean_terminated_length": 493.515625, + "completions/min_length": 304.0, + "completions/min_terminated_length": 304.0, + "epoch": 0.04475274110539271, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04176740348339081, + "learning_rate": 2.2098214285714286e-06, + "loss": -0.0062, + "num_tokens": 16140549.0, + "reward": 4.266956329345703, + "reward_std": 0.763481855392456, + "rewards/accuracy_reward/mean": 3.71875, + "rewards/accuracy_reward/std": 2.7629566192626953, + "rewards/ngram_similarity_reward/mean": 0.5482061505317688, + "rewards/ngram_similarity_reward/std": 0.4332646429538727, + "step": 100 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 792.0, + "completions/max_terminated_length": 792.0, + "completions/mean_length": 518.03125, + "completions/mean_terminated_length": 518.03125, + "completions/min_length": 273.0, + "completions/min_terminated_length": 273.0, + "epoch": 0.04520026851644663, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04275078326463699, + "learning_rate": 2.2321428571428573e-06, + "loss": 0.0128, + "num_tokens": 16286791.0, + "reward": 4.03216552734375, + "reward_std": 0.9512901306152344, + "rewards/accuracy_reward/mean": 3.40625, + "rewards/accuracy_reward/std": 2.920745372772217, + "rewards/ngram_similarity_reward/mean": 0.62591552734375, + "rewards/ngram_similarity_reward/std": 0.2111629694700241, + "step": 101 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 700.0, + "completions/max_terminated_length": 700.0, + "completions/mean_length": 483.484375, + "completions/mean_terminated_length": 483.484375, + "completions/min_length": 311.0, + "completions/min_terminated_length": 311.0, + "epoch": 0.04564779592750056, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.03929956257343292, + "learning_rate": 2.254464285714286e-06, + "loss": -0.0066, + "num_tokens": 16411382.0, + "reward": 3.690983533859253, + "reward_std": 0.8513113260269165, + "rewards/accuracy_reward/mean": 3.15625, + "rewards/accuracy_reward/std": 2.950484275817871, + "rewards/ngram_similarity_reward/mean": 0.5347336530685425, + "rewards/ngram_similarity_reward/std": 0.31054314970970154, + "step": 102 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 661.0, + "completions/max_terminated_length": 661.0, + "completions/mean_length": 387.15625, + "completions/mean_terminated_length": 387.15625, + "completions/min_length": 259.0, + "completions/min_terminated_length": 259.0, + "epoch": 0.046095323338554484, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.048255544155836105, + "learning_rate": 2.2767857142857144e-06, + "loss": 0.0101, + "num_tokens": 16553152.0, + "reward": 5.592085361480713, + "reward_std": 1.3386536836624146, + "rewards/accuracy_reward/mean": 5.03125, + "rewards/accuracy_reward/std": 1.6229382753372192, + "rewards/ngram_similarity_reward/mean": 0.5608352422714233, + "rewards/ngram_similarity_reward/std": 0.3552662134170532, + "step": 103 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 631.0, + "completions/max_terminated_length": 631.0, + "completions/mean_length": 431.828125, + "completions/mean_terminated_length": 431.828125, + "completions/min_length": 277.0, + "completions/min_terminated_length": 277.0, + "epoch": 0.046542850749608414, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04789068549871445, + "learning_rate": 2.2991071428571427e-06, + "loss": -0.0112, + "num_tokens": 16694837.0, + "reward": 1.405816912651062, + "reward_std": 1.6296827793121338, + "rewards/accuracy_reward/mean": 1.09375, + "rewards/accuracy_reward/std": 2.6709415912628174, + "rewards/ngram_similarity_reward/mean": 0.31206685304641724, + "rewards/ngram_similarity_reward/std": 0.17024105787277222, + "step": 104 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 955.0, + "completions/max_terminated_length": 955.0, + "completions/mean_length": 564.84375, + "completions/mean_terminated_length": 564.84375, + "completions/min_length": 343.0, + "completions/min_terminated_length": 343.0, + "epoch": 0.046990378160662344, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04270428791642189, + "learning_rate": 2.321428571428572e-06, + "loss": 0.0652, + "num_tokens": 16856475.0, + "reward": 4.657665729522705, + "reward_std": 0.23186588287353516, + "rewards/accuracy_reward/mean": 3.984375, + "rewards/accuracy_reward/std": 2.648702621459961, + "rewards/ngram_similarity_reward/mean": 0.6732906103134155, + "rewards/ngram_similarity_reward/std": 0.34633952379226685, + "step": 105 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 795.0, + "completions/max_terminated_length": 795.0, + "completions/mean_length": 563.3125, + "completions/mean_terminated_length": 563.3125, + "completions/min_length": 268.0, + "completions/min_terminated_length": 268.0, + "epoch": 0.04743790557171627, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.039831411093473434, + "learning_rate": 2.3437500000000002e-06, + "loss": -0.0179, + "num_tokens": 17034127.0, + "reward": 2.423933982849121, + "reward_std": 1.1356269121170044, + "rewards/accuracy_reward/mean": 2.109375, + "rewards/accuracy_reward/std": 3.0164480209350586, + "rewards/ngram_similarity_reward/mean": 0.31455907225608826, + "rewards/ngram_similarity_reward/std": 0.2853781580924988, + "step": 106 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 629.0, + "completions/max_terminated_length": 629.0, + "completions/mean_length": 323.40625, + "completions/mean_terminated_length": 323.40625, + "completions/min_length": 221.0, + "completions/min_terminated_length": 221.0, + "epoch": 0.0478854329827702, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.059055786579847336, + "learning_rate": 2.3660714285714285e-06, + "loss": -0.0032, + "num_tokens": 17159337.0, + "reward": 4.369568824768066, + "reward_std": 1.6163207292556763, + "rewards/accuracy_reward/mean": 3.53125, + "rewards/accuracy_reward/std": 2.839454174041748, + "rewards/ngram_similarity_reward/mean": 0.8383191227912903, + "rewards/ngram_similarity_reward/std": 0.43380168080329895, + "step": 107 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1108.0, + "completions/max_terminated_length": 1108.0, + "completions/mean_length": 609.796875, + "completions/mean_terminated_length": 609.796875, + "completions/min_length": 325.0, + "completions/min_terminated_length": 325.0, + "epoch": 0.04833296039382412, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.03806445747613907, + "learning_rate": 2.3883928571428573e-06, + "loss": 0.004, + "num_tokens": 17346972.0, + "reward": 0.2939775288105011, + "reward_std": 1.3833703994750977, + "rewards/accuracy_reward/mean": -0.0625, + "rewards/accuracy_reward/std": 2.006932497024536, + "rewards/ngram_similarity_reward/mean": 0.3564775288105011, + "rewards/ngram_similarity_reward/std": 0.18514876067638397, + "step": 108 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 836.0, + "completions/max_terminated_length": 836.0, + "completions/mean_length": 585.234375, + "completions/mean_terminated_length": 585.234375, + "completions/min_length": 359.0, + "completions/min_terminated_length": 359.0, + "epoch": 0.04878048780487805, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.03510132431983948, + "learning_rate": 2.410714285714286e-06, + "loss": 0.0063, + "num_tokens": 17471019.0, + "reward": 4.088577747344971, + "reward_std": 1.2437450885772705, + "rewards/accuracy_reward/mean": 3.578125, + "rewards/accuracy_reward/std": 2.880171298980713, + "rewards/ngram_similarity_reward/mean": 0.5104526281356812, + "rewards/ngram_similarity_reward/std": 0.3364333212375641, + "step": 109 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 516.0, + "completions/max_terminated_length": 516.0, + "completions/mean_length": 351.109375, + "completions/mean_terminated_length": 351.109375, + "completions/min_length": 196.0, + "completions/min_terminated_length": 196.0, + "epoch": 0.049228015215931974, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05111013725399971, + "learning_rate": 2.4330357142857144e-06, + "loss": -0.0383, + "num_tokens": 17628610.0, + "reward": 4.171026229858398, + "reward_std": 2.411790370941162, + "rewards/accuracy_reward/mean": 3.703125, + "rewards/accuracy_reward/std": 2.789889335632324, + "rewards/ngram_similarity_reward/mean": 0.46790117025375366, + "rewards/ngram_similarity_reward/std": 0.3062148094177246, + "step": 110 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 799.0, + "completions/max_terminated_length": 799.0, + "completions/mean_length": 503.578125, + "completions/mean_terminated_length": 503.578125, + "completions/min_length": 304.0, + "completions/min_terminated_length": 304.0, + "epoch": 0.049675542626985904, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04645279049873352, + "learning_rate": 2.455357142857143e-06, + "loss": -0.0245, + "num_tokens": 17800039.0, + "reward": 2.558358669281006, + "reward_std": 0.6171290278434753, + "rewards/accuracy_reward/mean": 2.203125, + "rewards/accuracy_reward/std": 3.2400259971618652, + "rewards/ngram_similarity_reward/mean": 0.3552337884902954, + "rewards/ngram_similarity_reward/std": 0.31524649262428284, + "step": 111 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 771.0, + "completions/max_terminated_length": 771.0, + "completions/mean_length": 581.890625, + "completions/mean_terminated_length": 581.890625, + "completions/min_length": 378.0, + "completions/min_terminated_length": 378.0, + "epoch": 0.05012307003803983, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.033150479197502136, + "learning_rate": 2.477678571428572e-06, + "loss": 0.0029, + "num_tokens": 17942640.0, + "reward": 4.360316753387451, + "reward_std": 0.6434515118598938, + "rewards/accuracy_reward/mean": 3.8125, + "rewards/accuracy_reward/std": 2.7189810276031494, + "rewards/ngram_similarity_reward/mean": 0.5478165745735168, + "rewards/ngram_similarity_reward/std": 0.34956124424934387, + "step": 112 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 772.0, + "completions/max_terminated_length": 772.0, + "completions/mean_length": 496.65625, + "completions/mean_terminated_length": 496.65625, + "completions/min_length": 228.0, + "completions/min_terminated_length": 228.0, + "epoch": 0.05057059744909376, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0438249446451664, + "learning_rate": 2.5e-06, + "loss": -0.0134, + "num_tokens": 18105098.0, + "reward": 0.2818566560745239, + "reward_std": 1.0355392694473267, + "rewards/accuracy_reward/mean": -0.078125, + "rewards/accuracy_reward/std": 2.3217720985412598, + "rewards/ngram_similarity_reward/mean": 0.35998162627220154, + "rewards/ngram_similarity_reward/std": 0.3143450915813446, + "step": 113 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 898.0, + "completions/max_terminated_length": 898.0, + "completions/mean_length": 474.4375, + "completions/mean_terminated_length": 474.4375, + "completions/min_length": 194.0, + "completions/min_terminated_length": 194.0, + "epoch": 0.05101812486014769, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.03859485685825348, + "learning_rate": 2.5223214285714285e-06, + "loss": 0.0201, + "num_tokens": 18257814.0, + "reward": 2.7156527042388916, + "reward_std": 0.7003411650657654, + "rewards/accuracy_reward/mean": 2.265625, + "rewards/accuracy_reward/std": 3.0692648887634277, + "rewards/ngram_similarity_reward/mean": 0.4500276446342468, + "rewards/ngram_similarity_reward/std": 0.3689570426940918, + "step": 114 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 731.0, + "completions/max_terminated_length": 731.0, + "completions/mean_length": 458.734375, + "completions/mean_terminated_length": 458.734375, + "completions/min_length": 256.0, + "completions/min_terminated_length": 256.0, + "epoch": 0.05146565227120161, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04315647855401039, + "learning_rate": 2.5446428571428573e-06, + "loss": -0.0008, + "num_tokens": 18409813.0, + "reward": 2.2372071743011475, + "reward_std": 2.4554202556610107, + "rewards/accuracy_reward/mean": 1.734375, + "rewards/accuracy_reward/std": 2.9425299167633057, + "rewards/ngram_similarity_reward/mean": 0.5028321743011475, + "rewards/ngram_similarity_reward/std": 0.3332710266113281, + "step": 115 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 873.0, + "completions/max_terminated_length": 873.0, + "completions/mean_length": 553.0, + "completions/mean_terminated_length": 553.0, + "completions/min_length": 339.0, + "completions/min_terminated_length": 339.0, + "epoch": 0.05191317968225554, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.03877819702029228, + "learning_rate": 2.5669642857142856e-06, + "loss": 0.0226, + "num_tokens": 18551605.0, + "reward": 2.798994779586792, + "reward_std": 0.607537031173706, + "rewards/accuracy_reward/mean": 2.359375, + "rewards/accuracy_reward/std": 3.075077533721924, + "rewards/ngram_similarity_reward/mean": 0.439619779586792, + "rewards/ngram_similarity_reward/std": 0.34050148725509644, + "step": 116 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 723.0, + "completions/max_terminated_length": 723.0, + "completions/mean_length": 520.25, + "completions/mean_terminated_length": 520.25, + "completions/min_length": 310.0, + "completions/min_terminated_length": 310.0, + "epoch": 0.052360707093309464, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.039993252605199814, + "learning_rate": 2.5892857142857148e-06, + "loss": 0.0302, + "num_tokens": 18704229.0, + "reward": 2.8932952880859375, + "reward_std": 0.6330355405807495, + "rewards/accuracy_reward/mean": 2.4375, + "rewards/accuracy_reward/std": 3.3040380477905273, + "rewards/ngram_similarity_reward/mean": 0.4557953178882599, + "rewards/ngram_similarity_reward/std": 0.2842147946357727, + "step": 117 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 841.0, + "completions/max_terminated_length": 841.0, + "completions/mean_length": 528.09375, + "completions/mean_terminated_length": 528.09375, + "completions/min_length": 198.0, + "completions/min_terminated_length": 198.0, + "epoch": 0.052808234504363394, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.03796974569559097, + "learning_rate": 2.611607142857143e-06, + "loss": 0.0116, + "num_tokens": 18892827.0, + "reward": 3.090029239654541, + "reward_std": 1.1082619428634644, + "rewards/accuracy_reward/mean": 2.5, + "rewards/accuracy_reward/std": 3.0237157344818115, + "rewards/ngram_similarity_reward/mean": 0.5900291204452515, + "rewards/ngram_similarity_reward/std": 0.3288757801055908, + "step": 118 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 935.0, + "completions/max_terminated_length": 935.0, + "completions/mean_length": 515.0625, + "completions/mean_terminated_length": 515.0625, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "epoch": 0.05325576191541732, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04664303734898567, + "learning_rate": 2.633928571428572e-06, + "loss": -0.0034, + "num_tokens": 19058383.0, + "reward": 1.6570677757263184, + "reward_std": 1.6934335231781006, + "rewards/accuracy_reward/mean": 1.21875, + "rewards/accuracy_reward/std": 2.9302406311035156, + "rewards/ngram_similarity_reward/mean": 0.43831780552864075, + "rewards/ngram_similarity_reward/std": 0.25029969215393066, + "step": 119 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1335.0, + "completions/max_terminated_length": 1335.0, + "completions/mean_length": 571.40625, + "completions/mean_terminated_length": 571.40625, + "completions/min_length": 212.0, + "completions/min_terminated_length": 212.0, + "epoch": 0.05370328932647125, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.039782412350177765, + "learning_rate": 2.65625e-06, + "loss": 0.0124, + "num_tokens": 19215817.0, + "reward": 1.7129923105239868, + "reward_std": 1.179494023323059, + "rewards/accuracy_reward/mean": 1.28125, + "rewards/accuracy_reward/std": 2.881075382232666, + "rewards/ngram_similarity_reward/mean": 0.43174242973327637, + "rewards/ngram_similarity_reward/std": 0.22701890766620636, + "step": 120 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 728.0, + "completions/max_terminated_length": 728.0, + "completions/mean_length": 522.953125, + "completions/mean_terminated_length": 522.953125, + "completions/min_length": 235.0, + "completions/min_terminated_length": 235.0, + "epoch": 0.05415081673752517, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04186190664768219, + "learning_rate": 2.6785714285714285e-06, + "loss": 0.0102, + "num_tokens": 19349942.0, + "reward": 4.323519706726074, + "reward_std": 0.27767765522003174, + "rewards/accuracy_reward/mean": 3.890625, + "rewards/accuracy_reward/std": 2.8206562995910645, + "rewards/ngram_similarity_reward/mean": 0.4328947067260742, + "rewards/ngram_similarity_reward/std": 0.38358616828918457, + "step": 121 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 641.0, + "completions/max_terminated_length": 641.0, + "completions/mean_length": 456.78125, + "completions/mean_terminated_length": 456.78125, + "completions/min_length": 207.0, + "completions/min_terminated_length": 207.0, + "epoch": 0.0545983441485791, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.040842872112989426, + "learning_rate": 2.7008928571428573e-06, + "loss": 0.0099, + "num_tokens": 19477352.0, + "reward": 2.548858642578125, + "reward_std": 1.7461631298065186, + "rewards/accuracy_reward/mean": 2.078125, + "rewards/accuracy_reward/std": 3.0488338470458984, + "rewards/ngram_similarity_reward/mean": 0.4707334041595459, + "rewards/ngram_similarity_reward/std": 0.36607521772384644, + "step": 122 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 706.0, + "completions/max_terminated_length": 706.0, + "completions/mean_length": 526.71875, + "completions/mean_terminated_length": 526.71875, + "completions/min_length": 324.0, + "completions/min_terminated_length": 324.0, + "epoch": 0.05504587155963303, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.03907957300543785, + "learning_rate": 2.7232142857142856e-06, + "loss": 0.0137, + "num_tokens": 19622662.0, + "reward": 3.366270065307617, + "reward_std": 1.1402533054351807, + "rewards/accuracy_reward/mean": 2.953125, + "rewards/accuracy_reward/std": 3.0075550079345703, + "rewards/ngram_similarity_reward/mean": 0.4131450653076172, + "rewards/ngram_similarity_reward/std": 0.33299681544303894, + "step": 123 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1155.0, + "completions/max_terminated_length": 1155.0, + "completions/mean_length": 539.546875, + "completions/mean_terminated_length": 539.546875, + "completions/min_length": 361.0, + "completions/min_terminated_length": 361.0, + "epoch": 0.055493398970686954, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.038894135504961014, + "learning_rate": 2.7455357142857148e-06, + "loss": -0.0165, + "num_tokens": 19796617.0, + "reward": 3.563744068145752, + "reward_std": 1.986689805984497, + "rewards/accuracy_reward/mean": 3.25, + "rewards/accuracy_reward/std": 2.9277002811431885, + "rewards/ngram_similarity_reward/mean": 0.3137441873550415, + "rewards/ngram_similarity_reward/std": 0.23322314023971558, + "step": 124 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 670.0, + "completions/max_terminated_length": 670.0, + "completions/mean_length": 405.703125, + "completions/mean_terminated_length": 405.703125, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "epoch": 0.055940926381740884, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.051937028765678406, + "learning_rate": 2.767857142857143e-06, + "loss": 0.0385, + "num_tokens": 19931254.0, + "reward": 2.6729683876037598, + "reward_std": 1.4942256212234497, + "rewards/accuracy_reward/mean": 2.3125, + "rewards/accuracy_reward/std": 3.0178043842315674, + "rewards/ngram_similarity_reward/mean": 0.36046838760375977, + "rewards/ngram_similarity_reward/std": 0.23005659878253937, + "step": 125 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 817.0, + "completions/mean_length": 598.828125, + "completions/mean_terminated_length": 575.825439453125, + "completions/min_length": 316.0, + "completions/min_terminated_length": 316.0, + "epoch": 0.05638845379279481, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.032587017863988876, + "learning_rate": 2.790178571428572e-06, + "loss": -0.0526, + "num_tokens": 20112907.0, + "reward": 5.738628387451172, + "reward_std": 1.0721367597579956, + "rewards/accuracy_reward/mean": 5.203125, + "rewards/accuracy_reward/std": 1.3531819581985474, + "rewards/ngram_similarity_reward/mean": 0.5355039834976196, + "rewards/ngram_similarity_reward/std": 0.25528010725975037, + "step": 126 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 546.0, + "completions/max_terminated_length": 546.0, + "completions/mean_length": 359.921875, + "completions/mean_terminated_length": 359.921875, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "epoch": 0.05683598120384874, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05148518830537796, + "learning_rate": 2.8125e-06, + "loss": 0.0046, + "num_tokens": 20219878.0, + "reward": 4.365318775177002, + "reward_std": 0.6934431195259094, + "rewards/accuracy_reward/mean": 3.8125, + "rewards/accuracy_reward/std": 2.7189810276031494, + "rewards/ngram_similarity_reward/mean": 0.5528188943862915, + "rewards/ngram_similarity_reward/std": 0.33181366324424744, + "step": 127 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 703.0, + "completions/max_terminated_length": 703.0, + "completions/mean_length": 522.25, + "completions/mean_terminated_length": 522.25, + "completions/min_length": 226.0, + "completions/min_terminated_length": 226.0, + "epoch": 0.05728350861490266, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04125802963972092, + "learning_rate": 2.834821428571429e-06, + "loss": 0.0127, + "num_tokens": 20376422.0, + "reward": 3.3187947273254395, + "reward_std": 0.7205219268798828, + "rewards/accuracy_reward/mean": 2.78125, + "rewards/accuracy_reward/std": 3.0103988647460938, + "rewards/ngram_similarity_reward/mean": 0.537544846534729, + "rewards/ngram_similarity_reward/std": 0.26360082626342773, + "step": 128 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 725.0, + "completions/max_terminated_length": 725.0, + "completions/mean_length": 441.53125, + "completions/mean_terminated_length": 441.53125, + "completions/min_length": 257.0, + "completions/min_terminated_length": 257.0, + "epoch": 0.05773103602595659, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04405700042843819, + "learning_rate": 2.8571428571428573e-06, + "loss": -0.0094, + "num_tokens": 20533656.0, + "reward": 4.606054782867432, + "reward_std": 0.45147010684013367, + "rewards/accuracy_reward/mean": 3.90625, + "rewards/accuracy_reward/std": 2.6709415912628174, + "rewards/ngram_similarity_reward/mean": 0.6998050212860107, + "rewards/ngram_similarity_reward/std": 0.36088237166404724, + "step": 129 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 791.0, + "completions/max_terminated_length": 791.0, + "completions/mean_length": 519.921875, + "completions/mean_terminated_length": 519.921875, + "completions/min_length": 286.0, + "completions/min_terminated_length": 286.0, + "epoch": 0.058178563437010514, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05712265148758888, + "learning_rate": 2.8794642857142856e-06, + "loss": -0.0075, + "num_tokens": 20753891.0, + "reward": 2.823075532913208, + "reward_std": 0.505908727645874, + "rewards/accuracy_reward/mean": 2.375, + "rewards/accuracy_reward/std": 3.057647228240967, + "rewards/ngram_similarity_reward/mean": 0.44807544350624084, + "rewards/ngram_similarity_reward/std": 0.31414222717285156, + "step": 130 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 982.0, + "completions/max_terminated_length": 982.0, + "completions/mean_length": 547.8125, + "completions/mean_terminated_length": 547.8125, + "completions/min_length": 329.0, + "completions/min_terminated_length": 329.0, + "epoch": 0.058626090848064444, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.038814567029476166, + "learning_rate": 2.9017857142857148e-06, + "loss": 0.0243, + "num_tokens": 20905799.0, + "reward": 6.176115989685059, + "reward_std": 0.22915717959403992, + "rewards/accuracy_reward/mean": 5.5, + "rewards/accuracy_reward/std": 0.0, + "rewards/ngram_similarity_reward/mean": 0.6761159896850586, + "rewards/ngram_similarity_reward/std": 0.29612261056900024, + "step": 131 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 897.0, + "completions/max_terminated_length": 897.0, + "completions/mean_length": 549.984375, + "completions/mean_terminated_length": 549.984375, + "completions/min_length": 318.0, + "completions/min_terminated_length": 318.0, + "epoch": 0.059073618259118374, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04424475133419037, + "learning_rate": 2.924107142857143e-06, + "loss": 0.0323, + "num_tokens": 21116870.0, + "reward": 2.8654801845550537, + "reward_std": 0.6325905323028564, + "rewards/accuracy_reward/mean": 2.390625, + "rewards/accuracy_reward/std": 3.0400354862213135, + "rewards/ngram_similarity_reward/mean": 0.4748552441596985, + "rewards/ngram_similarity_reward/std": 0.3601526618003845, + "step": 132 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 584.0, + "completions/max_terminated_length": 584.0, + "completions/mean_length": 425.671875, + "completions/mean_terminated_length": 425.671875, + "completions/min_length": 222.0, + "completions/min_terminated_length": 222.0, + "epoch": 0.0595211456701723, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.048342108726501465, + "learning_rate": 2.946428571428572e-06, + "loss": -0.0011, + "num_tokens": 21273953.0, + "reward": 3.6667661666870117, + "reward_std": 1.964839220046997, + "rewards/accuracy_reward/mean": 3.15625, + "rewards/accuracy_reward/std": 3.0613763332366943, + "rewards/ngram_similarity_reward/mean": 0.5105161666870117, + "rewards/ngram_similarity_reward/std": 0.3453754186630249, + "step": 133 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 707.0, + "completions/max_terminated_length": 707.0, + "completions/mean_length": 495.8125, + "completions/mean_terminated_length": 495.8125, + "completions/min_length": 299.0, + "completions/min_terminated_length": 299.0, + "epoch": 0.05996867308122623, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04297291487455368, + "learning_rate": 2.96875e-06, + "loss": -0.0019, + "num_tokens": 21426885.0, + "reward": 1.790950059890747, + "reward_std": 2.361938714981079, + "rewards/accuracy_reward/mean": 1.375, + "rewards/accuracy_reward/std": 3.0315799713134766, + "rewards/ngram_similarity_reward/mean": 0.41595014929771423, + "rewards/ngram_similarity_reward/std": 0.2542986571788788, + "step": 134 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 752.0, + "completions/max_terminated_length": 752.0, + "completions/mean_length": 538.9375, + "completions/mean_terminated_length": 538.9375, + "completions/min_length": 378.0, + "completions/min_terminated_length": 378.0, + "epoch": 0.06041620049228015, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.03412027657032013, + "learning_rate": 2.991071428571429e-06, + "loss": 0.0113, + "num_tokens": 21567537.0, + "reward": 3.170560598373413, + "reward_std": 0.8621605634689331, + "rewards/accuracy_reward/mean": 2.8125, + "rewards/accuracy_reward/std": 3.080275297164917, + "rewards/ngram_similarity_reward/mean": 0.3580605685710907, + "rewards/ngram_similarity_reward/std": 0.26210764050483704, + "step": 135 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 828.0, + "completions/max_terminated_length": 828.0, + "completions/mean_length": 499.171875, + "completions/mean_terminated_length": 499.171875, + "completions/min_length": 253.0, + "completions/min_terminated_length": 253.0, + "epoch": 0.06086372790333408, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04292212426662445, + "learning_rate": 3.0133928571428572e-06, + "loss": 0.0119, + "num_tokens": 21743868.0, + "reward": 0.061600834131240845, + "reward_std": 1.426113486289978, + "rewards/accuracy_reward/mean": -0.15625, + "rewards/accuracy_reward/std": 1.8790301084518433, + "rewards/ngram_similarity_reward/mean": 0.21785084903240204, + "rewards/ngram_similarity_reward/std": 0.15603578090667725, + "step": 136 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 767.0, + "completions/max_terminated_length": 767.0, + "completions/mean_length": 532.078125, + "completions/mean_terminated_length": 532.078125, + "completions/min_length": 334.0, + "completions/min_terminated_length": 334.0, + "epoch": 0.061311255314388004, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.038399964570999146, + "learning_rate": 3.0357142857142856e-06, + "loss": 0.0026, + "num_tokens": 21879521.0, + "reward": 4.752626419067383, + "reward_std": 1.480465292930603, + "rewards/accuracy_reward/mean": 4.359375, + "rewards/accuracy_reward/std": 2.3962087631225586, + "rewards/ngram_similarity_reward/mean": 0.3932513892650604, + "rewards/ngram_similarity_reward/std": 0.2955648601055145, + "step": 137 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 669.0, + "completions/max_terminated_length": 669.0, + "completions/mean_length": 467.15625, + "completions/mean_terminated_length": 467.15625, + "completions/min_length": 254.0, + "completions/min_terminated_length": 254.0, + "epoch": 0.061758782725441934, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04397137835621834, + "learning_rate": 3.0580357142857147e-06, + "loss": -0.0153, + "num_tokens": 22029819.0, + "reward": 3.7678921222686768, + "reward_std": 1.2001043558120728, + "rewards/accuracy_reward/mean": 3.34375, + "rewards/accuracy_reward/std": 2.9016621112823486, + "rewards/ngram_similarity_reward/mean": 0.42414209246635437, + "rewards/ngram_similarity_reward/std": 0.2783641219139099, + "step": 138 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 789.0, + "completions/max_terminated_length": 789.0, + "completions/mean_length": 502.25, + "completions/mean_terminated_length": 502.25, + "completions/min_length": 278.0, + "completions/min_terminated_length": 278.0, + "epoch": 0.06220631013649586, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.03933661803603172, + "learning_rate": 3.080357142857143e-06, + "loss": 0.0225, + "num_tokens": 22192283.0, + "reward": 1.9660686254501343, + "reward_std": 0.9389387965202332, + "rewards/accuracy_reward/mean": 1.53125, + "rewards/accuracy_reward/std": 2.900294303894043, + "rewards/ngram_similarity_reward/mean": 0.43481865525245667, + "rewards/ngram_similarity_reward/std": 0.3456280529499054, + "step": 139 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 699.0, + "completions/max_terminated_length": 699.0, + "completions/mean_length": 413.78125, + "completions/mean_terminated_length": 413.78125, + "completions/min_length": 221.0, + "completions/min_terminated_length": 221.0, + "epoch": 0.06265383754754979, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04940592870116234, + "learning_rate": 3.102678571428572e-06, + "loss": -0.0052, + "num_tokens": 22412765.0, + "reward": 2.9114465713500977, + "reward_std": 0.5219171047210693, + "rewards/accuracy_reward/mean": 2.40625, + "rewards/accuracy_reward/std": 3.0222392082214355, + "rewards/ngram_similarity_reward/mean": 0.5051964521408081, + "rewards/ngram_similarity_reward/std": 0.3867158889770508, + "step": 140 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 777.0, + "completions/max_terminated_length": 777.0, + "completions/mean_length": 563.984375, + "completions/mean_terminated_length": 563.984375, + "completions/min_length": 395.0, + "completions/min_terminated_length": 395.0, + "epoch": 0.06310136495860372, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.03664904087781906, + "learning_rate": 3.125e-06, + "loss": 0.0047, + "num_tokens": 22587036.0, + "reward": 5.278068542480469, + "reward_std": 1.061753273010254, + "rewards/accuracy_reward/mean": 4.75, + "rewards/accuracy_reward/std": 2.1602470874786377, + "rewards/ngram_similarity_reward/mean": 0.5280686020851135, + "rewards/ngram_similarity_reward/std": 0.248811274766922, + "step": 141 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 698.0, + "completions/max_terminated_length": 698.0, + "completions/mean_length": 398.796875, + "completions/mean_terminated_length": 398.796875, + "completions/min_length": 209.0, + "completions/min_terminated_length": 209.0, + "epoch": 0.06354889236965765, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04452500119805336, + "learning_rate": 3.147321428571429e-06, + "loss": 0.0195, + "num_tokens": 22730191.0, + "reward": 3.75919771194458, + "reward_std": 2.232400417327881, + "rewards/accuracy_reward/mean": 3.21875, + "rewards/accuracy_reward/std": 2.9732606410980225, + "rewards/ngram_similarity_reward/mean": 0.5404477715492249, + "rewards/ngram_similarity_reward/std": 0.45260751247406006, + "step": 142 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 657.0, + "completions/max_terminated_length": 657.0, + "completions/mean_length": 414.890625, + "completions/mean_terminated_length": 414.890625, + "completions/min_length": 250.0, + "completions/min_terminated_length": 250.0, + "epoch": 0.06399641978071156, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.047404687851667404, + "learning_rate": 3.1696428571428572e-06, + "loss": -0.0106, + "num_tokens": 22862792.0, + "reward": 1.301940679550171, + "reward_std": 0.5988505482673645, + "rewards/accuracy_reward/mean": 0.875, + "rewards/accuracy_reward/std": 2.5850608348846436, + "rewards/ngram_similarity_reward/mean": 0.4269407093524933, + "rewards/ngram_similarity_reward/std": 0.3728107511997223, + "step": 143 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 746.0, + "completions/max_terminated_length": 746.0, + "completions/mean_length": 516.234375, + "completions/mean_terminated_length": 516.234375, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "epoch": 0.0644439471917655, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.03927481546998024, + "learning_rate": 3.1919642857142856e-06, + "loss": -0.0034, + "num_tokens": 22998087.0, + "reward": 2.7710466384887695, + "reward_std": 1.9648394584655762, + "rewards/accuracy_reward/mean": 2.484375, + "rewards/accuracy_reward/std": 3.2537753582000732, + "rewards/ngram_similarity_reward/mean": 0.28667137026786804, + "rewards/ngram_similarity_reward/std": 0.20069169998168945, + "step": 144 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 760.0, + "completions/max_terminated_length": 760.0, + "completions/mean_length": 507.140625, + "completions/mean_terminated_length": 507.140625, + "completions/min_length": 322.0, + "completions/min_terminated_length": 322.0, + "epoch": 0.06489147460281942, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.043464187532663345, + "learning_rate": 3.2142857142857147e-06, + "loss": -0.0193, + "num_tokens": 23129872.0, + "reward": 0.631460964679718, + "reward_std": 1.3900837898254395, + "rewards/accuracy_reward/mean": 0.3125, + "rewards/accuracy_reward/std": 2.122255563735962, + "rewards/ngram_similarity_reward/mean": 0.3189609944820404, + "rewards/ngram_similarity_reward/std": 0.18644043803215027, + "step": 145 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 669.0, + "completions/max_terminated_length": 669.0, + "completions/mean_length": 430.859375, + "completions/mean_terminated_length": 430.859375, + "completions/min_length": 222.0, + "completions/min_terminated_length": 222.0, + "epoch": 0.06533900201387335, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04917949065566063, + "learning_rate": 3.2366071428571435e-06, + "loss": 0.0218, + "num_tokens": 23257031.0, + "reward": 2.4517362117767334, + "reward_std": 0.8715531229972839, + "rewards/accuracy_reward/mean": 2.109375, + "rewards/accuracy_reward/std": 3.0164480209350586, + "rewards/ngram_similarity_reward/mean": 0.34236112236976624, + "rewards/ngram_similarity_reward/std": 0.3702693283557892, + "step": 146 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 640.0, + "completions/max_terminated_length": 640.0, + "completions/mean_length": 435.46875, + "completions/mean_terminated_length": 435.46875, + "completions/min_length": 309.0, + "completions/min_terminated_length": 309.0, + "epoch": 0.06578652942492727, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.045783959329128265, + "learning_rate": 3.258928571428572e-06, + "loss": 0.0036, + "num_tokens": 23395637.0, + "reward": 4.754144668579102, + "reward_std": 1.722858190536499, + "rewards/accuracy_reward/mean": 4.28125, + "rewards/accuracy_reward/std": 2.4330317974090576, + "rewards/ngram_similarity_reward/mean": 0.4728948473930359, + "rewards/ngram_similarity_reward/std": 0.3473533093929291, + "step": 147 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1246.0, + "completions/max_terminated_length": 1246.0, + "completions/mean_length": 513.96875, + "completions/mean_terminated_length": 513.96875, + "completions/min_length": 234.0, + "completions/min_terminated_length": 234.0, + "epoch": 0.0662340568359812, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.047589968889951706, + "learning_rate": 3.28125e-06, + "loss": 0.006, + "num_tokens": 23523539.0, + "reward": 4.243943214416504, + "reward_std": 0.95088791847229, + "rewards/accuracy_reward/mean": 3.796875, + "rewards/accuracy_reward/std": 2.746886730194092, + "rewards/ngram_similarity_reward/mean": 0.4470682442188263, + "rewards/ngram_similarity_reward/std": 0.33129891753196716, + "step": 148 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 630.0, + "completions/max_terminated_length": 630.0, + "completions/mean_length": 448.765625, + "completions/mean_terminated_length": 448.765625, + "completions/min_length": 279.0, + "completions/min_terminated_length": 279.0, + "epoch": 0.06668158424703513, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.048214901238679886, + "learning_rate": 3.303571428571429e-06, + "loss": -0.0314, + "num_tokens": 23715156.0, + "reward": 2.7262778282165527, + "reward_std": 2.69027042388916, + "rewards/accuracy_reward/mean": 2.375, + "rewards/accuracy_reward/std": 3.057647228240967, + "rewards/ngram_similarity_reward/mean": 0.35127800703048706, + "rewards/ngram_similarity_reward/std": 0.27484095096588135, + "step": 149 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 683.0, + "completions/max_terminated_length": 683.0, + "completions/mean_length": 459.375, + "completions/mean_terminated_length": 459.375, + "completions/min_length": 204.0, + "completions/min_terminated_length": 204.0, + "epoch": 0.06712911165808906, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04197873920202255, + "learning_rate": 3.3258928571428572e-06, + "loss": 0.0316, + "num_tokens": 23860924.0, + "reward": 3.065863847732544, + "reward_std": 0.19159740209579468, + "rewards/accuracy_reward/mean": 2.5, + "rewards/accuracy_reward/std": 3.0237157344818115, + "rewards/ngram_similarity_reward/mean": 0.565863847732544, + "rewards/ngram_similarity_reward/std": 0.30514025688171387, + "step": 150 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 907.0, + "completions/max_terminated_length": 907.0, + "completions/mean_length": 548.921875, + "completions/mean_terminated_length": 548.921875, + "completions/min_length": 352.0, + "completions/min_terminated_length": 352.0, + "epoch": 0.06757663906914299, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.032930515706539154, + "learning_rate": 3.3482142857142855e-06, + "loss": 0.0249, + "num_tokens": 24041575.0, + "reward": 1.9457918405532837, + "reward_std": 2.021454095840454, + "rewards/accuracy_reward/mean": 1.5625, + "rewards/accuracy_reward/std": 2.872281312942505, + "rewards/ngram_similarity_reward/mean": 0.38329190015792847, + "rewards/ngram_similarity_reward/std": 0.2622060477733612, + "step": 151 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 680.0, + "completions/max_terminated_length": 680.0, + "completions/mean_length": 391.921875, + "completions/mean_terminated_length": 391.921875, + "completions/min_length": 162.0, + "completions/min_terminated_length": 162.0, + "epoch": 0.06802416648019691, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.04094477370381355, + "learning_rate": 3.3705357142857147e-06, + "loss": 0.0036, + "num_tokens": 24213394.0, + "reward": 3.6835200786590576, + "reward_std": 1.5544017553329468, + "rewards/accuracy_reward/mean": 2.828125, + "rewards/accuracy_reward/std": 3.060525417327881, + "rewards/ngram_similarity_reward/mean": 0.8553951978683472, + "rewards/ngram_similarity_reward/std": 0.3850794732570648, + "step": 152 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 873.0, + "completions/max_terminated_length": 873.0, + "completions/mean_length": 465.234375, + "completions/mean_terminated_length": 465.234375, + "completions/min_length": 243.0, + "completions/min_terminated_length": 243.0, + "epoch": 0.06847169389125084, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04359624162316322, + "learning_rate": 3.3928571428571435e-06, + "loss": -0.0134, + "num_tokens": 24358353.0, + "reward": 3.7651684284210205, + "reward_std": 2.4716694355010986, + "rewards/accuracy_reward/mean": 3.203125, + "rewards/accuracy_reward/std": 3.104935646057129, + "rewards/ngram_similarity_reward/mean": 0.5620434284210205, + "rewards/ngram_similarity_reward/std": 0.42700931429862976, + "step": 153 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 611.0, + "completions/max_terminated_length": 611.0, + "completions/mean_length": 411.640625, + "completions/mean_terminated_length": 411.640625, + "completions/min_length": 204.0, + "completions/min_terminated_length": 204.0, + "epoch": 0.06891922130230477, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04364282637834549, + "learning_rate": 3.415178571428572e-06, + "loss": 0.0097, + "num_tokens": 24512362.0, + "reward": 4.9647016525268555, + "reward_std": 0.9715432524681091, + "rewards/accuracy_reward/mean": 4.640625, + "rewards/accuracy_reward/std": 2.1445181369781494, + "rewards/ngram_similarity_reward/mean": 0.3240765333175659, + "rewards/ngram_similarity_reward/std": 0.24237005412578583, + "step": 154 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 739.0, + "completions/max_terminated_length": 739.0, + "completions/mean_length": 513.984375, + "completions/mean_terminated_length": 513.984375, + "completions/min_length": 328.0, + "completions/min_terminated_length": 328.0, + "epoch": 0.0693667487133587, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.03732859343290329, + "learning_rate": 3.4375e-06, + "loss": 0.0164, + "num_tokens": 24673833.0, + "reward": 3.5389277935028076, + "reward_std": 0.9162474274635315, + "rewards/accuracy_reward/mean": 2.90625, + "rewards/accuracy_reward/std": 3.2791731357574463, + "rewards/ngram_similarity_reward/mean": 0.6326779127120972, + "rewards/ngram_similarity_reward/std": 0.33283960819244385, + "step": 155 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 662.0, + "completions/mean_length": 505.875, + "completions/mean_terminated_length": 456.1290283203125, + "completions/min_length": 242.0, + "completions/min_terminated_length": 242.0, + "epoch": 0.06981427612441261, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05703406408429146, + "learning_rate": 3.459821428571429e-06, + "loss": 0.0632, + "num_tokens": 24883121.0, + "reward": 1.2255878448486328, + "reward_std": 1.6483601331710815, + "rewards/accuracy_reward/mean": 0.890625, + "rewards/accuracy_reward/std": 3.0478575229644775, + "rewards/ngram_similarity_reward/mean": 0.3349628448486328, + "rewards/ngram_similarity_reward/std": 0.29052525758743286, + "step": 156 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 719.0, + "completions/max_terminated_length": 719.0, + "completions/mean_length": 389.375, + "completions/mean_terminated_length": 389.375, + "completions/min_length": 175.0, + "completions/min_terminated_length": 175.0, + "epoch": 0.07026180353546654, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05271059647202492, + "learning_rate": 3.482142857142857e-06, + "loss": -0.015, + "num_tokens": 25023321.0, + "reward": 1.4020805358886719, + "reward_std": 0.50262451171875, + "rewards/accuracy_reward/mean": 1.09375, + "rewards/accuracy_reward/std": 2.6709415912628174, + "rewards/ngram_similarity_reward/mean": 0.30833062529563904, + "rewards/ngram_similarity_reward/std": 0.2317408323287964, + "step": 157 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 648.0, + "completions/max_terminated_length": 648.0, + "completions/mean_length": 423.234375, + "completions/mean_terminated_length": 423.234375, + "completions/min_length": 221.0, + "completions/min_terminated_length": 221.0, + "epoch": 0.07070933094652047, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05880529060959816, + "learning_rate": 3.504464285714286e-06, + "loss": -0.0119, + "num_tokens": 25222936.0, + "reward": 3.830268383026123, + "reward_std": 2.3002877235412598, + "rewards/accuracy_reward/mean": 3.25, + "rewards/accuracy_reward/std": 3.039423704147339, + "rewards/ngram_similarity_reward/mean": 0.5802686214447021, + "rewards/ngram_similarity_reward/std": 0.3722204566001892, + "step": 158 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 717.0, + "completions/max_terminated_length": 717.0, + "completions/mean_length": 518.96875, + "completions/mean_terminated_length": 518.96875, + "completions/min_length": 355.0, + "completions/min_terminated_length": 355.0, + "epoch": 0.0711568583575744, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.040393684059381485, + "learning_rate": 3.5267857142857147e-06, + "loss": -0.0017, + "num_tokens": 25403078.0, + "reward": 6.280098915100098, + "reward_std": 0.16385754942893982, + "rewards/accuracy_reward/mean": 5.5, + "rewards/accuracy_reward/std": 0.0, + "rewards/ngram_similarity_reward/mean": 0.7800989151000977, + "rewards/ngram_similarity_reward/std": 0.33270302414894104, + "step": 159 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 701.0, + "completions/max_terminated_length": 701.0, + "completions/mean_length": 472.09375, + "completions/mean_terminated_length": 472.09375, + "completions/min_length": 286.0, + "completions/min_terminated_length": 286.0, + "epoch": 0.07160438576862833, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04423805698752403, + "learning_rate": 3.5491071428571435e-06, + "loss": 0.0217, + "num_tokens": 25550540.0, + "reward": 4.868206977844238, + "reward_std": 2.067688465118408, + "rewards/accuracy_reward/mean": 4.375, + "rewards/accuracy_reward/std": 2.3603873252868652, + "rewards/ngram_similarity_reward/mean": 0.4932067394256592, + "rewards/ngram_similarity_reward/std": 0.2828584611415863, + "step": 160 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 699.0, + "completions/max_terminated_length": 699.0, + "completions/mean_length": 417.328125, + "completions/mean_terminated_length": 417.328125, + "completions/min_length": 190.0, + "completions/min_terminated_length": 190.0, + "epoch": 0.07205191317968225, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04653473198413849, + "learning_rate": 3.5714285714285718e-06, + "loss": 0.0195, + "num_tokens": 25798705.0, + "reward": 4.217301845550537, + "reward_std": 1.03849458694458, + "rewards/accuracy_reward/mean": 3.703125, + "rewards/accuracy_reward/std": 2.789889335632324, + "rewards/ngram_similarity_reward/mean": 0.5141770243644714, + "rewards/ngram_similarity_reward/std": 0.34160828590393066, + "step": 161 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 841.0, + "completions/max_terminated_length": 841.0, + "completions/mean_length": 444.296875, + "completions/mean_terminated_length": 444.296875, + "completions/min_length": 226.0, + "completions/min_terminated_length": 226.0, + "epoch": 0.07249944059073618, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04895284026861191, + "learning_rate": 3.59375e-06, + "loss": -0.028, + "num_tokens": 25952308.0, + "reward": 3.087433338165283, + "reward_std": 0.8225682973861694, + "rewards/accuracy_reward/mean": 2.640625, + "rewards/accuracy_reward/std": 3.1816298961639404, + "rewards/ngram_similarity_reward/mean": 0.44680821895599365, + "rewards/ngram_similarity_reward/std": 0.2572721242904663, + "step": 162 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 716.0, + "completions/max_terminated_length": 716.0, + "completions/mean_length": 486.390625, + "completions/mean_terminated_length": 486.390625, + "completions/min_length": 256.0, + "completions/min_terminated_length": 256.0, + "epoch": 0.07294696800179011, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.03934817016124725, + "learning_rate": 3.616071428571429e-06, + "loss": 0.0129, + "num_tokens": 26090429.0, + "reward": 3.4666459560394287, + "reward_std": 1.3379069566726685, + "rewards/accuracy_reward/mean": 3.1875, + "rewards/accuracy_reward/std": 3.0178043842315674, + "rewards/ngram_similarity_reward/mean": 0.2791461646556854, + "rewards/ngram_similarity_reward/std": 0.29793688654899597, + "step": 163 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 669.0, + "completions/max_terminated_length": 669.0, + "completions/mean_length": 469.078125, + "completions/mean_terminated_length": 469.078125, + "completions/min_length": 244.0, + "completions/min_terminated_length": 244.0, + "epoch": 0.07339449541284404, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.041254252195358276, + "learning_rate": 3.638392857142857e-06, + "loss": 0.0119, + "num_tokens": 26243394.0, + "reward": 2.9599480628967285, + "reward_std": 0.6078701615333557, + "rewards/accuracy_reward/mean": 2.46875, + "rewards/accuracy_reward/std": 3.1671366691589355, + "rewards/ngram_similarity_reward/mean": 0.49119803309440613, + "rewards/ngram_similarity_reward/std": 0.3554515540599823, + "step": 164 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 721.0, + "completions/max_terminated_length": 721.0, + "completions/mean_length": 441.734375, + "completions/mean_terminated_length": 441.734375, + "completions/min_length": 278.0, + "completions/min_terminated_length": 278.0, + "epoch": 0.07384202282389796, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04607221856713295, + "learning_rate": 3.660714285714286e-06, + "loss": 0.0285, + "num_tokens": 26401953.0, + "reward": 5.063331604003906, + "reward_std": 1.823121428489685, + "rewards/accuracy_reward/mean": 4.65625, + "rewards/accuracy_reward/std": 2.102294683456421, + "rewards/ngram_similarity_reward/mean": 0.4070812463760376, + "rewards/ngram_similarity_reward/std": 0.3174854516983032, + "step": 165 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 649.0, + "completions/max_terminated_length": 649.0, + "completions/mean_length": 438.09375, + "completions/mean_terminated_length": 438.09375, + "completions/min_length": 260.0, + "completions/min_terminated_length": 260.0, + "epoch": 0.07428955023495189, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04993997514247894, + "learning_rate": 3.6830357142857147e-06, + "loss": 0.0015, + "num_tokens": 26650343.0, + "reward": 2.793623447418213, + "reward_std": 0.7833993434906006, + "rewards/accuracy_reward/mean": 2.5, + "rewards/accuracy_reward/std": 3.0237157344818115, + "rewards/ngram_similarity_reward/mean": 0.29362359642982483, + "rewards/ngram_similarity_reward/std": 0.15472392737865448, + "step": 166 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 845.0, + "completions/max_terminated_length": 845.0, + "completions/mean_length": 511.984375, + "completions/mean_terminated_length": 511.984375, + "completions/min_length": 285.0, + "completions/min_terminated_length": 285.0, + "epoch": 0.07473707764600582, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04063967615365982, + "learning_rate": 3.7053571428571434e-06, + "loss": 0.0231, + "num_tokens": 26812294.0, + "reward": 3.3472909927368164, + "reward_std": 0.7561337351799011, + "rewards/accuracy_reward/mean": 2.78125, + "rewards/accuracy_reward/std": 3.0103988647460938, + "rewards/ngram_similarity_reward/mean": 0.5660411715507507, + "rewards/ngram_similarity_reward/std": 0.21464978158473969, + "step": 167 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 817.0, + "completions/max_terminated_length": 817.0, + "completions/mean_length": 569.171875, + "completions/mean_terminated_length": 569.171875, + "completions/min_length": 304.0, + "completions/min_terminated_length": 304.0, + "epoch": 0.07518460505705975, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04079766198992729, + "learning_rate": 3.7276785714285718e-06, + "loss": 0.0022, + "num_tokens": 27019057.0, + "reward": 2.664179801940918, + "reward_std": 2.0348520278930664, + "rewards/accuracy_reward/mean": 2.28125, + "rewards/accuracy_reward/std": 3.0522892475128174, + "rewards/ngram_similarity_reward/mean": 0.38292986154556274, + "rewards/ngram_similarity_reward/std": 0.3396172821521759, + "step": 168 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 947.0, + "completions/max_terminated_length": 947.0, + "completions/mean_length": 523.265625, + "completions/mean_terminated_length": 523.265625, + "completions/min_length": 250.0, + "completions/min_terminated_length": 250.0, + "epoch": 0.07563213246811368, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.051377370953559875, + "learning_rate": 3.7500000000000005e-06, + "loss": 0.0057, + "num_tokens": 27158818.0, + "reward": 2.863034963607788, + "reward_std": 0.5506872534751892, + "rewards/accuracy_reward/mean": 2.390625, + "rewards/accuracy_reward/std": 3.0400354862213135, + "rewards/ngram_similarity_reward/mean": 0.4724099636077881, + "rewards/ngram_similarity_reward/std": 0.32486027479171753, + "step": 169 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 860.0, + "completions/max_terminated_length": 860.0, + "completions/mean_length": 408.765625, + "completions/mean_terminated_length": 408.765625, + "completions/min_length": 220.0, + "completions/min_terminated_length": 220.0, + "epoch": 0.0760796598791676, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.047743599861860275, + "learning_rate": 3.772321428571429e-06, + "loss": 0.0161, + "num_tokens": 27296803.0, + "reward": 4.988167762756348, + "reward_std": 2.043489456176758, + "rewards/accuracy_reward/mean": 4.359375, + "rewards/accuracy_reward/std": 2.3962087631225586, + "rewards/ngram_similarity_reward/mean": 0.6287930607795715, + "rewards/ngram_similarity_reward/std": 0.36715027689933777, + "step": 170 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1090.0, + "completions/max_terminated_length": 1090.0, + "completions/mean_length": 661.5, + "completions/mean_terminated_length": 661.5, + "completions/min_length": 450.0, + "completions/min_terminated_length": 450.0, + "epoch": 0.07652718729022152, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.03597584366798401, + "learning_rate": 3.794642857142857e-06, + "loss": 0.0039, + "num_tokens": 27466067.0, + "reward": 2.084123373031616, + "reward_std": 2.295442819595337, + "rewards/accuracy_reward/mean": 1.671875, + "rewards/accuracy_reward/std": 3.0002894401550293, + "rewards/ngram_similarity_reward/mean": 0.4122483730316162, + "rewards/ngram_similarity_reward/std": 0.2563944458961487, + "step": 171 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 976.0, + "completions/max_terminated_length": 976.0, + "completions/mean_length": 516.0, + "completions/mean_terminated_length": 516.0, + "completions/min_length": 296.0, + "completions/min_terminated_length": 296.0, + "epoch": 0.07697471470127545, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.040315892547369, + "learning_rate": 3.816964285714286e-06, + "loss": -0.0344, + "num_tokens": 27624883.0, + "reward": 3.6607158184051514, + "reward_std": 1.7966370582580566, + "rewards/accuracy_reward/mean": 2.953125, + "rewards/accuracy_reward/std": 3.0075550079345703, + "rewards/ngram_similarity_reward/mean": 0.7075908184051514, + "rewards/ngram_similarity_reward/std": 0.3306605815887451, + "step": 172 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 902.0, + "completions/max_terminated_length": 902.0, + "completions/mean_length": 552.21875, + "completions/mean_terminated_length": 552.21875, + "completions/min_length": 270.0, + "completions/min_terminated_length": 270.0, + "epoch": 0.07742224211232938, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0378226637840271, + "learning_rate": 3.839285714285715e-06, + "loss": -0.03, + "num_tokens": 27786833.0, + "reward": 4.285251140594482, + "reward_std": 0.4423610270023346, + "rewards/accuracy_reward/mean": 3.90625, + "rewards/accuracy_reward/std": 2.6709415912628174, + "rewards/ngram_similarity_reward/mean": 0.3790009617805481, + "rewards/ngram_similarity_reward/std": 0.2819773256778717, + "step": 173 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 698.0, + "completions/max_terminated_length": 698.0, + "completions/mean_length": 511.34375, + "completions/mean_terminated_length": 511.34375, + "completions/min_length": 334.0, + "completions/min_terminated_length": 334.0, + "epoch": 0.0778697695233833, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.03857111185789108, + "learning_rate": 3.8616071428571434e-06, + "loss": -0.0041, + "num_tokens": 27967703.0, + "reward": 3.4933242797851562, + "reward_std": 2.03798508644104, + "rewards/accuracy_reward/mean": 2.921875, + "rewards/accuracy_reward/std": 3.1562721729278564, + "rewards/ngram_similarity_reward/mean": 0.5714495182037354, + "rewards/ngram_similarity_reward/std": 0.2844545841217041, + "step": 174 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 589.0, + "completions/max_terminated_length": 589.0, + "completions/mean_length": 355.453125, + "completions/mean_terminated_length": 355.453125, + "completions/min_length": 237.0, + "completions/min_terminated_length": 237.0, + "epoch": 0.07831729693443723, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06271856278181076, + "learning_rate": 3.883928571428572e-06, + "loss": -0.0087, + "num_tokens": 28067812.0, + "reward": 2.6634888648986816, + "reward_std": 0.49967533349990845, + "rewards/accuracy_reward/mean": 2.171875, + "rewards/accuracy_reward/std": 3.271108388900757, + "rewards/ngram_similarity_reward/mean": 0.49161386489868164, + "rewards/ngram_similarity_reward/std": 0.3125540614128113, + "step": 175 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1105.0, + "completions/max_terminated_length": 1105.0, + "completions/mean_length": 601.296875, + "completions/mean_terminated_length": 601.296875, + "completions/min_length": 330.0, + "completions/min_terminated_length": 330.0, + "epoch": 0.07876482434549116, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0370563268661499, + "learning_rate": 3.90625e-06, + "loss": -0.0243, + "num_tokens": 28227879.0, + "reward": 3.002413034439087, + "reward_std": 0.2257809340953827, + "rewards/accuracy_reward/mean": 2.421875, + "rewards/accuracy_reward/std": 3.113231897354126, + "rewards/ngram_similarity_reward/mean": 0.5805378556251526, + "rewards/ngram_similarity_reward/std": 0.343523770570755, + "step": 176 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 660.0, + "completions/max_terminated_length": 660.0, + "completions/mean_length": 438.6875, + "completions/mean_terminated_length": 438.6875, + "completions/min_length": 257.0, + "completions/min_terminated_length": 257.0, + "epoch": 0.07921235175654509, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04832509532570839, + "learning_rate": 3.928571428571429e-06, + "loss": 0.0243, + "num_tokens": 28378259.0, + "reward": -0.3152479827404022, + "reward_std": 0.05669836327433586, + "rewards/accuracy_reward/mean": -0.5, + "rewards/accuracy_reward/std": 0.0, + "rewards/ngram_similarity_reward/mean": 0.18475203216075897, + "rewards/ngram_similarity_reward/std": 0.07380875945091248, + "step": 177 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 672.0, + "completions/max_terminated_length": 672.0, + "completions/mean_length": 423.09375, + "completions/mean_terminated_length": 423.09375, + "completions/min_length": 226.0, + "completions/min_terminated_length": 226.0, + "epoch": 0.07965987916759902, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.046438515186309814, + "learning_rate": 3.950892857142858e-06, + "loss": 0.0181, + "num_tokens": 28553577.0, + "reward": 1.5509165525436401, + "reward_std": 2.002094030380249, + "rewards/accuracy_reward/mean": 1.125, + "rewards/accuracy_reward/std": 3.000000238418579, + "rewards/ngram_similarity_reward/mean": 0.4259165823459625, + "rewards/ngram_similarity_reward/std": 0.3136703372001648, + "step": 178 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 745.0, + "completions/max_terminated_length": 745.0, + "completions/mean_length": 403.0625, + "completions/mean_terminated_length": 403.0625, + "completions/min_length": 188.0, + "completions/min_terminated_length": 188.0, + "epoch": 0.08010740657865294, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04984445497393608, + "learning_rate": 3.9732142857142855e-06, + "loss": -0.0088, + "num_tokens": 28694173.0, + "reward": 1.3050878047943115, + "reward_std": 2.3252506256103516, + "rewards/accuracy_reward/mean": 0.953125, + "rewards/accuracy_reward/std": 2.6543147563934326, + "rewards/ngram_similarity_reward/mean": 0.35196271538734436, + "rewards/ngram_similarity_reward/std": 0.29783403873443604, + "step": 179 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 987.0, + "completions/max_terminated_length": 987.0, + "completions/mean_length": 515.40625, + "completions/mean_terminated_length": 515.40625, + "completions/min_length": 241.0, + "completions/min_terminated_length": 241.0, + "epoch": 0.08055493398970687, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04675843566656113, + "learning_rate": 3.995535714285715e-06, + "loss": 0.0441, + "num_tokens": 28901319.0, + "reward": 3.3299739360809326, + "reward_std": 1.4819881916046143, + "rewards/accuracy_reward/mean": 2.5625, + "rewards/accuracy_reward/std": 3.059593439102173, + "rewards/ngram_similarity_reward/mean": 0.7674739360809326, + "rewards/ngram_similarity_reward/std": 0.2389378547668457, + "step": 180 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 714.0, + "completions/max_terminated_length": 714.0, + "completions/mean_length": 536.984375, + "completions/mean_terminated_length": 536.984375, + "completions/min_length": 342.0, + "completions/min_terminated_length": 342.0, + "epoch": 0.0810024614007608, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.036061979830265045, + "learning_rate": 4.017857142857143e-06, + "loss": 0.0345, + "num_tokens": 29065846.0, + "reward": 4.682528972625732, + "reward_std": 1.3342541456222534, + "rewards/accuracy_reward/mean": 4.1875, + "rewards/accuracy_reward/std": 2.5, + "rewards/ngram_similarity_reward/mean": 0.4950290620326996, + "rewards/ngram_similarity_reward/std": 0.3298114836215973, + "step": 181 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 769.0, + "completions/max_terminated_length": 769.0, + "completions/mean_length": 465.796875, + "completions/mean_terminated_length": 465.796875, + "completions/min_length": 243.0, + "completions/min_terminated_length": 243.0, + "epoch": 0.08144998881181473, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0419074110686779, + "learning_rate": 4.040178571428572e-06, + "loss": -0.0185, + "num_tokens": 29235481.0, + "reward": 4.840533256530762, + "reward_std": 1.6569175720214844, + "rewards/accuracy_reward/mean": 4.5, + "rewards/accuracy_reward/std": 2.350278615951538, + "rewards/ngram_similarity_reward/mean": 0.3405328094959259, + "rewards/ngram_similarity_reward/std": 0.31163933873176575, + "step": 182 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 897.0, + "completions/max_terminated_length": 897.0, + "completions/mean_length": 498.21875, + "completions/mean_terminated_length": 498.21875, + "completions/min_length": 209.0, + "completions/min_terminated_length": 209.0, + "epoch": 0.08189751622286866, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04102945700287819, + "learning_rate": 4.0625000000000005e-06, + "loss": 0.0178, + "num_tokens": 29347175.0, + "reward": 2.4598028659820557, + "reward_std": 0.7249634265899658, + "rewards/accuracy_reward/mean": 2.125, + "rewards/accuracy_reward/std": 3.000000238418579, + "rewards/ngram_similarity_reward/mean": 0.3348028361797333, + "rewards/ngram_similarity_reward/std": 0.3524908423423767, + "step": 183 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 717.0, + "completions/max_terminated_length": 717.0, + "completions/mean_length": 518.234375, + "completions/mean_terminated_length": 518.234375, + "completions/min_length": 311.0, + "completions/min_terminated_length": 311.0, + "epoch": 0.08234504363392257, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.03980467468500137, + "learning_rate": 4.084821428571429e-06, + "loss": 0.0118, + "num_tokens": 29515670.0, + "reward": 2.591090440750122, + "reward_std": 1.683458924293518, + "rewards/accuracy_reward/mean": 2.34375, + "rewards/accuracy_reward/std": 3.1983067989349365, + "rewards/ngram_similarity_reward/mean": 0.24734047055244446, + "rewards/ngram_similarity_reward/std": 0.12441064417362213, + "step": 184 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 756.0, + "completions/max_terminated_length": 756.0, + "completions/mean_length": 441.90625, + "completions/mean_terminated_length": 441.90625, + "completions/min_length": 251.0, + "completions/min_terminated_length": 251.0, + "epoch": 0.0827925710449765, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.046047866344451904, + "learning_rate": 4.107142857142857e-06, + "loss": -0.0203, + "num_tokens": 29651072.0, + "reward": 2.0882480144500732, + "reward_std": 1.8428391218185425, + "rewards/accuracy_reward/mean": 1.65625, + "rewards/accuracy_reward/std": 2.9016621112823486, + "rewards/ngram_similarity_reward/mean": 0.43199795484542847, + "rewards/ngram_similarity_reward/std": 0.2237108051776886, + "step": 185 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 800.0, + "completions/max_terminated_length": 800.0, + "completions/mean_length": 525.78125, + "completions/mean_terminated_length": 525.78125, + "completions/min_length": 293.0, + "completions/min_terminated_length": 293.0, + "epoch": 0.08324009845603043, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.043506283313035965, + "learning_rate": 4.129464285714286e-06, + "loss": 0.0176, + "num_tokens": 29813602.0, + "reward": 1.362823486328125, + "reward_std": 0.18018248677253723, + "rewards/accuracy_reward/mean": 0.96875, + "rewards/accuracy_reward/std": 2.6425621509552, + "rewards/ngram_similarity_reward/mean": 0.39407360553741455, + "rewards/ngram_similarity_reward/std": 0.4013862907886505, + "step": 186 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 913.0, + "completions/max_terminated_length": 913.0, + "completions/mean_length": 507.265625, + "completions/mean_terminated_length": 507.265625, + "completions/min_length": 248.0, + "completions/min_terminated_length": 248.0, + "epoch": 0.08368762586708436, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.03793657198548317, + "learning_rate": 4.151785714285715e-06, + "loss": -0.0095, + "num_tokens": 30014931.0, + "reward": 2.19004225730896, + "reward_std": 0.8865464925765991, + "rewards/accuracy_reward/mean": 1.75, + "rewards/accuracy_reward/std": 2.9277002811431885, + "rewards/ngram_similarity_reward/mean": 0.4400422275066376, + "rewards/ngram_similarity_reward/std": 0.27053165435791016, + "step": 187 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 913.0, + "completions/max_terminated_length": 913.0, + "completions/mean_length": 503.515625, + "completions/mean_terminated_length": 503.515625, + "completions/min_length": 275.0, + "completions/min_terminated_length": 275.0, + "epoch": 0.08413515327813828, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.03992355987429619, + "learning_rate": 4.174107142857143e-06, + "loss": -0.01, + "num_tokens": 30162724.0, + "reward": 4.297519683837891, + "reward_std": 1.1939719915390015, + "rewards/accuracy_reward/mean": 3.71875, + "rewards/accuracy_reward/std": 2.7629566192626953, + "rewards/ngram_similarity_reward/mean": 0.5787696838378906, + "rewards/ngram_similarity_reward/std": 0.39754706621170044, + "step": 188 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 815.0, + "completions/max_terminated_length": 815.0, + "completions/mean_length": 517.125, + "completions/mean_terminated_length": 517.125, + "completions/min_length": 201.0, + "completions/min_terminated_length": 201.0, + "epoch": 0.08458268068919221, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04642122983932495, + "learning_rate": 4.196428571428572e-06, + "loss": 0.0242, + "num_tokens": 30309724.0, + "reward": 1.3255119323730469, + "reward_std": 0.6579290628433228, + "rewards/accuracy_reward/mean": 0.84375, + "rewards/accuracy_reward/std": 2.607795238494873, + "rewards/ngram_similarity_reward/mean": 0.4817619323730469, + "rewards/ngram_similarity_reward/std": 0.3493627607822418, + "step": 189 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 815.0, + "completions/max_terminated_length": 815.0, + "completions/mean_length": 486.3125, + "completions/mean_terminated_length": 486.3125, + "completions/min_length": 247.0, + "completions/min_terminated_length": 247.0, + "epoch": 0.08503020810024614, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.042718902230262756, + "learning_rate": 4.21875e-06, + "loss": 0.012, + "num_tokens": 30467344.0, + "reward": 1.8112475872039795, + "reward_std": 0.8907710313796997, + "rewards/accuracy_reward/mean": 1.21875, + "rewards/accuracy_reward/std": 3.041870594024658, + "rewards/ngram_similarity_reward/mean": 0.5924974679946899, + "rewards/ngram_similarity_reward/std": 0.3628630042076111, + "step": 190 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 978.0, + "completions/max_terminated_length": 978.0, + "completions/mean_length": 524.953125, + "completions/mean_terminated_length": 524.953125, + "completions/min_length": 226.0, + "completions/min_terminated_length": 226.0, + "epoch": 0.08547773551130007, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0406482107937336, + "learning_rate": 4.241071428571429e-06, + "loss": -0.0003, + "num_tokens": 30606797.0, + "reward": -0.15879753232002258, + "reward_std": 0.19097502529621124, + "rewards/accuracy_reward/mean": -0.53125, + "rewards/accuracy_reward/std": 0.17536810040473938, + "rewards/ngram_similarity_reward/mean": 0.3724524974822998, + "rewards/ngram_similarity_reward/std": 0.1509428173303604, + "step": 191 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 631.0, + "completions/max_terminated_length": 631.0, + "completions/mean_length": 413.75, + "completions/mean_terminated_length": 413.75, + "completions/min_length": 154.0, + "completions/min_terminated_length": 154.0, + "epoch": 0.085925262922354, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04431195929646492, + "learning_rate": 4.2633928571428576e-06, + "loss": 0.0018, + "num_tokens": 30774717.0, + "reward": 3.4124279022216797, + "reward_std": 2.0548229217529297, + "rewards/accuracy_reward/mean": 2.96875, + "rewards/accuracy_reward/std": 2.986577272415161, + "rewards/ngram_similarity_reward/mean": 0.44367799162864685, + "rewards/ngram_similarity_reward/std": 0.3273553252220154, + "step": 192 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 647.0, + "completions/max_terminated_length": 647.0, + "completions/mean_length": 441.890625, + "completions/mean_terminated_length": 441.890625, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "epoch": 0.08637279033340792, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.044728197157382965, + "learning_rate": 4.2857142857142855e-06, + "loss": -0.0185, + "num_tokens": 30937206.0, + "reward": 4.989612579345703, + "reward_std": 0.7707114815711975, + "rewards/accuracy_reward/mean": 4.28125, + "rewards/accuracy_reward/std": 2.4330317974090576, + "rewards/ngram_similarity_reward/mean": 0.7083626985549927, + "rewards/ngram_similarity_reward/std": 0.34404411911964417, + "step": 193 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 789.0, + "completions/max_terminated_length": 789.0, + "completions/mean_length": 453.703125, + "completions/mean_terminated_length": 453.703125, + "completions/min_length": 254.0, + "completions/min_terminated_length": 254.0, + "epoch": 0.08682031774446185, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.044507987797260284, + "learning_rate": 4.308035714285715e-06, + "loss": 0.0035, + "num_tokens": 31065091.0, + "reward": 2.8430004119873047, + "reward_std": 0.5047019720077515, + "rewards/accuracy_reward/mean": 2.59375, + "rewards/accuracy_reward/std": 3.0222392082214355, + "rewards/ngram_similarity_reward/mean": 0.2492506206035614, + "rewards/ngram_similarity_reward/std": 0.21281088888645172, + "step": 194 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 590.0, + "completions/max_terminated_length": 590.0, + "completions/mean_length": 387.84375, + "completions/mean_terminated_length": 387.84375, + "completions/min_length": 221.0, + "completions/min_terminated_length": 221.0, + "epoch": 0.08726784515551578, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04783143848180771, + "learning_rate": 4.330357142857143e-06, + "loss": -0.0006, + "num_tokens": 31185113.0, + "reward": 3.92765212059021, + "reward_std": 0.851255476474762, + "rewards/accuracy_reward/mean": 3.4375, + "rewards/accuracy_reward/std": 2.872281312942505, + "rewards/ngram_similarity_reward/mean": 0.4901522696018219, + "rewards/ngram_similarity_reward/std": 0.4444206655025482, + "step": 195 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 795.0, + "completions/max_terminated_length": 795.0, + "completions/mean_length": 494.5, + "completions/mean_terminated_length": 494.5, + "completions/min_length": 323.0, + "completions/min_terminated_length": 323.0, + "epoch": 0.08771537256656971, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.044673748314380646, + "learning_rate": 4.352678571428572e-06, + "loss": -0.0218, + "num_tokens": 31339833.0, + "reward": 2.257666826248169, + "reward_std": 2.731825828552246, + "rewards/accuracy_reward/mean": 1.734375, + "rewards/accuracy_reward/std": 3.264733076095581, + "rewards/ngram_similarity_reward/mean": 0.5232917666435242, + "rewards/ngram_similarity_reward/std": 0.2643654942512512, + "step": 196 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 613.0, + "completions/max_terminated_length": 613.0, + "completions/mean_length": 464.875, + "completions/mean_terminated_length": 464.875, + "completions/min_length": 248.0, + "completions/min_terminated_length": 248.0, + "epoch": 0.08816289997762362, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04198407754302025, + "learning_rate": 4.3750000000000005e-06, + "loss": -0.0096, + "num_tokens": 31491393.0, + "reward": 2.045973300933838, + "reward_std": 1.1696488857269287, + "rewards/accuracy_reward/mean": 1.5625, + "rewards/accuracy_reward/std": 2.872281312942505, + "rewards/ngram_similarity_reward/mean": 0.4834733009338379, + "rewards/ngram_similarity_reward/std": 0.34941643476486206, + "step": 197 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 794.0, + "completions/max_terminated_length": 794.0, + "completions/mean_length": 470.328125, + "completions/mean_terminated_length": 470.328125, + "completions/min_length": 295.0, + "completions/min_terminated_length": 295.0, + "epoch": 0.08861042738867755, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04621649533510208, + "learning_rate": 4.397321428571429e-06, + "loss": 0.0005, + "num_tokens": 31621558.0, + "reward": 3.066758632659912, + "reward_std": 2.3714590072631836, + "rewards/accuracy_reward/mean": 2.75, + "rewards/accuracy_reward/std": 3.0498504638671875, + "rewards/ngram_similarity_reward/mean": 0.3167587220668793, + "rewards/ngram_similarity_reward/std": 0.28607502579689026, + "step": 198 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 869.0, + "completions/max_terminated_length": 869.0, + "completions/mean_length": 510.5, + "completions/mean_terminated_length": 510.5, + "completions/min_length": 212.0, + "completions/min_terminated_length": 212.0, + "epoch": 0.08905795479973148, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.036292556673288345, + "learning_rate": 4.419642857142857e-06, + "loss": -0.0053, + "num_tokens": 31739878.0, + "reward": 4.306193828582764, + "reward_std": 0.6418611407279968, + "rewards/accuracy_reward/mean": 3.8125, + "rewards/accuracy_reward/std": 2.7189810276031494, + "rewards/ngram_similarity_reward/mean": 0.4936937987804413, + "rewards/ngram_similarity_reward/std": 0.41456860303878784, + "step": 199 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1107.0, + "completions/max_terminated_length": 1107.0, + "completions/mean_length": 559.65625, + "completions/mean_terminated_length": 559.65625, + "completions/min_length": 216.0, + "completions/min_terminated_length": 216.0, + "epoch": 0.08950548221078541, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.042119771242141724, + "learning_rate": 4.441964285714286e-06, + "loss": 0.0317, + "num_tokens": 31892224.0, + "reward": 5.85109806060791, + "reward_std": 1.0437448024749756, + "rewards/accuracy_reward/mean": 5.28125, + "rewards/accuracy_reward/std": 1.227576732635498, + "rewards/ngram_similarity_reward/mean": 0.569847583770752, + "rewards/ngram_similarity_reward/std": 0.36416593194007874, + "step": 200 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1417.0, + "completions/max_terminated_length": 1417.0, + "completions/mean_length": 559.0625, + "completions/mean_terminated_length": 559.0625, + "completions/min_length": 329.0, + "completions/min_terminated_length": 329.0, + "epoch": 0.08995300962183934, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.042125869542360306, + "learning_rate": 4.464285714285715e-06, + "loss": 0.0102, + "num_tokens": 32027972.0, + "reward": 4.4564409255981445, + "reward_std": 1.9459412097930908, + "rewards/accuracy_reward/mean": 3.984375, + "rewards/accuracy_reward/std": 2.648702621459961, + "rewards/ngram_similarity_reward/mean": 0.47206610441207886, + "rewards/ngram_similarity_reward/std": 0.2877153158187866, + "step": 201 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 731.0, + "completions/max_terminated_length": 731.0, + "completions/mean_length": 488.859375, + "completions/mean_terminated_length": 488.859375, + "completions/min_length": 252.0, + "completions/min_terminated_length": 252.0, + "epoch": 0.09040053703289326, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04488145560026169, + "learning_rate": 4.486607142857143e-06, + "loss": -0.0141, + "num_tokens": 32181947.0, + "reward": 3.771817207336426, + "reward_std": 1.2006248235702515, + "rewards/accuracy_reward/mean": 3.53125, + "rewards/accuracy_reward/std": 2.9545164108276367, + "rewards/ngram_similarity_reward/mean": 0.24056729674339294, + "rewards/ngram_similarity_reward/std": 0.1451207399368286, + "step": 202 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 863.0, + "completions/max_terminated_length": 863.0, + "completions/mean_length": 529.515625, + "completions/mean_terminated_length": 529.515625, + "completions/min_length": 344.0, + "completions/min_terminated_length": 344.0, + "epoch": 0.09084806444394719, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.034034159034490585, + "learning_rate": 4.508928571428572e-06, + "loss": 0.0283, + "num_tokens": 32329980.0, + "reward": 4.561273097991943, + "reward_std": 1.712855577468872, + "rewards/accuracy_reward/mean": 4.28125, + "rewards/accuracy_reward/std": 2.4330317974090576, + "rewards/ngram_similarity_reward/mean": 0.28002309799194336, + "rewards/ngram_similarity_reward/std": 0.19624720513820648, + "step": 203 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 664.0, + "completions/max_terminated_length": 664.0, + "completions/mean_length": 423.28125, + "completions/mean_terminated_length": 423.28125, + "completions/min_length": 233.0, + "completions/min_terminated_length": 233.0, + "epoch": 0.09129559185500112, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.04006795212626457, + "learning_rate": 4.53125e-06, + "loss": -0.027, + "num_tokens": 32454158.0, + "reward": 5.31807279586792, + "reward_std": 1.2390682697296143, + "rewards/accuracy_reward/mean": 4.71875, + "rewards/accuracy_reward/std": 2.0890398025512695, + "rewards/ngram_similarity_reward/mean": 0.599323034286499, + "rewards/ngram_similarity_reward/std": 0.47235941886901855, + "step": 204 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 810.0, + "completions/max_terminated_length": 810.0, + "completions/mean_length": 513.28125, + "completions/mean_terminated_length": 513.28125, + "completions/min_length": 342.0, + "completions/min_terminated_length": 342.0, + "epoch": 0.09174311926605505, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.039949022233486176, + "learning_rate": 4.553571428571429e-06, + "loss": -0.0302, + "num_tokens": 32580416.0, + "reward": 5.481403827667236, + "reward_std": 1.2307987213134766, + "rewards/accuracy_reward/mean": 4.9375, + "rewards/accuracy_reward/std": 1.7627090215682983, + "rewards/ngram_similarity_reward/mean": 0.5439037084579468, + "rewards/ngram_similarity_reward/std": 0.31527870893478394, + "step": 205 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 702.0, + "completions/max_terminated_length": 702.0, + "completions/mean_length": 464.25, + "completions/mean_terminated_length": 464.25, + "completions/min_length": 285.0, + "completions/min_terminated_length": 285.0, + "epoch": 0.09219064667710897, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05170599743723869, + "learning_rate": 4.5758928571428575e-06, + "loss": 0.0335, + "num_tokens": 32767136.0, + "reward": -0.4308336675167084, + "reward_std": 0.19196897745132446, + "rewards/accuracy_reward/mean": -0.765625, + "rewards/accuracy_reward/std": 0.44515693187713623, + "rewards/ngram_similarity_reward/mean": 0.334791362285614, + "rewards/ngram_similarity_reward/std": 0.3124901056289673, + "step": 206 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 638.0, + "completions/max_terminated_length": 638.0, + "completions/mean_length": 457.359375, + "completions/mean_terminated_length": 457.359375, + "completions/min_length": 302.0, + "completions/min_terminated_length": 302.0, + "epoch": 0.0926381740881629, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0499611496925354, + "learning_rate": 4.5982142857142854e-06, + "loss": 0.0122, + "num_tokens": 32950759.0, + "reward": 2.8177924156188965, + "reward_std": 1.109889268875122, + "rewards/accuracy_reward/mean": 2.28125, + "rewards/accuracy_reward/std": 3.0522892475128174, + "rewards/ngram_similarity_reward/mean": 0.5365424156188965, + "rewards/ngram_similarity_reward/std": 0.3115372061729431, + "step": 207 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 721.0, + "completions/max_terminated_length": 721.0, + "completions/mean_length": 446.234375, + "completions/mean_terminated_length": 446.234375, + "completions/min_length": 271.0, + "completions/min_terminated_length": 271.0, + "epoch": 0.09308570149921683, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.043924376368522644, + "learning_rate": 4.620535714285715e-06, + "loss": 0.0304, + "num_tokens": 33076294.0, + "reward": 3.9390711784362793, + "reward_std": 1.7528796195983887, + "rewards/accuracy_reward/mean": 3.5, + "rewards/accuracy_reward/std": 2.8894994258880615, + "rewards/ngram_similarity_reward/mean": 0.43907126784324646, + "rewards/ngram_similarity_reward/std": 0.3475610017776489, + "step": 208 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 689.0, + "completions/max_terminated_length": 689.0, + "completions/mean_length": 469.359375, + "completions/mean_terminated_length": 469.359375, + "completions/min_length": 230.0, + "completions/min_terminated_length": 230.0, + "epoch": 0.09353322891027076, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.037999190390110016, + "learning_rate": 4.642857142857144e-06, + "loss": -0.0256, + "num_tokens": 33247661.0, + "reward": 3.102001905441284, + "reward_std": 1.3292903900146484, + "rewards/accuracy_reward/mean": 2.5, + "rewards/accuracy_reward/std": 3.0237157344818115, + "rewards/ngram_similarity_reward/mean": 0.6020020246505737, + "rewards/ngram_similarity_reward/std": 0.3597540855407715, + "step": 209 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1063.0, + "completions/max_terminated_length": 1063.0, + "completions/mean_length": 459.5625, + "completions/mean_terminated_length": 459.5625, + "completions/min_length": 228.0, + "completions/min_terminated_length": 228.0, + "epoch": 0.09398075632132469, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0505104660987854, + "learning_rate": 4.665178571428572e-06, + "loss": 0.0413, + "num_tokens": 33433393.0, + "reward": 1.5454846620559692, + "reward_std": 0.5603064894676208, + "rewards/accuracy_reward/mean": 1.09375, + "rewards/accuracy_reward/std": 2.6709415912628174, + "rewards/ngram_similarity_reward/mean": 0.4517347812652588, + "rewards/ngram_similarity_reward/std": 0.4080568552017212, + "step": 210 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 768.0, + "completions/max_terminated_length": 768.0, + "completions/mean_length": 465.703125, + "completions/mean_terminated_length": 465.703125, + "completions/min_length": 285.0, + "completions/min_terminated_length": 285.0, + "epoch": 0.0944282837323786, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04667956009507179, + "learning_rate": 4.6875000000000004e-06, + "loss": -0.0065, + "num_tokens": 33562830.0, + "reward": 3.4275259971618652, + "reward_std": 1.291181206703186, + "rewards/accuracy_reward/mean": 2.84375, + "rewards/accuracy_reward/std": 3.0405657291412354, + "rewards/ngram_similarity_reward/mean": 0.5837761163711548, + "rewards/ngram_similarity_reward/std": 0.2460503727197647, + "step": 211 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1027.0, + "completions/max_terminated_length": 1027.0, + "completions/mean_length": 626.828125, + "completions/mean_terminated_length": 626.828125, + "completions/min_length": 354.0, + "completions/min_terminated_length": 354.0, + "epoch": 0.09487581114343253, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04065338522195816, + "learning_rate": 4.709821428571429e-06, + "loss": -0.0215, + "num_tokens": 33746627.0, + "reward": 1.388837218284607, + "reward_std": 0.7045068740844727, + "rewards/accuracy_reward/mean": 0.921875, + "rewards/accuracy_reward/std": 2.915773868560791, + "rewards/ngram_similarity_reward/mean": 0.46696218848228455, + "rewards/ngram_similarity_reward/std": 0.3326452374458313, + "step": 212 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 687.0, + "completions/max_terminated_length": 687.0, + "completions/mean_length": 505.671875, + "completions/mean_terminated_length": 505.671875, + "completions/min_length": 370.0, + "completions/min_terminated_length": 370.0, + "epoch": 0.09532333855448646, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04317639395594597, + "learning_rate": 4.732142857142857e-06, + "loss": -0.0012, + "num_tokens": 33925358.0, + "reward": 1.106865406036377, + "reward_std": 0.2781585454940796, + "rewards/accuracy_reward/mean": 0.90625, + "rewards/accuracy_reward/std": 2.688710927963257, + "rewards/ngram_similarity_reward/mean": 0.2006155252456665, + "rewards/ngram_similarity_reward/std": 0.17418237030506134, + "step": 213 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 765.0, + "completions/max_terminated_length": 765.0, + "completions/mean_length": 519.125, + "completions/mean_terminated_length": 519.125, + "completions/min_length": 235.0, + "completions/min_terminated_length": 235.0, + "epoch": 0.0957708659655404, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.038894299417734146, + "learning_rate": 4.754464285714286e-06, + "loss": 0.0314, + "num_tokens": 34096694.0, + "reward": 2.903021812438965, + "reward_std": 0.17010778188705444, + "rewards/accuracy_reward/mean": 2.5, + "rewards/accuracy_reward/std": 3.0237157344818115, + "rewards/ngram_similarity_reward/mean": 0.40302157402038574, + "rewards/ngram_similarity_reward/std": 0.1969473510980606, + "step": 214 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1753.0, + "completions/max_terminated_length": 1753.0, + "completions/mean_length": 649.09375, + "completions/mean_terminated_length": 649.09375, + "completions/min_length": 376.0, + "completions/min_terminated_length": 376.0, + "epoch": 0.09621839337659431, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0412026047706604, + "learning_rate": 4.776785714285715e-06, + "loss": 0.0123, + "num_tokens": 34303068.0, + "reward": 0.10277429223060608, + "reward_std": 0.8195408582687378, + "rewards/accuracy_reward/mean": -0.171875, + "rewards/accuracy_reward/std": 1.491294264793396, + "rewards/ngram_similarity_reward/mean": 0.27464932203292847, + "rewards/ngram_similarity_reward/std": 0.15213480591773987, + "step": 215 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 747.0, + "completions/max_terminated_length": 747.0, + "completions/mean_length": 535.015625, + "completions/mean_terminated_length": 535.015625, + "completions/min_length": 393.0, + "completions/min_terminated_length": 393.0, + "epoch": 0.09666592078764824, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.03897232189774513, + "learning_rate": 4.799107142857143e-06, + "loss": 0.0136, + "num_tokens": 34470301.0, + "reward": 3.1363539695739746, + "reward_std": 1.6776666641235352, + "rewards/accuracy_reward/mean": 2.796875, + "rewards/accuracy_reward/std": 3.0998191833496094, + "rewards/ngram_similarity_reward/mean": 0.3394790291786194, + "rewards/ngram_similarity_reward/std": 0.2934320271015167, + "step": 216 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1653.0, + "completions/max_terminated_length": 1653.0, + "completions/mean_length": 538.1875, + "completions/mean_terminated_length": 538.1875, + "completions/min_length": 307.0, + "completions/min_terminated_length": 307.0, + "epoch": 0.09711344819870217, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.03999040275812149, + "learning_rate": 4.821428571428572e-06, + "loss": 0.0359, + "num_tokens": 34629321.0, + "reward": 3.709991693496704, + "reward_std": 1.4276716709136963, + "rewards/accuracy_reward/mean": 3.375, + "rewards/accuracy_reward/std": 2.9680843353271484, + "rewards/ngram_similarity_reward/mean": 0.33499157428741455, + "rewards/ngram_similarity_reward/std": 0.27073392271995544, + "step": 217 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 569.0, + "completions/max_terminated_length": 569.0, + "completions/mean_length": 418.40625, + "completions/mean_terminated_length": 418.40625, + "completions/min_length": 274.0, + "completions/min_terminated_length": 274.0, + "epoch": 0.0975609756097561, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05568109452724457, + "learning_rate": 4.84375e-06, + "loss": 0.0013, + "num_tokens": 34798147.0, + "reward": 2.7140893936157227, + "reward_std": 1.6045554876327515, + "rewards/accuracy_reward/mean": 1.921875, + "rewards/accuracy_reward/std": 3.0953354835510254, + "rewards/ngram_similarity_reward/mean": 0.7922143936157227, + "rewards/ngram_similarity_reward/std": 0.2839650809764862, + "step": 218 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 734.0, + "completions/max_terminated_length": 734.0, + "completions/mean_length": 512.375, + "completions/mean_terminated_length": 512.375, + "completions/min_length": 292.0, + "completions/min_terminated_length": 292.0, + "epoch": 0.09800850302081003, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.03910877928137779, + "learning_rate": 4.866071428571429e-06, + "loss": -0.0238, + "num_tokens": 34947019.0, + "reward": 4.046869277954102, + "reward_std": 0.9176902770996094, + "rewards/accuracy_reward/mean": 3.515625, + "rewards/accuracy_reward/std": 2.8646292686462402, + "rewards/ngram_similarity_reward/mean": 0.5312443971633911, + "rewards/ngram_similarity_reward/std": 0.34237968921661377, + "step": 219 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 653.0, + "completions/mean_length": 463.609375, + "completions/mean_terminated_length": 438.4603576660156, + "completions/min_length": 256.0, + "completions/min_terminated_length": 256.0, + "epoch": 0.09845603043186395, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.053166162222623825, + "learning_rate": 4.8883928571428575e-06, + "loss": 0.0166, + "num_tokens": 35168546.0, + "reward": 0.9042137265205383, + "reward_std": 1.8351986408233643, + "rewards/accuracy_reward/mean": 0.65625, + "rewards/accuracy_reward/std": 2.607795238494873, + "rewards/ngram_similarity_reward/mean": 0.2479637861251831, + "rewards/ngram_similarity_reward/std": 0.16935303807258606, + "step": 220 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 813.0, + "completions/max_terminated_length": 813.0, + "completions/mean_length": 565.671875, + "completions/mean_terminated_length": 565.671875, + "completions/min_length": 281.0, + "completions/min_terminated_length": 281.0, + "epoch": 0.09890355784291788, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.048766668885946274, + "learning_rate": 4.910714285714286e-06, + "loss": -0.0135, + "num_tokens": 35410541.0, + "reward": 4.157958984375, + "reward_std": 0.9653618931770325, + "rewards/accuracy_reward/mean": 3.78125, + "rewards/accuracy_reward/std": 2.7744226455688477, + "rewards/ngram_similarity_reward/mean": 0.3767091631889343, + "rewards/ngram_similarity_reward/std": 0.25629374384880066, + "step": 221 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 637.0, + "completions/max_terminated_length": 637.0, + "completions/mean_length": 419.78125, + "completions/mean_terminated_length": 419.78125, + "completions/min_length": 186.0, + "completions/min_terminated_length": 186.0, + "epoch": 0.09935108525397181, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04921666532754898, + "learning_rate": 4.933035714285715e-06, + "loss": -0.0085, + "num_tokens": 35553471.0, + "reward": 5.763852119445801, + "reward_std": 0.57377028465271, + "rewards/accuracy_reward/mean": 5.3125, + "rewards/accuracy_reward/std": 1.0522085428237915, + "rewards/ngram_similarity_reward/mean": 0.4513515830039978, + "rewards/ngram_similarity_reward/std": 0.20995618402957916, + "step": 222 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 592.0, + "completions/max_terminated_length": 592.0, + "completions/mean_length": 418.765625, + "completions/mean_terminated_length": 418.765625, + "completions/min_length": 243.0, + "completions/min_terminated_length": 243.0, + "epoch": 0.09979861266502574, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05048259720206261, + "learning_rate": 4.955357142857144e-06, + "loss": 0.0148, + "num_tokens": 35726384.0, + "reward": 1.6927458047866821, + "reward_std": 0.9506723284721375, + "rewards/accuracy_reward/mean": 1.34375, + "rewards/accuracy_reward/std": 2.9450995922088623, + "rewards/ngram_similarity_reward/mean": 0.3489959239959717, + "rewards/ngram_similarity_reward/std": 0.31981024146080017, + "step": 223 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 579.0, + "completions/max_terminated_length": 579.0, + "completions/mean_length": 399.0, + "completions/mean_terminated_length": 399.0, + "completions/min_length": 207.0, + "completions/min_terminated_length": 207.0, + "epoch": 0.10024614007607965, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0539417527616024, + "learning_rate": 4.977678571428572e-06, + "loss": 0.0187, + "num_tokens": 35859280.0, + "reward": 4.095028877258301, + "reward_std": 1.9678847789764404, + "rewards/accuracy_reward/mean": 3.890625, + "rewards/accuracy_reward/std": 2.6998953819274902, + "rewards/ngram_similarity_reward/mean": 0.20440366864204407, + "rewards/ngram_similarity_reward/std": 0.10812616348266602, + "step": 224 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 679.0, + "completions/max_terminated_length": 679.0, + "completions/mean_length": 497.859375, + "completions/mean_terminated_length": 497.859375, + "completions/min_length": 335.0, + "completions/min_terminated_length": 335.0, + "epoch": 0.10069366748713358, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.03823324292898178, + "learning_rate": 5e-06, + "loss": 0.0313, + "num_tokens": 35986151.0, + "reward": 3.178346633911133, + "reward_std": 1.6972424983978271, + "rewards/accuracy_reward/mean": 2.765625, + "rewards/accuracy_reward/std": 3.0302298069000244, + "rewards/ngram_similarity_reward/mean": 0.41272154450416565, + "rewards/ngram_similarity_reward/std": 0.2593502402305603, + "step": 225 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 691.0, + "completions/max_terminated_length": 691.0, + "completions/mean_length": 430.3125, + "completions/mean_terminated_length": 430.3125, + "completions/min_length": 228.0, + "completions/min_terminated_length": 228.0, + "epoch": 0.10114119489818751, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05047139897942543, + "learning_rate": 4.999999384125436e-06, + "loss": -0.0278, + "num_tokens": 36197019.0, + "reward": 3.2247214317321777, + "reward_std": 1.6524477005004883, + "rewards/accuracy_reward/mean": 2.75, + "rewards/accuracy_reward/std": 3.0498504638671875, + "rewards/ngram_similarity_reward/mean": 0.4747212529182434, + "rewards/ngram_similarity_reward/std": 0.32453668117523193, + "step": 226 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 720.0, + "completions/max_terminated_length": 720.0, + "completions/mean_length": 541.609375, + "completions/mean_terminated_length": 541.609375, + "completions/min_length": 284.0, + "completions/min_terminated_length": 284.0, + "epoch": 0.10158872230924144, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.042722173035144806, + "learning_rate": 4.999997536502079e-06, + "loss": 0.0054, + "num_tokens": 36336962.0, + "reward": 1.242129921913147, + "reward_std": 1.0416669845581055, + "rewards/accuracy_reward/mean": 0.828125, + "rewards/accuracy_reward/std": 2.6189463138580322, + "rewards/ngram_similarity_reward/mean": 0.41400498151779175, + "rewards/ngram_similarity_reward/std": 0.20894940197467804, + "step": 227 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 810.0, + "completions/max_terminated_length": 810.0, + "completions/mean_length": 487.015625, + "completions/mean_terminated_length": 487.015625, + "completions/min_length": 247.0, + "completions/min_terminated_length": 247.0, + "epoch": 0.10203624972029537, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.047618549317121506, + "learning_rate": 4.99999445713094e-06, + "loss": 0.0304, + "num_tokens": 36472643.0, + "reward": 4.7300238609313965, + "reward_std": 1.587036371231079, + "rewards/accuracy_reward/mean": 4.28125, + "rewards/accuracy_reward/std": 2.4330317974090576, + "rewards/ngram_similarity_reward/mean": 0.44877392053604126, + "rewards/ngram_similarity_reward/std": 0.2516137957572937, + "step": 228 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 757.0, + "completions/max_terminated_length": 757.0, + "completions/mean_length": 490.90625, + "completions/mean_terminated_length": 490.90625, + "completions/min_length": 283.0, + "completions/min_terminated_length": 283.0, + "epoch": 0.10248377713134929, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04048790782690048, + "learning_rate": 4.9999901460137076e-06, + "loss": -0.0135, + "num_tokens": 36616349.0, + "reward": 3.358356475830078, + "reward_std": 2.4169936180114746, + "rewards/accuracy_reward/mean": 3.0625, + "rewards/accuracy_reward/std": 2.9700891971588135, + "rewards/ngram_similarity_reward/mean": 0.2958567142486572, + "rewards/ngram_similarity_reward/std": 0.25693514943122864, + "step": 229 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 844.0, + "completions/max_terminated_length": 844.0, + "completions/mean_length": 431.453125, + "completions/mean_terminated_length": 431.453125, + "completions/min_length": 190.0, + "completions/min_terminated_length": 190.0, + "epoch": 0.10293130454240322, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.051024310290813446, + "learning_rate": 4.999984603152739e-06, + "loss": -0.0329, + "num_tokens": 36745498.0, + "reward": 3.4914653301239014, + "reward_std": 0.9072697758674622, + "rewards/accuracy_reward/mean": 2.796875, + "rewards/accuracy_reward/std": 3.2055492401123047, + "rewards/ngram_similarity_reward/mean": 0.6945902109146118, + "rewards/ngram_similarity_reward/std": 0.3811330497264862, + "step": 230 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 818.0, + "completions/max_terminated_length": 818.0, + "completions/mean_length": 496.0625, + "completions/mean_terminated_length": 496.0625, + "completions/min_length": 311.0, + "completions/min_terminated_length": 311.0, + "epoch": 0.10337883195345715, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.043420251458883286, + "learning_rate": 4.999977828551071e-06, + "loss": 0.028, + "num_tokens": 36900382.0, + "reward": 5.204824447631836, + "reward_std": 1.7110671997070312, + "rewards/accuracy_reward/mean": 4.734375, + "rewards/accuracy_reward/std": 2.04506516456604, + "rewards/ngram_similarity_reward/mean": 0.4704493582248688, + "rewards/ngram_similarity_reward/std": 0.3356776833534241, + "step": 231 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 685.0, + "completions/max_terminated_length": 685.0, + "completions/mean_length": 431.296875, + "completions/mean_terminated_length": 431.296875, + "completions/min_length": 258.0, + "completions/min_terminated_length": 258.0, + "epoch": 0.10382635936451108, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.046472541987895966, + "learning_rate": 4.99996982221241e-06, + "loss": 0.0075, + "num_tokens": 37060545.0, + "reward": 3.730433702468872, + "reward_std": 2.3555819988250732, + "rewards/accuracy_reward/mean": 3.359375, + "rewards/accuracy_reward/std": 3.1007792949676514, + "rewards/ngram_similarity_reward/mean": 0.37105870246887207, + "rewards/ngram_similarity_reward/std": 0.29995277523994446, + "step": 232 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 672.0, + "completions/max_terminated_length": 672.0, + "completions/mean_length": 473.1875, + "completions/mean_terminated_length": 473.1875, + "completions/min_length": 281.0, + "completions/min_terminated_length": 281.0, + "epoch": 0.104273886775565, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04544816538691521, + "learning_rate": 4.999960584141141e-06, + "loss": 0.011, + "num_tokens": 37187629.0, + "reward": 2.8511152267456055, + "reward_std": 0.46699586510658264, + "rewards/accuracy_reward/mean": 2.40625, + "rewards/accuracy_reward/std": 3.0222392082214355, + "rewards/ngram_similarity_reward/mean": 0.44486522674560547, + "rewards/ngram_similarity_reward/std": 0.3931611478328705, + "step": 233 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 770.0, + "completions/max_terminated_length": 770.0, + "completions/mean_length": 444.375, + "completions/mean_terminated_length": 444.375, + "completions/min_length": 169.0, + "completions/min_terminated_length": 169.0, + "epoch": 0.10472141418661893, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.038571011275053024, + "learning_rate": 4.9999501143423195e-06, + "loss": -0.007, + "num_tokens": 37348581.0, + "reward": 3.952913761138916, + "reward_std": 1.3186864852905273, + "rewards/accuracy_reward/mean": 3.328125, + "rewards/accuracy_reward/std": 2.9252848625183105, + "rewards/ngram_similarity_reward/mean": 0.6247888803482056, + "rewards/ngram_similarity_reward/std": 0.35860466957092285, + "step": 234 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 945.0, + "completions/max_terminated_length": 945.0, + "completions/mean_length": 497.359375, + "completions/mean_terminated_length": 497.359375, + "completions/min_length": 161.0, + "completions/min_terminated_length": 161.0, + "epoch": 0.10516894159767286, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05110171064734459, + "learning_rate": 4.999938412821679e-06, + "loss": 0.0098, + "num_tokens": 37513564.0, + "reward": 3.3419313430786133, + "reward_std": 0.8716861009597778, + "rewards/accuracy_reward/mean": 2.875, + "rewards/accuracy_reward/std": 3.000000238418579, + "rewards/ngram_similarity_reward/mean": 0.4669312834739685, + "rewards/ngram_similarity_reward/std": 0.30982959270477295, + "step": 235 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 746.0, + "completions/max_terminated_length": 746.0, + "completions/mean_length": 516.1875, + "completions/mean_terminated_length": 516.1875, + "completions/min_length": 323.0, + "completions/min_terminated_length": 323.0, + "epoch": 0.10561646900872679, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.03886553272604942, + "learning_rate": 4.999925479585624e-06, + "loss": 0.0077, + "num_tokens": 37677224.0, + "reward": 5.317903518676758, + "reward_std": 1.877345323562622, + "rewards/accuracy_reward/mean": 4.65625, + "rewards/accuracy_reward/std": 2.102294683456421, + "rewards/ngram_similarity_reward/mean": 0.6616535782814026, + "rewards/ngram_similarity_reward/std": 0.3614196181297302, + "step": 236 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 813.0, + "completions/max_terminated_length": 813.0, + "completions/mean_length": 472.0625, + "completions/mean_terminated_length": 472.0625, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "epoch": 0.10606399641978072, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04324449226260185, + "learning_rate": 4.999911314641236e-06, + "loss": -0.0293, + "num_tokens": 37799580.0, + "reward": 6.1421403884887695, + "reward_std": 0.5494071841239929, + "rewards/accuracy_reward/mean": 5.40625, + "rewards/accuracy_reward/std": 0.7500000596046448, + "rewards/ngram_similarity_reward/mean": 0.7358900308609009, + "rewards/ngram_similarity_reward/std": 0.3383578658103943, + "step": 237 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 611.0, + "completions/max_terminated_length": 611.0, + "completions/mean_length": 422.125, + "completions/mean_terminated_length": 422.125, + "completions/min_length": 228.0, + "completions/min_terminated_length": 228.0, + "epoch": 0.10651152383083463, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.046920593827962875, + "learning_rate": 4.999895917996267e-06, + "loss": -0.014, + "num_tokens": 37940724.0, + "reward": 4.145472526550293, + "reward_std": 0.9606025218963623, + "rewards/accuracy_reward/mean": 3.71875, + "rewards/accuracy_reward/std": 2.7629566192626953, + "rewards/ngram_similarity_reward/mean": 0.4267226457595825, + "rewards/ngram_similarity_reward/std": 0.318371057510376, + "step": 238 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 666.0, + "completions/max_terminated_length": 666.0, + "completions/mean_length": 455.734375, + "completions/mean_terminated_length": 455.734375, + "completions/min_length": 277.0, + "completions/min_terminated_length": 277.0, + "epoch": 0.10695905124188856, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04800686240196228, + "learning_rate": 4.99987928965915e-06, + "loss": 0.0262, + "num_tokens": 38096931.0, + "reward": 2.8433384895324707, + "reward_std": 2.219339370727539, + "rewards/accuracy_reward/mean": 2.359375, + "rewards/accuracy_reward/std": 3.384686231613159, + "rewards/ngram_similarity_reward/mean": 0.4839634597301483, + "rewards/ngram_similarity_reward/std": 0.35715481638908386, + "step": 239 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 710.0, + "completions/max_terminated_length": 710.0, + "completions/mean_length": 471.59375, + "completions/mean_terminated_length": 471.59375, + "completions/min_length": 281.0, + "completions/min_terminated_length": 281.0, + "epoch": 0.1074065786529425, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04236859083175659, + "learning_rate": 4.999861429638984e-06, + "loss": -0.0202, + "num_tokens": 38262169.0, + "reward": 3.3624043464660645, + "reward_std": 2.224200487136841, + "rewards/accuracy_reward/mean": 3.0, + "rewards/accuracy_reward/std": 3.1622776985168457, + "rewards/ngram_similarity_reward/mean": 0.36240440607070923, + "rewards/ngram_similarity_reward/std": 0.258233904838562, + "step": 240 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 676.0, + "completions/max_terminated_length": 676.0, + "completions/mean_length": 479.578125, + "completions/mean_terminated_length": 479.578125, + "completions/min_length": 326.0, + "completions/min_terminated_length": 326.0, + "epoch": 0.10785410606399642, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.045299142599105835, + "learning_rate": 4.999842337945548e-06, + "loss": 0.0048, + "num_tokens": 38404846.0, + "reward": 2.6669921875, + "reward_std": 1.0915460586547852, + "rewards/accuracy_reward/mean": 2.5, + "rewards/accuracy_reward/std": 3.0237157344818115, + "rewards/ngram_similarity_reward/mean": 0.16699212789535522, + "rewards/ngram_similarity_reward/std": 0.13970988988876343, + "step": 241 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 723.0, + "completions/max_terminated_length": 723.0, + "completions/mean_length": 443.15625, + "completions/mean_terminated_length": 443.15625, + "completions/min_length": 243.0, + "completions/min_terminated_length": 243.0, + "epoch": 0.10830163347505034, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05201602727174759, + "learning_rate": 4.999822014589293e-06, + "loss": 0.0129, + "num_tokens": 38569688.0, + "reward": 0.9952648878097534, + "reward_std": 0.18254108726978302, + "rewards/accuracy_reward/mean": 0.734375, + "rewards/accuracy_reward/std": 2.8044307231903076, + "rewards/ngram_similarity_reward/mean": 0.2608899772167206, + "rewards/ngram_similarity_reward/std": 0.2795363664627075, + "step": 242 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 739.0, + "completions/max_terminated_length": 739.0, + "completions/mean_length": 503.484375, + "completions/mean_terminated_length": 503.484375, + "completions/min_length": 335.0, + "completions/min_terminated_length": 335.0, + "epoch": 0.10874916088610427, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04663022607564926, + "learning_rate": 4.9998004595813476e-06, + "loss": 0.0253, + "num_tokens": 38735095.0, + "reward": 4.354253768920898, + "reward_std": 0.46198853850364685, + "rewards/accuracy_reward/mean": 3.90625, + "rewards/accuracy_reward/std": 2.6709415912628174, + "rewards/ngram_similarity_reward/mean": 0.44800350069999695, + "rewards/ngram_similarity_reward/std": 0.3534550070762634, + "step": 243 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 698.0, + "completions/max_terminated_length": 698.0, + "completions/mean_length": 483.046875, + "completions/mean_terminated_length": 483.046875, + "completions/min_length": 256.0, + "completions/min_terminated_length": 256.0, + "epoch": 0.1091966882971582, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.046560950577259064, + "learning_rate": 4.9997776729335085e-06, + "loss": -0.0048, + "num_tokens": 38898442.0, + "reward": 3.153549909591675, + "reward_std": 1.9385340213775635, + "rewards/accuracy_reward/mean": 2.671875, + "rewards/accuracy_reward/std": 3.1449365615844727, + "rewards/ngram_similarity_reward/mean": 0.48167479038238525, + "rewards/ngram_similarity_reward/std": 0.41635817289352417, + "step": 244 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1286.0, + "completions/max_terminated_length": 1286.0, + "completions/mean_length": 508.78125, + "completions/mean_terminated_length": 508.78125, + "completions/min_length": 221.0, + "completions/min_terminated_length": 221.0, + "epoch": 0.10964421570821213, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0464753620326519, + "learning_rate": 4.999753654658252e-06, + "loss": -0.0335, + "num_tokens": 39044524.0, + "reward": 2.106961727142334, + "reward_std": 2.0896620750427246, + "rewards/accuracy_reward/mean": 1.75, + "rewards/accuracy_reward/std": 2.9277002811431885, + "rewards/ngram_similarity_reward/mean": 0.356961727142334, + "rewards/ngram_similarity_reward/std": 0.2141096293926239, + "step": 245 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 708.0, + "completions/max_terminated_length": 708.0, + "completions/mean_length": 463.84375, + "completions/mean_terminated_length": 463.84375, + "completions/min_length": 238.0, + "completions/min_terminated_length": 238.0, + "epoch": 0.11009174311926606, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.045450758188962936, + "learning_rate": 4.999728404768726e-06, + "loss": 0.0302, + "num_tokens": 39231858.0, + "reward": 2.8807318210601807, + "reward_std": 1.6852710247039795, + "rewards/accuracy_reward/mean": 2.40625, + "rewards/accuracy_reward/std": 3.0222392082214355, + "rewards/ngram_similarity_reward/mean": 0.4744817614555359, + "rewards/ngram_similarity_reward/std": 0.29741978645324707, + "step": 246 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 796.0, + "completions/max_terminated_length": 796.0, + "completions/mean_length": 537.125, + "completions/mean_terminated_length": 537.125, + "completions/min_length": 338.0, + "completions/min_terminated_length": 338.0, + "epoch": 0.11053927053031998, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.043713342398405075, + "learning_rate": 4.999701923278754e-06, + "loss": 0.0225, + "num_tokens": 39396474.0, + "reward": 2.1098146438598633, + "reward_std": 0.8655683398246765, + "rewards/accuracy_reward/mean": 1.90625, + "rewards/accuracy_reward/std": 3.001157283782959, + "rewards/ngram_similarity_reward/mean": 0.20356449484825134, + "rewards/ngram_similarity_reward/std": 0.10082338750362396, + "step": 247 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 801.0, + "completions/max_terminated_length": 801.0, + "completions/mean_length": 553.703125, + "completions/mean_terminated_length": 553.703125, + "completions/min_length": 375.0, + "completions/min_terminated_length": 375.0, + "epoch": 0.11098679794137391, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04395830258727074, + "learning_rate": 4.999674210202833e-06, + "loss": 0.0229, + "num_tokens": 39523719.0, + "reward": 2.5181853771209717, + "reward_std": 0.8695477247238159, + "rewards/accuracy_reward/mean": 2.140625, + "rewards/accuracy_reward/std": 3.0930912494659424, + "rewards/ngram_similarity_reward/mean": 0.3775605261325836, + "rewards/ngram_similarity_reward/std": 0.3463709056377411, + "step": 248 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 710.0, + "completions/max_terminated_length": 710.0, + "completions/mean_length": 513.375, + "completions/mean_terminated_length": 513.375, + "completions/min_length": 361.0, + "completions/min_terminated_length": 361.0, + "epoch": 0.11143432535242784, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.040253348648548126, + "learning_rate": 4.999645265556134e-06, + "loss": -0.013, + "num_tokens": 39664863.0, + "reward": 3.704132080078125, + "reward_std": 1.0076619386672974, + "rewards/accuracy_reward/mean": 2.96875, + "rewards/accuracy_reward/std": 2.986577272415161, + "rewards/ngram_similarity_reward/mean": 0.7353817820549011, + "rewards/ngram_similarity_reward/std": 0.3593013882637024, + "step": 249 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 730.0, + "completions/max_terminated_length": 730.0, + "completions/mean_length": 511.96875, + "completions/mean_terminated_length": 511.96875, + "completions/min_length": 282.0, + "completions/min_terminated_length": 282.0, + "epoch": 0.11188185276348177, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.045767977833747864, + "learning_rate": 4.999615089354502e-06, + "loss": 0.0125, + "num_tokens": 39805069.0, + "reward": 3.003373384475708, + "reward_std": 0.2290632426738739, + "rewards/accuracy_reward/mean": 2.40625, + "rewards/accuracy_reward/std": 3.1305904388427734, + "rewards/ngram_similarity_reward/mean": 0.5971232652664185, + "rewards/ngram_similarity_reward/std": 0.33951181173324585, + "step": 250 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 689.0, + "completions/max_terminated_length": 689.0, + "completions/mean_length": 429.40625, + "completions/mean_terminated_length": 429.40625, + "completions/min_length": 237.0, + "completions/min_terminated_length": 237.0, + "epoch": 0.11232938017453568, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04780712351202965, + "learning_rate": 4.999583681614458e-06, + "loss": 0.0247, + "num_tokens": 39941287.0, + "reward": 2.729985237121582, + "reward_std": 0.19259922206401825, + "rewards/accuracy_reward/mean": 2.484375, + "rewards/accuracy_reward/std": 3.0419929027557373, + "rewards/ngram_similarity_reward/mean": 0.24561025202274323, + "rewards/ngram_similarity_reward/std": 0.21396705508232117, + "step": 251 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 800.0, + "completions/max_terminated_length": 800.0, + "completions/mean_length": 445.53125, + "completions/mean_terminated_length": 445.53125, + "completions/min_length": 228.0, + "completions/min_terminated_length": 228.0, + "epoch": 0.11277690758558961, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04843899607658386, + "learning_rate": 4.999551042353196e-06, + "loss": -0.0417, + "num_tokens": 40124777.0, + "reward": 5.458471775054932, + "reward_std": 1.1269618272781372, + "rewards/accuracy_reward/mean": 4.9375, + "rewards/accuracy_reward/std": 1.7627090215682983, + "rewards/ngram_similarity_reward/mean": 0.5209718942642212, + "rewards/ngram_similarity_reward/std": 0.35723310708999634, + "step": 252 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 784.0, + "completions/max_terminated_length": 784.0, + "completions/mean_length": 527.28125, + "completions/mean_terminated_length": 527.28125, + "completions/min_length": 328.0, + "completions/min_terminated_length": 328.0, + "epoch": 0.11322443499664354, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04037504270672798, + "learning_rate": 4.999517171588584e-06, + "loss": 0.0072, + "num_tokens": 40279451.0, + "reward": 4.945751190185547, + "reward_std": 1.7158468961715698, + "rewards/accuracy_reward/mean": 4.546875, + "rewards/accuracy_reward/std": 2.2355687618255615, + "rewards/ngram_similarity_reward/mean": 0.3988759219646454, + "rewards/ngram_similarity_reward/std": 0.3014982044696808, + "step": 253 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 829.0, + "completions/max_terminated_length": 829.0, + "completions/mean_length": 462.125, + "completions/mean_terminated_length": 462.125, + "completions/min_length": 242.0, + "completions/min_terminated_length": 242.0, + "epoch": 0.11367196240769747, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04890478029847145, + "learning_rate": 4.999482069339163e-06, + "loss": -0.0018, + "num_tokens": 40435987.0, + "reward": 3.4920666217803955, + "reward_std": 0.8878836035728455, + "rewards/accuracy_reward/mean": 3.0625, + "rewards/accuracy_reward/std": 2.9700891971588135, + "rewards/ngram_similarity_reward/mean": 0.42956671118736267, + "rewards/ngram_similarity_reward/std": 0.2548423409461975, + "step": 254 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 735.0, + "completions/max_terminated_length": 735.0, + "completions/mean_length": 542.578125, + "completions/mean_terminated_length": 542.578125, + "completions/min_length": 330.0, + "completions/min_terminated_length": 330.0, + "epoch": 0.1141194898187514, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.03973216935992241, + "learning_rate": 4.99944573562415e-06, + "loss": 0.0244, + "num_tokens": 40635496.0, + "reward": 4.6910176277160645, + "reward_std": 1.8697986602783203, + "rewards/accuracy_reward/mean": 4.1875, + "rewards/accuracy_reward/std": 2.5, + "rewards/ngram_similarity_reward/mean": 0.5035178065299988, + "rewards/ngram_similarity_reward/std": 0.29467612504959106, + "step": 255 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 883.0, + "completions/max_terminated_length": 883.0, + "completions/mean_length": 562.03125, + "completions/mean_terminated_length": 562.03125, + "completions/min_length": 367.0, + "completions/min_terminated_length": 367.0, + "epoch": 0.11456701722980532, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04116257652640343, + "learning_rate": 4.999408170463438e-06, + "loss": 0.0027, + "num_tokens": 40805418.0, + "reward": 2.6413564682006836, + "reward_std": 1.014211654663086, + "rewards/accuracy_reward/mean": 2.25, + "rewards/accuracy_reward/std": 3.1922526359558105, + "rewards/ngram_similarity_reward/mean": 0.39135655760765076, + "rewards/ngram_similarity_reward/std": 0.23213790357112885, + "step": 256 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 855.0, + "completions/max_terminated_length": 855.0, + "completions/mean_length": 513.046875, + "completions/mean_terminated_length": 513.046875, + "completions/min_length": 298.0, + "completions/min_terminated_length": 298.0, + "epoch": 0.11501454464085925, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05036421865224838, + "learning_rate": 4.999369373877589e-06, + "loss": 0.0415, + "num_tokens": 40960749.0, + "reward": 1.2556395530700684, + "reward_std": 0.6643199920654297, + "rewards/accuracy_reward/mean": 0.96875, + "rewards/accuracy_reward/std": 2.7658276557922363, + "rewards/ngram_similarity_reward/mean": 0.28688937425613403, + "rewards/ngram_similarity_reward/std": 0.24688217043876648, + "step": 257 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1882.0, + "completions/max_terminated_length": 1882.0, + "completions/mean_length": 529.859375, + "completions/mean_terminated_length": 529.859375, + "completions/min_length": 292.0, + "completions/min_terminated_length": 292.0, + "epoch": 0.11546207205191318, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.040541063994169235, + "learning_rate": 4.999329345887844e-06, + "loss": 0.0607, + "num_tokens": 41230180.0, + "reward": -0.11133871972560883, + "reward_std": 1.5756843090057373, + "rewards/accuracy_reward/mean": -0.3125, + "rewards/accuracy_reward/std": 1.7627090215682983, + "rewards/ngram_similarity_reward/mean": 0.20116126537322998, + "rewards/ngram_similarity_reward/std": 0.22335268557071686, + "step": 258 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 955.0, + "completions/max_terminated_length": 955.0, + "completions/mean_length": 568.6875, + "completions/mean_terminated_length": 568.6875, + "completions/min_length": 396.0, + "completions/min_terminated_length": 396.0, + "epoch": 0.11590959946296711, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0399027056992054, + "learning_rate": 4.999288086516114e-06, + "loss": 0.0382, + "num_tokens": 41373776.0, + "reward": 3.2704312801361084, + "reward_std": 0.8012506365776062, + "rewards/accuracy_reward/mean": 2.859375, + "rewards/accuracy_reward/std": 3.0203921794891357, + "rewards/ngram_similarity_reward/mean": 0.4110559821128845, + "rewards/ngram_similarity_reward/std": 0.3374841809272766, + "step": 259 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 671.0, + "completions/max_terminated_length": 671.0, + "completions/mean_length": 494.078125, + "completions/mean_terminated_length": 494.078125, + "completions/min_length": 335.0, + "completions/min_terminated_length": 335.0, + "epoch": 0.11635712687402103, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04895956814289093, + "learning_rate": 4.999245595784988e-06, + "loss": 0.018, + "num_tokens": 41518421.0, + "reward": 1.0387669801712036, + "reward_std": 1.3193118572235107, + "rewards/accuracy_reward/mean": 0.71875, + "rewards/accuracy_reward/std": 2.6931350231170654, + "rewards/ngram_similarity_reward/mean": 0.3200168013572693, + "rewards/ngram_similarity_reward/std": 0.2850259840488434, + "step": 260 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 717.0, + "completions/max_terminated_length": 717.0, + "completions/mean_length": 423.140625, + "completions/mean_terminated_length": 423.140625, + "completions/min_length": 282.0, + "completions/min_terminated_length": 282.0, + "epoch": 0.11680465428507496, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04695745185017586, + "learning_rate": 4.999201873717725e-06, + "loss": 0.0225, + "num_tokens": 41688078.0, + "reward": 3.881317615509033, + "reward_std": 1.5000773668289185, + "rewards/accuracy_reward/mean": 3.25, + "rewards/accuracy_reward/std": 2.9277002811431885, + "rewards/ngram_similarity_reward/mean": 0.6313177943229675, + "rewards/ngram_similarity_reward/std": 0.30789679288864136, + "step": 261 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 874.0, + "completions/max_terminated_length": 874.0, + "completions/mean_length": 525.03125, + "completions/mean_terminated_length": 525.03125, + "completions/min_length": 247.0, + "completions/min_terminated_length": 247.0, + "epoch": 0.11725218169612889, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.041961099952459335, + "learning_rate": 4.999156920338263e-06, + "loss": 0.0157, + "num_tokens": 41825072.0, + "reward": 3.311887741088867, + "reward_std": 1.407604694366455, + "rewards/accuracy_reward/mean": 2.578125, + "rewards/accuracy_reward/std": 3.0410144329071045, + "rewards/ngram_similarity_reward/mean": 0.7337629795074463, + "rewards/ngram_similarity_reward/std": 0.28070977330207825, + "step": 262 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 737.0, + "completions/max_terminated_length": 737.0, + "completions/mean_length": 465.703125, + "completions/mean_terminated_length": 465.703125, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "epoch": 0.11769970910718282, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05207325145602226, + "learning_rate": 4.9991107356712116e-06, + "loss": -0.0008, + "num_tokens": 41965293.0, + "reward": 3.4179024696350098, + "reward_std": 1.8766613006591797, + "rewards/accuracy_reward/mean": 2.96875, + "rewards/accuracy_reward/std": 2.986577272415161, + "rewards/ngram_similarity_reward/mean": 0.44915229082107544, + "rewards/ngram_similarity_reward/std": 0.32722440361976624, + "step": 263 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 888.0, + "completions/max_terminated_length": 888.0, + "completions/mean_length": 438.234375, + "completions/mean_terminated_length": 438.234375, + "completions/min_length": 250.0, + "completions/min_terminated_length": 250.0, + "epoch": 0.11814723651823675, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.058121174573898315, + "learning_rate": 4.9990633197418515e-06, + "loss": 0.0237, + "num_tokens": 42121772.0, + "reward": 1.065765380859375, + "reward_std": 1.3512228727340698, + "rewards/accuracy_reward/mean": 0.75, + "rewards/accuracy_reward/std": 2.7945525646209717, + "rewards/ngram_similarity_reward/mean": 0.3157654106616974, + "rewards/ngram_similarity_reward/std": 0.28690239787101746, + "step": 264 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 734.0, + "completions/max_terminated_length": 734.0, + "completions/mean_length": 587.21875, + "completions/mean_terminated_length": 587.21875, + "completions/min_length": 392.0, + "completions/min_terminated_length": 392.0, + "epoch": 0.11859476392929066, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.03617122396826744, + "learning_rate": 4.999014672576143e-06, + "loss": -0.0155, + "num_tokens": 42274778.0, + "reward": 3.6269023418426514, + "reward_std": 2.9205880165100098, + "rewards/accuracy_reward/mean": 3.125, + "rewards/accuracy_reward/std": 2.994704246520996, + "rewards/ngram_similarity_reward/mean": 0.5019023418426514, + "rewards/ngram_similarity_reward/std": 0.27822884917259216, + "step": 265 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 715.0, + "completions/max_terminated_length": 715.0, + "completions/mean_length": 478.28125, + "completions/mean_terminated_length": 478.28125, + "completions/min_length": 320.0, + "completions/min_terminated_length": 320.0, + "epoch": 0.1190422913403446, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04338669404387474, + "learning_rate": 4.998964794200716e-06, + "loss": 0.0158, + "num_tokens": 42423340.0, + "reward": 2.7534642219543457, + "reward_std": 0.5386490821838379, + "rewards/accuracy_reward/mean": 2.375, + "rewards/accuracy_reward/std": 3.057647228240967, + "rewards/ngram_similarity_reward/mean": 0.37846437096595764, + "rewards/ngram_similarity_reward/std": 0.32297518849372864, + "step": 266 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 753.0, + "completions/max_terminated_length": 753.0, + "completions/mean_length": 499.78125, + "completions/mean_terminated_length": 499.78125, + "completions/min_length": 340.0, + "completions/min_terminated_length": 340.0, + "epoch": 0.11948981875139852, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.042514264583587646, + "learning_rate": 4.9989136846428775e-06, + "loss": 0.0057, + "num_tokens": 42586094.0, + "reward": 2.722477674484253, + "reward_std": 0.7117666602134705, + "rewards/accuracy_reward/mean": 2.59375, + "rewards/accuracy_reward/std": 3.1305904388427734, + "rewards/ngram_similarity_reward/mean": 0.12872779369354248, + "rewards/ngram_similarity_reward/std": 0.17331068217754364, + "step": 267 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 657.0, + "completions/max_terminated_length": 657.0, + "completions/mean_length": 454.4375, + "completions/mean_terminated_length": 454.4375, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, + "epoch": 0.11993734616245245, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04440493881702423, + "learning_rate": 4.998861343930605e-06, + "loss": -0.0051, + "num_tokens": 42709482.0, + "reward": 5.638522148132324, + "reward_std": 0.634451150894165, + "rewards/accuracy_reward/mean": 5.390625, + "rewards/accuracy_reward/std": 0.8750000596046448, + "rewards/ngram_similarity_reward/mean": 0.24789699912071228, + "rewards/ngram_similarity_reward/std": 0.2870684862136841, + "step": 268 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 678.0, + "completions/max_terminated_length": 678.0, + "completions/mean_length": 443.53125, + "completions/mean_terminated_length": 443.53125, + "completions/min_length": 204.0, + "completions/min_terminated_length": 204.0, + "epoch": 0.12038487357350637, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04513474926352501, + "learning_rate": 4.998807772092555e-06, + "loss": 0.0183, + "num_tokens": 42844220.0, + "reward": 5.522170066833496, + "reward_std": 0.832690954208374, + "rewards/accuracy_reward/mean": 5.03125, + "rewards/accuracy_reward/std": 1.6229382753372192, + "rewards/ngram_similarity_reward/mean": 0.49091994762420654, + "rewards/ngram_similarity_reward/std": 0.31039613485336304, + "step": 269 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 589.0, + "completions/max_terminated_length": 589.0, + "completions/mean_length": 390.71875, + "completions/mean_terminated_length": 390.71875, + "completions/min_length": 212.0, + "completions/min_terminated_length": 212.0, + "epoch": 0.1208324009845603, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05287130922079086, + "learning_rate": 4.998752969158052e-06, + "loss": -0.0091, + "num_tokens": 42986714.0, + "reward": 1.4076976776123047, + "reward_std": 1.0002360343933105, + "rewards/accuracy_reward/mean": 0.90625, + "rewards/accuracy_reward/std": 2.5617377758026123, + "rewards/ngram_similarity_reward/mean": 0.5014476776123047, + "rewards/ngram_similarity_reward/std": 0.31911924481391907, + "step": 270 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 523.0, + "completions/max_terminated_length": 523.0, + "completions/mean_length": 351.875, + "completions/mean_terminated_length": 351.875, + "completions/min_length": 191.0, + "completions/min_terminated_length": 191.0, + "epoch": 0.12127992839561423, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06454234570264816, + "learning_rate": 4.9986969351571006e-06, + "loss": -0.0137, + "num_tokens": 43177970.0, + "reward": 4.2275071144104, + "reward_std": 0.9109856486320496, + "rewards/accuracy_reward/mean": 3.546875, + "rewards/accuracy_reward/std": 2.9300289154052734, + "rewards/ngram_similarity_reward/mean": 0.6806321740150452, + "rewards/ngram_similarity_reward/std": 0.34795522689819336, + "step": 271 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1038.0, + "completions/max_terminated_length": 1038.0, + "completions/mean_length": 544.921875, + "completions/mean_terminated_length": 544.921875, + "completions/min_length": 275.0, + "completions/min_terminated_length": 275.0, + "epoch": 0.12172745580666816, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.050481028854846954, + "learning_rate": 4.998639670120374e-06, + "loss": 0.0122, + "num_tokens": 43335037.0, + "reward": 2.832256555557251, + "reward_std": 0.6346484422683716, + "rewards/accuracy_reward/mean": 2.375, + "rewards/accuracy_reward/std": 3.057647228240967, + "rewards/ngram_similarity_reward/mean": 0.4572564959526062, + "rewards/ngram_similarity_reward/std": 0.2750431001186371, + "step": 272 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1403.0, + "completions/max_terminated_length": 1403.0, + "completions/mean_length": 528.875, + "completions/mean_terminated_length": 528.875, + "completions/min_length": 231.0, + "completions/min_terminated_length": 231.0, + "epoch": 0.12217498321772209, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.03463369607925415, + "learning_rate": 4.9985811740792226e-06, + "loss": 0.0546, + "num_tokens": 43462565.0, + "reward": 0.013282734900712967, + "reward_std": 0.1335204541683197, + "rewards/accuracy_reward/mean": -0.5, + "rewards/accuracy_reward/std": 0.0, + "rewards/ngram_similarity_reward/mean": 0.5132827758789062, + "rewards/ngram_similarity_reward/std": 0.32260528206825256, + "step": 273 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 720.0, + "completions/max_terminated_length": 720.0, + "completions/mean_length": 508.9375, + "completions/mean_terminated_length": 508.9375, + "completions/min_length": 256.0, + "completions/min_terminated_length": 256.0, + "epoch": 0.12262251062877601, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04936639964580536, + "learning_rate": 4.9985214470656705e-06, + "loss": 0.0042, + "num_tokens": 43670353.0, + "reward": 2.0254318714141846, + "reward_std": 1.2559350728988647, + "rewards/accuracy_reward/mean": 1.53125, + "rewards/accuracy_reward/std": 2.900294303894043, + "rewards/ngram_similarity_reward/mean": 0.49418196082115173, + "rewards/ngram_similarity_reward/std": 0.33127668499946594, + "step": 274 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 662.0, + "completions/max_terminated_length": 662.0, + "completions/mean_length": 397.828125, + "completions/mean_terminated_length": 397.828125, + "completions/min_length": 197.0, + "completions/min_terminated_length": 197.0, + "epoch": 0.12307003803982994, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04740218073129654, + "learning_rate": 4.998460489112412e-06, + "loss": -0.0289, + "num_tokens": 43845318.0, + "reward": 5.616533279418945, + "reward_std": 1.3415679931640625, + "rewards/accuracy_reward/mean": 5.0, + "rewards/accuracy_reward/std": 1.7366269826889038, + "rewards/ngram_similarity_reward/mean": 0.6165330410003662, + "rewards/ngram_similarity_reward/std": 0.46372175216674805, + "step": 275 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 666.0, + "completions/max_terminated_length": 666.0, + "completions/mean_length": 382.734375, + "completions/mean_terminated_length": 382.734375, + "completions/min_length": 185.0, + "completions/min_terminated_length": 185.0, + "epoch": 0.12351756545088387, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.045659687370061874, + "learning_rate": 4.998398300252821e-06, + "loss": 0.0129, + "num_tokens": 43985925.0, + "reward": 2.289724826812744, + "reward_std": 0.761822521686554, + "rewards/accuracy_reward/mean": 2.03125, + "rewards/accuracy_reward/std": 2.986577272415161, + "rewards/ngram_similarity_reward/mean": 0.2584747076034546, + "rewards/ngram_similarity_reward/std": 0.21390005946159363, + "step": 276 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 560.0, + "completions/max_terminated_length": 560.0, + "completions/mean_length": 402.25, + "completions/mean_terminated_length": 402.25, + "completions/min_length": 257.0, + "completions/min_terminated_length": 257.0, + "epoch": 0.1239650928619378, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0604717992246151, + "learning_rate": 4.998334880520942e-06, + "loss": 0.0173, + "num_tokens": 44192549.0, + "reward": 1.8768291473388672, + "reward_std": 2.8309593200683594, + "rewards/accuracy_reward/mean": 1.671875, + "rewards/accuracy_reward/std": 3.0002894401550293, + "rewards/ngram_similarity_reward/mean": 0.20495399832725525, + "rewards/ngram_similarity_reward/std": 0.11042793095111847, + "step": 277 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 981.0, + "completions/max_terminated_length": 981.0, + "completions/mean_length": 515.6875, + "completions/mean_terminated_length": 515.6875, + "completions/min_length": 244.0, + "completions/min_terminated_length": 244.0, + "epoch": 0.12441262027299171, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.048352211713790894, + "learning_rate": 4.998270229951493e-06, + "loss": -0.0155, + "num_tokens": 44332465.0, + "reward": 4.659803867340088, + "reward_std": 1.2321932315826416, + "rewards/accuracy_reward/mean": 4.109375, + "rewards/accuracy_reward/std": 2.6584229469299316, + "rewards/ngram_similarity_reward/mean": 0.550429105758667, + "rewards/ngram_similarity_reward/std": 0.4145485460758209, + "step": 278 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 713.0, + "completions/max_terminated_length": 713.0, + "completions/mean_length": 482.296875, + "completions/mean_terminated_length": 482.296875, + "completions/min_length": 301.0, + "completions/min_terminated_length": 301.0, + "epoch": 0.12486014768404564, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04150489717721939, + "learning_rate": 4.998204348579867e-06, + "loss": 0.0014, + "num_tokens": 44462228.0, + "reward": 5.0958943367004395, + "reward_std": 0.978097677230835, + "rewards/accuracy_reward/mean": 4.453125, + "rewards/accuracy_reward/std": 2.4587368965148926, + "rewards/ngram_similarity_reward/mean": 0.6427693963050842, + "rewards/ngram_similarity_reward/std": 0.24158473312854767, + "step": 279 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 821.0, + "completions/max_terminated_length": 821.0, + "completions/mean_length": 514.75, + "completions/mean_terminated_length": 514.75, + "completions/min_length": 290.0, + "completions/min_terminated_length": 290.0, + "epoch": 0.12530767509509957, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0430060438811779, + "learning_rate": 4.99813723644213e-06, + "loss": 0.0225, + "num_tokens": 44624964.0, + "reward": 4.896323204040527, + "reward_std": 1.478863000869751, + "rewards/accuracy_reward/mean": 4.34375, + "rewards/accuracy_reward/std": 2.4314002990722656, + "rewards/ngram_similarity_reward/mean": 0.5525734424591064, + "rewards/ngram_similarity_reward/std": 0.38259801268577576, + "step": 280 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 678.0, + "completions/max_terminated_length": 678.0, + "completions/mean_length": 471.265625, + "completions/mean_terminated_length": 471.265625, + "completions/min_length": 289.0, + "completions/min_terminated_length": 289.0, + "epoch": 0.1257552025061535, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.049225497990846634, + "learning_rate": 4.998068893575021e-06, + "loss": 0.0518, + "num_tokens": 44814821.0, + "reward": 2.4747767448425293, + "reward_std": 0.9369537830352783, + "rewards/accuracy_reward/mean": 2.078125, + "rewards/accuracy_reward/std": 3.0488338470458984, + "rewards/ngram_similarity_reward/mean": 0.39665159583091736, + "rewards/ngram_similarity_reward/std": 0.2629046142101288, + "step": 281 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1187.0, + "completions/max_terminated_length": 1187.0, + "completions/mean_length": 514.40625, + "completions/mean_terminated_length": 514.40625, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "epoch": 0.12620272991720743, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04439893737435341, + "learning_rate": 4.997999320015956e-06, + "loss": 0.0095, + "num_tokens": 44957471.0, + "reward": 2.500685691833496, + "reward_std": 1.8301470279693604, + "rewards/accuracy_reward/mean": 2.015625, + "rewards/accuracy_reward/std": 3.00260329246521, + "rewards/ngram_similarity_reward/mean": 0.48506051301956177, + "rewards/ngram_similarity_reward/std": 0.2847696840763092, + "step": 282 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 677.0, + "completions/max_terminated_length": 677.0, + "completions/mean_length": 450.1875, + "completions/mean_terminated_length": 450.1875, + "completions/min_length": 268.0, + "completions/min_terminated_length": 268.0, + "epoch": 0.12665025732826135, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.051984407007694244, + "learning_rate": 4.997928515803023e-06, + "loss": 0.0022, + "num_tokens": 45095595.0, + "reward": 5.386553764343262, + "reward_std": 1.2933610677719116, + "rewards/accuracy_reward/mean": 4.890625, + "rewards/accuracy_reward/std": 1.915825366973877, + "rewards/ngram_similarity_reward/mean": 0.4959290623664856, + "rewards/ngram_similarity_reward/std": 0.31944161653518677, + "step": 283 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 801.0, + "completions/max_terminated_length": 801.0, + "completions/mean_length": 503.84375, + "completions/mean_terminated_length": 503.84375, + "completions/min_length": 246.0, + "completions/min_terminated_length": 246.0, + "epoch": 0.1270977847393153, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04425594583153725, + "learning_rate": 4.99785648097498e-06, + "loss": -0.0495, + "num_tokens": 45318497.0, + "reward": 5.2758564949035645, + "reward_std": 1.8868292570114136, + "rewards/accuracy_reward/mean": 4.78125, + "rewards/accuracy_reward/std": 2.0737876892089844, + "rewards/ngram_similarity_reward/mean": 0.4946065843105316, + "rewards/ngram_similarity_reward/std": 0.3254554569721222, + "step": 284 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 698.0, + "completions/max_terminated_length": 698.0, + "completions/mean_length": 477.578125, + "completions/mean_terminated_length": 477.578125, + "completions/min_length": 236.0, + "completions/min_terminated_length": 236.0, + "epoch": 0.1275453121503692, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04923122376203537, + "learning_rate": 4.9977832155712666e-06, + "loss": 0.0074, + "num_tokens": 45497046.0, + "reward": 1.7728796005249023, + "reward_std": 1.6681783199310303, + "rewards/accuracy_reward/mean": 1.21875, + "rewards/accuracy_reward/std": 3.1495466232299805, + "rewards/ngram_similarity_reward/mean": 0.5541296005249023, + "rewards/ngram_similarity_reward/std": 0.48000162839889526, + "step": 285 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 853.0, + "completions/max_terminated_length": 853.0, + "completions/mean_length": 487.578125, + "completions/mean_terminated_length": 487.578125, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, + "epoch": 0.12799283956142313, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04564898833632469, + "learning_rate": 4.997708719631989e-06, + "loss": 0.0231, + "num_tokens": 45648443.0, + "reward": 3.957265615463257, + "reward_std": 0.9509031176567078, + "rewards/accuracy_reward/mean": 3.421875, + "rewards/accuracy_reward/std": 2.896657705307007, + "rewards/ngram_similarity_reward/mean": 0.5353904962539673, + "rewards/ngram_similarity_reward/std": 0.38713234663009644, + "step": 286 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 638.0, + "completions/max_terminated_length": 638.0, + "completions/mean_length": 374.453125, + "completions/mean_terminated_length": 374.453125, + "completions/min_length": 212.0, + "completions/min_terminated_length": 212.0, + "epoch": 0.12844036697247707, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05063045024871826, + "learning_rate": 4.997632993197929e-06, + "loss": 0.0039, + "num_tokens": 45787080.0, + "reward": 4.43035888671875, + "reward_std": 0.6818827390670776, + "rewards/accuracy_reward/mean": 3.8125, + "rewards/accuracy_reward/std": 2.7189810276031494, + "rewards/ngram_similarity_reward/mean": 0.6178590655326843, + "rewards/ngram_similarity_reward/std": 0.33551886677742004, + "step": 287 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 863.0, + "completions/max_terminated_length": 863.0, + "completions/mean_length": 425.578125, + "completions/mean_terminated_length": 425.578125, + "completions/min_length": 226.0, + "completions/min_terminated_length": 226.0, + "epoch": 0.128887894383531, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04895696043968201, + "learning_rate": 4.997556036310543e-06, + "loss": 0.017, + "num_tokens": 45971293.0, + "reward": 4.363377571105957, + "reward_std": 1.7207295894622803, + "rewards/accuracy_reward/mean": 3.890625, + "rewards/accuracy_reward/std": 2.6998953819274902, + "rewards/ngram_similarity_reward/mean": 0.4727526605129242, + "rewards/ngram_similarity_reward/std": 0.45147505402565, + "step": 288 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 754.0, + "completions/max_terminated_length": 754.0, + "completions/mean_length": 483.375, + "completions/mean_terminated_length": 483.375, + "completions/min_length": 330.0, + "completions/min_terminated_length": 330.0, + "epoch": 0.12933542179458493, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04794878885149956, + "learning_rate": 4.9974778490119605e-06, + "loss": -0.0003, + "num_tokens": 46140261.0, + "reward": 3.348125457763672, + "reward_std": 2.0464982986450195, + "rewards/accuracy_reward/mean": 2.796875, + "rewards/accuracy_reward/std": 3.0998191833496094, + "rewards/ngram_similarity_reward/mean": 0.5512505769729614, + "rewards/ngram_similarity_reward/std": 0.33140721917152405, + "step": 289 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 747.0, + "completions/max_terminated_length": 747.0, + "completions/mean_length": 506.90625, + "completions/mean_terminated_length": 506.90625, + "completions/min_length": 350.0, + "completions/min_terminated_length": 350.0, + "epoch": 0.12978294920563885, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.039918627589941025, + "learning_rate": 4.997398431344985e-06, + "loss": -0.0138, + "num_tokens": 46271311.0, + "reward": 5.304322242736816, + "reward_std": 1.2509558200836182, + "rewards/accuracy_reward/mean": 4.921875, + "rewards/accuracy_reward/std": 1.8153201341629028, + "rewards/ngram_similarity_reward/mean": 0.3824467360973358, + "rewards/ngram_similarity_reward/std": 0.2415844053030014, + "step": 290 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 696.0, + "completions/max_terminated_length": 696.0, + "completions/mean_length": 454.546875, + "completions/mean_terminated_length": 454.546875, + "completions/min_length": 279.0, + "completions/min_terminated_length": 279.0, + "epoch": 0.13023047661669276, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05253060534596443, + "learning_rate": 4.997317783353095e-06, + "loss": 0.0051, + "num_tokens": 46435554.0, + "reward": 3.7336955070495605, + "reward_std": 1.3570480346679688, + "rewards/accuracy_reward/mean": 3.0625, + "rewards/accuracy_reward/std": 2.9700891971588135, + "rewards/ngram_similarity_reward/mean": 0.6711956262588501, + "rewards/ngram_similarity_reward/std": 0.31571659445762634, + "step": 291 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 725.0, + "completions/max_terminated_length": 725.0, + "completions/mean_length": 361.5, + "completions/mean_terminated_length": 361.5, + "completions/min_length": 208.0, + "completions/min_terminated_length": 208.0, + "epoch": 0.1306780040277467, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06143470108509064, + "learning_rate": 4.997235905080438e-06, + "loss": 0.0071, + "num_tokens": 46609250.0, + "reward": 2.271378517150879, + "reward_std": 0.8853912353515625, + "rewards/accuracy_reward/mean": 2.0, + "rewards/accuracy_reward/std": 3.3333334922790527, + "rewards/ngram_similarity_reward/mean": 0.2713784873485565, + "rewards/ngram_similarity_reward/std": 0.23282523453235626, + "step": 292 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 611.0, + "completions/max_terminated_length": 611.0, + "completions/mean_length": 466.484375, + "completions/mean_terminated_length": 466.484375, + "completions/min_length": 283.0, + "completions/min_terminated_length": 283.0, + "epoch": 0.13112553143880062, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04411265626549721, + "learning_rate": 4.997152796571838e-06, + "loss": -0.0272, + "num_tokens": 46754321.0, + "reward": 3.6616954803466797, + "reward_std": 1.9821219444274902, + "rewards/accuracy_reward/mean": 3.09375, + "rewards/accuracy_reward/std": 3.037954330444336, + "rewards/ngram_similarity_reward/mean": 0.5679454207420349, + "rewards/ngram_similarity_reward/std": 0.3640801012516022, + "step": 293 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1074.0, + "completions/max_terminated_length": 1074.0, + "completions/mean_length": 647.34375, + "completions/mean_terminated_length": 647.34375, + "completions/min_length": 356.0, + "completions/min_terminated_length": 356.0, + "epoch": 0.13157305884985454, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.03811763972043991, + "learning_rate": 4.997068457872794e-06, + "loss": 0.0037, + "num_tokens": 46910871.0, + "reward": 2.387169599533081, + "reward_std": 1.858168125152588, + "rewards/accuracy_reward/mean": 1.921875, + "rewards/accuracy_reward/std": 2.9857051372528076, + "rewards/ngram_similarity_reward/mean": 0.4652945399284363, + "rewards/ngram_similarity_reward/std": 0.28880149126052856, + "step": 294 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 837.0, + "completions/max_terminated_length": 837.0, + "completions/mean_length": 447.625, + "completions/mean_terminated_length": 447.625, + "completions/min_length": 135.0, + "completions/min_terminated_length": 135.0, + "epoch": 0.13202058626090848, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04353957623243332, + "learning_rate": 4.996982889029477e-06, + "loss": 0.0123, + "num_tokens": 47060175.0, + "reward": 4.953624725341797, + "reward_std": 1.593971610069275, + "rewards/accuracy_reward/mean": 4.578125, + "rewards/accuracy_reward/std": 2.304617166519165, + "rewards/ngram_similarity_reward/mean": 0.37549978494644165, + "rewards/ngram_similarity_reward/std": 0.2559112310409546, + "step": 295 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 845.0, + "completions/max_terminated_length": 845.0, + "completions/mean_length": 590.28125, + "completions/mean_terminated_length": 590.28125, + "completions/min_length": 337.0, + "completions/min_terminated_length": 337.0, + "epoch": 0.1324681136719624, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04188549146056175, + "learning_rate": 4.996896090088728e-06, + "loss": 0.0031, + "num_tokens": 47221473.0, + "reward": 3.032853841781616, + "reward_std": 1.2330999374389648, + "rewards/accuracy_reward/mean": 2.796875, + "rewards/accuracy_reward/std": 3.0998191833496094, + "rewards/ngram_similarity_reward/mean": 0.23597882688045502, + "rewards/ngram_similarity_reward/std": 0.1108793169260025, + "step": 296 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 909.0, + "completions/max_terminated_length": 909.0, + "completions/mean_length": 517.171875, + "completions/mean_terminated_length": 517.171875, + "completions/min_length": 277.0, + "completions/min_terminated_length": 277.0, + "epoch": 0.13291564108301634, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05190104618668556, + "learning_rate": 4.996808061098067e-06, + "loss": -0.0203, + "num_tokens": 47398780.0, + "reward": 3.1222596168518066, + "reward_std": 1.8057923316955566, + "rewards/accuracy_reward/mean": 2.6875, + "rewards/accuracy_reward/std": 3.126309394836426, + "rewards/ngram_similarity_reward/mean": 0.4347594976425171, + "rewards/ngram_similarity_reward/std": 0.27208247780799866, + "step": 297 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 714.0, + "completions/max_terminated_length": 714.0, + "completions/mean_length": 545.484375, + "completions/mean_terminated_length": 545.484375, + "completions/min_length": 301.0, + "completions/min_terminated_length": 301.0, + "epoch": 0.13336316849407026, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04385865479707718, + "learning_rate": 4.9967188021056845e-06, + "loss": 0.0138, + "num_tokens": 47560507.0, + "reward": 4.006990432739258, + "reward_std": 0.9415175914764404, + "rewards/accuracy_reward/mean": 3.71875, + "rewards/accuracy_reward/std": 2.7629566192626953, + "rewards/ngram_similarity_reward/mean": 0.2882406711578369, + "rewards/ngram_similarity_reward/std": 0.23498745262622833, + "step": 298 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 641.0, + "completions/max_terminated_length": 641.0, + "completions/mean_length": 405.171875, + "completions/mean_terminated_length": 405.171875, + "completions/min_length": 292.0, + "completions/min_terminated_length": 292.0, + "epoch": 0.13381069590512418, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05112559720873833, + "learning_rate": 4.996628313160445e-06, + "loss": 0.0161, + "num_tokens": 47743270.0, + "reward": 2.28450870513916, + "reward_std": 2.212578773498535, + "rewards/accuracy_reward/mean": 1.765625, + "rewards/accuracy_reward/std": 3.1332433223724365, + "rewards/ngram_similarity_reward/mean": 0.5188837051391602, + "rewards/ngram_similarity_reward/std": 0.3469817042350769, + "step": 299 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1014.0, + "completions/max_terminated_length": 1014.0, + "completions/mean_length": 632.78125, + "completions/mean_terminated_length": 632.78125, + "completions/min_length": 337.0, + "completions/min_terminated_length": 337.0, + "epoch": 0.13425822331617812, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.038688741624355316, + "learning_rate": 4.996536594311886e-06, + "loss": -0.0127, + "num_tokens": 47918376.0, + "reward": 2.733279228210449, + "reward_std": 2.08182430267334, + "rewards/accuracy_reward/mean": 2.3125, + "rewards/accuracy_reward/std": 3.231172561645508, + "rewards/ngram_similarity_reward/mean": 0.42077934741973877, + "rewards/ngram_similarity_reward/std": 0.22763550281524658, + "step": 300 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 663.0, + "completions/max_terminated_length": 663.0, + "completions/mean_length": 481.1875, + "completions/mean_terminated_length": 481.1875, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "epoch": 0.13470575072723204, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0463065467774868, + "learning_rate": 4.996443645610218e-06, + "loss": 0.0123, + "num_tokens": 48076724.0, + "reward": 3.142300844192505, + "reward_std": 2.0055160522460938, + "rewards/accuracy_reward/mean": 2.59375, + "rewards/accuracy_reward/std": 3.0222392082214355, + "rewards/ngram_similarity_reward/mean": 0.5485509634017944, + "rewards/ngram_similarity_reward/std": 0.22397854924201965, + "step": 301 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 694.0, + "completions/max_terminated_length": 694.0, + "completions/mean_length": 417.640625, + "completions/mean_terminated_length": 417.640625, + "completions/min_length": 251.0, + "completions/min_terminated_length": 251.0, + "epoch": 0.13515327813828598, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05372007563710213, + "learning_rate": 4.996349467106325e-06, + "loss": 0.0212, + "num_tokens": 48215021.0, + "reward": 2.7351503372192383, + "reward_std": 0.22711166739463806, + "rewards/accuracy_reward/mean": 2.25, + "rewards/accuracy_reward/std": 3.295017957687378, + "rewards/ngram_similarity_reward/mean": 0.48515036702156067, + "rewards/ngram_similarity_reward/std": 0.35704848170280457, + "step": 302 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 854.0, + "completions/max_terminated_length": 854.0, + "completions/mean_length": 537.46875, + "completions/mean_terminated_length": 537.46875, + "completions/min_length": 225.0, + "completions/min_terminated_length": 225.0, + "epoch": 0.1356008055493399, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04387521743774414, + "learning_rate": 4.996254058851767e-06, + "loss": 0.003, + "num_tokens": 48369211.0, + "reward": 4.248077392578125, + "reward_std": 0.7423344850540161, + "rewards/accuracy_reward/mean": 3.78125, + "rewards/accuracy_reward/std": 2.7744226455688477, + "rewards/ngram_similarity_reward/mean": 0.4668273627758026, + "rewards/ngram_similarity_reward/std": 0.244145005941391, + "step": 303 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 730.0, + "completions/max_terminated_length": 730.0, + "completions/mean_length": 439.765625, + "completions/mean_terminated_length": 439.765625, + "completions/min_length": 267.0, + "completions/min_terminated_length": 267.0, + "epoch": 0.13604833296039381, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04732627049088478, + "learning_rate": 4.996157420898771e-06, + "loss": 0.049, + "num_tokens": 48509756.0, + "reward": 3.4477286338806152, + "reward_std": 1.2740354537963867, + "rewards/accuracy_reward/mean": 2.921875, + "rewards/accuracy_reward/std": 3.0488338470458984, + "rewards/ngram_similarity_reward/mean": 0.5258538722991943, + "rewards/ngram_similarity_reward/std": 0.37970679998397827, + "step": 304 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 810.0, + "completions/max_terminated_length": 810.0, + "completions/mean_length": 508.96875, + "completions/mean_terminated_length": 508.96875, + "completions/min_length": 337.0, + "completions/min_terminated_length": 337.0, + "epoch": 0.13649586037144776, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.03984339162707329, + "learning_rate": 4.996059553300243e-06, + "loss": 0.0369, + "num_tokens": 48651274.0, + "reward": 4.591022968292236, + "reward_std": 0.9147764444351196, + "rewards/accuracy_reward/mean": 4.0, + "rewards/accuracy_reward/std": 2.618614673614502, + "rewards/ngram_similarity_reward/mean": 0.5910229682922363, + "rewards/ngram_similarity_reward/std": 0.23684833943843842, + "step": 305 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 637.0, + "completions/max_terminated_length": 637.0, + "completions/mean_length": 455.03125, + "completions/mean_terminated_length": 455.03125, + "completions/min_length": 274.0, + "completions/min_terminated_length": 274.0, + "epoch": 0.13694338778250167, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.046683188527822495, + "learning_rate": 4.99596045610976e-06, + "loss": 0.0198, + "num_tokens": 48823820.0, + "reward": 3.555692195892334, + "reward_std": 1.0254743099212646, + "rewards/accuracy_reward/mean": 3.140625, + "rewards/accuracy_reward/std": 3.082810640335083, + "rewards/ngram_similarity_reward/mean": 0.4150674343109131, + "rewards/ngram_similarity_reward/std": 0.2890641987323761, + "step": 306 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 733.0, + "completions/max_terminated_length": 733.0, + "completions/mean_length": 430.6875, + "completions/mean_terminated_length": 430.6875, + "completions/min_length": 238.0, + "completions/min_terminated_length": 238.0, + "epoch": 0.13739091519355562, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.051073748618364334, + "learning_rate": 4.995860129381572e-06, + "loss": 0.016, + "num_tokens": 48977368.0, + "reward": 3.4029407501220703, + "reward_std": 0.8974111080169678, + "rewards/accuracy_reward/mean": 3.0625, + "rewards/accuracy_reward/std": 2.9700891971588135, + "rewards/ngram_similarity_reward/mean": 0.3404407501220703, + "rewards/ngram_similarity_reward/std": 0.2654664218425751, + "step": 307 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 736.0, + "completions/max_terminated_length": 736.0, + "completions/mean_length": 414.984375, + "completions/mean_terminated_length": 414.984375, + "completions/min_length": 191.0, + "completions/min_terminated_length": 191.0, + "epoch": 0.13783844260460953, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.041511911898851395, + "learning_rate": 4.995758573170601e-06, + "loss": -0.0017, + "num_tokens": 49123271.0, + "reward": 1.4354006052017212, + "reward_std": 0.1506062150001526, + "rewards/accuracy_reward/mean": 0.984375, + "rewards/accuracy_reward/std": 2.6306629180908203, + "rewards/ngram_similarity_reward/mean": 0.45102566480636597, + "rewards/ngram_similarity_reward/std": 0.2433464378118515, + "step": 308 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 688.0, + "completions/max_terminated_length": 688.0, + "completions/mean_length": 435.53125, + "completions/mean_terminated_length": 435.53125, + "completions/min_length": 254.0, + "completions/min_terminated_length": 254.0, + "epoch": 0.13828597001566345, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05282020941376686, + "learning_rate": 4.995655787532445e-06, + "loss": 0.0198, + "num_tokens": 49279321.0, + "reward": 3.551271915435791, + "reward_std": 1.4317071437835693, + "rewards/accuracy_reward/mean": 3.015625, + "rewards/accuracy_reward/std": 3.2464497089385986, + "rewards/ngram_similarity_reward/mean": 0.5356469750404358, + "rewards/ngram_similarity_reward/std": 0.31283071637153625, + "step": 309 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 727.0, + "completions/max_terminated_length": 727.0, + "completions/mean_length": 486.984375, + "completions/mean_terminated_length": 486.984375, + "completions/min_length": 319.0, + "completions/min_terminated_length": 319.0, + "epoch": 0.1387334974267174, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04417431727051735, + "learning_rate": 4.995551772523372e-06, + "loss": -0.0243, + "num_tokens": 49497016.0, + "reward": 5.310014724731445, + "reward_std": 0.978313684463501, + "rewards/accuracy_reward/mean": 4.65625, + "rewards/accuracy_reward/std": 2.102294683456421, + "rewards/ngram_similarity_reward/mean": 0.6537647247314453, + "rewards/ngram_similarity_reward/std": 0.39969301223754883, + "step": 310 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 764.0, + "completions/max_terminated_length": 764.0, + "completions/mean_length": 396.265625, + "completions/mean_terminated_length": 396.265625, + "completions/min_length": 150.0, + "completions/min_terminated_length": 150.0, + "epoch": 0.1391810248377713, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.04919736087322235, + "learning_rate": 4.9954465282003265e-06, + "loss": 0.0251, + "num_tokens": 49650041.0, + "reward": 4.4382853507995605, + "reward_std": 1.0855292081832886, + "rewards/accuracy_reward/mean": 3.625, + "rewards/accuracy_reward/std": 2.8030595779418945, + "rewards/ngram_similarity_reward/mean": 0.8132855892181396, + "rewards/ngram_similarity_reward/std": 0.44261741638183594, + "step": 311 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 974.0, + "completions/max_terminated_length": 974.0, + "completions/mean_length": 504.578125, + "completions/mean_terminated_length": 504.578125, + "completions/min_length": 302.0, + "completions/min_terminated_length": 302.0, + "epoch": 0.13962855224882523, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0423838347196579, + "learning_rate": 4.995340054620922e-06, + "loss": 0.0222, + "num_tokens": 49805038.0, + "reward": 2.7991769313812256, + "reward_std": 1.8164341449737549, + "rewards/accuracy_reward/mean": 2.25, + "rewards/accuracy_reward/std": 3.0860671997070312, + "rewards/ngram_similarity_reward/mean": 0.5491769909858704, + "rewards/ngram_similarity_reward/std": 0.3414195477962494, + "step": 312 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 833.0, + "completions/max_terminated_length": 833.0, + "completions/mean_length": 409.4375, + "completions/mean_terminated_length": 409.4375, + "completions/min_length": 163.0, + "completions/min_terminated_length": 163.0, + "epoch": 0.14007607965987917, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.05049237236380577, + "learning_rate": 4.995232351843448e-06, + "loss": 0.0289, + "num_tokens": 50079466.0, + "reward": 4.114768028259277, + "reward_std": 1.654599666595459, + "rewards/accuracy_reward/mean": 3.40625, + "rewards/accuracy_reward/std": 3.0327250957489014, + "rewards/ngram_similarity_reward/mean": 0.7085182666778564, + "rewards/ngram_similarity_reward/std": 0.29894617199897766, + "step": 313 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 542.0, + "completions/max_terminated_length": 542.0, + "completions/mean_length": 313.671875, + "completions/mean_terminated_length": 313.671875, + "completions/min_length": 179.0, + "completions/min_terminated_length": 179.0, + "epoch": 0.1405236070709331, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0518927238881588, + "learning_rate": 4.995123419926864e-06, + "loss": -0.0175, + "num_tokens": 50258693.0, + "reward": 3.5665626525878906, + "reward_std": 1.0320234298706055, + "rewards/accuracy_reward/mean": 2.890625, + "rewards/accuracy_reward/std": 3.195319890975952, + "rewards/ngram_similarity_reward/mean": 0.6759374737739563, + "rewards/ngram_similarity_reward/std": 0.2605065703392029, + "step": 314 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 830.0, + "completions/max_terminated_length": 830.0, + "completions/mean_length": 434.40625, + "completions/mean_terminated_length": 434.40625, + "completions/min_length": 171.0, + "completions/min_terminated_length": 171.0, + "epoch": 0.14097113448198703, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.051625993102788925, + "learning_rate": 4.995013258930806e-06, + "loss": -0.0135, + "num_tokens": 50402687.0, + "reward": 4.174743175506592, + "reward_std": 0.5071485042572021, + "rewards/accuracy_reward/mean": 3.90625, + "rewards/accuracy_reward/std": 2.6709415912628174, + "rewards/ngram_similarity_reward/mean": 0.2684934139251709, + "rewards/ngram_similarity_reward/std": 0.19142936170101166, + "step": 315 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 695.0, + "completions/max_terminated_length": 695.0, + "completions/mean_length": 417.5625, + "completions/mean_terminated_length": 417.5625, + "completions/min_length": 229.0, + "completions/min_terminated_length": 229.0, + "epoch": 0.14141866189304095, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04863320663571358, + "learning_rate": 4.994901868915581e-06, + "loss": 0.0242, + "num_tokens": 50554563.0, + "reward": 1.499826192855835, + "reward_std": 0.551353931427002, + "rewards/accuracy_reward/mean": 1.0625, + "rewards/accuracy_reward/std": 2.695528507232666, + "rewards/ngram_similarity_reward/mean": 0.4373261630535126, + "rewards/ngram_similarity_reward/std": 0.35376015305519104, + "step": 316 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 704.0, + "completions/max_terminated_length": 704.0, + "completions/mean_length": 403.828125, + "completions/mean_terminated_length": 403.828125, + "completions/min_length": 191.0, + "completions/min_terminated_length": 191.0, + "epoch": 0.14186618930409486, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04763497784733772, + "learning_rate": 4.994789249942166e-06, + "loss": 0.0255, + "num_tokens": 50678952.0, + "reward": 4.207784652709961, + "reward_std": 0.6976755857467651, + "rewards/accuracy_reward/mean": 3.78125, + "rewards/accuracy_reward/std": 2.7744226455688477, + "rewards/ngram_similarity_reward/mean": 0.4265344738960266, + "rewards/ngram_similarity_reward/std": 0.2707797884941101, + "step": 317 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 842.0, + "completions/max_terminated_length": 842.0, + "completions/mean_length": 519.578125, + "completions/mean_terminated_length": 519.578125, + "completions/min_length": 340.0, + "completions/min_terminated_length": 340.0, + "epoch": 0.1423137167151488, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04119405895471573, + "learning_rate": 4.994675402072217e-06, + "loss": 0.0025, + "num_tokens": 50848589.0, + "reward": 5.666694164276123, + "reward_std": 0.502032995223999, + "rewards/accuracy_reward/mean": 5.390625, + "rewards/accuracy_reward/std": 0.8750000596046448, + "rewards/ngram_similarity_reward/mean": 0.2760693430900574, + "rewards/ngram_similarity_reward/std": 0.15548977255821228, + "step": 318 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 840.0, + "completions/max_terminated_length": 840.0, + "completions/mean_length": 476.90625, + "completions/mean_terminated_length": 476.90625, + "completions/min_length": 199.0, + "completions/min_terminated_length": 199.0, + "epoch": 0.14276124412620272, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05252527445554733, + "learning_rate": 4.994560325368057e-06, + "loss": 0.0324, + "num_tokens": 50998887.0, + "reward": 3.12988018989563, + "reward_std": 1.5426826477050781, + "rewards/accuracy_reward/mean": 2.765625, + "rewards/accuracy_reward/std": 3.0302298069000244, + "rewards/ngram_similarity_reward/mean": 0.364255428314209, + "rewards/ngram_similarity_reward/std": 0.2845660150051117, + "step": 319 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 749.0, + "completions/max_terminated_length": 749.0, + "completions/mean_length": 428.46875, + "completions/mean_terminated_length": 428.46875, + "completions/min_length": 173.0, + "completions/min_terminated_length": 173.0, + "epoch": 0.14320877153725667, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05109865590929985, + "learning_rate": 4.994444019892687e-06, + "loss": 0.0118, + "num_tokens": 51160677.0, + "reward": 4.334773063659668, + "reward_std": 0.6890419721603394, + "rewards/accuracy_reward/mean": 3.8125, + "rewards/accuracy_reward/std": 2.7189810276031494, + "rewards/ngram_similarity_reward/mean": 0.5222730040550232, + "rewards/ngram_similarity_reward/std": 0.28128302097320557, + "step": 320 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 724.0, + "completions/max_terminated_length": 724.0, + "completions/mean_length": 499.296875, + "completions/mean_terminated_length": 499.296875, + "completions/min_length": 334.0, + "completions/min_terminated_length": 334.0, + "epoch": 0.14365629894831058, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04184623435139656, + "learning_rate": 4.994326485709774e-06, + "loss": 0.0489, + "num_tokens": 51305976.0, + "reward": 2.736367702484131, + "reward_std": 0.8544173836708069, + "rewards/accuracy_reward/mean": 2.1875, + "rewards/accuracy_reward/std": 3.043989896774292, + "rewards/ngram_similarity_reward/mean": 0.5488678216934204, + "rewards/ngram_similarity_reward/std": 0.3216118812561035, + "step": 321 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 862.0, + "completions/max_terminated_length": 862.0, + "completions/mean_length": 544.1875, + "completions/mean_terminated_length": 544.1875, + "completions/min_length": 348.0, + "completions/min_terminated_length": 348.0, + "epoch": 0.1441038263593645, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04549555107951164, + "learning_rate": 4.994207722883664e-06, + "loss": -0.0169, + "num_tokens": 51472868.0, + "reward": 2.5141773223876953, + "reward_std": 2.492871046066284, + "rewards/accuracy_reward/mean": 1.96875, + "rewards/accuracy_reward/std": 3.0496878623962402, + "rewards/ngram_similarity_reward/mean": 0.5454275012016296, + "rewards/ngram_similarity_reward/std": 0.35773760080337524, + "step": 322 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 864.0, + "completions/max_terminated_length": 864.0, + "completions/mean_length": 510.359375, + "completions/mean_terminated_length": 510.359375, + "completions/min_length": 319.0, + "completions/min_terminated_length": 319.0, + "epoch": 0.14455135377041844, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04742863029241562, + "learning_rate": 4.994087731479371e-06, + "loss": -0.0141, + "num_tokens": 51641211.0, + "reward": 1.274566650390625, + "reward_std": 0.5307009220123291, + "rewards/accuracy_reward/mean": 0.875, + "rewards/accuracy_reward/std": 2.8312318325042725, + "rewards/ngram_similarity_reward/mean": 0.399566650390625, + "rewards/ngram_similarity_reward/std": 0.23947805166244507, + "step": 323 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 911.0, + "completions/max_terminated_length": 911.0, + "completions/mean_length": 551.15625, + "completions/mean_terminated_length": 551.15625, + "completions/min_length": 269.0, + "completions/min_terminated_length": 269.0, + "epoch": 0.14499888118147236, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.042440809309482574, + "learning_rate": 4.993966511562586e-06, + "loss": -0.0165, + "num_tokens": 51799509.0, + "reward": 2.732563018798828, + "reward_std": 0.9093632698059082, + "rewards/accuracy_reward/mean": 2.28125, + "rewards/accuracy_reward/std": 3.2634034156799316, + "rewards/ngram_similarity_reward/mean": 0.4513130187988281, + "rewards/ngram_similarity_reward/std": 0.3170211613178253, + "step": 324 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 916.0, + "completions/max_terminated_length": 916.0, + "completions/mean_length": 478.03125, + "completions/mean_terminated_length": 478.03125, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "epoch": 0.1454464085925263, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04281028360128403, + "learning_rate": 4.993844063199668e-06, + "loss": -0.0039, + "num_tokens": 51955303.0, + "reward": 3.9197871685028076, + "reward_std": 0.9051376581192017, + "rewards/accuracy_reward/mean": 3.53125, + "rewards/accuracy_reward/std": 2.839454174041748, + "rewards/ngram_similarity_reward/mean": 0.38853707909584045, + "rewards/ngram_similarity_reward/std": 0.26418963074684143, + "step": 325 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 603.0, + "completions/max_terminated_length": 603.0, + "completions/mean_length": 367.3125, + "completions/mean_terminated_length": 367.3125, + "completions/min_length": 215.0, + "completions/min_terminated_length": 215.0, + "epoch": 0.14589393600358022, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05799470096826553, + "learning_rate": 4.993720386457653e-06, + "loss": 0.0324, + "num_tokens": 52132203.0, + "reward": 3.663940906524658, + "reward_std": 1.499124526977539, + "rewards/accuracy_reward/mean": 3.25, + "rewards/accuracy_reward/std": 2.9277002811431885, + "rewards/ngram_similarity_reward/mean": 0.4139409363269806, + "rewards/ngram_similarity_reward/std": 0.3726455271244049, + "step": 326 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 701.0, + "completions/max_terminated_length": 701.0, + "completions/mean_length": 389.421875, + "completions/mean_terminated_length": 389.421875, + "completions/min_length": 216.0, + "completions/min_terminated_length": 216.0, + "epoch": 0.14634146341463414, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05426111817359924, + "learning_rate": 4.993595481404245e-06, + "loss": 0.0035, + "num_tokens": 52287798.0, + "reward": 3.8550524711608887, + "reward_std": 1.761357069015503, + "rewards/accuracy_reward/mean": 3.25, + "rewards/accuracy_reward/std": 2.9277002811431885, + "rewards/ngram_similarity_reward/mean": 0.6050525307655334, + "rewards/ngram_similarity_reward/std": 0.3760647475719452, + "step": 327 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1060.0, + "completions/max_terminated_length": 1060.0, + "completions/mean_length": 574.796875, + "completions/mean_terminated_length": 574.796875, + "completions/min_length": 194.0, + "completions/min_terminated_length": 194.0, + "epoch": 0.14678899082568808, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.043712202459573746, + "learning_rate": 4.993469348107822e-06, + "loss": -0.0342, + "num_tokens": 52441769.0, + "reward": 3.295194625854492, + "reward_std": 1.5132980346679688, + "rewards/accuracy_reward/mean": 2.921875, + "rewards/accuracy_reward/std": 3.0488338470458984, + "rewards/ngram_similarity_reward/mean": 0.3733198344707489, + "rewards/ngram_similarity_reward/std": 0.3137679696083069, + "step": 328 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 784.0, + "completions/max_terminated_length": 784.0, + "completions/mean_length": 481.90625, + "completions/mean_terminated_length": 481.90625, + "completions/min_length": 250.0, + "completions/min_terminated_length": 250.0, + "epoch": 0.147236518236742, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05058812350034714, + "learning_rate": 4.993341986637437e-06, + "loss": -0.0285, + "num_tokens": 52588659.0, + "reward": 2.0073139667510986, + "reward_std": 0.9403772950172424, + "rewards/accuracy_reward/mean": 1.828125, + "rewards/accuracy_reward/std": 3.076045274734497, + "rewards/ngram_similarity_reward/mean": 0.17918887734413147, + "rewards/ngram_similarity_reward/std": 0.08094964176416397, + "step": 329 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 708.0, + "completions/max_terminated_length": 708.0, + "completions/mean_length": 505.15625, + "completions/mean_terminated_length": 505.15625, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "epoch": 0.14768404564779591, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04791771620512009, + "learning_rate": 4.993213397062812e-06, + "loss": 0.04, + "num_tokens": 52768349.0, + "reward": 2.805192232131958, + "reward_std": 0.7437160015106201, + "rewards/accuracy_reward/mean": 2.265625, + "rewards/accuracy_reward/std": 3.0692648887634277, + "rewards/ngram_similarity_reward/mean": 0.5395671725273132, + "rewards/ngram_similarity_reward/std": 0.3949489891529083, + "step": 330 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 953.0, + "completions/max_terminated_length": 953.0, + "completions/mean_length": 518.828125, + "completions/mean_terminated_length": 518.828125, + "completions/min_length": 196.0, + "completions/min_terminated_length": 196.0, + "epoch": 0.14813157305884986, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.042132019996643066, + "learning_rate": 4.993083579454345e-06, + "loss": 0.0154, + "num_tokens": 52953938.0, + "reward": 2.460780620574951, + "reward_std": 0.8049179911613464, + "rewards/accuracy_reward/mean": 2.03125, + "rewards/accuracy_reward/std": 2.986577272415161, + "rewards/ngram_similarity_reward/mean": 0.42953070998191833, + "rewards/ngram_similarity_reward/std": 0.24472883343696594, + "step": 331 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 765.0, + "completions/max_terminated_length": 765.0, + "completions/mean_length": 499.40625, + "completions/mean_terminated_length": 499.40625, + "completions/min_length": 259.0, + "completions/min_terminated_length": 259.0, + "epoch": 0.14857910046990377, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04983328655362129, + "learning_rate": 4.992952533883099e-06, + "loss": -0.01, + "num_tokens": 53159516.0, + "reward": 1.8761520385742188, + "reward_std": 1.512256145477295, + "rewards/accuracy_reward/mean": 1.4375, + "rewards/accuracy_reward/std": 2.9807584285736084, + "rewards/ngram_similarity_reward/mean": 0.43865206837654114, + "rewards/ngram_similarity_reward/std": 0.384187251329422, + "step": 332 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 826.0, + "completions/max_terminated_length": 826.0, + "completions/mean_length": 473.125, + "completions/mean_terminated_length": 473.125, + "completions/min_length": 260.0, + "completions/min_terminated_length": 260.0, + "epoch": 0.14902662788095772, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04624255374073982, + "learning_rate": 4.992820260420817e-06, + "loss": -0.0091, + "num_tokens": 53309988.0, + "reward": 4.088277816772461, + "reward_std": 0.7323791980743408, + "rewards/accuracy_reward/mean": 3.640625, + "rewards/accuracy_reward/std": 3.007225275039673, + "rewards/ngram_similarity_reward/mean": 0.44765257835388184, + "rewards/ngram_similarity_reward/std": 0.2967788875102997, + "step": 333 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 708.0, + "completions/max_terminated_length": 708.0, + "completions/mean_length": 470.296875, + "completions/mean_terminated_length": 470.296875, + "completions/min_length": 189.0, + "completions/min_terminated_length": 189.0, + "epoch": 0.14947415529201163, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05054234340786934, + "learning_rate": 4.9926867591399125e-06, + "loss": 0.0328, + "num_tokens": 53433335.0, + "reward": 1.209140419960022, + "reward_std": 0.9032200574874878, + "rewards/accuracy_reward/mean": 0.9375, + "rewards/accuracy_reward/std": 2.6659226417541504, + "rewards/ngram_similarity_reward/mean": 0.2716403603553772, + "rewards/ngram_similarity_reward/std": 0.20133136212825775, + "step": 334 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 604.0, + "completions/max_terminated_length": 604.0, + "completions/mean_length": 428.984375, + "completions/mean_terminated_length": 428.984375, + "completions/min_length": 269.0, + "completions/min_terminated_length": 269.0, + "epoch": 0.14992168270306555, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04859912022948265, + "learning_rate": 4.992552030113469e-06, + "loss": 0.0081, + "num_tokens": 53569542.0, + "reward": 1.492279291152954, + "reward_std": 0.8460592031478882, + "rewards/accuracy_reward/mean": 1.234375, + "rewards/accuracy_reward/std": 2.8015992641448975, + "rewards/ngram_similarity_reward/mean": 0.25790441036224365, + "rewards/ngram_similarity_reward/std": 0.18795253336429596, + "step": 335 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 800.0, + "completions/max_terminated_length": 800.0, + "completions/mean_length": 531.578125, + "completions/mean_terminated_length": 531.578125, + "completions/min_length": 243.0, + "completions/min_terminated_length": 243.0, + "epoch": 0.1503692101141195, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04315594211220741, + "learning_rate": 4.992416073415242e-06, + "loss": 0.0014, + "num_tokens": 53731147.0, + "reward": 3.3873965740203857, + "reward_std": 1.5658332109451294, + "rewards/accuracy_reward/mean": 2.75, + "rewards/accuracy_reward/std": 3.261122703552246, + "rewards/ngram_similarity_reward/mean": 0.6373966932296753, + "rewards/ngram_similarity_reward/std": 0.3304082155227661, + "step": 336 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 620.0, + "completions/max_terminated_length": 620.0, + "completions/mean_length": 417.234375, + "completions/mean_terminated_length": 417.234375, + "completions/min_length": 267.0, + "completions/min_terminated_length": 267.0, + "epoch": 0.1508167375251734, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.055264078080654144, + "learning_rate": 4.992278889119661e-06, + "loss": 0.0051, + "num_tokens": 53894890.0, + "reward": 1.4579107761383057, + "reward_std": 0.48005035519599915, + "rewards/accuracy_reward/mean": 1.09375, + "rewards/accuracy_reward/std": 2.6709415912628174, + "rewards/ngram_similarity_reward/mean": 0.3641607165336609, + "rewards/ngram_similarity_reward/std": 0.346956342458725, + "step": 337 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 976.0, + "completions/max_terminated_length": 976.0, + "completions/mean_length": 614.21875, + "completions/mean_terminated_length": 614.21875, + "completions/min_length": 331.0, + "completions/min_terminated_length": 331.0, + "epoch": 0.15126426493622736, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.03559010848402977, + "learning_rate": 4.992140477301827e-06, + "loss": -0.0071, + "num_tokens": 54065272.0, + "reward": 2.9556291103363037, + "reward_std": 1.1817878484725952, + "rewards/accuracy_reward/mean": 2.453125, + "rewards/accuracy_reward/std": 3.077979803085327, + "rewards/ngram_similarity_reward/mean": 0.5025041103363037, + "rewards/ngram_similarity_reward/std": 0.2608015537261963, + "step": 338 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 831.0, + "completions/max_terminated_length": 831.0, + "completions/mean_length": 503.125, + "completions/mean_terminated_length": 503.125, + "completions/min_length": 288.0, + "completions/min_terminated_length": 288.0, + "epoch": 0.15171179234728127, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04555033519864082, + "learning_rate": 4.992000838037512e-06, + "loss": 0.0097, + "num_tokens": 54187600.0, + "reward": 3.294914722442627, + "reward_std": 0.7349349856376648, + "rewards/accuracy_reward/mean": 2.78125, + "rewards/accuracy_reward/std": 3.0103988647460938, + "rewards/ngram_similarity_reward/mean": 0.5136648416519165, + "rewards/ngram_similarity_reward/std": 0.34142234921455383, + "step": 339 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 703.0, + "completions/max_terminated_length": 703.0, + "completions/mean_length": 474.40625, + "completions/mean_terminated_length": 474.40625, + "completions/min_length": 245.0, + "completions/min_terminated_length": 245.0, + "epoch": 0.1521593197583352, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04681862145662308, + "learning_rate": 4.9918599714031625e-06, + "loss": -0.0015, + "num_tokens": 54336842.0, + "reward": 1.9993470907211304, + "reward_std": 1.2931156158447266, + "rewards/accuracy_reward/mean": 1.5, + "rewards/accuracy_reward/std": 3.039423704147339, + "rewards/ngram_similarity_reward/mean": 0.4993470013141632, + "rewards/ngram_similarity_reward/std": 0.2642381191253662, + "step": 340 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 697.0, + "completions/max_terminated_length": 697.0, + "completions/mean_length": 439.796875, + "completions/mean_terminated_length": 439.796875, + "completions/min_length": 227.0, + "completions/min_terminated_length": 227.0, + "epoch": 0.15260684716938913, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.045354828238487244, + "learning_rate": 4.991717877475893e-06, + "loss": -0.0309, + "num_tokens": 54471741.0, + "reward": 4.207108497619629, + "reward_std": 1.2334591150283813, + "rewards/accuracy_reward/mean": 4.0, + "rewards/accuracy_reward/std": 2.618614673614502, + "rewards/ngram_similarity_reward/mean": 0.20710866153240204, + "rewards/ngram_similarity_reward/std": 0.11993768811225891, + "step": 341 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 682.0, + "completions/max_terminated_length": 682.0, + "completions/mean_length": 477.921875, + "completions/mean_terminated_length": 477.921875, + "completions/min_length": 296.0, + "completions/min_terminated_length": 296.0, + "epoch": 0.15305437458044305, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04923330619931221, + "learning_rate": 4.991574556333492e-06, + "loss": 0.0041, + "num_tokens": 54613128.0, + "reward": 3.2489047050476074, + "reward_std": 1.4373531341552734, + "rewards/accuracy_reward/mean": 3.046875, + "rewards/accuracy_reward/std": 2.991680145263672, + "rewards/ngram_similarity_reward/mean": 0.20202943682670593, + "rewards/ngram_similarity_reward/std": 0.15467119216918945, + "step": 342 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 797.0, + "completions/max_terminated_length": 797.0, + "completions/mean_length": 476.03125, + "completions/mean_terminated_length": 476.03125, + "completions/min_length": 284.0, + "completions/min_terminated_length": 284.0, + "epoch": 0.153501901991497, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04791298136115074, + "learning_rate": 4.991430008054422e-06, + "loss": 0.0334, + "num_tokens": 54802138.0, + "reward": 5.949857711791992, + "reward_std": 0.7111374139785767, + "rewards/accuracy_reward/mean": 5.296875, + "rewards/accuracy_reward/std": 1.1433686017990112, + "rewards/ngram_similarity_reward/mean": 0.6529824733734131, + "rewards/ngram_similarity_reward/std": 0.31718015670776367, + "step": 343 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 939.0, + "completions/max_terminated_length": 939.0, + "completions/mean_length": 564.125, + "completions/mean_terminated_length": 564.125, + "completions/min_length": 313.0, + "completions/min_terminated_length": 313.0, + "epoch": 0.1539494294025509, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04487808421254158, + "learning_rate": 4.9912842327178125e-06, + "loss": 0.0061, + "num_tokens": 54978162.0, + "reward": 3.2296833992004395, + "reward_std": 1.9352624416351318, + "rewards/accuracy_reward/mean": 2.75, + "rewards/accuracy_reward/std": 3.0498504638671875, + "rewards/ngram_similarity_reward/mean": 0.47968345880508423, + "rewards/ngram_similarity_reward/std": 0.36490052938461304, + "step": 344 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 686.0, + "completions/max_terminated_length": 686.0, + "completions/mean_length": 469.765625, + "completions/mean_terminated_length": 469.765625, + "completions/min_length": 233.0, + "completions/min_terminated_length": 233.0, + "epoch": 0.15439695681360482, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.050543803721666336, + "learning_rate": 4.991137230403469e-06, + "loss": 0.0272, + "num_tokens": 55135347.0, + "reward": 4.8638715744018555, + "reward_std": 1.9317225217819214, + "rewards/accuracy_reward/mean": 4.28125, + "rewards/accuracy_reward/std": 2.4330317974090576, + "rewards/ngram_similarity_reward/mean": 0.5826215744018555, + "rewards/ngram_similarity_reward/std": 0.31985583901405334, + "step": 345 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 732.0, + "completions/max_terminated_length": 732.0, + "completions/mean_length": 469.375, + "completions/mean_terminated_length": 469.375, + "completions/min_length": 293.0, + "completions/min_terminated_length": 293.0, + "epoch": 0.15484448422465877, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04796312749385834, + "learning_rate": 4.990989001191866e-06, + "loss": -0.0286, + "num_tokens": 55284347.0, + "reward": 0.8435247540473938, + "reward_std": 1.1935805082321167, + "rewards/accuracy_reward/mean": 0.515625, + "rewards/accuracy_reward/std": 2.56652569770813, + "rewards/ngram_similarity_reward/mean": 0.32789987325668335, + "rewards/ngram_similarity_reward/std": 0.3246069550514221, + "step": 346 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 736.0, + "completions/max_terminated_length": 736.0, + "completions/mean_length": 428.953125, + "completions/mean_terminated_length": 428.953125, + "completions/min_length": 165.0, + "completions/min_terminated_length": 165.0, + "epoch": 0.15529201163571268, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05468254163861275, + "learning_rate": 4.990839545164152e-06, + "loss": -0.014, + "num_tokens": 55443336.0, + "reward": 2.6985256671905518, + "reward_std": 0.7273306250572205, + "rewards/accuracy_reward/mean": 2.25, + "rewards/accuracy_reward/std": 3.0860671997070312, + "rewards/ngram_similarity_reward/mean": 0.44852563738822937, + "rewards/ngram_similarity_reward/std": 0.2961733043193817, + "step": 347 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 718.0, + "completions/max_terminated_length": 718.0, + "completions/mean_length": 429.59375, + "completions/mean_terminated_length": 429.59375, + "completions/min_length": 212.0, + "completions/min_terminated_length": 212.0, + "epoch": 0.1557395390467666, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.052619464695453644, + "learning_rate": 4.990688862402145e-06, + "loss": -0.0551, + "num_tokens": 55608798.0, + "reward": 2.735952138900757, + "reward_std": 1.4438188076019287, + "rewards/accuracy_reward/mean": 2.21875, + "rewards/accuracy_reward/std": 3.119161367416382, + "rewards/ngram_similarity_reward/mean": 0.5172022581100464, + "rewards/ngram_similarity_reward/std": 0.3857964873313904, + "step": 348 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 844.0, + "completions/max_terminated_length": 844.0, + "completions/mean_length": 465.421875, + "completions/mean_terminated_length": 465.421875, + "completions/min_length": 228.0, + "completions/min_terminated_length": 228.0, + "epoch": 0.15618706645782054, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05453364923596382, + "learning_rate": 4.990536952988335e-06, + "loss": -0.0107, + "num_tokens": 55755113.0, + "reward": 2.8205485343933105, + "reward_std": 0.9628164768218994, + "rewards/accuracy_reward/mean": 2.46875, + "rewards/accuracy_reward/std": 3.06007981300354, + "rewards/ngram_similarity_reward/mean": 0.35179853439331055, + "rewards/ngram_similarity_reward/std": 0.2584626376628876, + "step": 349 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 653.0, + "completions/max_terminated_length": 653.0, + "completions/mean_length": 432.296875, + "completions/mean_terminated_length": 432.296875, + "completions/min_length": 182.0, + "completions/min_terminated_length": 182.0, + "epoch": 0.15663459386887446, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06002373620867729, + "learning_rate": 4.990383817005885e-06, + "loss": 0.0612, + "num_tokens": 55985580.0, + "reward": 4.117739677429199, + "reward_std": 0.9441288709640503, + "rewards/accuracy_reward/mean": 3.578125, + "rewards/accuracy_reward/std": 2.6083180904388428, + "rewards/ngram_similarity_reward/mean": 0.5396143794059753, + "rewards/ngram_similarity_reward/std": 0.40891844034194946, + "step": 350 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 730.0, + "completions/max_terminated_length": 730.0, + "completions/mean_length": 469.953125, + "completions/mean_terminated_length": 469.953125, + "completions/min_length": 188.0, + "completions/min_terminated_length": 188.0, + "epoch": 0.1570821212799284, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04758965224027634, + "learning_rate": 4.990229454538626e-06, + "loss": -0.0109, + "num_tokens": 56161241.0, + "reward": 4.118064880371094, + "reward_std": 1.6833152770996094, + "rewards/accuracy_reward/mean": 3.625, + "rewards/accuracy_reward/std": 2.8030595779418945, + "rewards/ngram_similarity_reward/mean": 0.4930647909641266, + "rewards/ngram_similarity_reward/std": 0.3592028319835663, + "step": 351 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 834.0, + "completions/max_terminated_length": 834.0, + "completions/mean_length": 475.71875, + "completions/mean_terminated_length": 475.71875, + "completions/min_length": 234.0, + "completions/min_terminated_length": 234.0, + "epoch": 0.15752964869098232, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05025568604469299, + "learning_rate": 4.990073865671067e-06, + "loss": 0.0062, + "num_tokens": 56406215.0, + "reward": 4.09308385848999, + "reward_std": 1.8486790657043457, + "rewards/accuracy_reward/mean": 3.78125, + "rewards/accuracy_reward/std": 2.7744226455688477, + "rewards/ngram_similarity_reward/mean": 0.3118337392807007, + "rewards/ngram_similarity_reward/std": 0.18081018328666687, + "step": 352 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 848.0, + "completions/max_terminated_length": 848.0, + "completions/mean_length": 634.390625, + "completions/mean_terminated_length": 634.390625, + "completions/min_length": 424.0, + "completions/min_terminated_length": 424.0, + "epoch": 0.15797717610203624, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.03872627764940262, + "learning_rate": 4.989917050488381e-06, + "loss": 0.0052, + "num_tokens": 56531936.0, + "reward": 2.959627628326416, + "reward_std": 0.1740277260541916, + "rewards/accuracy_reward/mean": 2.46875, + "rewards/accuracy_reward/std": 3.06007981300354, + "rewards/ngram_similarity_reward/mean": 0.4908774793148041, + "rewards/ngram_similarity_reward/std": 0.3493154048919678, + "step": 353 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 720.0, + "completions/max_terminated_length": 720.0, + "completions/mean_length": 453.09375, + "completions/mean_terminated_length": 453.09375, + "completions/min_length": 321.0, + "completions/min_terminated_length": 321.0, + "epoch": 0.15842470351309018, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05340864509344101, + "learning_rate": 4.989759009076415e-06, + "loss": 0.0086, + "num_tokens": 56691830.0, + "reward": 4.392290115356445, + "reward_std": 0.4906888008117676, + "rewards/accuracy_reward/mean": 3.90625, + "rewards/accuracy_reward/std": 2.6709415912628174, + "rewards/ngram_similarity_reward/mean": 0.4860401153564453, + "rewards/ngram_similarity_reward/std": 0.38415634632110596, + "step": 354 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1085.0, + "completions/max_terminated_length": 1085.0, + "completions/mean_length": 537.015625, + "completions/mean_terminated_length": 537.015625, + "completions/min_length": 295.0, + "completions/min_terminated_length": 295.0, + "epoch": 0.1588722309241441, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04268348962068558, + "learning_rate": 4.98959974152169e-06, + "loss": 0.0337, + "num_tokens": 56853303.0, + "reward": 4.8049211502075195, + "reward_std": 1.7453646659851074, + "rewards/accuracy_reward/mean": 4.15625, + "rewards/accuracy_reward/std": 2.564833879470825, + "rewards/ngram_similarity_reward/mean": 0.6486713886260986, + "rewards/ngram_similarity_reward/std": 0.3948211073875427, + "step": 355 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 701.0, + "completions/max_terminated_length": 701.0, + "completions/mean_length": 466.03125, + "completions/mean_terminated_length": 466.03125, + "completions/min_length": 271.0, + "completions/min_terminated_length": 271.0, + "epoch": 0.15931975833519804, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.043552812188863754, + "learning_rate": 4.9894392479113945e-06, + "loss": 0.0055, + "num_tokens": 57025721.0, + "reward": 3.4975011348724365, + "reward_std": 2.0453078746795654, + "rewards/accuracy_reward/mean": 2.953125, + "rewards/accuracy_reward/std": 3.0075550079345703, + "rewards/ngram_similarity_reward/mean": 0.5443758964538574, + "rewards/ngram_similarity_reward/std": 0.3401655852794647, + "step": 356 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 603.0, + "completions/max_terminated_length": 603.0, + "completions/mean_length": 406.34375, + "completions/mean_terminated_length": 406.34375, + "completions/min_length": 246.0, + "completions/min_terminated_length": 246.0, + "epoch": 0.15976728574625196, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0548367016017437, + "learning_rate": 4.989277528333392e-06, + "loss": 0.0039, + "num_tokens": 57154383.0, + "reward": 3.869058609008789, + "reward_std": 2.3097217082977295, + "rewards/accuracy_reward/mean": 3.421875, + "rewards/accuracy_reward/std": 2.896657705307007, + "rewards/ngram_similarity_reward/mean": 0.4471837282180786, + "rewards/ngram_similarity_reward/std": 0.3531920909881592, + "step": 357 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 667.0, + "completions/mean_length": 499.203125, + "completions/mean_terminated_length": 474.61907958984375, + "completions/min_length": 206.0, + "completions/min_terminated_length": 206.0, + "epoch": 0.16021481315730587, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.060244787484407425, + "learning_rate": 4.989114582876212e-06, + "loss": 0.0195, + "num_tokens": 57300668.0, + "reward": 3.5350584983825684, + "reward_std": 1.4591851234436035, + "rewards/accuracy_reward/mean": 3.0625, + "rewards/accuracy_reward/std": 3.080275297164917, + "rewards/ngram_similarity_reward/mean": 0.47255846858024597, + "rewards/ngram_similarity_reward/std": 0.33570200204849243, + "step": 358 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 678.0, + "completions/max_terminated_length": 678.0, + "completions/mean_length": 500.859375, + "completions/mean_terminated_length": 500.859375, + "completions/min_length": 294.0, + "completions/min_terminated_length": 294.0, + "epoch": 0.16066234056835982, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05069715902209282, + "learning_rate": 4.98895041162906e-06, + "loss": 0.0158, + "num_tokens": 57512851.0, + "reward": 4.281793594360352, + "reward_std": 0.7293245792388916, + "rewards/accuracy_reward/mean": 3.71875, + "rewards/accuracy_reward/std": 2.7629566192626953, + "rewards/ngram_similarity_reward/mean": 0.5630437731742859, + "rewards/ngram_similarity_reward/std": 0.38598915934562683, + "step": 359 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 689.0, + "completions/max_terminated_length": 689.0, + "completions/mean_length": 431.0, + "completions/mean_terminated_length": 431.0, + "completions/min_length": 219.0, + "completions/min_terminated_length": 219.0, + "epoch": 0.16110986797941373, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04762047156691551, + "learning_rate": 4.9887850146818095e-06, + "loss": 0.0561, + "num_tokens": 57691923.0, + "reward": 4.7552032470703125, + "reward_std": 2.4422922134399414, + "rewards/accuracy_reward/mean": 3.96875, + "rewards/accuracy_reward/std": 2.6783599853515625, + "rewards/ngram_similarity_reward/mean": 0.7864532470703125, + "rewards/ngram_similarity_reward/std": 0.36044836044311523, + "step": 360 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 676.0, + "completions/max_terminated_length": 676.0, + "completions/mean_length": 414.375, + "completions/mean_terminated_length": 414.375, + "completions/min_length": 175.0, + "completions/min_terminated_length": 175.0, + "epoch": 0.16155739539046768, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.057258009910583496, + "learning_rate": 4.988618392125007e-06, + "loss": -0.0112, + "num_tokens": 57855067.0, + "reward": 2.3432412147521973, + "reward_std": 1.5988247394561768, + "rewards/accuracy_reward/mean": 1.78125, + "rewards/accuracy_reward/std": 3.0103988647460938, + "rewards/ngram_similarity_reward/mean": 0.5619911551475525, + "rewards/ngram_similarity_reward/std": 0.34821024537086487, + "step": 361 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 865.0, + "completions/max_terminated_length": 865.0, + "completions/mean_length": 477.6875, + "completions/mean_terminated_length": 477.6875, + "completions/min_length": 221.0, + "completions/min_terminated_length": 221.0, + "epoch": 0.1620049228015216, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04392610117793083, + "learning_rate": 4.988450544049869e-06, + "loss": 0.0179, + "num_tokens": 57988775.0, + "reward": 2.244896650314331, + "reward_std": 1.298945426940918, + "rewards/accuracy_reward/mean": 1.71875, + "rewards/accuracy_reward/std": 2.9572014808654785, + "rewards/ngram_similarity_reward/mean": 0.5261467695236206, + "rewards/ngram_similarity_reward/std": 0.33598366379737854, + "step": 362 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 752.0, + "completions/max_terminated_length": 752.0, + "completions/mean_length": 495.40625, + "completions/mean_terminated_length": 495.40625, + "completions/min_length": 323.0, + "completions/min_terminated_length": 323.0, + "epoch": 0.1624524502125755, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.048559024930000305, + "learning_rate": 4.988281470548282e-06, + "loss": 0.0189, + "num_tokens": 58112769.0, + "reward": 4.195694923400879, + "reward_std": 0.6702969670295715, + "rewards/accuracy_reward/mean": 3.8125, + "rewards/accuracy_reward/std": 2.7189810276031494, + "rewards/ngram_similarity_reward/mean": 0.38319528102874756, + "rewards/ngram_similarity_reward/std": 0.34498733282089233, + "step": 363 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 593.0, + "completions/max_terminated_length": 593.0, + "completions/mean_length": 375.625, + "completions/mean_terminated_length": 375.625, + "completions/min_length": 252.0, + "completions/min_terminated_length": 252.0, + "epoch": 0.16289997762362945, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06379535794258118, + "learning_rate": 4.988111171712804e-06, + "loss": -0.0161, + "num_tokens": 58332233.0, + "reward": 2.30958890914917, + "reward_std": 2.4288597106933594, + "rewards/accuracy_reward/mean": 2.0, + "rewards/accuracy_reward/std": 3.0184617042541504, + "rewards/ngram_similarity_reward/mean": 0.30958884954452515, + "rewards/ngram_similarity_reward/std": 0.25692975521087646, + "step": 364 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 639.0, + "completions/max_terminated_length": 639.0, + "completions/mean_length": 418.890625, + "completions/mean_terminated_length": 418.890625, + "completions/min_length": 255.0, + "completions/min_terminated_length": 255.0, + "epoch": 0.16334750503468337, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.054858047515153885, + "learning_rate": 4.987939647636666e-06, + "loss": -0.0046, + "num_tokens": 58467634.0, + "reward": 4.41879415512085, + "reward_std": 0.28737539052963257, + "rewards/accuracy_reward/mean": 3.9375, + "rewards/accuracy_reward/std": 2.736438512802124, + "rewards/ngram_similarity_reward/mean": 0.48129430413246155, + "rewards/ngram_similarity_reward/std": 0.3488374948501587, + "step": 365 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 787.0, + "completions/max_terminated_length": 787.0, + "completions/mean_length": 507.265625, + "completions/mean_terminated_length": 507.265625, + "completions/min_length": 207.0, + "completions/min_terminated_length": 207.0, + "epoch": 0.16379503244573732, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05520766228437424, + "learning_rate": 4.987766898413766e-06, + "loss": 0.0402, + "num_tokens": 58667411.0, + "reward": 3.7660741806030273, + "reward_std": 2.487922430038452, + "rewards/accuracy_reward/mean": 3.15625, + "rewards/accuracy_reward/std": 2.950484275817871, + "rewards/ngram_similarity_reward/mean": 0.6098242402076721, + "rewards/ngram_similarity_reward/std": 0.3422655463218689, + "step": 366 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 876.0, + "completions/max_terminated_length": 876.0, + "completions/mean_length": 491.25, + "completions/mean_terminated_length": 491.25, + "completions/min_length": 182.0, + "completions/min_terminated_length": 182.0, + "epoch": 0.16424255985679123, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04897642135620117, + "learning_rate": 4.987592924138676e-06, + "loss": 0.0341, + "num_tokens": 58819587.0, + "reward": 1.3902175426483154, + "reward_std": 0.9953027963638306, + "rewards/accuracy_reward/mean": 0.890625, + "rewards/accuracy_reward/std": 2.6998953819274902, + "rewards/ngram_similarity_reward/mean": 0.4995926022529602, + "rewards/ngram_similarity_reward/std": 0.3933001160621643, + "step": 367 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 667.0, + "completions/max_terminated_length": 667.0, + "completions/mean_length": 458.4375, + "completions/mean_terminated_length": 458.4375, + "completions/min_length": 248.0, + "completions/min_terminated_length": 248.0, + "epoch": 0.16469008726784515, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.047864362597465515, + "learning_rate": 4.987417724906636e-06, + "loss": 0.0007, + "num_tokens": 58973983.0, + "reward": 2.1851534843444824, + "reward_std": 1.8688929080963135, + "rewards/accuracy_reward/mean": 1.640625, + "rewards/accuracy_reward/std": 2.91611385345459, + "rewards/ngram_similarity_reward/mean": 0.5445283055305481, + "rewards/ngram_similarity_reward/std": 0.33005306124687195, + "step": 368 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 705.0, + "completions/mean_length": 529.015625, + "completions/mean_terminated_length": 504.90478515625, + "completions/min_length": 314.0, + "completions/min_terminated_length": 314.0, + "epoch": 0.1651376146788991, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05091036856174469, + "learning_rate": 4.987241300813559e-06, + "loss": -0.0226, + "num_tokens": 59123152.0, + "reward": 1.76096510887146, + "reward_std": 1.6029728651046753, + "rewards/accuracy_reward/mean": 1.40625, + "rewards/accuracy_reward/std": 3.006441593170166, + "rewards/ngram_similarity_reward/mean": 0.3547152280807495, + "rewards/ngram_similarity_reward/std": 0.21333430707454681, + "step": 369 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1011.0, + "completions/max_terminated_length": 1011.0, + "completions/mean_length": 485.15625, + "completions/mean_terminated_length": 485.15625, + "completions/min_length": 245.0, + "completions/min_terminated_length": 245.0, + "epoch": 0.165585142089953, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.046371713280677795, + "learning_rate": 4.987063651956025e-06, + "loss": -0.0032, + "num_tokens": 59267834.0, + "reward": 2.320300579071045, + "reward_std": 1.143520474433899, + "rewards/accuracy_reward/mean": 2.03125, + "rewards/accuracy_reward/std": 2.986577272415161, + "rewards/ngram_similarity_reward/mean": 0.2890505790710449, + "rewards/ngram_similarity_reward/std": 0.3301846385002136, + "step": 370 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 584.0, + "completions/max_terminated_length": 584.0, + "completions/mean_length": 385.5, + "completions/mean_terminated_length": 385.5, + "completions/min_length": 206.0, + "completions/min_terminated_length": 206.0, + "epoch": 0.16603266950100692, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.059546373784542084, + "learning_rate": 4.98688477843129e-06, + "loss": 0.016, + "num_tokens": 59411946.0, + "reward": 3.2313122749328613, + "reward_std": 1.873366355895996, + "rewards/accuracy_reward/mean": 2.734375, + "rewards/accuracy_reward/std": 3.1760122776031494, + "rewards/ngram_similarity_reward/mean": 0.4969370663166046, + "rewards/ngram_similarity_reward/std": 0.2913515865802765, + "step": 371 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 894.0, + "completions/max_terminated_length": 894.0, + "completions/mean_length": 505.6875, + "completions/mean_terminated_length": 505.6875, + "completions/min_length": 323.0, + "completions/min_terminated_length": 323.0, + "epoch": 0.16648019691206087, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.045408546924591064, + "learning_rate": 4.986704680337274e-06, + "loss": -0.0131, + "num_tokens": 59589190.0, + "reward": 1.599818229675293, + "reward_std": 0.12945838272571564, + "rewards/accuracy_reward/mean": 0.984375, + "rewards/accuracy_reward/std": 2.6306629180908203, + "rewards/ngram_similarity_reward/mean": 0.6154431104660034, + "rewards/ngram_similarity_reward/std": 0.3580954074859619, + "step": 372 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 748.0, + "completions/max_terminated_length": 748.0, + "completions/mean_length": 529.296875, + "completions/mean_terminated_length": 529.296875, + "completions/min_length": 244.0, + "completions/min_terminated_length": 244.0, + "epoch": 0.16692772432311478, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04404851794242859, + "learning_rate": 4.986523357772573e-06, + "loss": -0.0016, + "num_tokens": 59736329.0, + "reward": 5.319042205810547, + "reward_std": 0.9707709550857544, + "rewards/accuracy_reward/mean": 4.953125, + "rewards/accuracy_reward/std": 1.8934279680252075, + "rewards/ngram_similarity_reward/mean": 0.36591705679893494, + "rewards/ngram_similarity_reward/std": 0.2848031520843506, + "step": 373 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 671.0, + "completions/max_terminated_length": 671.0, + "completions/mean_length": 450.40625, + "completions/mean_terminated_length": 450.40625, + "completions/min_length": 280.0, + "completions/min_terminated_length": 280.0, + "epoch": 0.16737525173416873, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05363212525844574, + "learning_rate": 4.9863408108364506e-06, + "loss": 0.0066, + "num_tokens": 59854755.0, + "reward": 2.6478450298309326, + "reward_std": 0.7685288190841675, + "rewards/accuracy_reward/mean": 2.125, + "rewards/accuracy_reward/std": 3.000000238418579, + "rewards/ngram_similarity_reward/mean": 0.5228448510169983, + "rewards/ngram_similarity_reward/std": 0.3450181782245636, + "step": 374 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 656.0, + "completions/max_terminated_length": 656.0, + "completions/mean_length": 462.640625, + "completions/mean_terminated_length": 462.640625, + "completions/min_length": 286.0, + "completions/min_terminated_length": 286.0, + "epoch": 0.16782277914522264, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04987025633454323, + "learning_rate": 4.986157039628841e-06, + "loss": 0.0063, + "num_tokens": 60053868.0, + "reward": 2.8956449031829834, + "reward_std": 1.9078601598739624, + "rewards/accuracy_reward/mean": 2.484375, + "rewards/accuracy_reward/std": 3.0419929027557373, + "rewards/ngram_similarity_reward/mean": 0.4112697243690491, + "rewards/ngram_similarity_reward/std": 0.206552654504776, + "step": 375 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 867.0, + "completions/max_terminated_length": 867.0, + "completions/mean_length": 444.359375, + "completions/mean_terminated_length": 444.359375, + "completions/min_length": 197.0, + "completions/min_terminated_length": 197.0, + "epoch": 0.16827030655627656, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.04339223727583885, + "learning_rate": 4.9859720442503465e-06, + "loss": -0.0199, + "num_tokens": 60211587.0, + "reward": 2.1971282958984375, + "reward_std": 1.369988203048706, + "rewards/accuracy_reward/mean": 1.625, + "rewards/accuracy_reward/std": 3.0420336723327637, + "rewards/ngram_similarity_reward/mean": 0.572128415107727, + "rewards/ngram_similarity_reward/std": 0.443522572517395, + "step": 376 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 639.0, + "completions/max_terminated_length": 639.0, + "completions/mean_length": 451.453125, + "completions/mean_terminated_length": 451.453125, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, + "epoch": 0.1687178339673305, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.049212515354156494, + "learning_rate": 4.985785824802244e-06, + "loss": -0.0071, + "num_tokens": 60375920.0, + "reward": 2.760453701019287, + "reward_std": 1.8617186546325684, + "rewards/accuracy_reward/mean": 2.5, + "rewards/accuracy_reward/std": 3.0237157344818115, + "rewards/ngram_similarity_reward/mean": 0.26045358180999756, + "rewards/ngram_similarity_reward/std": 0.17604276537895203, + "step": 377 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 980.0, + "completions/max_terminated_length": 980.0, + "completions/mean_length": 588.421875, + "completions/mean_terminated_length": 588.421875, + "completions/min_length": 295.0, + "completions/min_terminated_length": 295.0, + "epoch": 0.16916536137838442, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04697156697511673, + "learning_rate": 4.985598381386479e-06, + "loss": 0.0357, + "num_tokens": 60516331.0, + "reward": 4.39042854309082, + "reward_std": 0.49993008375167847, + "rewards/accuracy_reward/mean": 3.890625, + "rewards/accuracy_reward/std": 2.6998953819274902, + "rewards/ngram_similarity_reward/mean": 0.49980348348617554, + "rewards/ngram_similarity_reward/std": 0.2500842213630676, + "step": 378 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 651.0, + "completions/max_terminated_length": 651.0, + "completions/mean_length": 433.890625, + "completions/mean_terminated_length": 433.890625, + "completions/min_length": 215.0, + "completions/min_terminated_length": 215.0, + "epoch": 0.16961288878943837, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05765402689576149, + "learning_rate": 4.985409714105665e-06, + "loss": -0.0063, + "num_tokens": 60666036.0, + "reward": 4.322384834289551, + "reward_std": 1.326669454574585, + "rewards/accuracy_reward/mean": 3.859375, + "rewards/accuracy_reward/std": 2.7566208839416504, + "rewards/ngram_similarity_reward/mean": 0.46300965547561646, + "rewards/ngram_similarity_reward/std": 0.18290044367313385, + "step": 379 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 867.0, + "completions/max_terminated_length": 867.0, + "completions/mean_length": 538.5, + "completions/mean_terminated_length": 538.5, + "completions/min_length": 229.0, + "completions/min_terminated_length": 229.0, + "epoch": 0.17006041620049228, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04762609675526619, + "learning_rate": 4.985219823063086e-06, + "loss": -0.0093, + "num_tokens": 60784148.0, + "reward": 1.2124340534210205, + "reward_std": 1.3166241645812988, + "rewards/accuracy_reward/mean": 0.8125, + "rewards/accuracy_reward/std": 2.5, + "rewards/ngram_similarity_reward/mean": 0.3999340534210205, + "rewards/ngram_similarity_reward/std": 0.2689370810985565, + "step": 380 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 685.0, + "completions/max_terminated_length": 685.0, + "completions/mean_length": 446.0625, + "completions/mean_terminated_length": 446.0625, + "completions/min_length": 269.0, + "completions/min_terminated_length": 269.0, + "epoch": 0.1705079436115462, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05093666911125183, + "learning_rate": 4.985028708362697e-06, + "loss": -0.005, + "num_tokens": 60923352.0, + "reward": 4.361514091491699, + "reward_std": 0.5552492737770081, + "rewards/accuracy_reward/mean": 3.859375, + "rewards/accuracy_reward/std": 2.7566208839416504, + "rewards/ngram_similarity_reward/mean": 0.5021390318870544, + "rewards/ngram_similarity_reward/std": 0.35762351751327515, + "step": 381 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 674.0, + "completions/max_terminated_length": 674.0, + "completions/mean_length": 460.96875, + "completions/mean_terminated_length": 460.96875, + "completions/min_length": 327.0, + "completions/min_terminated_length": 327.0, + "epoch": 0.17095547102260014, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.050654876977205276, + "learning_rate": 4.984836370109124e-06, + "loss": 0.0272, + "num_tokens": 61069670.0, + "reward": 3.0200891494750977, + "reward_std": 0.8383454084396362, + "rewards/accuracy_reward/mean": 2.421875, + "rewards/accuracy_reward/std": 3.113231897354126, + "rewards/ngram_similarity_reward/mean": 0.5982141494750977, + "rewards/ngram_similarity_reward/std": 0.4427201449871063, + "step": 382 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 736.0, + "completions/max_terminated_length": 736.0, + "completions/mean_length": 472.90625, + "completions/mean_terminated_length": 472.90625, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "epoch": 0.17140299843365406, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05232694745063782, + "learning_rate": 4.98464280840766e-06, + "loss": -0.0054, + "num_tokens": 61196592.0, + "reward": 4.779727935791016, + "reward_std": 0.6334294080734253, + "rewards/accuracy_reward/mean": 4.1875, + "rewards/accuracy_reward/std": 2.5, + "rewards/ngram_similarity_reward/mean": 0.5922282934188843, + "rewards/ngram_similarity_reward/std": 0.34544986486434937, + "step": 383 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 730.0, + "completions/max_terminated_length": 730.0, + "completions/mean_length": 408.15625, + "completions/mean_terminated_length": 408.15625, + "completions/min_length": 226.0, + "completions/min_terminated_length": 226.0, + "epoch": 0.171850525844708, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06515532732009888, + "learning_rate": 4.98444802336427e-06, + "loss": 0.0095, + "num_tokens": 61394442.0, + "reward": 3.603017807006836, + "reward_std": 1.3876564502716064, + "rewards/accuracy_reward/mean": 3.328125, + "rewards/accuracy_reward/std": 2.9252848625183105, + "rewards/ngram_similarity_reward/mean": 0.274892657995224, + "rewards/ngram_similarity_reward/std": 0.20359322428703308, + "step": 384 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1014.0, + "completions/max_terminated_length": 1014.0, + "completions/mean_length": 446.3125, + "completions/mean_terminated_length": 446.3125, + "completions/min_length": 234.0, + "completions/min_terminated_length": 234.0, + "epoch": 0.17229805325576192, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05600956082344055, + "learning_rate": 4.984252015085588e-06, + "loss": 0.0006, + "num_tokens": 61581278.0, + "reward": 1.1497323513031006, + "reward_std": 2.092327356338501, + "rewards/accuracy_reward/mean": 0.90625, + "rewards/accuracy_reward/std": 2.688710927963257, + "rewards/ngram_similarity_reward/mean": 0.24348226189613342, + "rewards/ngram_similarity_reward/std": 0.1237434595823288, + "step": 385 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 811.0, + "completions/max_terminated_length": 811.0, + "completions/mean_length": 478.921875, + "completions/mean_terminated_length": 478.921875, + "completions/min_length": 257.0, + "completions/min_terminated_length": 257.0, + "epoch": 0.17274558066681583, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04640120267868042, + "learning_rate": 4.9840547836789175e-06, + "loss": -0.0179, + "num_tokens": 61737433.0, + "reward": 6.133259296417236, + "reward_std": 0.5081021189689636, + "rewards/accuracy_reward/mean": 5.40625, + "rewards/accuracy_reward/std": 0.7500000596046448, + "rewards/ngram_similarity_reward/mean": 0.7270093560218811, + "rewards/ngram_similarity_reward/std": 0.27775880694389343, + "step": 386 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 762.0, + "completions/max_terminated_length": 762.0, + "completions/mean_length": 521.71875, + "completions/mean_terminated_length": 521.71875, + "completions/min_length": 370.0, + "completions/min_terminated_length": 370.0, + "epoch": 0.17319310807786978, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.044407736510038376, + "learning_rate": 4.9838563292522304e-06, + "loss": 0.0146, + "num_tokens": 61899751.0, + "reward": 3.686185359954834, + "reward_std": 2.0083577632904053, + "rewards/accuracy_reward/mean": 3.046875, + "rewards/accuracy_reward/std": 2.991680145263672, + "rewards/ngram_similarity_reward/mean": 0.6393104791641235, + "rewards/ngram_similarity_reward/std": 0.3743637204170227, + "step": 387 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 691.0, + "completions/max_terminated_length": 691.0, + "completions/mean_length": 469.390625, + "completions/mean_terminated_length": 469.390625, + "completions/min_length": 250.0, + "completions/min_terminated_length": 250.0, + "epoch": 0.1736406354889237, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.045903291553258896, + "learning_rate": 4.983656651914172e-06, + "loss": -0.0037, + "num_tokens": 62064528.0, + "reward": 3.4533281326293945, + "reward_std": 2.146829843521118, + "rewards/accuracy_reward/mean": 2.921875, + "rewards/accuracy_reward/std": 3.0488338470458984, + "rewards/ngram_similarity_reward/mean": 0.5314529538154602, + "rewards/ngram_similarity_reward/std": 0.30242854356765747, + "step": 388 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 743.0, + "completions/max_terminated_length": 743.0, + "completions/mean_length": 518.015625, + "completions/mean_terminated_length": 518.015625, + "completions/min_length": 278.0, + "completions/min_terminated_length": 278.0, + "epoch": 0.1740881628999776, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.046720489859580994, + "learning_rate": 4.983455751774051e-06, + "loss": -0.008, + "num_tokens": 62232193.0, + "reward": 2.3623266220092773, + "reward_std": 1.4893290996551514, + "rewards/accuracy_reward/mean": 1.890625, + "rewards/accuracy_reward/std": 3.0164480209350586, + "rewards/ngram_similarity_reward/mean": 0.47170156240463257, + "rewards/ngram_similarity_reward/std": 0.19742360711097717, + "step": 389 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 937.0, + "completions/mean_length": 616.84375, + "completions/mean_terminated_length": 594.1270141601562, + "completions/min_length": 267.0, + "completions/min_terminated_length": 267.0, + "epoch": 0.17453569031103155, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.041487254202365875, + "learning_rate": 4.983253628941852e-06, + "loss": 0.0107, + "num_tokens": 62382647.0, + "reward": 0.36744025349617004, + "reward_std": 1.2747513055801392, + "rewards/accuracy_reward/mean": 0.015625, + "rewards/accuracy_reward/std": 1.790558934211731, + "rewards/ngram_similarity_reward/mean": 0.35181528329849243, + "rewards/ngram_similarity_reward/std": 0.24288035929203033, + "step": 390 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 622.0, + "completions/max_terminated_length": 622.0, + "completions/mean_length": 403.078125, + "completions/mean_terminated_length": 403.078125, + "completions/min_length": 230.0, + "completions/min_terminated_length": 230.0, + "epoch": 0.17498321772208547, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05197535827755928, + "learning_rate": 4.983050283528224e-06, + "loss": -0.0025, + "num_tokens": 62521292.0, + "reward": 5.143540382385254, + "reward_std": 1.4719245433807373, + "rewards/accuracy_reward/mean": 4.5625, + "rewards/accuracy_reward/std": 2.195775270462036, + "rewards/ngram_similarity_reward/mean": 0.5810403823852539, + "rewards/ngram_similarity_reward/std": 0.38260239362716675, + "step": 391 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 587.0, + "completions/max_terminated_length": 587.0, + "completions/mean_length": 339.984375, + "completions/mean_terminated_length": 339.984375, + "completions/min_length": 184.0, + "completions/min_terminated_length": 184.0, + "epoch": 0.17543074513313942, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07508906722068787, + "learning_rate": 4.982845715644489e-06, + "loss": -0.008, + "num_tokens": 62662379.0, + "reward": 2.900442600250244, + "reward_std": 0.5704430341720581, + "rewards/accuracy_reward/mean": 2.578125, + "rewards/accuracy_reward/std": 3.0410144329071045, + "rewards/ngram_similarity_reward/mean": 0.32231757044792175, + "rewards/ngram_similarity_reward/std": 0.35339492559432983, + "step": 392 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 601.0, + "completions/max_terminated_length": 601.0, + "completions/mean_length": 451.125, + "completions/mean_terminated_length": 451.125, + "completions/min_length": 270.0, + "completions/min_terminated_length": 270.0, + "epoch": 0.17587827254419333, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.057737164199352264, + "learning_rate": 4.982639925402636e-06, + "loss": -0.0154, + "num_tokens": 62806643.0, + "reward": 4.112510681152344, + "reward_std": 0.8752514123916626, + "rewards/accuracy_reward/mean": 3.78125, + "rewards/accuracy_reward/std": 2.7744226455688477, + "rewards/ngram_similarity_reward/mean": 0.3312610387802124, + "rewards/ngram_similarity_reward/std": 0.19397112727165222, + "step": 393 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 668.0, + "completions/max_terminated_length": 668.0, + "completions/mean_length": 377.046875, + "completions/mean_terminated_length": 377.046875, + "completions/min_length": 209.0, + "completions/min_terminated_length": 209.0, + "epoch": 0.17632579995524725, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0579199455678463, + "learning_rate": 4.982432912915321e-06, + "loss": 0.0082, + "num_tokens": 62972886.0, + "reward": 3.8911335468292236, + "reward_std": 1.3057481050491333, + "rewards/accuracy_reward/mean": 3.40625, + "rewards/accuracy_reward/std": 2.920745372772217, + "rewards/ngram_similarity_reward/mean": 0.4848836660385132, + "rewards/ngram_similarity_reward/std": 0.46673062443733215, + "step": 394 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 578.0, + "completions/max_terminated_length": 578.0, + "completions/mean_length": 411.46875, + "completions/mean_terminated_length": 411.46875, + "completions/min_length": 230.0, + "completions/min_terminated_length": 230.0, + "epoch": 0.1767733273663012, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06127110496163368, + "learning_rate": 4.982224678295876e-06, + "loss": 0.0187, + "num_tokens": 63124420.0, + "reward": 1.1061592102050781, + "reward_std": 0.47718849778175354, + "rewards/accuracy_reward/mean": 0.859375, + "rewards/accuracy_reward/std": 2.8416807651519775, + "rewards/ngram_similarity_reward/mean": 0.24678421020507812, + "rewards/ngram_similarity_reward/std": 0.08838558197021484, + "step": 395 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 605.0, + "completions/max_terminated_length": 605.0, + "completions/mean_length": 373.90625, + "completions/mean_terminated_length": 373.90625, + "completions/min_length": 253.0, + "completions/min_terminated_length": 253.0, + "epoch": 0.1772208547773551, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05738307535648346, + "learning_rate": 4.982015221658294e-06, + "loss": 0.0137, + "num_tokens": 63244862.0, + "reward": 4.085636138916016, + "reward_std": 0.895679235458374, + "rewards/accuracy_reward/mean": 3.671875, + "rewards/accuracy_reward/std": 2.8427278995513916, + "rewards/ngram_similarity_reward/mean": 0.4137610197067261, + "rewards/ngram_similarity_reward/std": 0.353694349527359, + "step": 396 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 824.0, + "completions/max_terminated_length": 824.0, + "completions/mean_length": 462.734375, + "completions/mean_terminated_length": 462.734375, + "completions/min_length": 286.0, + "completions/min_terminated_length": 286.0, + "epoch": 0.17766838218840905, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05412351340055466, + "learning_rate": 4.981804543117243e-06, + "loss": 0.0322, + "num_tokens": 63466141.0, + "reward": 2.1925134658813477, + "reward_std": 1.0302202701568604, + "rewards/accuracy_reward/mean": 1.671875, + "rewards/accuracy_reward/std": 3.0002894401550293, + "rewards/ngram_similarity_reward/mean": 0.5206387042999268, + "rewards/ngram_similarity_reward/std": 0.35803091526031494, + "step": 397 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 632.0, + "completions/max_terminated_length": 632.0, + "completions/mean_length": 448.421875, + "completions/mean_terminated_length": 448.421875, + "completions/min_length": 254.0, + "completions/min_terminated_length": 254.0, + "epoch": 0.17811590959946297, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05046705901622772, + "learning_rate": 4.9815926427880575e-06, + "loss": 0.0316, + "num_tokens": 63606904.0, + "reward": 1.6938735246658325, + "reward_std": 1.4424854516983032, + "rewards/accuracy_reward/mean": 1.28125, + "rewards/accuracy_reward/std": 2.7629566192626953, + "rewards/ngram_similarity_reward/mean": 0.41262343525886536, + "rewards/ngram_similarity_reward/std": 0.31365764141082764, + "step": 398 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 688.0, + "completions/mean_length": 449.6875, + "completions/mean_terminated_length": 424.3174743652344, + "completions/min_length": 260.0, + "completions/min_terminated_length": 260.0, + "epoch": 0.17856343701051688, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0631108209490776, + "learning_rate": 4.981379520786742e-06, + "loss": -0.0484, + "num_tokens": 63857620.0, + "reward": 1.294722080230713, + "reward_std": 1.6329095363616943, + "rewards/accuracy_reward/mean": 0.921875, + "rewards/accuracy_reward/std": 2.6773874759674072, + "rewards/ngram_similarity_reward/mean": 0.3728471100330353, + "rewards/ngram_similarity_reward/std": 0.2743929624557495, + "step": 399 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 508.0, + "completions/max_terminated_length": 508.0, + "completions/mean_length": 319.984375, + "completions/mean_terminated_length": 319.984375, + "completions/min_length": 169.0, + "completions/min_terminated_length": 169.0, + "epoch": 0.17901096442157083, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07036460191011429, + "learning_rate": 4.981165177229967e-06, + "loss": -0.0157, + "num_tokens": 64122227.0, + "reward": 4.307033538818359, + "reward_std": 0.4799138009548187, + "rewards/accuracy_reward/mean": 3.90625, + "rewards/accuracy_reward/std": 2.6709415912628174, + "rewards/ngram_similarity_reward/mean": 0.4007837772369385, + "rewards/ngram_similarity_reward/std": 0.2723061740398407, + "step": 400 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 864.0, + "completions/max_terminated_length": 864.0, + "completions/mean_length": 531.5625, + "completions/mean_terminated_length": 531.5625, + "completions/min_length": 335.0, + "completions/min_terminated_length": 335.0, + "epoch": 0.17945849183262474, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04598090052604675, + "learning_rate": 4.980949612235073e-06, + "loss": -0.0366, + "num_tokens": 64286343.0, + "reward": 6.062546730041504, + "reward_std": 0.4906230568885803, + "rewards/accuracy_reward/mean": 5.40625, + "rewards/accuracy_reward/std": 0.7500000596046448, + "rewards/ngram_similarity_reward/mean": 0.6562970876693726, + "rewards/ngram_similarity_reward/std": 0.25769153237342834, + "step": 401 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 747.0, + "completions/max_terminated_length": 747.0, + "completions/mean_length": 446.140625, + "completions/mean_terminated_length": 446.140625, + "completions/min_length": 242.0, + "completions/min_terminated_length": 242.0, + "epoch": 0.1799060192436787, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05102456733584404, + "learning_rate": 4.980732825920072e-06, + "loss": 0.0096, + "num_tokens": 64425040.0, + "reward": 4.750601768493652, + "reward_std": 1.248483419418335, + "rewards/accuracy_reward/mean": 4.359375, + "rewards/accuracy_reward/std": 2.3962087631225586, + "rewards/ngram_similarity_reward/mean": 0.39122653007507324, + "rewards/ngram_similarity_reward/std": 0.261292427778244, + "step": 402 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 740.0, + "completions/max_terminated_length": 740.0, + "completions/mean_length": 522.59375, + "completions/mean_terminated_length": 522.59375, + "completions/min_length": 241.0, + "completions/min_terminated_length": 241.0, + "epoch": 0.1803535466547326, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04561065137386322, + "learning_rate": 4.980514818403642e-06, + "loss": -0.0013, + "num_tokens": 64586262.0, + "reward": 3.251201629638672, + "reward_std": 1.5547279119491577, + "rewards/accuracy_reward/mean": 2.671875, + "rewards/accuracy_reward/std": 3.037097215652466, + "rewards/ngram_similarity_reward/mean": 0.579326868057251, + "rewards/ngram_similarity_reward/std": 0.26619914174079895, + "step": 403 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 801.0, + "completions/max_terminated_length": 801.0, + "completions/mean_length": 531.46875, + "completions/mean_terminated_length": 531.46875, + "completions/min_length": 289.0, + "completions/min_terminated_length": 289.0, + "epoch": 0.18080107406578652, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04496293514966965, + "learning_rate": 4.980295589805129e-06, + "loss": 0.0011, + "num_tokens": 64718356.0, + "reward": 4.318756580352783, + "reward_std": 0.5536075830459595, + "rewards/accuracy_reward/mean": 3.890625, + "rewards/accuracy_reward/std": 2.6998953819274902, + "rewards/ngram_similarity_reward/mean": 0.42813175916671753, + "rewards/ngram_similarity_reward/std": 0.35951486229896545, + "step": 404 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 752.0, + "completions/max_terminated_length": 752.0, + "completions/mean_length": 465.296875, + "completions/mean_terminated_length": 465.296875, + "completions/min_length": 253.0, + "completions/min_terminated_length": 253.0, + "epoch": 0.18124860147684047, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.051899295300245285, + "learning_rate": 4.980075140244548e-06, + "loss": 0.0078, + "num_tokens": 64855735.0, + "reward": 3.4069631099700928, + "reward_std": 0.9889413118362427, + "rewards/accuracy_reward/mean": 3.03125, + "rewards/accuracy_reward/std": 3.0130341053009033, + "rewards/ngram_similarity_reward/mean": 0.37571316957473755, + "rewards/ngram_similarity_reward/std": 0.3223586082458496, + "step": 405 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 815.0, + "completions/max_terminated_length": 815.0, + "completions/mean_length": 495.046875, + "completions/mean_terminated_length": 495.046875, + "completions/min_length": 243.0, + "completions/min_terminated_length": 243.0, + "epoch": 0.18169612888789438, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0473325289785862, + "learning_rate": 4.979853469842584e-06, + "loss": 0.0247, + "num_tokens": 65002106.0, + "reward": 3.3111226558685303, + "reward_std": 0.7830137610435486, + "rewards/accuracy_reward/mean": 2.78125, + "rewards/accuracy_reward/std": 3.0103988647460938, + "rewards/ngram_similarity_reward/mean": 0.5298726558685303, + "rewards/ngram_similarity_reward/std": 0.29405301809310913, + "step": 406 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 751.0, + "completions/max_terminated_length": 751.0, + "completions/mean_length": 481.140625, + "completions/mean_terminated_length": 481.140625, + "completions/min_length": 281.0, + "completions/min_terminated_length": 281.0, + "epoch": 0.1821436562989483, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.046513911336660385, + "learning_rate": 4.97963057872059e-06, + "loss": 0.0184, + "num_tokens": 65127523.0, + "reward": 2.0190467834472656, + "reward_std": 0.8510860800743103, + "rewards/accuracy_reward/mean": 1.75, + "rewards/accuracy_reward/std": 2.9277002811431885, + "rewards/ngram_similarity_reward/mean": 0.2690466642379761, + "rewards/ngram_similarity_reward/std": 0.2904336154460907, + "step": 407 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 675.0, + "completions/max_terminated_length": 675.0, + "completions/mean_length": 418.40625, + "completions/mean_terminated_length": 418.40625, + "completions/min_length": 194.0, + "completions/min_terminated_length": 194.0, + "epoch": 0.18259118371000224, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05490051954984665, + "learning_rate": 4.979406467000583e-06, + "loss": -0.0138, + "num_tokens": 65250397.0, + "reward": 2.7550859451293945, + "reward_std": 0.2528682351112366, + "rewards/accuracy_reward/mean": 2.34375, + "rewards/accuracy_reward/std": 3.1983067989349365, + "rewards/ngram_similarity_reward/mean": 0.41133588552474976, + "rewards/ngram_similarity_reward/std": 0.3694441020488739, + "step": 408 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 838.0, + "completions/max_terminated_length": 838.0, + "completions/mean_length": 513.296875, + "completions/mean_terminated_length": 513.296875, + "completions/min_length": 194.0, + "completions/min_terminated_length": 194.0, + "epoch": 0.18303871112105616, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0506632924079895, + "learning_rate": 4.979181134805255e-06, + "loss": -0.0235, + "num_tokens": 65420032.0, + "reward": 3.2668845653533936, + "reward_std": 1.1849749088287354, + "rewards/accuracy_reward/mean": 2.78125, + "rewards/accuracy_reward/std": 3.0103988647460938, + "rewards/ngram_similarity_reward/mean": 0.4856344759464264, + "rewards/ngram_similarity_reward/std": 0.29462864995002747, + "step": 409 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 884.0, + "completions/max_terminated_length": 884.0, + "completions/mean_length": 484.734375, + "completions/mean_terminated_length": 484.734375, + "completions/min_length": 191.0, + "completions/min_terminated_length": 191.0, + "epoch": 0.1834862385321101, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04931401088833809, + "learning_rate": 4.978954582257961e-06, + "loss": -0.0443, + "num_tokens": 65605263.0, + "reward": 2.908878803253174, + "reward_std": 1.4477934837341309, + "rewards/accuracy_reward/mean": 2.4375, + "rewards/accuracy_reward/std": 3.095695972442627, + "rewards/ngram_similarity_reward/mean": 0.47137901186943054, + "rewards/ngram_similarity_reward/std": 0.2872864902019501, + "step": 410 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1052.0, + "completions/max_terminated_length": 1052.0, + "completions/mean_length": 544.03125, + "completions/mean_terminated_length": 544.03125, + "completions/min_length": 270.0, + "completions/min_terminated_length": 270.0, + "epoch": 0.18393376594316402, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0414864607155323, + "learning_rate": 4.978726809482727e-06, + "loss": 0.0662, + "num_tokens": 65764945.0, + "reward": 3.0413129329681396, + "reward_std": 1.360729694366455, + "rewards/accuracy_reward/mean": 2.453125, + "rewards/accuracy_reward/std": 3.287444591522217, + "rewards/ngram_similarity_reward/mean": 0.5881880521774292, + "rewards/ngram_similarity_reward/std": 0.3917367160320282, + "step": 411 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 685.0, + "completions/max_terminated_length": 685.0, + "completions/mean_length": 467.8125, + "completions/mean_terminated_length": 467.8125, + "completions/min_length": 245.0, + "completions/min_terminated_length": 245.0, + "epoch": 0.18438129335421793, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.049397651106119156, + "learning_rate": 4.978497816604244e-06, + "loss": -0.0061, + "num_tokens": 65935605.0, + "reward": 4.930202484130859, + "reward_std": 0.7822836637496948, + "rewards/accuracy_reward/mean": 4.265625, + "rewards/accuracy_reward/std": 2.467195510864258, + "rewards/ngram_similarity_reward/mean": 0.6645776033401489, + "rewards/ngram_similarity_reward/std": 0.3582763373851776, + "step": 412 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 869.0, + "completions/max_terminated_length": 869.0, + "completions/mean_length": 499.21875, + "completions/mean_terminated_length": 499.21875, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "epoch": 0.18482882076527188, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05851361155509949, + "learning_rate": 4.978267603747875e-06, + "loss": 0.0516, + "num_tokens": 66133683.0, + "reward": 2.65552020072937, + "reward_std": 1.2815545797348022, + "rewards/accuracy_reward/mean": 2.21875, + "rewards/accuracy_reward/std": 3.0103988647460938, + "rewards/ngram_similarity_reward/mean": 0.4367702901363373, + "rewards/ngram_similarity_reward/std": 0.2824799120426178, + "step": 413 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 564.0, + "completions/max_terminated_length": 564.0, + "completions/mean_length": 353.453125, + "completions/mean_terminated_length": 353.453125, + "completions/min_length": 200.0, + "completions/min_terminated_length": 200.0, + "epoch": 0.1852763481763258, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06710066646337509, + "learning_rate": 4.9780361710396475e-06, + "loss": -0.0202, + "num_tokens": 66249888.0, + "reward": 2.8284106254577637, + "reward_std": 1.3892254829406738, + "rewards/accuracy_reward/mean": 2.46875, + "rewards/accuracy_reward/std": 3.06007981300354, + "rewards/ngram_similarity_reward/mean": 0.35966062545776367, + "rewards/ngram_similarity_reward/std": 0.2577352225780487, + "step": 414 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 536.0, + "completions/max_terminated_length": 536.0, + "completions/mean_length": 420.3125, + "completions/mean_terminated_length": 420.3125, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, + "epoch": 0.18572387558737974, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05144178494811058, + "learning_rate": 4.977803518606258e-06, + "loss": -0.0241, + "num_tokens": 66384324.0, + "reward": 4.493481636047363, + "reward_std": 1.0047578811645508, + "rewards/accuracy_reward/mean": 3.90625, + "rewards/accuracy_reward/std": 2.6709415912628174, + "rewards/ngram_similarity_reward/mean": 0.5872312188148499, + "rewards/ngram_similarity_reward/std": 0.29360121488571167, + "step": 415 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 711.0, + "completions/max_terminated_length": 711.0, + "completions/mean_length": 418.375, + "completions/mean_terminated_length": 418.375, + "completions/min_length": 186.0, + "completions/min_terminated_length": 186.0, + "epoch": 0.18617140299843365, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06043994054198265, + "learning_rate": 4.977569646575071e-06, + "loss": -0.0143, + "num_tokens": 66561676.0, + "reward": 4.20809268951416, + "reward_std": 0.9526137709617615, + "rewards/accuracy_reward/mean": 3.796875, + "rewards/accuracy_reward/std": 2.746886730194092, + "rewards/ngram_similarity_reward/mean": 0.4112175703048706, + "rewards/ngram_similarity_reward/std": 0.2842460870742798, + "step": 416 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 719.0, + "completions/max_terminated_length": 719.0, + "completions/mean_length": 450.71875, + "completions/mean_terminated_length": 450.71875, + "completions/min_length": 209.0, + "completions/min_terminated_length": 209.0, + "epoch": 0.18661893040948757, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04853370413184166, + "learning_rate": 4.977334555074119e-06, + "loss": -0.0126, + "num_tokens": 66737594.0, + "reward": 0.9017431735992432, + "reward_std": 1.474316954612732, + "rewards/accuracy_reward/mean": 0.484375, + "rewards/accuracy_reward/std": 2.312781572341919, + "rewards/ngram_similarity_reward/mean": 0.41736823320388794, + "rewards/ngram_similarity_reward/std": 0.3200174868106842, + "step": 417 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 523.0, + "completions/max_terminated_length": 523.0, + "completions/mean_length": 335.046875, + "completions/mean_terminated_length": 335.046875, + "completions/min_length": 173.0, + "completions/min_terminated_length": 173.0, + "epoch": 0.18706645782054152, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06876197457313538, + "learning_rate": 4.977098244232099e-06, + "loss": -0.013, + "num_tokens": 66936701.0, + "reward": 4.07749080657959, + "reward_std": 0.8316932916641235, + "rewards/accuracy_reward/mean": 3.625, + "rewards/accuracy_reward/std": 2.8030595779418945, + "rewards/ngram_similarity_reward/mean": 0.45249098539352417, + "rewards/ngram_similarity_reward/std": 0.2823086380958557, + "step": 418 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 789.0, + "completions/max_terminated_length": 789.0, + "completions/mean_length": 461.6875, + "completions/mean_terminated_length": 461.6875, + "completions/min_length": 242.0, + "completions/min_terminated_length": 242.0, + "epoch": 0.18751398523159543, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05493154749274254, + "learning_rate": 4.97686071417838e-06, + "loss": 0.0447, + "num_tokens": 67153945.0, + "reward": 5.43075704574585, + "reward_std": 1.6792662143707275, + "rewards/accuracy_reward/mean": 4.921875, + "rewards/accuracy_reward/std": 1.8153201341629028, + "rewards/ngram_similarity_reward/mean": 0.5088820457458496, + "rewards/ngram_similarity_reward/std": 0.3658638596534729, + "step": 419 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1072.0, + "completions/max_terminated_length": 1072.0, + "completions/mean_length": 549.71875, + "completions/mean_terminated_length": 549.71875, + "completions/min_length": 121.0, + "completions/min_terminated_length": 121.0, + "epoch": 0.18796151264264938, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04870909824967384, + "learning_rate": 4.976621965042996e-06, + "loss": -0.0049, + "num_tokens": 67330359.0, + "reward": 3.679755210876465, + "reward_std": 1.5922231674194336, + "rewards/accuracy_reward/mean": 2.953125, + "rewards/accuracy_reward/std": 3.0075550079345703, + "rewards/ngram_similarity_reward/mean": 0.7266303300857544, + "rewards/ngram_similarity_reward/std": 0.3412902057170868, + "step": 420 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 740.0, + "completions/max_terminated_length": 740.0, + "completions/mean_length": 462.234375, + "completions/mean_terminated_length": 462.234375, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "epoch": 0.1884090400537033, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.056523796170949936, + "learning_rate": 4.97638199695665e-06, + "loss": -0.0166, + "num_tokens": 67511190.0, + "reward": 2.990081787109375, + "reward_std": 2.6330084800720215, + "rewards/accuracy_reward/mean": 2.296875, + "rewards/accuracy_reward/std": 3.0351366996765137, + "rewards/ngram_similarity_reward/mean": 0.693206787109375, + "rewards/ngram_similarity_reward/std": 0.2941289246082306, + "step": 421 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 695.0, + "completions/max_terminated_length": 695.0, + "completions/mean_length": 462.171875, + "completions/mean_terminated_length": 462.171875, + "completions/min_length": 277.0, + "completions/min_terminated_length": 277.0, + "epoch": 0.1888565674647572, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05080768093466759, + "learning_rate": 4.9761408100507094e-06, + "loss": 0.0207, + "num_tokens": 67662577.0, + "reward": 4.520578861236572, + "reward_std": 0.07399225234985352, + "rewards/accuracy_reward/mean": 4.0, + "rewards/accuracy_reward/std": 2.618614673614502, + "rewards/ngram_similarity_reward/mean": 0.5205788016319275, + "rewards/ngram_similarity_reward/std": 0.4589429497718811, + "step": 422 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 876.0, + "completions/max_terminated_length": 876.0, + "completions/mean_length": 475.40625, + "completions/mean_terminated_length": 475.40625, + "completions/min_length": 199.0, + "completions/min_terminated_length": 199.0, + "epoch": 0.18930409487581115, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05515008419752121, + "learning_rate": 4.97589840445721e-06, + "loss": -0.0312, + "num_tokens": 67851435.0, + "reward": 0.9724314212799072, + "reward_std": 1.1153570413589478, + "rewards/accuracy_reward/mean": 0.546875, + "rewards/accuracy_reward/std": 2.6751632690429688, + "rewards/ngram_similarity_reward/mean": 0.425556480884552, + "rewards/ngram_similarity_reward/std": 0.25544473528862, + "step": 423 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 728.0, + "completions/max_terminated_length": 728.0, + "completions/mean_length": 475.71875, + "completions/mean_terminated_length": 475.71875, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, + "epoch": 0.18975162228686507, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.048770152032375336, + "learning_rate": 4.975654780308857e-06, + "loss": -0.0423, + "num_tokens": 67989177.0, + "reward": 3.2835075855255127, + "reward_std": 1.7055535316467285, + "rewards/accuracy_reward/mean": 2.90625, + "rewards/accuracy_reward/std": 3.069143772125244, + "rewards/ngram_similarity_reward/mean": 0.37725764513015747, + "rewards/ngram_similarity_reward/std": 0.29064175486564636, + "step": 424 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 823.0, + "completions/max_terminated_length": 823.0, + "completions/mean_length": 491.15625, + "completions/mean_terminated_length": 491.15625, + "completions/min_length": 211.0, + "completions/min_terminated_length": 211.0, + "epoch": 0.19019914969791898, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.04984172061085701, + "learning_rate": 4.975409937739021e-06, + "loss": 0.0371, + "num_tokens": 68125651.0, + "reward": 5.302239418029785, + "reward_std": 0.8608360290527344, + "rewards/accuracy_reward/mean": 4.65625, + "rewards/accuracy_reward/std": 2.102294683456421, + "rewards/ngram_similarity_reward/mean": 0.6459895372390747, + "rewards/ngram_similarity_reward/std": 0.2360706925392151, + "step": 425 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 829.0, + "completions/max_terminated_length": 829.0, + "completions/mean_length": 480.828125, + "completions/mean_terminated_length": 480.828125, + "completions/min_length": 307.0, + "completions/min_terminated_length": 307.0, + "epoch": 0.19064667710897293, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04941911622881889, + "learning_rate": 4.9751638768817385e-06, + "loss": 0.0341, + "num_tokens": 68249832.0, + "reward": 2.1362762451171875, + "reward_std": 1.6013157367706299, + "rewards/accuracy_reward/mean": 1.640625, + "rewards/accuracy_reward/std": 2.91611385345459, + "rewards/ngram_similarity_reward/mean": 0.4956514239311218, + "rewards/ngram_similarity_reward/std": 0.2930045425891876, + "step": 426 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 593.0, + "completions/max_terminated_length": 593.0, + "completions/mean_length": 375.46875, + "completions/mean_terminated_length": 375.46875, + "completions/min_length": 201.0, + "completions/min_terminated_length": 201.0, + "epoch": 0.19109420452002684, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0572495236992836, + "learning_rate": 4.974916597871714e-06, + "loss": 0.0218, + "num_tokens": 68406470.0, + "reward": 0.5266842246055603, + "reward_std": 1.5081195831298828, + "rewards/accuracy_reward/mean": 0.140625, + "rewards/accuracy_reward/std": 2.0653390884399414, + "rewards/ngram_similarity_reward/mean": 0.3860591948032379, + "rewards/ngram_similarity_reward/std": 0.25203803181648254, + "step": 427 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 931.0, + "completions/max_terminated_length": 931.0, + "completions/mean_length": 577.796875, + "completions/mean_terminated_length": 577.796875, + "completions/min_length": 371.0, + "completions/min_terminated_length": 371.0, + "epoch": 0.1915417319310808, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04633355140686035, + "learning_rate": 4.97466810084432e-06, + "loss": 0.0083, + "num_tokens": 68562169.0, + "reward": 3.6001086235046387, + "reward_std": 1.9819128513336182, + "rewards/accuracy_reward/mean": 2.96875, + "rewards/accuracy_reward/std": 2.986577272415161, + "rewards/ngram_similarity_reward/mean": 0.6313588619232178, + "rewards/ngram_similarity_reward/std": 0.24967895448207855, + "step": 428 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 736.0, + "completions/max_terminated_length": 736.0, + "completions/mean_length": 432.90625, + "completions/mean_terminated_length": 432.90625, + "completions/min_length": 273.0, + "completions/min_terminated_length": 273.0, + "epoch": 0.1919892593421347, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04926011711359024, + "learning_rate": 4.974418385935594e-06, + "loss": 0.0302, + "num_tokens": 68738451.0, + "reward": 4.414224624633789, + "reward_std": 0.6799229383468628, + "rewards/accuracy_reward/mean": 3.8125, + "rewards/accuracy_reward/std": 2.7189810276031494, + "rewards/ngram_similarity_reward/mean": 0.6017246246337891, + "rewards/ngram_similarity_reward/std": 0.3715902268886566, + "step": 429 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 795.0, + "completions/max_terminated_length": 795.0, + "completions/mean_length": 490.4375, + "completions/mean_terminated_length": 490.4375, + "completions/min_length": 285.0, + "completions/min_terminated_length": 285.0, + "epoch": 0.19243678675318862, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.051539380103349686, + "learning_rate": 4.97416745328224e-06, + "loss": 0.0126, + "num_tokens": 68862127.0, + "reward": 3.573207378387451, + "reward_std": 0.9432869553565979, + "rewards/accuracy_reward/mean": 3.140625, + "rewards/accuracy_reward/std": 2.9727182388305664, + "rewards/ngram_similarity_reward/mean": 0.43258219957351685, + "rewards/ngram_similarity_reward/std": 0.22983446717262268, + "step": 430 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 857.0, + "completions/max_terminated_length": 857.0, + "completions/mean_length": 526.6875, + "completions/mean_terminated_length": 526.6875, + "completions/min_length": 217.0, + "completions/min_terminated_length": 217.0, + "epoch": 0.19288431416424257, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04874693974852562, + "learning_rate": 4.973915303021632e-06, + "loss": 0.0028, + "num_tokens": 68999963.0, + "reward": 2.221597194671631, + "reward_std": 1.8298258781433105, + "rewards/accuracy_reward/mean": 1.84375, + "rewards/accuracy_reward/std": 2.950484275817871, + "rewards/ngram_similarity_reward/mean": 0.37784695625305176, + "rewards/ngram_similarity_reward/std": 0.1993177980184555, + "step": 431 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 523.0, + "completions/max_terminated_length": 523.0, + "completions/mean_length": 319.421875, + "completions/mean_terminated_length": 319.421875, + "completions/min_length": 137.0, + "completions/min_terminated_length": 137.0, + "epoch": 0.19333184157529648, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08644500374794006, + "learning_rate": 4.973661935291807e-06, + "loss": -0.025, + "num_tokens": 69236182.0, + "reward": 3.3783063888549805, + "reward_std": 1.6269056797027588, + "rewards/accuracy_reward/mean": 2.703125, + "rewards/accuracy_reward/std": 3.315091609954834, + "rewards/ngram_similarity_reward/mean": 0.6751815676689148, + "rewards/ngram_similarity_reward/std": 0.40489012002944946, + "step": 432 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 709.0, + "completions/max_terminated_length": 709.0, + "completions/mean_length": 518.09375, + "completions/mean_terminated_length": 518.09375, + "completions/min_length": 324.0, + "completions/min_terminated_length": 324.0, + "epoch": 0.19377936898635043, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05006977915763855, + "learning_rate": 4.973407350231469e-06, + "loss": 0.006, + "num_tokens": 69429164.0, + "reward": 4.490677833557129, + "reward_std": 0.5806868076324463, + "rewards/accuracy_reward/mean": 3.890625, + "rewards/accuracy_reward/std": 2.6998953819274902, + "rewards/ngram_similarity_reward/mean": 0.6000529527664185, + "rewards/ngram_similarity_reward/std": 0.3240531384944916, + "step": 433 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 822.0, + "completions/max_terminated_length": 822.0, + "completions/mean_length": 512.03125, + "completions/mean_terminated_length": 512.03125, + "completions/min_length": 255.0, + "completions/min_terminated_length": 255.0, + "epoch": 0.19422689639740434, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04619980603456497, + "learning_rate": 4.97315154797999e-06, + "loss": 0.0044, + "num_tokens": 69583438.0, + "reward": 4.255029678344727, + "reward_std": 0.7939695119857788, + "rewards/accuracy_reward/mean": 3.78125, + "rewards/accuracy_reward/std": 2.7744226455688477, + "rewards/ngram_similarity_reward/mean": 0.4737798273563385, + "rewards/ngram_similarity_reward/std": 0.2946584224700928, + "step": 434 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 884.0, + "completions/max_terminated_length": 884.0, + "completions/mean_length": 482.640625, + "completions/mean_terminated_length": 482.640625, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "epoch": 0.19467442380845826, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05399477854371071, + "learning_rate": 4.972894528677406e-06, + "loss": 0.0359, + "num_tokens": 69739415.0, + "reward": 4.601756572723389, + "reward_std": 2.62172532081604, + "rewards/accuracy_reward/mean": 3.953125, + "rewards/accuracy_reward/std": 2.7076005935668945, + "rewards/ngram_similarity_reward/mean": 0.6486316919326782, + "rewards/ngram_similarity_reward/std": 0.34401828050613403, + "step": 435 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 637.0, + "completions/max_terminated_length": 637.0, + "completions/mean_length": 365.109375, + "completions/mean_terminated_length": 365.109375, + "completions/min_length": 220.0, + "completions/min_terminated_length": 220.0, + "epoch": 0.1951219512195122, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07542918622493744, + "learning_rate": 4.972636292464423e-06, + "loss": 0.0384, + "num_tokens": 69959870.0, + "reward": 2.8980531692504883, + "reward_std": 1.627511978149414, + "rewards/accuracy_reward/mean": 2.296875, + "rewards/accuracy_reward/std": 3.143043279647827, + "rewards/ngram_similarity_reward/mean": 0.6011780500411987, + "rewards/ngram_similarity_reward/std": 0.3964519798755646, + "step": 436 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 857.0, + "completions/max_terminated_length": 857.0, + "completions/mean_length": 479.40625, + "completions/mean_terminated_length": 479.40625, + "completions/min_length": 227.0, + "completions/min_terminated_length": 227.0, + "epoch": 0.19556947863056612, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.052132755517959595, + "learning_rate": 4.9723768394824085e-06, + "loss": 0.0189, + "num_tokens": 70115896.0, + "reward": 4.600559234619141, + "reward_std": 1.1112843751907349, + "rewards/accuracy_reward/mean": 4.171875, + "rewards/accuracy_reward/std": 2.5326733589172363, + "rewards/ngram_similarity_reward/mean": 0.42868444323539734, + "rewards/ngram_similarity_reward/std": 0.40038877725601196, + "step": 437 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 828.0, + "completions/max_terminated_length": 828.0, + "completions/mean_length": 509.609375, + "completions/mean_terminated_length": 509.609375, + "completions/min_length": 226.0, + "completions/min_terminated_length": 226.0, + "epoch": 0.19601700604162006, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05371229350566864, + "learning_rate": 4.9721161698734e-06, + "loss": -0.0293, + "num_tokens": 70343679.0, + "reward": 2.7435035705566406, + "reward_std": 0.9512712359428406, + "rewards/accuracy_reward/mean": 2.171875, + "rewards/accuracy_reward/std": 3.060525417327881, + "rewards/ngram_similarity_reward/mean": 0.5716284513473511, + "rewards/ngram_similarity_reward/std": 0.4092555046081543, + "step": 438 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 719.0, + "completions/max_terminated_length": 719.0, + "completions/mean_length": 508.78125, + "completions/mean_terminated_length": 508.78125, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, + "epoch": 0.19646453345267398, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.053464487195014954, + "learning_rate": 4.971854283780099e-06, + "loss": -0.0217, + "num_tokens": 70481521.0, + "reward": 4.153450012207031, + "reward_std": 1.3068652153015137, + "rewards/accuracy_reward/mean": 3.4375, + "rewards/accuracy_reward/std": 2.872281312942505, + "rewards/ngram_similarity_reward/mean": 0.7159500122070312, + "rewards/ngram_similarity_reward/std": 0.3677367568016052, + "step": 439 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 714.0, + "completions/max_terminated_length": 714.0, + "completions/mean_length": 403.71875, + "completions/mean_terminated_length": 403.71875, + "completions/min_length": 205.0, + "completions/min_terminated_length": 205.0, + "epoch": 0.1969120608637279, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05618702620267868, + "learning_rate": 4.971591181345874e-06, + "loss": -0.0447, + "num_tokens": 70617183.0, + "reward": 2.9440038204193115, + "reward_std": 1.3587820529937744, + "rewards/accuracy_reward/mean": 2.5, + "rewards/accuracy_reward/std": 3.0237157344818115, + "rewards/ngram_similarity_reward/mean": 0.4440038800239563, + "rewards/ngram_similarity_reward/std": 0.38334041833877563, + "step": 440 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 765.0, + "completions/max_terminated_length": 765.0, + "completions/mean_length": 487.578125, + "completions/mean_terminated_length": 487.578125, + "completions/min_length": 244.0, + "completions/min_terminated_length": 244.0, + "epoch": 0.19735958827478184, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.049524884670972824, + "learning_rate": 4.971326862714757e-06, + "loss": 0.0157, + "num_tokens": 70767364.0, + "reward": 4.090793609619141, + "reward_std": 1.2506130933761597, + "rewards/accuracy_reward/mean": 3.5, + "rewards/accuracy_reward/std": 2.8894994258880615, + "rewards/ngram_similarity_reward/mean": 0.5907935500144958, + "rewards/ngram_similarity_reward/std": 0.29597073793411255, + "step": 441 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 600.0, + "completions/max_terminated_length": 600.0, + "completions/mean_length": 392.5, + "completions/mean_terminated_length": 392.5, + "completions/min_length": 227.0, + "completions/min_terminated_length": 227.0, + "epoch": 0.19780711568583575, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0690118670463562, + "learning_rate": 4.97106132803145e-06, + "loss": -0.0388, + "num_tokens": 70945348.0, + "reward": 2.892568588256836, + "reward_std": 2.564103126525879, + "rewards/accuracy_reward/mean": 2.53125, + "rewards/accuracy_reward/std": 3.0961766242980957, + "rewards/ngram_similarity_reward/mean": 0.36131858825683594, + "rewards/ngram_similarity_reward/std": 0.3403457701206207, + "step": 442 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 631.0, + "completions/max_terminated_length": 631.0, + "completions/mean_length": 441.4375, + "completions/mean_terminated_length": 441.4375, + "completions/min_length": 277.0, + "completions/min_terminated_length": 277.0, + "epoch": 0.19825464309688967, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05034182220697403, + "learning_rate": 4.9707945774413194e-06, + "loss": -0.019, + "num_tokens": 71070976.0, + "reward": 3.9274301528930664, + "reward_std": 0.9560231566429138, + "rewards/accuracy_reward/mean": 3.25, + "rewards/accuracy_reward/std": 2.9277002811431885, + "rewards/ngram_similarity_reward/mean": 0.6774301528930664, + "rewards/ngram_similarity_reward/std": 0.3557387590408325, + "step": 443 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 658.0, + "completions/max_terminated_length": 658.0, + "completions/mean_length": 499.6875, + "completions/mean_terminated_length": 499.6875, + "completions/min_length": 326.0, + "completions/min_terminated_length": 326.0, + "epoch": 0.19870217050794362, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04697749763727188, + "learning_rate": 4.970526611090391e-06, + "loss": 0.004, + "num_tokens": 71228108.0, + "reward": 3.823631763458252, + "reward_std": 0.9873033165931702, + "rewards/accuracy_reward/mean": 3.390625, + "rewards/accuracy_reward/std": 2.944552183151245, + "rewards/ngram_similarity_reward/mean": 0.4330069422721863, + "rewards/ngram_similarity_reward/std": 0.2955899238586426, + "step": 444 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 751.0, + "completions/max_terminated_length": 751.0, + "completions/mean_length": 389.9375, + "completions/mean_terminated_length": 389.9375, + "completions/min_length": 211.0, + "completions/min_terminated_length": 211.0, + "epoch": 0.19914969791899753, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06391636282205582, + "learning_rate": 4.970257429125368e-06, + "loss": -0.0304, + "num_tokens": 71359784.0, + "reward": 1.6688861846923828, + "reward_std": 0.9762160181999207, + "rewards/accuracy_reward/mean": 1.359375, + "rewards/accuracy_reward/std": 3.1515538692474365, + "rewards/ngram_similarity_reward/mean": 0.3095111846923828, + "rewards/ngram_similarity_reward/std": 0.3359506130218506, + "step": 445 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 690.0, + "completions/max_terminated_length": 690.0, + "completions/mean_length": 442.3125, + "completions/mean_terminated_length": 442.3125, + "completions/min_length": 274.0, + "completions/min_terminated_length": 274.0, + "epoch": 0.19959722533005148, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0558396577835083, + "learning_rate": 4.969987031693606e-06, + "loss": 0.0272, + "num_tokens": 71522284.0, + "reward": 4.019309997558594, + "reward_std": 0.8522672653198242, + "rewards/accuracy_reward/mean": 3.4375, + "rewards/accuracy_reward/std": 2.872281312942505, + "rewards/ngram_similarity_reward/mean": 0.5818102359771729, + "rewards/ngram_similarity_reward/std": 0.2885514795780182, + "step": 446 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 719.0, + "completions/max_terminated_length": 719.0, + "completions/mean_length": 549.1875, + "completions/mean_terminated_length": 549.1875, + "completions/min_length": 342.0, + "completions/min_terminated_length": 342.0, + "epoch": 0.2000447527411054, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04167667776346207, + "learning_rate": 4.969715418943137e-06, + "loss": -0.0106, + "num_tokens": 71676392.0, + "reward": 4.276381492614746, + "reward_std": 0.9113213419914246, + "rewards/accuracy_reward/mean": 3.4375, + "rewards/accuracy_reward/std": 2.872281312942505, + "rewards/ngram_similarity_reward/mean": 0.8388814330101013, + "rewards/ngram_similarity_reward/std": 0.19755546748638153, + "step": 447 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 660.0, + "completions/max_terminated_length": 660.0, + "completions/mean_length": 443.671875, + "completions/mean_terminated_length": 443.671875, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "epoch": 0.2004922801521593, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06569624692201614, + "learning_rate": 4.969442591022653e-06, + "loss": -0.0035, + "num_tokens": 71835811.0, + "reward": 2.307880163192749, + "reward_std": 1.4398939609527588, + "rewards/accuracy_reward/mean": 1.78125, + "rewards/accuracy_reward/std": 3.0103988647460938, + "rewards/ngram_similarity_reward/mean": 0.5266302824020386, + "rewards/ngram_similarity_reward/std": 0.37980908155441284, + "step": 448 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 959.0, + "completions/max_terminated_length": 959.0, + "completions/mean_length": 511.53125, + "completions/mean_terminated_length": 511.53125, + "completions/min_length": 303.0, + "completions/min_terminated_length": 303.0, + "epoch": 0.20093980756321325, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05349518358707428, + "learning_rate": 4.969168548081511e-06, + "loss": -0.0157, + "num_tokens": 72012293.0, + "reward": 4.331980228424072, + "reward_std": 0.8488610982894897, + "rewards/accuracy_reward/mean": 3.6875, + "rewards/accuracy_reward/std": 2.816476583480835, + "rewards/ngram_similarity_reward/mean": 0.6444799304008484, + "rewards/ngram_similarity_reward/std": 0.289334774017334, + "step": 449 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 812.0, + "completions/max_terminated_length": 812.0, + "completions/mean_length": 461.234375, + "completions/mean_terminated_length": 461.234375, + "completions/min_length": 236.0, + "completions/min_terminated_length": 236.0, + "epoch": 0.20138733497426717, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0612126961350441, + "learning_rate": 4.968893290269734e-06, + "loss": 0.029, + "num_tokens": 72167716.0, + "reward": 3.29874587059021, + "reward_std": 1.4539015293121338, + "rewards/accuracy_reward/mean": 2.9375, + "rewards/accuracy_reward/std": 3.028305768966675, + "rewards/ngram_similarity_reward/mean": 0.36124569177627563, + "rewards/ngram_similarity_reward/std": 0.24575692415237427, + "step": 450 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 668.0, + "completions/max_terminated_length": 668.0, + "completions/mean_length": 406.046875, + "completions/mean_terminated_length": 406.046875, + "completions/min_length": 244.0, + "completions/min_terminated_length": 244.0, + "epoch": 0.2018348623853211, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.062243226915597916, + "learning_rate": 4.968616817738013e-06, + "loss": -0.0154, + "num_tokens": 72343751.0, + "reward": 3.4030308723449707, + "reward_std": 1.9362056255340576, + "rewards/accuracy_reward/mean": 2.859375, + "rewards/accuracy_reward/std": 3.0203921794891357, + "rewards/ngram_similarity_reward/mean": 0.5436556339263916, + "rewards/ngram_similarity_reward/std": 0.3432004451751709, + "step": 451 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 862.0, + "completions/max_terminated_length": 862.0, + "completions/mean_length": 403.109375, + "completions/mean_terminated_length": 403.109375, + "completions/min_length": 185.0, + "completions/min_terminated_length": 185.0, + "epoch": 0.20228238979637503, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05695686489343643, + "learning_rate": 4.968339130637696e-06, + "loss": 0.0208, + "num_tokens": 72479902.0, + "reward": 3.9481887817382812, + "reward_std": 0.9490264654159546, + "rewards/accuracy_reward/mean": 3.4375, + "rewards/accuracy_reward/std": 2.872281312942505, + "rewards/ngram_similarity_reward/mean": 0.5106890201568604, + "rewards/ngram_similarity_reward/std": 0.2993254065513611, + "step": 452 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 767.0, + "completions/max_terminated_length": 767.0, + "completions/mean_length": 416.84375, + "completions/mean_terminated_length": 416.84375, + "completions/min_length": 276.0, + "completions/min_terminated_length": 276.0, + "epoch": 0.20272991720742894, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06031221151351929, + "learning_rate": 4.968060229120806e-06, + "loss": -0.0202, + "num_tokens": 72703972.0, + "reward": 4.6372809410095215, + "reward_std": 1.6896450519561768, + "rewards/accuracy_reward/mean": 4.1875, + "rewards/accuracy_reward/std": 2.5, + "rewards/ngram_similarity_reward/mean": 0.44978126883506775, + "rewards/ngram_similarity_reward/std": 0.31349703669548035, + "step": 453 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 568.0, + "completions/max_terminated_length": 568.0, + "completions/mean_length": 401.71875, + "completions/mean_terminated_length": 401.71875, + "completions/min_length": 227.0, + "completions/min_terminated_length": 227.0, + "epoch": 0.2031774446184829, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06598486751317978, + "learning_rate": 4.967780113340025e-06, + "loss": 0.0328, + "num_tokens": 72925394.0, + "reward": 2.2886803150177, + "reward_std": 1.0122326612472534, + "rewards/accuracy_reward/mean": 1.859375, + "rewards/accuracy_reward/std": 3.046555280685425, + "rewards/ngram_similarity_reward/mean": 0.4293053448200226, + "rewards/ngram_similarity_reward/std": 0.3749549984931946, + "step": 454 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 836.0, + "completions/max_terminated_length": 836.0, + "completions/mean_length": 529.28125, + "completions/mean_terminated_length": 529.28125, + "completions/min_length": 310.0, + "completions/min_terminated_length": 310.0, + "epoch": 0.2036249720295368, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.047418881207704544, + "learning_rate": 4.9674987834486986e-06, + "loss": 0.0254, + "num_tokens": 73091668.0, + "reward": 4.394710063934326, + "reward_std": 0.22299596667289734, + "rewards/accuracy_reward/mean": 4.0, + "rewards/accuracy_reward/std": 2.618614673614502, + "rewards/ngram_similarity_reward/mean": 0.3947100043296814, + "rewards/ngram_similarity_reward/std": 0.3227953016757965, + "step": 455 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 796.0, + "completions/max_terminated_length": 796.0, + "completions/mean_length": 487.546875, + "completions/mean_terminated_length": 487.546875, + "completions/min_length": 247.0, + "completions/min_terminated_length": 247.0, + "epoch": 0.20407249944059075, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05645826458930969, + "learning_rate": 4.967216239600842e-06, + "loss": -0.0032, + "num_tokens": 73235847.0, + "reward": 2.1269445419311523, + "reward_std": 2.6225547790527344, + "rewards/accuracy_reward/mean": 1.71875, + "rewards/accuracy_reward/std": 2.9572014808654785, + "rewards/ngram_similarity_reward/mean": 0.40819472074508667, + "rewards/ngram_similarity_reward/std": 0.2562521994113922, + "step": 456 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 858.0, + "completions/max_terminated_length": 858.0, + "completions/mean_length": 499.65625, + "completions/mean_terminated_length": 499.65625, + "completions/min_length": 288.0, + "completions/min_terminated_length": 288.0, + "epoch": 0.20452002685164467, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04859715700149536, + "learning_rate": 4.966932481951129e-06, + "loss": 0.0128, + "num_tokens": 73428353.0, + "reward": 2.2688302993774414, + "reward_std": 0.929410457611084, + "rewards/accuracy_reward/mean": 1.75, + "rewards/accuracy_reward/std": 2.9277002811431885, + "rewards/ngram_similarity_reward/mean": 0.5188302993774414, + "rewards/ngram_similarity_reward/std": 0.27700766921043396, + "step": 457 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 637.0, + "completions/max_terminated_length": 637.0, + "completions/mean_length": 362.0, + "completions/mean_terminated_length": 362.0, + "completions/min_length": 207.0, + "completions/min_terminated_length": 207.0, + "epoch": 0.20496755426269858, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05721988156437874, + "learning_rate": 4.966647510654904e-06, + "loss": 0.0061, + "num_tokens": 73564145.0, + "reward": 5.685698509216309, + "reward_std": 1.1184368133544922, + "rewards/accuracy_reward/mean": 5.125, + "rewards/accuracy_reward/std": 1.4638501405715942, + "rewards/ngram_similarity_reward/mean": 0.5606982707977295, + "rewards/ngram_similarity_reward/std": 0.4015916883945465, + "step": 458 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 849.0, + "completions/max_terminated_length": 849.0, + "completions/mean_length": 557.5625, + "completions/mean_terminated_length": 557.5625, + "completions/min_length": 274.0, + "completions/min_terminated_length": 274.0, + "epoch": 0.20541508167375253, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05014842748641968, + "learning_rate": 4.966361325868171e-06, + "loss": -0.0044, + "num_tokens": 73720213.0, + "reward": 3.8704371452331543, + "reward_std": 1.5883183479309082, + "rewards/accuracy_reward/mean": 3.40625, + "rewards/accuracy_reward/std": 2.920745372772217, + "rewards/ngram_similarity_reward/mean": 0.4641871154308319, + "rewards/ngram_similarity_reward/std": 0.29948803782463074, + "step": 459 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 626.0, + "completions/max_terminated_length": 626.0, + "completions/mean_length": 427.953125, + "completions/mean_terminated_length": 427.953125, + "completions/min_length": 208.0, + "completions/min_terminated_length": 208.0, + "epoch": 0.20586260908480644, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.059216953814029694, + "learning_rate": 4.9660739277476e-06, + "loss": -0.0108, + "num_tokens": 73874130.0, + "reward": 1.846233606338501, + "reward_std": 2.161407947540283, + "rewards/accuracy_reward/mean": 1.296875, + "rewards/accuracy_reward/std": 2.8406331539154053, + "rewards/ngram_similarity_reward/mean": 0.5493584275245667, + "rewards/ngram_similarity_reward/std": 0.36027437448501587, + "step": 460 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 853.0, + "completions/max_terminated_length": 853.0, + "completions/mean_length": 456.734375, + "completions/mean_terminated_length": 456.734375, + "completions/min_length": 234.0, + "completions/min_terminated_length": 234.0, + "epoch": 0.20631013649586036, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06060659512877464, + "learning_rate": 4.965785316450528e-06, + "loss": 0.0245, + "num_tokens": 74051041.0, + "reward": 2.9211111068725586, + "reward_std": 0.16564224660396576, + "rewards/accuracy_reward/mean": 2.5, + "rewards/accuracy_reward/std": 3.0237157344818115, + "rewards/ngram_similarity_reward/mean": 0.42111140489578247, + "rewards/ngram_similarity_reward/std": 0.2438206523656845, + "step": 461 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 739.0, + "completions/max_terminated_length": 739.0, + "completions/mean_length": 487.203125, + "completions/mean_terminated_length": 487.203125, + "completions/min_length": 254.0, + "completions/min_terminated_length": 254.0, + "epoch": 0.2067576639069143, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05391167104244232, + "learning_rate": 4.9654954921349504e-06, + "loss": -0.0394, + "num_tokens": 74239342.0, + "reward": 3.002830982208252, + "reward_std": 1.39364492893219, + "rewards/accuracy_reward/mean": 2.5, + "rewards/accuracy_reward/std": 3.0237157344818115, + "rewards/ngram_similarity_reward/mean": 0.502831220626831, + "rewards/ngram_similarity_reward/std": 0.4091090261936188, + "step": 462 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 684.0, + "completions/max_terminated_length": 684.0, + "completions/mean_length": 490.59375, + "completions/mean_terminated_length": 490.59375, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "epoch": 0.20720519131796822, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05286828801035881, + "learning_rate": 4.965204454959531e-06, + "loss": 0.0267, + "num_tokens": 74439316.0, + "reward": 2.826718807220459, + "reward_std": 1.2495957612991333, + "rewards/accuracy_reward/mean": 2.3125, + "rewards/accuracy_reward/std": 3.0178043842315674, + "rewards/ngram_similarity_reward/mean": 0.5142186284065247, + "rewards/ngram_similarity_reward/std": 0.43164652585983276, + "step": 463 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 697.0, + "completions/max_terminated_length": 697.0, + "completions/mean_length": 429.765625, + "completions/mean_terminated_length": 429.765625, + "completions/min_length": 197.0, + "completions/min_terminated_length": 197.0, + "epoch": 0.20765271872902216, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06051198020577431, + "learning_rate": 4.964912205083597e-06, + "loss": 0.0362, + "num_tokens": 74562565.0, + "reward": 4.667201042175293, + "reward_std": 2.926253318786621, + "rewards/accuracy_reward/mean": 4.015625, + "rewards/accuracy_reward/std": 2.713822364807129, + "rewards/ngram_similarity_reward/mean": 0.651576042175293, + "rewards/ngram_similarity_reward/std": 0.3751135468482971, + "step": 464 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 507.0, + "completions/max_terminated_length": 507.0, + "completions/mean_length": 315.484375, + "completions/mean_terminated_length": 315.484375, + "completions/min_length": 189.0, + "completions/min_terminated_length": 189.0, + "epoch": 0.20810024614007608, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.07512948662042618, + "learning_rate": 4.964618742667139e-06, + "loss": -0.0049, + "num_tokens": 74709732.0, + "reward": 3.7250890731811523, + "reward_std": 0.9738008975982666, + "rewards/accuracy_reward/mean": 3.21875, + "rewards/accuracy_reward/std": 2.9732606410980225, + "rewards/ngram_similarity_reward/mean": 0.5063392519950867, + "rewards/ngram_similarity_reward/std": 0.46273863315582275, + "step": 465 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 680.0, + "completions/max_terminated_length": 680.0, + "completions/mean_length": 527.6875, + "completions/mean_terminated_length": 527.6875, + "completions/min_length": 369.0, + "completions/min_terminated_length": 369.0, + "epoch": 0.20854777355113, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04736018553376198, + "learning_rate": 4.9643240678708085e-06, + "loss": -0.0261, + "num_tokens": 74866064.0, + "reward": 4.556824207305908, + "reward_std": 0.5301584005355835, + "rewards/accuracy_reward/mean": 3.90625, + "rewards/accuracy_reward/std": 2.6709415912628174, + "rewards/ngram_similarity_reward/mean": 0.6505742073059082, + "rewards/ngram_similarity_reward/std": 0.28748565912246704, + "step": 466 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 819.0, + "completions/max_terminated_length": 819.0, + "completions/mean_length": 550.75, + "completions/mean_terminated_length": 550.75, + "completions/min_length": 281.0, + "completions/min_terminated_length": 281.0, + "epoch": 0.20899530096218394, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04929441213607788, + "learning_rate": 4.964028180855927e-06, + "loss": 0.0112, + "num_tokens": 74993584.0, + "reward": 1.8805651664733887, + "reward_std": 1.1419997215270996, + "rewards/accuracy_reward/mean": 1.34375, + "rewards/accuracy_reward/std": 2.8296544551849365, + "rewards/ngram_similarity_reward/mean": 0.5368151068687439, + "rewards/ngram_similarity_reward/std": 0.3858891427516937, + "step": 467 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 756.0, + "completions/max_terminated_length": 756.0, + "completions/mean_length": 442.03125, + "completions/mean_terminated_length": 442.03125, + "completions/min_length": 273.0, + "completions/min_terminated_length": 273.0, + "epoch": 0.20944282837323785, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.058800678700208664, + "learning_rate": 4.9637310817844745e-06, + "loss": 0.0347, + "num_tokens": 75155474.0, + "reward": 3.7268218994140625, + "reward_std": 0.9965977072715759, + "rewards/accuracy_reward/mean": 3.15625, + "rewards/accuracy_reward/std": 2.950484275817871, + "rewards/ngram_similarity_reward/mean": 0.5705718398094177, + "rewards/ngram_similarity_reward/std": 0.36925530433654785, + "step": 468 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 704.0, + "completions/max_terminated_length": 704.0, + "completions/mean_length": 479.84375, + "completions/mean_terminated_length": 479.84375, + "completions/min_length": 299.0, + "completions/min_terminated_length": 299.0, + "epoch": 0.2098903557842918, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05136004462838173, + "learning_rate": 4.963432770819096e-06, + "loss": 0.0027, + "num_tokens": 75310568.0, + "reward": 4.841269493103027, + "reward_std": 0.8033795356750488, + "rewards/accuracy_reward/mean": 4.28125, + "rewards/accuracy_reward/std": 2.4330317974090576, + "rewards/ngram_similarity_reward/mean": 0.5600191354751587, + "rewards/ngram_similarity_reward/std": 0.28958660364151, + "step": 469 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 631.0, + "completions/max_terminated_length": 631.0, + "completions/mean_length": 430.953125, + "completions/mean_terminated_length": 430.953125, + "completions/min_length": 267.0, + "completions/min_terminated_length": 267.0, + "epoch": 0.21033788319534572, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05915706232190132, + "learning_rate": 4.9631332481231004e-06, + "loss": -0.0117, + "num_tokens": 75459349.0, + "reward": 2.92802357673645, + "reward_std": 0.11898425221443176, + "rewards/accuracy_reward/mean": 2.5, + "rewards/accuracy_reward/std": 3.0237157344818115, + "rewards/ngram_similarity_reward/mean": 0.42802348732948303, + "rewards/ngram_similarity_reward/std": 0.261121928691864, + "step": 470 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 679.0, + "completions/max_terminated_length": 679.0, + "completions/mean_length": 447.828125, + "completions/mean_terminated_length": 447.828125, + "completions/min_length": 172.0, + "completions/min_terminated_length": 172.0, + "epoch": 0.21078541060639963, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06052910536527634, + "learning_rate": 4.962832513860459e-06, + "loss": 0.0336, + "num_tokens": 75594586.0, + "reward": 4.519056797027588, + "reward_std": 1.371473789215088, + "rewards/accuracy_reward/mean": 4.0, + "rewards/accuracy_reward/std": 2.618614673614502, + "rewards/ngram_similarity_reward/mean": 0.5190566778182983, + "rewards/ngram_similarity_reward/std": 0.3726147711277008, + "step": 471 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 808.0, + "completions/max_terminated_length": 808.0, + "completions/mean_length": 455.3125, + "completions/mean_terminated_length": 455.3125, + "completions/min_length": 252.0, + "completions/min_terminated_length": 252.0, + "epoch": 0.21123293801745358, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05712325870990753, + "learning_rate": 4.962530568195808e-06, + "loss": -0.0486, + "num_tokens": 75734366.0, + "reward": 1.448999047279358, + "reward_std": 0.5560168027877808, + "rewards/accuracy_reward/mean": 0.828125, + "rewards/accuracy_reward/std": 2.8622043132781982, + "rewards/ngram_similarity_reward/mean": 0.6208740472793579, + "rewards/ngram_similarity_reward/std": 0.4527631103992462, + "step": 472 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 703.0, + "completions/max_terminated_length": 703.0, + "completions/mean_length": 496.703125, + "completions/mean_terminated_length": 496.703125, + "completions/min_length": 279.0, + "completions/min_terminated_length": 279.0, + "epoch": 0.2116804654285075, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04869071766734123, + "learning_rate": 4.962227411294446e-06, + "loss": -0.0011, + "num_tokens": 75903963.0, + "reward": 1.5254019498825073, + "reward_std": 0.2591969966888428, + "rewards/accuracy_reward/mean": 0.984375, + "rewards/accuracy_reward/std": 2.6306629180908203, + "rewards/ngram_similarity_reward/mean": 0.5410269498825073, + "rewards/ngram_similarity_reward/std": 0.337963342666626, + "step": 473 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 725.0, + "completions/max_terminated_length": 725.0, + "completions/mean_length": 463.625, + "completions/mean_terminated_length": 463.625, + "completions/min_length": 289.0, + "completions/min_terminated_length": 289.0, + "epoch": 0.21212799283956144, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.056896839290857315, + "learning_rate": 4.961923043322333e-06, + "loss": 0.0054, + "num_tokens": 76043411.0, + "reward": 3.734736680984497, + "reward_std": 1.8387919664382935, + "rewards/accuracy_reward/mean": 3.140625, + "rewards/accuracy_reward/std": 2.9727182388305664, + "rewards/ngram_similarity_reward/mean": 0.5941117405891418, + "rewards/ngram_similarity_reward/std": 0.4166860282421112, + "step": 474 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 683.0, + "completions/max_terminated_length": 683.0, + "completions/mean_length": 438.796875, + "completions/mean_terminated_length": 438.796875, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "epoch": 0.21257552025061535, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05332396924495697, + "learning_rate": 4.961617464446094e-06, + "loss": 0.0008, + "num_tokens": 76195686.0, + "reward": 4.474213123321533, + "reward_std": 2.7889294624328613, + "rewards/accuracy_reward/mean": 3.71875, + "rewards/accuracy_reward/std": 2.7629566192626953, + "rewards/ngram_similarity_reward/mean": 0.7554631233215332, + "rewards/ngram_similarity_reward/std": 0.3857472538948059, + "step": 475 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 706.0, + "completions/max_terminated_length": 706.0, + "completions/mean_length": 474.046875, + "completions/mean_terminated_length": 474.046875, + "completions/min_length": 288.0, + "completions/min_terminated_length": 288.0, + "epoch": 0.21302304766166927, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0532119981944561, + "learning_rate": 4.961310674833016e-06, + "loss": -0.0179, + "num_tokens": 76357321.0, + "reward": 4.594073295593262, + "reward_std": 1.177838921546936, + "rewards/accuracy_reward/mean": 4.0, + "rewards/accuracy_reward/std": 2.618614673614502, + "rewards/ngram_similarity_reward/mean": 0.5940733551979065, + "rewards/ngram_similarity_reward/std": 0.22110331058502197, + "step": 476 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 739.0, + "completions/max_terminated_length": 739.0, + "completions/mean_length": 519.40625, + "completions/mean_terminated_length": 519.40625, + "completions/min_length": 352.0, + "completions/min_terminated_length": 352.0, + "epoch": 0.2134705750727232, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.047116197645664215, + "learning_rate": 4.961002674651051e-06, + "loss": -0.011, + "num_tokens": 76511683.0, + "reward": 5.643961429595947, + "reward_std": 1.319187879562378, + "rewards/accuracy_reward/mean": 4.828125, + "rewards/accuracy_reward/std": 1.9359153509140015, + "rewards/ngram_similarity_reward/mean": 0.8158363699913025, + "rewards/ngram_similarity_reward/std": 0.24353596568107605, + "step": 477 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 605.0, + "completions/max_terminated_length": 605.0, + "completions/mean_length": 417.078125, + "completions/mean_terminated_length": 417.078125, + "completions/min_length": 251.0, + "completions/min_terminated_length": 251.0, + "epoch": 0.21391810248377713, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05638702213764191, + "learning_rate": 4.960693464068809e-06, + "loss": -0.0166, + "num_tokens": 76684968.0, + "reward": 4.017838954925537, + "reward_std": 1.2605984210968018, + "rewards/accuracy_reward/mean": 3.4375, + "rewards/accuracy_reward/std": 2.872281312942505, + "rewards/ngram_similarity_reward/mean": 0.5803389549255371, + "rewards/ngram_similarity_reward/std": 0.3501337468624115, + "step": 478 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 636.0, + "completions/max_terminated_length": 636.0, + "completions/mean_length": 395.125, + "completions/mean_terminated_length": 395.125, + "completions/min_length": 206.0, + "completions/min_terminated_length": 206.0, + "epoch": 0.21436562989483104, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07186482101678848, + "learning_rate": 4.960383043255568e-06, + "loss": 0.0101, + "num_tokens": 76812608.0, + "reward": 2.7974987030029297, + "reward_std": 1.5725497007369995, + "rewards/accuracy_reward/mean": 2.125, + "rewards/accuracy_reward/std": 3.000000238418579, + "rewards/ngram_similarity_reward/mean": 0.6724984645843506, + "rewards/ngram_similarity_reward/std": 0.44111230969429016, + "step": 479 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 765.0, + "completions/max_terminated_length": 765.0, + "completions/mean_length": 525.671875, + "completions/mean_terminated_length": 525.671875, + "completions/min_length": 356.0, + "completions/min_terminated_length": 356.0, + "epoch": 0.214813157305885, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04860546812415123, + "learning_rate": 4.960071412381265e-06, + "loss": 0.0029, + "num_tokens": 76971259.0, + "reward": 1.3983885049819946, + "reward_std": 1.2901124954223633, + "rewards/accuracy_reward/mean": 1.0, + "rewards/accuracy_reward/std": 2.618614673614502, + "rewards/ngram_similarity_reward/mean": 0.3983883261680603, + "rewards/ngram_similarity_reward/std": 0.23067422211170197, + "step": 480 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 770.0, + "completions/max_terminated_length": 770.0, + "completions/mean_length": 502.609375, + "completions/mean_terminated_length": 502.609375, + "completions/min_length": 259.0, + "completions/min_terminated_length": 259.0, + "epoch": 0.2152606847169389, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05025094002485275, + "learning_rate": 4.9597585716165e-06, + "loss": -0.0012, + "num_tokens": 77119058.0, + "reward": 3.136701822280884, + "reward_std": 2.031083106994629, + "rewards/accuracy_reward/mean": 2.765625, + "rewards/accuracy_reward/std": 3.0302298069000244, + "rewards/ngram_similarity_reward/mean": 0.37107694149017334, + "rewards/ngram_similarity_reward/std": 0.2309703826904297, + "step": 481 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 648.0, + "completions/max_terminated_length": 648.0, + "completions/mean_length": 393.34375, + "completions/mean_terminated_length": 393.34375, + "completions/min_length": 207.0, + "completions/min_terminated_length": 207.0, + "epoch": 0.21570821212799285, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06302059441804886, + "learning_rate": 4.959444521132537e-06, + "loss": 0.0032, + "num_tokens": 77271688.0, + "reward": 4.503539085388184, + "reward_std": 0.938154935836792, + "rewards/accuracy_reward/mean": 4.0, + "rewards/accuracy_reward/std": 2.618614673614502, + "rewards/ngram_similarity_reward/mean": 0.5035389065742493, + "rewards/ngram_similarity_reward/std": 0.4234048128128052, + "step": 482 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 885.0, + "completions/max_terminated_length": 885.0, + "completions/mean_length": 487.625, + "completions/mean_terminated_length": 487.625, + "completions/min_length": 244.0, + "completions/min_terminated_length": 244.0, + "epoch": 0.21615573953904677, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.051766663789749146, + "learning_rate": 4.959129261101301e-06, + "loss": 0.0325, + "num_tokens": 77417744.0, + "reward": 5.006524085998535, + "reward_std": 0.9188006520271301, + "rewards/accuracy_reward/mean": 4.46875, + "rewards/accuracy_reward/std": 2.2815253734588623, + "rewards/ngram_similarity_reward/mean": 0.5377740859985352, + "rewards/ngram_similarity_reward/std": 0.2830451428890228, + "step": 483 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 756.0, + "completions/max_terminated_length": 756.0, + "completions/mean_length": 564.5625, + "completions/mean_terminated_length": 564.5625, + "completions/min_length": 397.0, + "completions/min_terminated_length": 397.0, + "epoch": 0.21660326695010068, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04267222806811333, + "learning_rate": 4.958812791695377e-06, + "loss": -0.0045, + "num_tokens": 77571620.0, + "reward": 5.613181114196777, + "reward_std": 1.2169044017791748, + "rewards/accuracy_reward/mean": 4.75, + "rewards/accuracy_reward/std": 2.0, + "rewards/ngram_similarity_reward/mean": 0.863180935382843, + "rewards/ngram_similarity_reward/std": 0.2767801284790039, + "step": 484 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 683.0, + "completions/max_terminated_length": 683.0, + "completions/mean_length": 388.328125, + "completions/mean_terminated_length": 388.328125, + "completions/min_length": 210.0, + "completions/min_terminated_length": 210.0, + "epoch": 0.21705079436115463, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06155702844262123, + "learning_rate": 4.958495113088016e-06, + "loss": -0.0267, + "num_tokens": 77788457.0, + "reward": 1.7356253862380981, + "reward_std": 2.3263394832611084, + "rewards/accuracy_reward/mean": 1.28125, + "rewards/accuracy_reward/std": 2.7629566192626953, + "rewards/ngram_similarity_reward/mean": 0.45437535643577576, + "rewards/ngram_similarity_reward/std": 0.22731998562812805, + "step": 485 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 755.0, + "completions/max_terminated_length": 755.0, + "completions/mean_length": 417.609375, + "completions/mean_terminated_length": 417.609375, + "completions/min_length": 170.0, + "completions/min_terminated_length": 170.0, + "epoch": 0.21749832177220854, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05934767797589302, + "learning_rate": 4.95817622545313e-06, + "loss": 0.0046, + "num_tokens": 77942288.0, + "reward": 4.513749122619629, + "reward_std": 1.5365564823150635, + "rewards/accuracy_reward/mean": 4.0, + "rewards/accuracy_reward/std": 2.618614673614502, + "rewards/ngram_similarity_reward/mean": 0.5137491822242737, + "rewards/ngram_similarity_reward/std": 0.2162044197320938, + "step": 486 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1549.0, + "completions/max_terminated_length": 1549.0, + "completions/mean_length": 427.296875, + "completions/mean_terminated_length": 427.296875, + "completions/min_length": 270.0, + "completions/min_terminated_length": 270.0, + "epoch": 0.21794584918326249, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06957036256790161, + "learning_rate": 4.957856128965292e-06, + "loss": 0.0439, + "num_tokens": 78101459.0, + "reward": 3.2877097129821777, + "reward_std": 0.931866466999054, + "rewards/accuracy_reward/mean": 2.703125, + "rewards/accuracy_reward/std": 3.1074907779693604, + "rewards/ngram_similarity_reward/mean": 0.5845849514007568, + "rewards/ngram_similarity_reward/std": 0.360519677400589, + "step": 487 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 769.0, + "completions/max_terminated_length": 769.0, + "completions/mean_length": 507.796875, + "completions/mean_terminated_length": 507.796875, + "completions/min_length": 354.0, + "completions/min_terminated_length": 354.0, + "epoch": 0.2183933765943164, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06689263880252838, + "learning_rate": 4.957534823799735e-06, + "loss": 0.0176, + "num_tokens": 78233910.0, + "reward": 4.001543998718262, + "reward_std": 0.8976078629493713, + "rewards/accuracy_reward/mean": 3.625, + "rewards/accuracy_reward/std": 2.8030595779418945, + "rewards/ngram_similarity_reward/mean": 0.3765438199043274, + "rewards/ngram_similarity_reward/std": 0.32272958755493164, + "step": 488 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 767.0, + "completions/max_terminated_length": 767.0, + "completions/mean_length": 434.65625, + "completions/mean_terminated_length": 434.65625, + "completions/min_length": 224.0, + "completions/min_terminated_length": 224.0, + "epoch": 0.21884090400537032, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06068659573793411, + "learning_rate": 4.957212310132357e-06, + "loss": 0.0258, + "num_tokens": 78419840.0, + "reward": 3.849094867706299, + "reward_std": 1.5910649299621582, + "rewards/accuracy_reward/mean": 3.4375, + "rewards/accuracy_reward/std": 2.872281312942505, + "rewards/ngram_similarity_reward/mean": 0.4115948975086212, + "rewards/ngram_similarity_reward/std": 0.29876673221588135, + "step": 489 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 719.0, + "completions/max_terminated_length": 719.0, + "completions/mean_length": 453.84375, + "completions/mean_terminated_length": 453.84375, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "epoch": 0.21928843141642426, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0636509358882904, + "learning_rate": 4.956888588139716e-06, + "loss": 0.031, + "num_tokens": 78581206.0, + "reward": 5.014695167541504, + "reward_std": 1.7021368741989136, + "rewards/accuracy_reward/mean": 4.453125, + "rewards/accuracy_reward/std": 2.319206953048706, + "rewards/ngram_similarity_reward/mean": 0.56156986951828, + "rewards/ngram_similarity_reward/std": 0.30171915888786316, + "step": 490 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 905.0, + "completions/max_terminated_length": 905.0, + "completions/mean_length": 574.078125, + "completions/mean_terminated_length": 574.078125, + "completions/min_length": 286.0, + "completions/min_terminated_length": 286.0, + "epoch": 0.21973595882747818, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05884205177426338, + "learning_rate": 4.956563657999032e-06, + "loss": -0.0081, + "num_tokens": 78731227.0, + "reward": 1.8951448202133179, + "reward_std": 1.9604424238204956, + "rewards/accuracy_reward/mean": 1.359375, + "rewards/accuracy_reward/std": 2.816432476043701, + "rewards/ngram_similarity_reward/mean": 0.5357697606086731, + "rewards/ngram_similarity_reward/std": 0.2690957188606262, + "step": 491 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 658.0, + "completions/max_terminated_length": 658.0, + "completions/mean_length": 466.390625, + "completions/mean_terminated_length": 466.390625, + "completions/min_length": 215.0, + "completions/min_terminated_length": 215.0, + "epoch": 0.22018348623853212, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05558258667588234, + "learning_rate": 4.956237519888186e-06, + "loss": -0.0245, + "num_tokens": 78910468.0, + "reward": 2.3418540954589844, + "reward_std": 1.3856549263000488, + "rewards/accuracy_reward/mean": 1.65625, + "rewards/accuracy_reward/std": 2.9016621112823486, + "rewards/ngram_similarity_reward/mean": 0.6856042146682739, + "rewards/ngram_similarity_reward/std": 0.31204110383987427, + "step": 492 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 660.0, + "completions/max_terminated_length": 660.0, + "completions/mean_length": 434.578125, + "completions/mean_terminated_length": 434.578125, + "completions/min_length": 228.0, + "completions/min_terminated_length": 228.0, + "epoch": 0.22063101364958604, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05283565819263458, + "learning_rate": 4.95591017398572e-06, + "loss": 0.0229, + "num_tokens": 79060265.0, + "reward": 3.7463274002075195, + "reward_std": 1.7352664470672607, + "rewards/accuracy_reward/mean": 3.15625, + "rewards/accuracy_reward/std": 2.950484275817871, + "rewards/ngram_similarity_reward/mean": 0.5900774598121643, + "rewards/ngram_similarity_reward/std": 0.32223933935165405, + "step": 493 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 810.0, + "completions/max_terminated_length": 810.0, + "completions/mean_length": 542.015625, + "completions/mean_terminated_length": 542.015625, + "completions/min_length": 383.0, + "completions/min_terminated_length": 383.0, + "epoch": 0.22107854106063995, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04560478776693344, + "learning_rate": 4.955581620470838e-06, + "loss": -0.0261, + "num_tokens": 79234922.0, + "reward": 2.386859893798828, + "reward_std": 0.9993495941162109, + "rewards/accuracy_reward/mean": 1.921875, + "rewards/accuracy_reward/std": 2.9857051372528076, + "rewards/ngram_similarity_reward/mean": 0.46498507261276245, + "rewards/ngram_similarity_reward/std": 0.38773688673973083, + "step": 494 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 684.0, + "completions/max_terminated_length": 684.0, + "completions/mean_length": 417.0625, + "completions/mean_terminated_length": 417.0625, + "completions/min_length": 230.0, + "completions/min_terminated_length": 230.0, + "epoch": 0.2215260684716939, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06720297783613205, + "learning_rate": 4.955251859523404e-06, + "loss": 0.0337, + "num_tokens": 79384830.0, + "reward": 5.404541015625, + "reward_std": 1.311445713043213, + "rewards/accuracy_reward/mean": 4.890625, + "rewards/accuracy_reward/std": 1.915825366973877, + "rewards/ngram_similarity_reward/mean": 0.5139156579971313, + "rewards/ngram_similarity_reward/std": 0.22248059511184692, + "step": 495 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 655.0, + "completions/max_terminated_length": 655.0, + "completions/mean_length": 423.796875, + "completions/mean_terminated_length": 423.796875, + "completions/min_length": 260.0, + "completions/min_terminated_length": 260.0, + "epoch": 0.22197359588274782, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.056853897869586945, + "learning_rate": 4.954920891323944e-06, + "loss": 0.028, + "num_tokens": 79552049.0, + "reward": 3.310576915740967, + "reward_std": 1.6470513343811035, + "rewards/accuracy_reward/mean": 2.78125, + "rewards/accuracy_reward/std": 3.0103988647460938, + "rewards/ngram_similarity_reward/mean": 0.5293266773223877, + "rewards/ngram_similarity_reward/std": 0.30274975299835205, + "step": 496 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 605.0, + "completions/max_terminated_length": 605.0, + "completions/mean_length": 406.03125, + "completions/mean_terminated_length": 406.03125, + "completions/min_length": 293.0, + "completions/min_terminated_length": 293.0, + "epoch": 0.22242112329380176, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05910646170377731, + "learning_rate": 4.954588716053645e-06, + "loss": 0.0401, + "num_tokens": 79714627.0, + "reward": 0.7363100051879883, + "reward_std": 1.615598440170288, + "rewards/accuracy_reward/mean": 0.359375, + "rewards/accuracy_reward/std": 2.2457523345947266, + "rewards/ngram_similarity_reward/mean": 0.3769349455833435, + "rewards/ngram_similarity_reward/std": 0.23151440918445587, + "step": 497 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 719.0, + "completions/max_terminated_length": 719.0, + "completions/mean_length": 462.703125, + "completions/mean_terminated_length": 462.703125, + "completions/min_length": 268.0, + "completions/min_terminated_length": 268.0, + "epoch": 0.22286865070485568, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05820932984352112, + "learning_rate": 4.954255333894354e-06, + "loss": -0.0194, + "num_tokens": 79846448.0, + "reward": 0.9518995881080627, + "reward_std": 1.9144651889801025, + "rewards/accuracy_reward/mean": 0.53125, + "rewards/accuracy_reward/std": 2.2815253734588623, + "rewards/ngram_similarity_reward/mean": 0.42064958810806274, + "rewards/ngram_similarity_reward/std": 0.2036563754081726, + "step": 498 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 554.0, + "completions/max_terminated_length": 554.0, + "completions/mean_length": 381.515625, + "completions/mean_terminated_length": 381.515625, + "completions/min_length": 236.0, + "completions/min_terminated_length": 236.0, + "epoch": 0.2233161781159096, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06320500373840332, + "learning_rate": 4.953920745028579e-06, + "loss": -0.0074, + "num_tokens": 80019761.0, + "reward": 3.8864622116088867, + "reward_std": 0.8679347038269043, + "rewards/accuracy_reward/mean": 3.25, + "rewards/accuracy_reward/std": 2.9277002811431885, + "rewards/ngram_similarity_reward/mean": 0.6364620327949524, + "rewards/ngram_similarity_reward/std": 0.3301069736480713, + "step": 499 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 632.0, + "completions/max_terminated_length": 632.0, + "completions/mean_length": 415.25, + "completions/mean_terminated_length": 415.25, + "completions/min_length": 212.0, + "completions/min_terminated_length": 212.0, + "epoch": 0.22376370552696354, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06903304904699326, + "learning_rate": 4.9535849496394885e-06, + "loss": 0.0473, + "num_tokens": 80138369.0, + "reward": 4.351874828338623, + "reward_std": 1.3796770572662354, + "rewards/accuracy_reward/mean": 3.796875, + "rewards/accuracy_reward/std": 2.746886730194092, + "rewards/ngram_similarity_reward/mean": 0.5550000667572021, + "rewards/ngram_similarity_reward/std": 0.3303433656692505, + "step": 500 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1114.0, + "completions/max_terminated_length": 1114.0, + "completions/mean_length": 513.875, + "completions/mean_terminated_length": 513.875, + "completions/min_length": 304.0, + "completions/min_terminated_length": 304.0, + "epoch": 0.22421123293801745, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05276532471179962, + "learning_rate": 4.953247947910913e-06, + "loss": -0.0046, + "num_tokens": 80348649.0, + "reward": 2.3902032375335693, + "reward_std": 2.070019006729126, + "rewards/accuracy_reward/mean": 2.015625, + "rewards/accuracy_reward/std": 3.00260329246521, + "rewards/ngram_similarity_reward/mean": 0.3745781481266022, + "rewards/ngram_similarity_reward/std": 0.29120922088623047, + "step": 501 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 790.0, + "completions/max_terminated_length": 790.0, + "completions/mean_length": 505.25, + "completions/mean_terminated_length": 505.25, + "completions/min_length": 255.0, + "completions/min_terminated_length": 255.0, + "epoch": 0.22465876034907137, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05461040511727333, + "learning_rate": 4.9529097400273395e-06, + "loss": -0.0483, + "num_tokens": 80511481.0, + "reward": 3.9073472023010254, + "reward_std": 1.3411670923233032, + "rewards/accuracy_reward/mean": 3.328125, + "rewards/accuracy_reward/std": 2.9252848625183105, + "rewards/ngram_similarity_reward/mean": 0.5792225003242493, + "rewards/ngram_similarity_reward/std": 0.3714323043823242, + "step": 502 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 686.0, + "completions/max_terminated_length": 686.0, + "completions/mean_length": 455.578125, + "completions/mean_terminated_length": 455.578125, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "epoch": 0.2251062877601253, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05344484746456146, + "learning_rate": 4.95257032617392e-06, + "loss": -0.0038, + "num_tokens": 80701806.0, + "reward": 4.105986595153809, + "reward_std": 1.2655099630355835, + "rewards/accuracy_reward/mean": 3.4375, + "rewards/accuracy_reward/std": 2.872281312942505, + "rewards/ngram_similarity_reward/mean": 0.6684862375259399, + "rewards/ngram_similarity_reward/std": 0.33517885208129883, + "step": 503 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 662.0, + "completions/max_terminated_length": 662.0, + "completions/mean_length": 469.953125, + "completions/mean_terminated_length": 469.953125, + "completions/min_length": 277.0, + "completions/min_terminated_length": 277.0, + "epoch": 0.22555381517117923, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05020499974489212, + "learning_rate": 4.952229706536465e-06, + "loss": 0.0122, + "num_tokens": 80860155.0, + "reward": 1.7860900163650513, + "reward_std": 0.6525092124938965, + "rewards/accuracy_reward/mean": 1.171875, + "rewards/accuracy_reward/std": 2.7316761016845703, + "rewards/ngram_similarity_reward/mean": 0.6142149567604065, + "rewards/ngram_similarity_reward/std": 0.31398242712020874, + "step": 504 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 615.0, + "completions/max_terminated_length": 615.0, + "completions/mean_length": 437.609375, + "completions/mean_terminated_length": 437.609375, + "completions/min_length": 281.0, + "completions/min_terminated_length": 281.0, + "epoch": 0.22600134258223317, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.056204263120889664, + "learning_rate": 4.951887881301443e-06, + "loss": 0.0174, + "num_tokens": 80996610.0, + "reward": 5.491084098815918, + "reward_std": 0.8083322048187256, + "rewards/accuracy_reward/mean": 4.9375, + "rewards/accuracy_reward/std": 1.7627090215682983, + "rewards/ngram_similarity_reward/mean": 0.5535838603973389, + "rewards/ngram_similarity_reward/std": 0.37020254135131836, + "step": 505 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 707.0, + "completions/max_terminated_length": 707.0, + "completions/mean_length": 504.28125, + "completions/mean_terminated_length": 504.28125, + "completions/min_length": 305.0, + "completions/min_terminated_length": 305.0, + "epoch": 0.2264488699932871, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0578279085457325, + "learning_rate": 4.951544850655985e-06, + "loss": 0.0115, + "num_tokens": 81168100.0, + "reward": 2.248349189758301, + "reward_std": 1.4716249704360962, + "rewards/accuracy_reward/mean": 1.828125, + "rewards/accuracy_reward/std": 2.9657018184661865, + "rewards/ngram_similarity_reward/mean": 0.42022407054901123, + "rewards/ngram_similarity_reward/std": 0.18899768590927124, + "step": 506 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 810.0, + "completions/max_terminated_length": 810.0, + "completions/mean_length": 451.484375, + "completions/mean_terminated_length": 451.484375, + "completions/min_length": 175.0, + "completions/min_terminated_length": 175.0, + "epoch": 0.226896397404341, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0604545883834362, + "learning_rate": 4.951200614787881e-06, + "loss": -0.0065, + "num_tokens": 81325603.0, + "reward": 3.941903591156006, + "reward_std": 0.9505756497383118, + "rewards/accuracy_reward/mean": 3.421875, + "rewards/accuracy_reward/std": 2.896657705307007, + "rewards/ngram_similarity_reward/mean": 0.5200284719467163, + "rewards/ngram_similarity_reward/std": 0.2735101282596588, + "step": 507 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 718.0, + "completions/max_terminated_length": 718.0, + "completions/mean_length": 442.75, + "completions/mean_terminated_length": 442.75, + "completions/min_length": 270.0, + "completions/min_terminated_length": 270.0, + "epoch": 0.22734392481539495, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05478325113654137, + "learning_rate": 4.950855173885582e-06, + "loss": 0.0086, + "num_tokens": 81451043.0, + "reward": 4.029051780700684, + "reward_std": 1.3405178785324097, + "rewards/accuracy_reward/mean": 3.4375, + "rewards/accuracy_reward/std": 2.872281312942505, + "rewards/ngram_similarity_reward/mean": 0.5915517210960388, + "rewards/ngram_similarity_reward/std": 0.3374991714954376, + "step": 508 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 730.0, + "completions/max_terminated_length": 730.0, + "completions/mean_length": 424.6875, + "completions/mean_terminated_length": 424.6875, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, + "epoch": 0.22779145222644887, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07086840271949768, + "learning_rate": 4.950508528138195e-06, + "loss": 0.0058, + "num_tokens": 81679215.0, + "reward": 3.891526222229004, + "reward_std": 1.43088698387146, + "rewards/accuracy_reward/mean": 3.3125, + "rewards/accuracy_reward/std": 2.948634386062622, + "rewards/ngram_similarity_reward/mean": 0.5790262222290039, + "rewards/ngram_similarity_reward/std": 0.4064956307411194, + "step": 509 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 683.0, + "completions/max_terminated_length": 683.0, + "completions/mean_length": 451.796875, + "completions/mean_terminated_length": 451.796875, + "completions/min_length": 286.0, + "completions/min_terminated_length": 286.0, + "epoch": 0.2282389796375028, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05182816833257675, + "learning_rate": 4.9501606777354914e-06, + "loss": 0.0048, + "num_tokens": 81817906.0, + "reward": 5.248233318328857, + "reward_std": 1.3395018577575684, + "rewards/accuracy_reward/mean": 4.46875, + "rewards/accuracy_reward/std": 2.2815253734588623, + "rewards/ngram_similarity_reward/mean": 0.7794830799102783, + "rewards/ngram_similarity_reward/std": 0.3089151382446289, + "step": 510 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 806.0, + "completions/max_terminated_length": 806.0, + "completions/mean_length": 455.703125, + "completions/mean_terminated_length": 455.703125, + "completions/min_length": 279.0, + "completions/min_terminated_length": 279.0, + "epoch": 0.22868650704855673, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06050952896475792, + "learning_rate": 4.949811622867899e-06, + "loss": 0.0038, + "num_tokens": 81981167.0, + "reward": 3.9882125854492188, + "reward_std": 2.2624154090881348, + "rewards/accuracy_reward/mean": 3.515625, + "rewards/accuracy_reward/std": 2.8646292686462402, + "rewards/ngram_similarity_reward/mean": 0.4725874662399292, + "rewards/ngram_similarity_reward/std": 0.2975623905658722, + "step": 511 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 888.0, + "completions/max_terminated_length": 888.0, + "completions/mean_length": 471.0, + "completions/mean_terminated_length": 471.0, + "completions/min_length": 215.0, + "completions/min_terminated_length": 215.0, + "epoch": 0.22913403445961064, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.048352234065532684, + "learning_rate": 4.949461363726506e-06, + "loss": -0.0004, + "num_tokens": 82172159.0, + "reward": 4.106328964233398, + "reward_std": 0.9110741019248962, + "rewards/accuracy_reward/mean": 3.53125, + "rewards/accuracy_reward/std": 2.839454174041748, + "rewards/ngram_similarity_reward/mean": 0.5750784873962402, + "rewards/ngram_similarity_reward/std": 0.3322790563106537, + "step": 512 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 715.0, + "completions/max_terminated_length": 715.0, + "completions/mean_length": 435.515625, + "completions/mean_terminated_length": 435.515625, + "completions/min_length": 243.0, + "completions/min_terminated_length": 243.0, + "epoch": 0.22958156187066459, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05683273822069168, + "learning_rate": 4.94910990050306e-06, + "loss": -0.0037, + "num_tokens": 82315248.0, + "reward": 3.042149543762207, + "reward_std": 0.1302480250597, + "rewards/accuracy_reward/mean": 2.5, + "rewards/accuracy_reward/std": 3.0237157344818115, + "rewards/ngram_similarity_reward/mean": 0.5421494841575623, + "rewards/ngram_similarity_reward/std": 0.35772615671157837, + "step": 513 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 715.0, + "completions/max_terminated_length": 715.0, + "completions/mean_length": 413.5625, + "completions/mean_terminated_length": 413.5625, + "completions/min_length": 256.0, + "completions/min_terminated_length": 256.0, + "epoch": 0.2300290892817185, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07653037458658218, + "learning_rate": 4.9487572333899665e-06, + "loss": 0.0415, + "num_tokens": 82441892.0, + "reward": 5.274172782897949, + "reward_std": 0.9859016537666321, + "rewards/accuracy_reward/mean": 4.5625, + "rewards/accuracy_reward/std": 2.195775270462036, + "rewards/ngram_similarity_reward/mean": 0.7116726040840149, + "rewards/ngram_similarity_reward/std": 0.2897319495677948, + "step": 514 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 792.0, + "completions/max_terminated_length": 792.0, + "completions/mean_length": 544.125, + "completions/mean_terminated_length": 544.125, + "completions/min_length": 251.0, + "completions/min_terminated_length": 251.0, + "epoch": 0.23047661669277245, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05028749629855156, + "learning_rate": 4.948403362580291e-06, + "loss": 0.0515, + "num_tokens": 82599324.0, + "reward": 2.9024624824523926, + "reward_std": 0.1806321144104004, + "rewards/accuracy_reward/mean": 2.5, + "rewards/accuracy_reward/std": 3.0237157344818115, + "rewards/ngram_similarity_reward/mean": 0.4024624228477478, + "rewards/ngram_similarity_reward/std": 0.238870769739151, + "step": 515 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 804.0, + "completions/max_terminated_length": 804.0, + "completions/mean_length": 547.34375, + "completions/mean_terminated_length": 547.34375, + "completions/min_length": 335.0, + "completions/min_terminated_length": 335.0, + "epoch": 0.23092414410382636, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04469163715839386, + "learning_rate": 4.9480482882677595e-06, + "loss": 0.0052, + "num_tokens": 82760306.0, + "reward": 4.694647312164307, + "reward_std": 0.9846379160881042, + "rewards/accuracy_reward/mean": 3.984375, + "rewards/accuracy_reward/std": 2.648702621459961, + "rewards/ngram_similarity_reward/mean": 0.7102725505828857, + "rewards/ngram_similarity_reward/std": 0.2618943154811859, + "step": 516 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 679.0, + "completions/max_terminated_length": 679.0, + "completions/mean_length": 384.46875, + "completions/mean_terminated_length": 384.46875, + "completions/min_length": 216.0, + "completions/min_terminated_length": 216.0, + "epoch": 0.23137167151488028, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06376522779464722, + "learning_rate": 4.947692010646754e-06, + "loss": -0.022, + "num_tokens": 82960528.0, + "reward": 2.4959452152252197, + "reward_std": 1.714483380317688, + "rewards/accuracy_reward/mean": 2.046875, + "rewards/accuracy_reward/std": 3.080557107925415, + "rewards/ngram_similarity_reward/mean": 0.44907036423683167, + "rewards/ngram_similarity_reward/std": 0.3995765149593353, + "step": 517 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 755.0, + "completions/max_terminated_length": 755.0, + "completions/mean_length": 468.25, + "completions/mean_terminated_length": 468.25, + "completions/min_length": 249.0, + "completions/min_terminated_length": 249.0, + "epoch": 0.23181919892593422, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05506211891770363, + "learning_rate": 4.9473345299123174e-06, + "loss": 0.0672, + "num_tokens": 83088208.0, + "reward": 4.447715759277344, + "reward_std": 0.7623867392539978, + "rewards/accuracy_reward/mean": 3.71875, + "rewards/accuracy_reward/std": 2.7629566192626953, + "rewards/ngram_similarity_reward/mean": 0.7289658188819885, + "rewards/ngram_similarity_reward/std": 0.27043387293815613, + "step": 518 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 861.0, + "completions/max_terminated_length": 861.0, + "completions/mean_length": 496.59375, + "completions/mean_terminated_length": 496.59375, + "completions/min_length": 318.0, + "completions/min_terminated_length": 318.0, + "epoch": 0.23226672633698814, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06617296487092972, + "learning_rate": 4.946975846260149e-06, + "loss": -0.0103, + "num_tokens": 83245510.0, + "reward": 2.8303518295288086, + "reward_std": 0.1702744960784912, + "rewards/accuracy_reward/mean": 2.265625, + "rewards/accuracy_reward/std": 3.2792866230010986, + "rewards/ngram_similarity_reward/mean": 0.5647268295288086, + "rewards/ngram_similarity_reward/std": 0.31829366087913513, + "step": 519 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 774.0, + "completions/max_terminated_length": 774.0, + "completions/mean_length": 531.671875, + "completions/mean_terminated_length": 531.671875, + "completions/min_length": 326.0, + "completions/min_terminated_length": 326.0, + "epoch": 0.23271425374804205, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.055209528654813766, + "learning_rate": 4.94661595988661e-06, + "loss": 0.0148, + "num_tokens": 83383393.0, + "reward": 2.1428885459899902, + "reward_std": 1.666260004043579, + "rewards/accuracy_reward/mean": 1.921875, + "rewards/accuracy_reward/std": 2.9857051372528076, + "rewards/ngram_similarity_reward/mean": 0.22101356089115143, + "rewards/ngram_similarity_reward/std": 0.13001449406147003, + "step": 520 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 714.0, + "completions/max_terminated_length": 714.0, + "completions/mean_length": 466.421875, + "completions/mean_terminated_length": 466.421875, + "completions/min_length": 337.0, + "completions/min_terminated_length": 337.0, + "epoch": 0.233161781159096, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.061757054179906845, + "learning_rate": 4.9462548709887165e-06, + "loss": 0.0202, + "num_tokens": 83538620.0, + "reward": 2.524232864379883, + "reward_std": 1.1955227851867676, + "rewards/accuracy_reward/mean": 2.15625, + "rewards/accuracy_reward/std": 3.0768916606903076, + "rewards/ngram_similarity_reward/mean": 0.36798280477523804, + "rewards/ngram_similarity_reward/std": 0.37950998544692993, + "step": 521 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 663.0, + "completions/max_terminated_length": 663.0, + "completions/mean_length": 401.828125, + "completions/mean_terminated_length": 401.828125, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "epoch": 0.23360930857014992, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.064455047249794, + "learning_rate": 4.945892579764145e-06, + "loss": 0.0213, + "num_tokens": 83677825.0, + "reward": 4.467645168304443, + "reward_std": 1.6417489051818848, + "rewards/accuracy_reward/mean": 3.796875, + "rewards/accuracy_reward/std": 2.746886730194092, + "rewards/ngram_similarity_reward/mean": 0.6707702279090881, + "rewards/ngram_similarity_reward/std": 0.3119213581085205, + "step": 522 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1616.0, + "completions/max_terminated_length": 1616.0, + "completions/mean_length": 559.234375, + "completions/mean_terminated_length": 559.234375, + "completions/min_length": 228.0, + "completions/min_terminated_length": 228.0, + "epoch": 0.23405683598120386, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05815133824944496, + "learning_rate": 4.94552908641123e-06, + "loss": 0.0387, + "num_tokens": 83805856.0, + "reward": 1.0161261558532715, + "reward_std": 0.9117844104766846, + "rewards/accuracy_reward/mean": 0.5625, + "rewards/accuracy_reward/std": 2.4028754234313965, + "rewards/ngram_similarity_reward/mean": 0.45362603664398193, + "rewards/ngram_similarity_reward/std": 0.24171987175941467, + "step": 523 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 814.0, + "completions/max_terminated_length": 814.0, + "completions/mean_length": 481.328125, + "completions/mean_terminated_length": 481.328125, + "completions/min_length": 227.0, + "completions/min_terminated_length": 227.0, + "epoch": 0.23450436339225778, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05317988619208336, + "learning_rate": 4.945164391128962e-06, + "loss": -0.0071, + "num_tokens": 83971189.0, + "reward": 4.062438011169434, + "reward_std": 1.3103262186050415, + "rewards/accuracy_reward/mean": 3.578125, + "rewards/accuracy_reward/std": 2.880171298980713, + "rewards/ngram_similarity_reward/mean": 0.4843129515647888, + "rewards/ngram_similarity_reward/std": 0.27475109696388245, + "step": 524 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 438.0, + "completions/max_terminated_length": 438.0, + "completions/mean_length": 336.828125, + "completions/mean_terminated_length": 336.828125, + "completions/min_length": 247.0, + "completions/min_terminated_length": 247.0, + "epoch": 0.2349518908033117, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07858358323574066, + "learning_rate": 4.944798494116994e-06, + "loss": 0.0186, + "num_tokens": 84112474.0, + "reward": 5.912669658660889, + "reward_std": 0.9706466197967529, + "rewards/accuracy_reward/mean": 5.3125, + "rewards/accuracy_reward/std": 1.0522085428237915, + "rewards/ngram_similarity_reward/mean": 0.60016930103302, + "rewards/ngram_similarity_reward/std": 0.3875333070755005, + "step": 525 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1003.0, + "completions/max_terminated_length": 1003.0, + "completions/mean_length": 545.75, + "completions/mean_terminated_length": 545.75, + "completions/min_length": 315.0, + "completions/min_terminated_length": 315.0, + "epoch": 0.23539941821436564, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04891042038798332, + "learning_rate": 4.944431395575633e-06, + "loss": -0.0229, + "num_tokens": 84290170.0, + "reward": 4.428959846496582, + "reward_std": 1.7386280298233032, + "rewards/accuracy_reward/mean": 3.8125, + "rewards/accuracy_reward/std": 2.7189810276031494, + "rewards/ngram_similarity_reward/mean": 0.616459846496582, + "rewards/ngram_similarity_reward/std": 0.30519163608551025, + "step": 526 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 648.0, + "completions/max_terminated_length": 648.0, + "completions/mean_length": 430.734375, + "completions/mean_terminated_length": 430.734375, + "completions/min_length": 213.0, + "completions/min_terminated_length": 213.0, + "epoch": 0.23584694562541955, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06413199752569199, + "learning_rate": 4.944063095705845e-06, + "loss": 0.0077, + "num_tokens": 84453289.0, + "reward": 3.515164375305176, + "reward_std": 1.7847726345062256, + "rewards/accuracy_reward/mean": 3.046875, + "rewards/accuracy_reward/std": 2.991680145263672, + "rewards/ngram_similarity_reward/mean": 0.4682896137237549, + "rewards/ngram_similarity_reward/std": 0.20700648427009583, + "step": 527 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 563.0, + "completions/max_terminated_length": 563.0, + "completions/mean_length": 422.65625, + "completions/mean_terminated_length": 422.65625, + "completions/min_length": 301.0, + "completions/min_terminated_length": 301.0, + "epoch": 0.2362944730364735, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06800197064876556, + "learning_rate": 4.943693594709251e-06, + "loss": -0.0152, + "num_tokens": 84677251.0, + "reward": 4.715863227844238, + "reward_std": 1.7876237630844116, + "rewards/accuracy_reward/mean": 4.078125, + "rewards/accuracy_reward/std": 2.593059778213501, + "rewards/ngram_similarity_reward/mean": 0.6377381086349487, + "rewards/ngram_similarity_reward/std": 0.3081703186035156, + "step": 528 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 619.0, + "completions/max_terminated_length": 619.0, + "completions/mean_length": 375.625, + "completions/mean_terminated_length": 375.625, + "completions/min_length": 216.0, + "completions/min_terminated_length": 216.0, + "epoch": 0.2367420004475274, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0659157931804657, + "learning_rate": 4.943322892788136e-06, + "loss": 0.0055, + "num_tokens": 84828923.0, + "reward": 3.566446304321289, + "reward_std": 0.9382291436195374, + "rewards/accuracy_reward/mean": 3.15625, + "rewards/accuracy_reward/std": 2.950484275817871, + "rewards/ngram_similarity_reward/mean": 0.4101959764957428, + "rewards/ngram_similarity_reward/std": 0.34982484579086304, + "step": 529 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 596.0, + "completions/max_terminated_length": 596.0, + "completions/mean_length": 416.578125, + "completions/mean_terminated_length": 416.578125, + "completions/min_length": 233.0, + "completions/min_terminated_length": 233.0, + "epoch": 0.23718952785858133, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06760423630475998, + "learning_rate": 4.942950990145438e-06, + "loss": 0.0017, + "num_tokens": 84963840.0, + "reward": 4.4553704261779785, + "reward_std": 0.24327006936073303, + "rewards/accuracy_reward/mean": 4.0, + "rewards/accuracy_reward/std": 2.618614673614502, + "rewards/ngram_similarity_reward/mean": 0.4553704261779785, + "rewards/ngram_similarity_reward/std": 0.32216182351112366, + "step": 530 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 730.0, + "completions/max_terminated_length": 730.0, + "completions/mean_length": 395.46875, + "completions/mean_terminated_length": 395.46875, + "completions/min_length": 189.0, + "completions/min_terminated_length": 189.0, + "epoch": 0.23763705526963527, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07767239212989807, + "learning_rate": 4.9425778869847516e-06, + "loss": 0.0157, + "num_tokens": 85140046.0, + "reward": 3.088334321975708, + "reward_std": 2.573495864868164, + "rewards/accuracy_reward/mean": 2.5, + "rewards/accuracy_reward/std": 3.0237157344818115, + "rewards/ngram_similarity_reward/mean": 0.5883344411849976, + "rewards/ngram_similarity_reward/std": 0.31117013096809387, + "step": 531 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 760.0, + "completions/max_terminated_length": 760.0, + "completions/mean_length": 508.484375, + "completions/mean_terminated_length": 508.484375, + "completions/min_length": 279.0, + "completions/min_terminated_length": 279.0, + "epoch": 0.2380845826806892, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05766969919204712, + "learning_rate": 4.94220358351033e-06, + "loss": -0.0255, + "num_tokens": 85300285.0, + "reward": 1.054612398147583, + "reward_std": 1.5021476745605469, + "rewards/accuracy_reward/mean": 0.515625, + "rewards/accuracy_reward/std": 2.292099714279175, + "rewards/ngram_similarity_reward/mean": 0.5389874577522278, + "rewards/ngram_similarity_reward/std": 0.26103782653808594, + "step": 532 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 755.0, + "completions/max_terminated_length": 755.0, + "completions/mean_length": 416.296875, + "completions/mean_terminated_length": 416.296875, + "completions/min_length": 270.0, + "completions/min_terminated_length": 270.0, + "epoch": 0.23853211009174313, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0637131929397583, + "learning_rate": 4.941828079927083e-06, + "loss": 0.0029, + "num_tokens": 85415216.0, + "reward": 3.05372953414917, + "reward_std": 0.8926549553871155, + "rewards/accuracy_reward/mean": 2.5, + "rewards/accuracy_reward/std": 3.0237157344818115, + "rewards/ngram_similarity_reward/mean": 0.5537295341491699, + "rewards/ngram_similarity_reward/std": 0.30394914746284485, + "step": 533 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 666.0, + "completions/max_terminated_length": 666.0, + "completions/mean_length": 481.046875, + "completions/mean_terminated_length": 481.046875, + "completions/min_length": 314.0, + "completions/min_terminated_length": 314.0, + "epoch": 0.23897963750279705, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.057355329394340515, + "learning_rate": 4.941451376440579e-06, + "loss": -0.0184, + "num_tokens": 85631043.0, + "reward": 3.322988510131836, + "reward_std": 0.8318421244621277, + "rewards/accuracy_reward/mean": 2.875, + "rewards/accuracy_reward/std": 3.000000238418579, + "rewards/ngram_similarity_reward/mean": 0.44798851013183594, + "rewards/ngram_similarity_reward/std": 0.28369003534317017, + "step": 534 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 751.0, + "completions/max_terminated_length": 751.0, + "completions/mean_length": 461.140625, + "completions/mean_terminated_length": 461.140625, + "completions/min_length": 284.0, + "completions/min_terminated_length": 284.0, + "epoch": 0.23942716491385097, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.058750320225954056, + "learning_rate": 4.941073473257041e-06, + "loss": -0.0238, + "num_tokens": 85785164.0, + "reward": 4.765369415283203, + "reward_std": 1.5559390783309937, + "rewards/accuracy_reward/mean": 4.1875, + "rewards/accuracy_reward/std": 2.5, + "rewards/ngram_similarity_reward/mean": 0.5778692960739136, + "rewards/ngram_similarity_reward/std": 0.366667777299881, + "step": 535 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 761.0, + "completions/max_terminated_length": 761.0, + "completions/mean_length": 396.328125, + "completions/mean_terminated_length": 396.328125, + "completions/min_length": 140.0, + "completions/min_terminated_length": 140.0, + "epoch": 0.2398746923249049, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06921112537384033, + "learning_rate": 4.940694370583351e-06, + "loss": -0.0388, + "num_tokens": 85968417.0, + "reward": 3.6020917892456055, + "reward_std": 1.2433319091796875, + "rewards/accuracy_reward/mean": 3.046875, + "rewards/accuracy_reward/std": 2.991680145263672, + "rewards/ngram_similarity_reward/mean": 0.555216908454895, + "rewards/ngram_similarity_reward/std": 0.3706609606742859, + "step": 536 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1172.0, + "completions/max_terminated_length": 1172.0, + "completions/mean_length": 389.203125, + "completions/mean_terminated_length": 389.203125, + "completions/min_length": 167.0, + "completions/min_terminated_length": 167.0, + "epoch": 0.24032221973595883, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.06513098627328873, + "learning_rate": 4.9403140686270455e-06, + "loss": 0.0764, + "num_tokens": 86096974.0, + "reward": 3.6188747882843018, + "reward_std": 1.255029320716858, + "rewards/accuracy_reward/mean": 2.9375, + "rewards/accuracy_reward/std": 3.028305768966675, + "rewards/ngram_similarity_reward/mean": 0.6813750267028809, + "rewards/ngram_similarity_reward/std": 0.4902365207672119, + "step": 537 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 592.0, + "completions/max_terminated_length": 592.0, + "completions/mean_length": 367.234375, + "completions/mean_terminated_length": 367.234375, + "completions/min_length": 229.0, + "completions/min_terminated_length": 229.0, + "epoch": 0.24076974714701274, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06107205152511597, + "learning_rate": 4.939932567596319e-06, + "loss": 0.0035, + "num_tokens": 86263213.0, + "reward": 2.052935838699341, + "reward_std": 0.839949905872345, + "rewards/accuracy_reward/mean": 1.5625, + "rewards/accuracy_reward/std": 2.872281312942505, + "rewards/ngram_similarity_reward/mean": 0.4904356896877289, + "rewards/ngram_similarity_reward/std": 0.13649572432041168, + "step": 538 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 628.0, + "completions/max_terminated_length": 628.0, + "completions/mean_length": 384.78125, + "completions/mean_terminated_length": 384.78125, + "completions/min_length": 173.0, + "completions/min_terminated_length": 173.0, + "epoch": 0.24121727455806669, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06742729246616364, + "learning_rate": 4.939549867700022e-06, + "loss": -0.0325, + "num_tokens": 86399407.0, + "reward": 4.329132080078125, + "reward_std": 1.331215500831604, + "rewards/accuracy_reward/mean": 3.53125, + "rewards/accuracy_reward/std": 2.839454174041748, + "rewards/ngram_similarity_reward/mean": 0.7978819608688354, + "rewards/ngram_similarity_reward/std": 0.31928256154060364, + "step": 539 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 555.0, + "completions/max_terminated_length": 555.0, + "completions/mean_length": 413.546875, + "completions/mean_terminated_length": 413.546875, + "completions/min_length": 276.0, + "completions/min_terminated_length": 276.0, + "epoch": 0.2416648019691206, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06391128152608871, + "learning_rate": 4.939165969147662e-06, + "loss": -0.033, + "num_tokens": 86631522.0, + "reward": 3.186141014099121, + "reward_std": 0.22396305203437805, + "rewards/accuracy_reward/mean": 2.5, + "rewards/accuracy_reward/std": 3.0237157344818115, + "rewards/ngram_similarity_reward/mean": 0.6861410737037659, + "rewards/ngram_similarity_reward/std": 0.2991383373737335, + "step": 540 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 675.0, + "completions/max_terminated_length": 675.0, + "completions/mean_length": 441.0625, + "completions/mean_terminated_length": 441.0625, + "completions/min_length": 267.0, + "completions/min_terminated_length": 267.0, + "epoch": 0.24211232938017455, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05881042405962944, + "learning_rate": 4.9387808721494e-06, + "loss": -0.0381, + "num_tokens": 86759174.0, + "reward": 4.801782608032227, + "reward_std": 1.611395239830017, + "rewards/accuracy_reward/mean": 4.1875, + "rewards/accuracy_reward/std": 2.5, + "rewards/ngram_similarity_reward/mean": 0.6142823696136475, + "rewards/ngram_similarity_reward/std": 0.27828091382980347, + "step": 541 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 638.0, + "completions/max_terminated_length": 638.0, + "completions/mean_length": 446.796875, + "completions/mean_terminated_length": 446.796875, + "completions/min_length": 314.0, + "completions/min_terminated_length": 314.0, + "epoch": 0.24255985679122846, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08241968601942062, + "learning_rate": 4.938394576916057e-06, + "loss": 0.0036, + "num_tokens": 86895049.0, + "reward": 5.214838027954102, + "reward_std": 1.254507303237915, + "rewards/accuracy_reward/mean": 4.640625, + "rewards/accuracy_reward/std": 2.1445181369781494, + "rewards/ngram_similarity_reward/mean": 0.5742127895355225, + "rewards/ngram_similarity_reward/std": 0.232599139213562, + "step": 542 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 838.0, + "completions/max_terminated_length": 838.0, + "completions/mean_length": 489.828125, + "completions/mean_terminated_length": 489.828125, + "completions/min_length": 279.0, + "completions/min_terminated_length": 279.0, + "epoch": 0.24300738420228238, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09177494049072266, + "learning_rate": 4.938007083659106e-06, + "loss": -0.0308, + "num_tokens": 87020622.0, + "reward": 4.419434547424316, + "reward_std": 1.546202301979065, + "rewards/accuracy_reward/mean": 3.8125, + "rewards/accuracy_reward/std": 2.7189810276031494, + "rewards/ngram_similarity_reward/mean": 0.6069345474243164, + "rewards/ngram_similarity_reward/std": 0.2924477756023407, + "step": 543 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 970.0, + "completions/max_terminated_length": 970.0, + "completions/mean_length": 621.8125, + "completions/mean_terminated_length": 621.8125, + "completions/min_length": 351.0, + "completions/min_terminated_length": 351.0, + "epoch": 0.24345491161333632, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05039747431874275, + "learning_rate": 4.937618392590681e-06, + "loss": -0.002, + "num_tokens": 87177474.0, + "reward": 3.1579737663269043, + "reward_std": 1.5208743810653687, + "rewards/accuracy_reward/mean": 2.578125, + "rewards/accuracy_reward/std": 3.0410144329071045, + "rewards/ngram_similarity_reward/mean": 0.5798487067222595, + "rewards/ngram_similarity_reward/std": 0.34609490633010864, + "step": 544 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 510.0, + "completions/max_terminated_length": 510.0, + "completions/mean_length": 330.390625, + "completions/mean_terminated_length": 330.390625, + "completions/min_length": 202.0, + "completions/min_terminated_length": 202.0, + "epoch": 0.24390243902439024, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08228424936532974, + "learning_rate": 4.9372285039235654e-06, + "loss": -0.0113, + "num_tokens": 87311051.0, + "reward": 2.638988494873047, + "reward_std": 0.8976145386695862, + "rewards/accuracy_reward/mean": 2.03125, + "rewards/accuracy_reward/std": 2.986577272415161, + "rewards/ngram_similarity_reward/mean": 0.6077384352684021, + "rewards/ngram_similarity_reward/std": 0.3406791687011719, + "step": 545 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 633.0, + "completions/max_terminated_length": 633.0, + "completions/mean_length": 377.875, + "completions/mean_terminated_length": 377.875, + "completions/min_length": 222.0, + "completions/min_terminated_length": 222.0, + "epoch": 0.24434996643544418, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07295098900794983, + "learning_rate": 4.9368374178712035e-06, + "loss": -0.0202, + "num_tokens": 87509763.0, + "reward": 3.602973461151123, + "reward_std": 0.9902347326278687, + "rewards/accuracy_reward/mean": 3.140625, + "rewards/accuracy_reward/std": 2.9727182388305664, + "rewards/ngram_similarity_reward/mean": 0.4623487591743469, + "rewards/ngram_similarity_reward/std": 0.2751040458679199, + "step": 546 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 633.0, + "completions/max_terminated_length": 633.0, + "completions/mean_length": 383.0, + "completions/mean_terminated_length": 383.0, + "completions/min_length": 235.0, + "completions/min_terminated_length": 235.0, + "epoch": 0.2447974938464981, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1062786802649498, + "learning_rate": 4.936445134647692e-06, + "loss": 0.0131, + "num_tokens": 87664179.0, + "reward": 4.703701972961426, + "reward_std": 0.20016412436962128, + "rewards/accuracy_reward/mean": 4.0, + "rewards/accuracy_reward/std": 2.618614673614502, + "rewards/ngram_similarity_reward/mean": 0.7037018537521362, + "rewards/ngram_similarity_reward/std": 0.39074793457984924, + "step": 547 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 613.0, + "completions/max_terminated_length": 613.0, + "completions/mean_length": 402.8125, + "completions/mean_terminated_length": 402.8125, + "completions/min_length": 215.0, + "completions/min_terminated_length": 215.0, + "epoch": 0.24524502125755201, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10711069405078888, + "learning_rate": 4.9360516544677835e-06, + "loss": 0.0029, + "num_tokens": 87875207.0, + "reward": 5.1088080406188965, + "reward_std": 1.2155022621154785, + "rewards/accuracy_reward/mean": 4.375, + "rewards/accuracy_reward/std": 2.3603873252868652, + "rewards/ngram_similarity_reward/mean": 0.7338082790374756, + "rewards/ngram_similarity_reward/std": 0.3225592076778412, + "step": 548 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 857.0, + "completions/max_terminated_length": 857.0, + "completions/mean_length": 456.375, + "completions/mean_terminated_length": 456.375, + "completions/min_length": 249.0, + "completions/min_terminated_length": 249.0, + "epoch": 0.24569254866860596, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05813458189368248, + "learning_rate": 4.935656977546889e-06, + "loss": -0.0346, + "num_tokens": 88034511.0, + "reward": 5.104488372802734, + "reward_std": 0.8898274302482605, + "rewards/accuracy_reward/mean": 4.65625, + "rewards/accuracy_reward/std": 2.102294683456421, + "rewards/ngram_similarity_reward/mean": 0.44823816418647766, + "rewards/ngram_similarity_reward/std": 0.2638348937034607, + "step": 549 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 729.0, + "completions/max_terminated_length": 729.0, + "completions/mean_length": 417.125, + "completions/mean_terminated_length": 417.125, + "completions/min_length": 160.0, + "completions/min_terminated_length": 160.0, + "epoch": 0.24614007607965988, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06147436797618866, + "learning_rate": 4.935261104101069e-06, + "loss": 0.0314, + "num_tokens": 88210951.0, + "reward": 3.0853829383850098, + "reward_std": 0.24257975816726685, + "rewards/accuracy_reward/mean": 2.5, + "rewards/accuracy_reward/std": 3.0237157344818115, + "rewards/ngram_similarity_reward/mean": 0.5853830575942993, + "rewards/ngram_similarity_reward/std": 0.3101142346858978, + "step": 550 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 692.0, + "completions/max_terminated_length": 692.0, + "completions/mean_length": 443.75, + "completions/mean_terminated_length": 443.75, + "completions/min_length": 253.0, + "completions/min_terminated_length": 253.0, + "epoch": 0.24658760349071382, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0830966979265213, + "learning_rate": 4.9348640343470435e-06, + "loss": 0.0007, + "num_tokens": 88340759.0, + "reward": 4.599246978759766, + "reward_std": 0.25810831785202026, + "rewards/accuracy_reward/mean": 4.0, + "rewards/accuracy_reward/std": 2.618614673614502, + "rewards/ngram_similarity_reward/mean": 0.5992466807365417, + "rewards/ngram_similarity_reward/std": 0.3302537798881531, + "step": 551 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 849.0, + "completions/max_terminated_length": 849.0, + "completions/mean_length": 527.828125, + "completions/mean_terminated_length": 527.828125, + "completions/min_length": 326.0, + "completions/min_terminated_length": 326.0, + "epoch": 0.24703513090176774, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.055706046521663666, + "learning_rate": 4.934465768502187e-06, + "loss": -0.0198, + "num_tokens": 88516476.0, + "reward": 5.346339225769043, + "reward_std": 2.015418529510498, + "rewards/accuracy_reward/mean": 4.59375, + "rewards/accuracy_reward/std": 2.265817403793335, + "rewards/ngram_similarity_reward/mean": 0.7525894641876221, + "rewards/ngram_similarity_reward/std": 0.3101097345352173, + "step": 552 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 639.0, + "completions/max_terminated_length": 639.0, + "completions/mean_length": 355.40625, + "completions/mean_terminated_length": 355.40625, + "completions/min_length": 179.0, + "completions/min_terminated_length": 179.0, + "epoch": 0.24748265831282165, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0677371695637703, + "learning_rate": 4.934066306784525e-06, + "loss": 0.0648, + "num_tokens": 88676646.0, + "reward": 4.451678276062012, + "reward_std": 1.0497232675552368, + "rewards/accuracy_reward/mean": 3.90625, + "rewards/accuracy_reward/std": 2.6709415912628174, + "rewards/ngram_similarity_reward/mean": 0.5454282760620117, + "rewards/ngram_similarity_reward/std": 0.360625684261322, + "step": 553 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 576.0, + "completions/max_terminated_length": 576.0, + "completions/mean_length": 370.1875, + "completions/mean_terminated_length": 370.1875, + "completions/min_length": 191.0, + "completions/min_terminated_length": 191.0, + "epoch": 0.2479301857238756, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06734210252761841, + "learning_rate": 4.933665649412743e-06, + "loss": 0.0108, + "num_tokens": 88806306.0, + "reward": 3.965791702270508, + "reward_std": 0.9451578259468079, + "rewards/accuracy_reward/mean": 3.25, + "rewards/accuracy_reward/std": 2.9277002811431885, + "rewards/ngram_similarity_reward/mean": 0.7157918810844421, + "rewards/ngram_similarity_reward/std": 0.2672803997993469, + "step": 554 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1025.0, + "completions/max_terminated_length": 1025.0, + "completions/mean_length": 435.890625, + "completions/mean_terminated_length": 435.890625, + "completions/min_length": 191.0, + "completions/min_terminated_length": 191.0, + "epoch": 0.2483777131349295, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05665924772620201, + "learning_rate": 4.933263796606178e-06, + "loss": 0.0323, + "num_tokens": 88980523.0, + "reward": 4.632099628448486, + "reward_std": 0.13429510593414307, + "rewards/accuracy_reward/mean": 4.0, + "rewards/accuracy_reward/std": 2.618614673614502, + "rewards/ngram_similarity_reward/mean": 0.6320995688438416, + "rewards/ngram_similarity_reward/std": 0.32506057620048523, + "step": 555 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 686.0, + "completions/max_terminated_length": 686.0, + "completions/mean_length": 434.3125, + "completions/mean_terminated_length": 434.3125, + "completions/min_length": 250.0, + "completions/min_terminated_length": 250.0, + "epoch": 0.24882524054598343, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0613069124519825, + "learning_rate": 4.9328607485848205e-06, + "loss": -0.0059, + "num_tokens": 89145327.0, + "reward": 1.708435297012329, + "reward_std": 2.089996814727783, + "rewards/accuracy_reward/mean": 1.375, + "rewards/accuracy_reward/std": 2.8030595779418945, + "rewards/ngram_similarity_reward/mean": 0.3334353566169739, + "rewards/ngram_similarity_reward/std": 0.22431553900241852, + "step": 556 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 722.0, + "completions/max_terminated_length": 722.0, + "completions/mean_length": 421.765625, + "completions/mean_terminated_length": 421.765625, + "completions/min_length": 284.0, + "completions/min_terminated_length": 284.0, + "epoch": 0.24927276795703737, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0642596036195755, + "learning_rate": 4.932456505569318e-06, + "loss": -0.0034, + "num_tokens": 89303664.0, + "reward": 4.698173999786377, + "reward_std": 1.4473786354064941, + "rewards/accuracy_reward/mean": 4.0625, + "rewards/accuracy_reward/std": 2.623913288116455, + "rewards/ngram_similarity_reward/mean": 0.635674238204956, + "rewards/ngram_similarity_reward/std": 0.35289034247398376, + "step": 557 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 768.0, + "completions/max_terminated_length": 768.0, + "completions/mean_length": 478.65625, + "completions/mean_terminated_length": 478.65625, + "completions/min_length": 257.0, + "completions/min_terminated_length": 257.0, + "epoch": 0.2497202953680913, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05370509624481201, + "learning_rate": 4.9320510677809705e-06, + "loss": -0.0157, + "num_tokens": 89444426.0, + "reward": 3.192556858062744, + "reward_std": 1.486718773841858, + "rewards/accuracy_reward/mean": 2.765625, + "rewards/accuracy_reward/std": 3.0302298069000244, + "rewards/ngram_similarity_reward/mean": 0.4269317388534546, + "rewards/ngram_similarity_reward/std": 0.24186871945858002, + "step": 558 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 749.0, + "completions/max_terminated_length": 749.0, + "completions/mean_length": 431.53125, + "completions/mean_terminated_length": 431.53125, + "completions/min_length": 226.0, + "completions/min_terminated_length": 226.0, + "epoch": 0.25016782277914523, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06211641803383827, + "learning_rate": 4.931644435441732e-06, + "loss": 0.0252, + "num_tokens": 89578060.0, + "reward": 4.079433917999268, + "reward_std": 1.3576300144195557, + "rewards/accuracy_reward/mean": 3.625, + "rewards/accuracy_reward/std": 2.8030595779418945, + "rewards/ngram_similarity_reward/mean": 0.4544336199760437, + "rewards/ngram_similarity_reward/std": 0.30528175830841064, + "step": 559 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 867.0, + "completions/max_terminated_length": 867.0, + "completions/mean_length": 511.625, + "completions/mean_terminated_length": 511.625, + "completions/min_length": 211.0, + "completions/min_terminated_length": 211.0, + "epoch": 0.25061535019019915, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06391696631908417, + "learning_rate": 4.931236608774213e-06, + "loss": 0.0084, + "num_tokens": 89798164.0, + "reward": 3.956958055496216, + "reward_std": 1.2396081686019897, + "rewards/accuracy_reward/mean": 3.421875, + "rewards/accuracy_reward/std": 2.896657705307007, + "rewards/ngram_similarity_reward/mean": 0.535082995891571, + "rewards/ngram_similarity_reward/std": 0.20237237215042114, + "step": 560 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 784.0, + "completions/max_terminated_length": 784.0, + "completions/mean_length": 542.390625, + "completions/mean_terminated_length": 542.390625, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, + "epoch": 0.25106287760125306, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06036326289176941, + "learning_rate": 4.930827588001673e-06, + "loss": 0.0104, + "num_tokens": 90015293.0, + "reward": 4.204543113708496, + "reward_std": 1.2788963317871094, + "rewards/accuracy_reward/mean": 3.796875, + "rewards/accuracy_reward/std": 2.746886730194092, + "rewards/ngram_similarity_reward/mean": 0.4076683521270752, + "rewards/ngram_similarity_reward/std": 0.32381314039230347, + "step": 561 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 507.0, + "completions/max_terminated_length": 507.0, + "completions/mean_length": 342.453125, + "completions/mean_terminated_length": 342.453125, + "completions/min_length": 213.0, + "completions/min_terminated_length": 213.0, + "epoch": 0.251510405012307, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07648829370737076, + "learning_rate": 4.93041737334803e-06, + "loss": 0.0066, + "num_tokens": 90127866.0, + "reward": 5.420676231384277, + "reward_std": 2.02937912940979, + "rewards/accuracy_reward/mean": 4.75, + "rewards/accuracy_reward/std": 2.0, + "rewards/ngram_similarity_reward/mean": 0.6706759333610535, + "rewards/ngram_similarity_reward/std": 0.3660717010498047, + "step": 562 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 757.0, + "completions/max_terminated_length": 757.0, + "completions/mean_length": 482.9375, + "completions/mean_terminated_length": 482.9375, + "completions/min_length": 214.0, + "completions/min_terminated_length": 214.0, + "epoch": 0.25195793242336095, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05291726812720299, + "learning_rate": 4.930005965037853e-06, + "loss": 0.0074, + "num_tokens": 90269814.0, + "reward": 5.875433921813965, + "reward_std": 0.8267433047294617, + "rewards/accuracy_reward/mean": 5.21875, + "rewards/accuracy_reward/std": 1.2782522439956665, + "rewards/ngram_similarity_reward/mean": 0.6566839218139648, + "rewards/ngram_similarity_reward/std": 0.3229796588420868, + "step": 563 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 604.0, + "completions/max_terminated_length": 604.0, + "completions/mean_length": 426.65625, + "completions/mean_terminated_length": 426.65625, + "completions/min_length": 239.0, + "completions/min_terminated_length": 239.0, + "epoch": 0.25240545983441487, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0659690722823143, + "learning_rate": 4.929593363296365e-06, + "loss": -0.0142, + "num_tokens": 90419888.0, + "reward": 5.506959915161133, + "reward_std": 1.3257946968078613, + "rewards/accuracy_reward/mean": 4.9375, + "rewards/accuracy_reward/std": 1.7627090215682983, + "rewards/ngram_similarity_reward/mean": 0.5694600939750671, + "rewards/ngram_similarity_reward/std": 0.2535358667373657, + "step": 564 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1374.0, + "completions/max_terminated_length": 1374.0, + "completions/mean_length": 612.265625, + "completions/mean_terminated_length": 612.265625, + "completions/min_length": 182.0, + "completions/min_terminated_length": 182.0, + "epoch": 0.2528529872454688, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05176481604576111, + "learning_rate": 4.929179568349442e-06, + "loss": 0.0655, + "num_tokens": 90609329.0, + "reward": 3.9212515354156494, + "reward_std": 1.6650469303131104, + "rewards/accuracy_reward/mean": 3.328125, + "rewards/accuracy_reward/std": 2.9252848625183105, + "rewards/ngram_similarity_reward/mean": 0.5931264162063599, + "rewards/ngram_similarity_reward/std": 0.31439313292503357, + "step": 565 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 840.0, + "completions/max_terminated_length": 840.0, + "completions/mean_length": 438.28125, + "completions/mean_terminated_length": 438.28125, + "completions/min_length": 288.0, + "completions/min_terminated_length": 288.0, + "epoch": 0.2533005146565227, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.059034351259469986, + "learning_rate": 4.928764580423615e-06, + "loss": 0.0107, + "num_tokens": 90790563.0, + "reward": 5.449217796325684, + "reward_std": 1.4397118091583252, + "rewards/accuracy_reward/mean": 4.828125, + "rewards/accuracy_reward/std": 1.9359153509140015, + "rewards/ngram_similarity_reward/mean": 0.6210930943489075, + "rewards/ngram_similarity_reward/std": 0.34128040075302124, + "step": 566 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 672.0, + "completions/max_terminated_length": 672.0, + "completions/mean_length": 453.390625, + "completions/mean_terminated_length": 453.390625, + "completions/min_length": 315.0, + "completions/min_terminated_length": 315.0, + "epoch": 0.2537480420675766, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05689390376210213, + "learning_rate": 4.928348399746066e-06, + "loss": -0.0143, + "num_tokens": 90948892.0, + "reward": 5.924552917480469, + "reward_std": 0.4729365110397339, + "rewards/accuracy_reward/mean": 5.40625, + "rewards/accuracy_reward/std": 0.7500000596046448, + "rewards/ngram_similarity_reward/mean": 0.5183030366897583, + "rewards/ngram_similarity_reward/std": 0.37194132804870605, + "step": 567 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1009.0, + "completions/max_terminated_length": 1009.0, + "completions/mean_length": 553.03125, + "completions/mean_terminated_length": 553.03125, + "completions/min_length": 284.0, + "completions/min_terminated_length": 284.0, + "epoch": 0.2541955694786306, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0526115708053112, + "learning_rate": 4.927931026544628e-06, + "loss": 0.0328, + "num_tokens": 91128334.0, + "reward": 1.770582914352417, + "reward_std": 0.6807299852371216, + "rewards/accuracy_reward/mean": 1.171875, + "rewards/accuracy_reward/std": 2.7316761016845703, + "rewards/ngram_similarity_reward/mean": 0.5987077951431274, + "rewards/ngram_similarity_reward/std": 0.27312007546424866, + "step": 568 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 537.0, + "completions/max_terminated_length": 537.0, + "completions/mean_length": 375.359375, + "completions/mean_terminated_length": 375.359375, + "completions/min_length": 244.0, + "completions/min_terminated_length": 244.0, + "epoch": 0.2546430968896845, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06447634100914001, + "learning_rate": 4.927512461047794e-06, + "loss": -0.0145, + "num_tokens": 91282837.0, + "reward": 4.480569839477539, + "reward_std": 0.23406577110290527, + "rewards/accuracy_reward/mean": 4.0, + "rewards/accuracy_reward/std": 2.618614673614502, + "rewards/ngram_similarity_reward/mean": 0.4805700480937958, + "rewards/ngram_similarity_reward/std": 0.3614371120929718, + "step": 569 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 611.0, + "completions/max_terminated_length": 611.0, + "completions/mean_length": 416.34375, + "completions/mean_terminated_length": 416.34375, + "completions/min_length": 287.0, + "completions/min_terminated_length": 287.0, + "epoch": 0.2550906243007384, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.060128338634967804, + "learning_rate": 4.927092703484701e-06, + "loss": 0.0107, + "num_tokens": 91429115.0, + "reward": 3.222437620162964, + "reward_std": 0.2556608319282532, + "rewards/accuracy_reward/mean": 2.5, + "rewards/accuracy_reward/std": 3.0237157344818115, + "rewards/ngram_similarity_reward/mean": 0.7224376797676086, + "rewards/ngram_similarity_reward/std": 0.39764177799224854, + "step": 570 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 794.0, + "completions/max_terminated_length": 794.0, + "completions/mean_length": 459.890625, + "completions/mean_terminated_length": 459.890625, + "completions/min_length": 247.0, + "completions/min_terminated_length": 247.0, + "epoch": 0.25553815171179234, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05413290858268738, + "learning_rate": 4.926671754085146e-06, + "loss": 0.0039, + "num_tokens": 91551092.0, + "reward": 4.4916276931762695, + "reward_std": 0.5226418972015381, + "rewards/accuracy_reward/mean": 4.078125, + "rewards/accuracy_reward/std": 2.593059778213501, + "rewards/ngram_similarity_reward/mean": 0.41350257396698, + "rewards/ngram_similarity_reward/std": 0.3172895312309265, + "step": 571 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 753.0, + "completions/max_terminated_length": 753.0, + "completions/mean_length": 465.28125, + "completions/mean_terminated_length": 465.28125, + "completions/min_length": 287.0, + "completions/min_terminated_length": 287.0, + "epoch": 0.25598567912284625, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.059520188719034195, + "learning_rate": 4.9262496130795735e-06, + "loss": -0.019, + "num_tokens": 91701702.0, + "reward": 3.140317916870117, + "reward_std": 0.5068360567092896, + "rewards/accuracy_reward/mean": 2.59375, + "rewards/accuracy_reward/std": 3.0222392082214355, + "rewards/ngram_similarity_reward/mean": 0.5465677976608276, + "rewards/ngram_similarity_reward/std": 0.3046760559082031, + "step": 572 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 491.0, + "completions/max_terminated_length": 491.0, + "completions/mean_length": 298.0, + "completions/mean_terminated_length": 298.0, + "completions/min_length": 186.0, + "completions/min_terminated_length": 186.0, + "epoch": 0.2564332065339002, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07516378164291382, + "learning_rate": 4.925826280699083e-06, + "loss": 0.0017, + "num_tokens": 91828230.0, + "reward": 4.201871395111084, + "reward_std": 1.5170429944992065, + "rewards/accuracy_reward/mean": 3.5, + "rewards/accuracy_reward/std": 2.8894994258880615, + "rewards/ngram_similarity_reward/mean": 0.7018713355064392, + "rewards/ngram_similarity_reward/std": 0.43469229340553284, + "step": 573 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 785.0, + "completions/max_terminated_length": 785.0, + "completions/mean_length": 450.765625, + "completions/mean_terminated_length": 450.765625, + "completions/min_length": 251.0, + "completions/min_terminated_length": 251.0, + "epoch": 0.25688073394495414, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05854872614145279, + "learning_rate": 4.9254017571754246e-06, + "loss": 0.0312, + "num_tokens": 91985671.0, + "reward": 6.019687652587891, + "reward_std": 0.4594746530056, + "rewards/accuracy_reward/mean": 5.40625, + "rewards/accuracy_reward/std": 0.7500000596046448, + "rewards/ngram_similarity_reward/mean": 0.6134378910064697, + "rewards/ngram_similarity_reward/std": 0.34068456292152405, + "step": 574 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 866.0, + "completions/max_terminated_length": 866.0, + "completions/mean_length": 473.34375, + "completions/mean_terminated_length": 473.34375, + "completions/min_length": 228.0, + "completions/min_terminated_length": 228.0, + "epoch": 0.25732826135600806, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04823269322514534, + "learning_rate": 4.924976042741001e-06, + "loss": 0.0281, + "num_tokens": 92129325.0, + "reward": 2.2590408325195312, + "reward_std": 0.914676308631897, + "rewards/accuracy_reward/mean": 1.84375, + "rewards/accuracy_reward/std": 2.950484275817871, + "rewards/ngram_similarity_reward/mean": 0.4152909517288208, + "rewards/ngram_similarity_reward/std": 0.27419641613960266, + "step": 575 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 596.0, + "completions/max_terminated_length": 596.0, + "completions/mean_length": 379.921875, + "completions/mean_terminated_length": 379.921875, + "completions/min_length": 221.0, + "completions/min_terminated_length": 221.0, + "epoch": 0.257775788767062, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06938162446022034, + "learning_rate": 4.924549137628868e-06, + "loss": 0.0411, + "num_tokens": 92242680.0, + "reward": 5.770617961883545, + "reward_std": 0.8946278095245361, + "rewards/accuracy_reward/mean": 5.03125, + "rewards/accuracy_reward/std": 1.6229382753372192, + "rewards/ngram_similarity_reward/mean": 0.7393677234649658, + "rewards/ngram_similarity_reward/std": 0.35116398334503174, + "step": 576 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 612.0, + "completions/max_terminated_length": 612.0, + "completions/mean_length": 379.328125, + "completions/mean_terminated_length": 379.328125, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "epoch": 0.2582233161781159, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06705532222986221, + "learning_rate": 4.924121042072731e-06, + "loss": -0.0318, + "num_tokens": 92421613.0, + "reward": 3.4321212768554688, + "reward_std": 1.660041332244873, + "rewards/accuracy_reward/mean": 2.78125, + "rewards/accuracy_reward/std": 3.0103988647460938, + "rewards/ngram_similarity_reward/mean": 0.6508713960647583, + "rewards/ngram_similarity_reward/std": 0.3138256072998047, + "step": 577 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 631.0, + "completions/max_terminated_length": 631.0, + "completions/mean_length": 391.171875, + "completions/mean_terminated_length": 391.171875, + "completions/min_length": 162.0, + "completions/min_terminated_length": 162.0, + "epoch": 0.25867084358916986, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06797738373279572, + "learning_rate": 4.92369175630695e-06, + "loss": 0.0537, + "num_tokens": 92648280.0, + "reward": 0.8223634958267212, + "reward_std": 1.387064814567566, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 2.195775270462036, + "rewards/ngram_similarity_reward/mean": 0.3848634958267212, + "rewards/ngram_similarity_reward/std": 0.22720791399478912, + "step": 578 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 588.0, + "completions/max_terminated_length": 588.0, + "completions/mean_length": 367.59375, + "completions/mean_terminated_length": 367.59375, + "completions/min_length": 198.0, + "completions/min_terminated_length": 198.0, + "epoch": 0.2591183710002238, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0671374499797821, + "learning_rate": 4.923261280566534e-06, + "loss": -0.0254, + "num_tokens": 92783486.0, + "reward": 5.127212047576904, + "reward_std": 2.018561363220215, + "rewards/accuracy_reward/mean": 4.515625, + "rewards/accuracy_reward/std": 2.312781572341919, + "rewards/ngram_similarity_reward/mean": 0.6115868091583252, + "rewards/ngram_similarity_reward/std": 0.35721004009246826, + "step": 579 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 669.0, + "completions/max_terminated_length": 669.0, + "completions/mean_length": 394.390625, + "completions/mean_terminated_length": 394.390625, + "completions/min_length": 252.0, + "completions/min_terminated_length": 252.0, + "epoch": 0.2595658984112777, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07511835545301437, + "learning_rate": 4.922829615087144e-06, + "loss": -0.0139, + "num_tokens": 93007367.0, + "reward": 3.1698343753814697, + "reward_std": 0.7463058233261108, + "rewards/accuracy_reward/mean": 2.765625, + "rewards/accuracy_reward/std": 3.0302298069000244, + "rewards/ngram_similarity_reward/mean": 0.4042094647884369, + "rewards/ngram_similarity_reward/std": 0.3607175946235657, + "step": 580 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 654.0, + "completions/max_terminated_length": 654.0, + "completions/mean_length": 405.515625, + "completions/mean_terminated_length": 405.515625, + "completions/min_length": 203.0, + "completions/min_terminated_length": 203.0, + "epoch": 0.2600134258223316, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.051304783672094345, + "learning_rate": 4.922396760105093e-06, + "loss": -0.0243, + "num_tokens": 93179992.0, + "reward": 4.243990421295166, + "reward_std": 1.482818365097046, + "rewards/accuracy_reward/mean": 3.34375, + "rewards/accuracy_reward/std": 2.9016621112823486, + "rewards/ngram_similarity_reward/mean": 0.9002405405044556, + "rewards/ngram_similarity_reward/std": 0.30516666173934937, + "step": 581 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 734.0, + "completions/max_terminated_length": 734.0, + "completions/mean_length": 444.40625, + "completions/mean_terminated_length": 444.40625, + "completions/min_length": 241.0, + "completions/min_terminated_length": 241.0, + "epoch": 0.26046095323338553, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06275703012943268, + "learning_rate": 4.921962715857346e-06, + "loss": 0.0095, + "num_tokens": 93345746.0, + "reward": 3.421440601348877, + "reward_std": 0.8881598711013794, + "rewards/accuracy_reward/mean": 2.96875, + "rewards/accuracy_reward/std": 2.986577272415161, + "rewards/ngram_similarity_reward/mean": 0.45269080996513367, + "rewards/ngram_similarity_reward/std": 0.2987428307533264, + "step": 582 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 607.0, + "completions/max_terminated_length": 607.0, + "completions/mean_length": 409.375, + "completions/mean_terminated_length": 409.375, + "completions/min_length": 279.0, + "completions/min_terminated_length": 279.0, + "epoch": 0.26090848064443944, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06608902662992477, + "learning_rate": 4.921527482581515e-06, + "loss": 0.0212, + "num_tokens": 93504730.0, + "reward": 4.2971320152282715, + "reward_std": 0.9389599561691284, + "rewards/accuracy_reward/mean": 3.796875, + "rewards/accuracy_reward/std": 2.746886730194092, + "rewards/ngram_similarity_reward/mean": 0.5002568960189819, + "rewards/ngram_similarity_reward/std": 0.30913400650024414, + "step": 583 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 754.0, + "completions/max_terminated_length": 754.0, + "completions/mean_length": 464.640625, + "completions/mean_terminated_length": 464.640625, + "completions/min_length": 279.0, + "completions/min_terminated_length": 279.0, + "epoch": 0.2613560080554934, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06220301240682602, + "learning_rate": 4.921091060515869e-06, + "loss": -0.001, + "num_tokens": 93729187.0, + "reward": 3.043252944946289, + "reward_std": 1.370833158493042, + "rewards/accuracy_reward/mean": 2.6875, + "rewards/accuracy_reward/std": 3.0178043842315674, + "rewards/ngram_similarity_reward/mean": 0.3557531535625458, + "rewards/ngram_similarity_reward/std": 0.26693427562713623, + "step": 584 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 813.0, + "completions/max_terminated_length": 813.0, + "completions/mean_length": 476.5, + "completions/mean_terminated_length": 476.5, + "completions/min_length": 196.0, + "completions/min_terminated_length": 196.0, + "epoch": 0.26180353546654733, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.048784174025058746, + "learning_rate": 4.920653449899324e-06, + "loss": 0.0125, + "num_tokens": 93899619.0, + "reward": 4.929577350616455, + "reward_std": 0.9999752044677734, + "rewards/accuracy_reward/mean": 4.09375, + "rewards/accuracy_reward/std": 2.5617377758026123, + "rewards/ngram_similarity_reward/mean": 0.8358274698257446, + "rewards/ngram_similarity_reward/std": 0.2971649169921875, + "step": 585 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 751.0, + "completions/max_terminated_length": 751.0, + "completions/mean_length": 414.375, + "completions/mean_terminated_length": 414.375, + "completions/min_length": 209.0, + "completions/min_terminated_length": 209.0, + "epoch": 0.26225106287760125, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.06329008936882019, + "learning_rate": 4.920214650971446e-06, + "loss": -0.013, + "num_tokens": 94070251.0, + "reward": 3.229645013809204, + "reward_std": 1.3219417333602905, + "rewards/accuracy_reward/mean": 2.671875, + "rewards/accuracy_reward/std": 3.037097215652466, + "rewards/ngram_similarity_reward/mean": 0.5577700138092041, + "rewards/ngram_similarity_reward/std": 0.2504371404647827, + "step": 586 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 767.0, + "completions/max_terminated_length": 767.0, + "completions/mean_length": 413.078125, + "completions/mean_terminated_length": 413.078125, + "completions/min_length": 221.0, + "completions/min_terminated_length": 221.0, + "epoch": 0.26269859028865516, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06915739178657532, + "learning_rate": 4.919774663972455e-06, + "loss": 0.0026, + "num_tokens": 94293536.0, + "reward": 4.383290767669678, + "reward_std": 1.0135438442230225, + "rewards/accuracy_reward/mean": 3.71875, + "rewards/accuracy_reward/std": 2.7629566192626953, + "rewards/ngram_similarity_reward/mean": 0.6645410060882568, + "rewards/ngram_similarity_reward/std": 0.35068821907043457, + "step": 587 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 635.0, + "completions/max_terminated_length": 635.0, + "completions/mean_length": 404.625, + "completions/mean_terminated_length": 404.625, + "completions/min_length": 267.0, + "completions/min_terminated_length": 267.0, + "epoch": 0.2631461176997091, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06700963526964188, + "learning_rate": 4.919333489143217e-06, + "loss": 0.0152, + "num_tokens": 94442600.0, + "reward": 2.7941768169403076, + "reward_std": 1.2157671451568604, + "rewards/accuracy_reward/mean": 2.109375, + "rewards/accuracy_reward/std": 3.0164480209350586, + "rewards/ngram_similarity_reward/mean": 0.6848018169403076, + "rewards/ngram_similarity_reward/std": 0.22740662097930908, + "step": 588 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 682.0, + "completions/max_terminated_length": 682.0, + "completions/mean_length": 379.25, + "completions/mean_terminated_length": 379.25, + "completions/min_length": 169.0, + "completions/min_terminated_length": 169.0, + "epoch": 0.26359364511076305, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07101466506719589, + "learning_rate": 4.918891126725251e-06, + "loss": -0.0079, + "num_tokens": 94573016.0, + "reward": 3.7072603702545166, + "reward_std": 1.4600337743759155, + "rewards/accuracy_reward/mean": 3.140625, + "rewards/accuracy_reward/std": 2.9727182388305664, + "rewards/ngram_similarity_reward/mean": 0.5666354894638062, + "rewards/ngram_similarity_reward/std": 0.40288856625556946, + "step": 589 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 680.0, + "completions/max_terminated_length": 680.0, + "completions/mean_length": 446.828125, + "completions/mean_terminated_length": 446.828125, + "completions/min_length": 270.0, + "completions/min_terminated_length": 270.0, + "epoch": 0.26404117252181697, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.057009000331163406, + "learning_rate": 4.918447576960727e-06, + "loss": -0.0193, + "num_tokens": 94736301.0, + "reward": 3.3594250679016113, + "reward_std": 2.1236746311187744, + "rewards/accuracy_reward/mean": 2.59375, + "rewards/accuracy_reward/std": 3.0222392082214355, + "rewards/ngram_similarity_reward/mean": 0.7656752467155457, + "rewards/ngram_similarity_reward/std": 0.3162704110145569, + "step": 590 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 878.0, + "completions/max_terminated_length": 878.0, + "completions/mean_length": 512.671875, + "completions/mean_terminated_length": 512.671875, + "completions/min_length": 318.0, + "completions/min_terminated_length": 318.0, + "epoch": 0.2644886999328709, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06178559362888336, + "learning_rate": 4.918002840092462e-06, + "loss": 0.0004, + "num_tokens": 94929144.0, + "reward": 2.7129454612731934, + "reward_std": 1.9928107261657715, + "rewards/accuracy_reward/mean": 2.21875, + "rewards/accuracy_reward/std": 3.0103988647460938, + "rewards/ngram_similarity_reward/mean": 0.4941953420639038, + "rewards/ngram_similarity_reward/std": 0.21917389333248138, + "step": 591 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 548.0, + "completions/max_terminated_length": 548.0, + "completions/mean_length": 337.3125, + "completions/mean_terminated_length": 337.3125, + "completions/min_length": 188.0, + "completions/min_terminated_length": 188.0, + "epoch": 0.2649362273439248, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.06375846266746521, + "learning_rate": 4.917556916363926e-06, + "loss": 0.0134, + "num_tokens": 95060652.0, + "reward": 4.1918253898620605, + "reward_std": 1.1015440225601196, + "rewards/accuracy_reward/mean": 3.53125, + "rewards/accuracy_reward/std": 2.839454174041748, + "rewards/ngram_similarity_reward/mean": 0.6605754494667053, + "rewards/ngram_similarity_reward/std": 0.41147440671920776, + "step": 592 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 693.0, + "completions/max_terminated_length": 693.0, + "completions/mean_length": 410.96875, + "completions/mean_terminated_length": 410.96875, + "completions/min_length": 250.0, + "completions/min_terminated_length": 250.0, + "epoch": 0.2653837547549787, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05672091618180275, + "learning_rate": 4.917109806019236e-06, + "loss": -0.0009, + "num_tokens": 95203018.0, + "reward": 3.603363513946533, + "reward_std": 1.3123035430908203, + "rewards/accuracy_reward/mean": 2.78125, + "rewards/accuracy_reward/std": 3.0103988647460938, + "rewards/ngram_similarity_reward/mean": 0.8221136331558228, + "rewards/ngram_similarity_reward/std": 0.3107687532901764, + "step": 593 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 941.0, + "completions/max_terminated_length": 941.0, + "completions/mean_length": 539.28125, + "completions/mean_terminated_length": 539.28125, + "completions/min_length": 319.0, + "completions/min_terminated_length": 319.0, + "epoch": 0.2658312821660327, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.048371218144893646, + "learning_rate": 4.916661509303162e-06, + "loss": 0.0377, + "num_tokens": 95402348.0, + "reward": 5.1871442794799805, + "reward_std": 1.5259716510772705, + "rewards/accuracy_reward/mean": 4.625, + "rewards/accuracy_reward/std": 2.1858129501342773, + "rewards/ngram_similarity_reward/mean": 0.5621447563171387, + "rewards/ngram_similarity_reward/std": 0.3073975145816803, + "step": 594 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 553.0, + "completions/max_terminated_length": 553.0, + "completions/mean_length": 373.84375, + "completions/mean_terminated_length": 373.84375, + "completions/min_length": 257.0, + "completions/min_terminated_length": 257.0, + "epoch": 0.2662788095770866, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06661540269851685, + "learning_rate": 4.9162120264611195e-06, + "loss": 0.0047, + "num_tokens": 95524130.0, + "reward": 4.550005912780762, + "reward_std": 0.2907702922821045, + "rewards/accuracy_reward/mean": 3.984375, + "rewards/accuracy_reward/std": 2.648702621459961, + "rewards/ngram_similarity_reward/mean": 0.5656312704086304, + "rewards/ngram_similarity_reward/std": 0.3623160123825073, + "step": 595 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 711.0, + "completions/max_terminated_length": 711.0, + "completions/mean_length": 468.09375, + "completions/mean_terminated_length": 468.09375, + "completions/min_length": 278.0, + "completions/min_terminated_length": 278.0, + "epoch": 0.2667263369881405, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.061852775514125824, + "learning_rate": 4.915761357739175e-06, + "loss": 0.0104, + "num_tokens": 95680824.0, + "reward": 3.7030439376831055, + "reward_std": 1.5367881059646606, + "rewards/accuracy_reward/mean": 3.34375, + "rewards/accuracy_reward/std": 2.9016621112823486, + "rewards/ngram_similarity_reward/mean": 0.359294056892395, + "rewards/ngram_similarity_reward/std": 0.24002881348133087, + "step": 596 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 799.0, + "completions/max_terminated_length": 799.0, + "completions/mean_length": 505.375, + "completions/mean_terminated_length": 505.375, + "completions/min_length": 245.0, + "completions/min_terminated_length": 245.0, + "epoch": 0.26717386439919444, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05682015046477318, + "learning_rate": 4.915309503384046e-06, + "loss": 0.0316, + "num_tokens": 95840192.0, + "reward": 5.281811237335205, + "reward_std": 1.534910798072815, + "rewards/accuracy_reward/mean": 4.65625, + "rewards/accuracy_reward/std": 2.102294683456421, + "rewards/ngram_similarity_reward/mean": 0.6255611181259155, + "rewards/ngram_similarity_reward/std": 0.31543436646461487, + "step": 597 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1536.0, + "completions/max_terminated_length": 1536.0, + "completions/mean_length": 482.359375, + "completions/mean_terminated_length": 482.359375, + "completions/min_length": 229.0, + "completions/min_terminated_length": 229.0, + "epoch": 0.26762139181024835, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05381014198064804, + "learning_rate": 4.9148564636430965e-06, + "loss": -0.0006, + "num_tokens": 96013367.0, + "reward": 0.008737348020076752, + "reward_std": 0.182732954621315, + "rewards/accuracy_reward/mean": -0.515625, + "rewards/accuracy_reward/std": 0.125, + "rewards/ngram_similarity_reward/mean": 0.524362325668335, + "rewards/ngram_similarity_reward/std": 0.3303601145744324, + "step": 598 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 672.0, + "completions/max_terminated_length": 672.0, + "completions/mean_length": 415.1875, + "completions/mean_terminated_length": 415.1875, + "completions/min_length": 235.0, + "completions/min_terminated_length": 235.0, + "epoch": 0.2680689192213023, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05661553516983986, + "learning_rate": 4.91440223876434e-06, + "loss": -0.0005, + "num_tokens": 96122579.0, + "reward": 4.493249893188477, + "reward_std": 0.5313665270805359, + "rewards/accuracy_reward/mean": 3.890625, + "rewards/accuracy_reward/std": 2.6998953819274902, + "rewards/ngram_similarity_reward/mean": 0.6026246547698975, + "rewards/ngram_similarity_reward/std": 0.1737067550420761, + "step": 599 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 866.0, + "completions/max_terminated_length": 866.0, + "completions/mean_length": 501.140625, + "completions/mean_terminated_length": 501.140625, + "completions/min_length": 254.0, + "completions/min_terminated_length": 254.0, + "epoch": 0.26851644663235624, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05517631024122238, + "learning_rate": 4.91394682899644e-06, + "loss": -0.0006, + "num_tokens": 96263420.0, + "reward": 5.909665107727051, + "reward_std": 0.9372274875640869, + "rewards/accuracy_reward/mean": 5.296875, + "rewards/accuracy_reward/std": 1.1433686017990112, + "rewards/ngram_similarity_reward/mean": 0.6127904057502747, + "rewards/ngram_similarity_reward/std": 0.2696954011917114, + "step": 600 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 886.0, + "completions/max_terminated_length": 886.0, + "completions/mean_length": 486.03125, + "completions/mean_terminated_length": 486.03125, + "completions/min_length": 240.0, + "completions/min_terminated_length": 240.0, + "epoch": 0.26896397404341016, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05566961690783501, + "learning_rate": 4.913490234588708e-06, + "loss": -0.0116, + "num_tokens": 96395678.0, + "reward": 3.3153233528137207, + "reward_std": 0.9597666263580322, + "rewards/accuracy_reward/mean": 2.484375, + "rewards/accuracy_reward/std": 3.0419929027557373, + "rewards/ngram_similarity_reward/mean": 0.8309487104415894, + "rewards/ngram_similarity_reward/std": 0.2732831835746765, + "step": 601 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 758.0, + "completions/max_terminated_length": 758.0, + "completions/mean_length": 456.859375, + "completions/mean_terminated_length": 456.859375, + "completions/min_length": 247.0, + "completions/min_terminated_length": 247.0, + "epoch": 0.2694115014544641, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06226060166954994, + "learning_rate": 4.913032455791102e-06, + "loss": 0.0153, + "num_tokens": 96627125.0, + "reward": 3.7133123874664307, + "reward_std": 1.417924165725708, + "rewards/accuracy_reward/mean": 3.21875, + "rewards/accuracy_reward/std": 3.0833332538604736, + "rewards/ngram_similarity_reward/mean": 0.4945622682571411, + "rewards/ngram_similarity_reward/std": 0.2964348793029785, + "step": 602 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 800.0, + "completions/max_terminated_length": 800.0, + "completions/mean_length": 478.359375, + "completions/mean_terminated_length": 478.359375, + "completions/min_length": 254.0, + "completions/min_terminated_length": 254.0, + "epoch": 0.269859028865518, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.054972272366285324, + "learning_rate": 4.912573492854233e-06, + "loss": 0.0164, + "num_tokens": 96799596.0, + "reward": 1.3673251867294312, + "reward_std": 0.9833556413650513, + "rewards/accuracy_reward/mean": 0.90625, + "rewards/accuracy_reward/std": 2.5617377758026123, + "rewards/ngram_similarity_reward/mean": 0.4610751271247864, + "rewards/ngram_similarity_reward/std": 0.244536891579628, + "step": 603 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 747.0, + "completions/max_terminated_length": 747.0, + "completions/mean_length": 376.5625, + "completions/mean_terminated_length": 376.5625, + "completions/min_length": 168.0, + "completions/min_terminated_length": 168.0, + "epoch": 0.27030655627657196, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06426282227039337, + "learning_rate": 4.912113346029356e-06, + "loss": 0.0069, + "num_tokens": 96967024.0, + "reward": 3.4562032222747803, + "reward_std": 0.5571986436843872, + "rewards/accuracy_reward/mean": 2.59375, + "rewards/accuracy_reward/std": 3.0222392082214355, + "rewards/ngram_similarity_reward/mean": 0.8624532222747803, + "rewards/ngram_similarity_reward/std": 0.2347019612789154, + "step": 604 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1010.0, + "completions/mean_length": 465.515625, + "completions/mean_terminated_length": 440.3968505859375, + "completions/min_length": 252.0, + "completions/min_terminated_length": 252.0, + "epoch": 0.2707540836876259, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07138356566429138, + "learning_rate": 4.911652015568376e-06, + "loss": 0.0217, + "num_tokens": 97095601.0, + "reward": 1.1763570308685303, + "reward_std": 1.5949087142944336, + "rewards/accuracy_reward/mean": 0.6875, + "rewards/accuracy_reward/std": 2.455153465270996, + "rewards/ngram_similarity_reward/mean": 0.48885706067085266, + "rewards/ngram_similarity_reward/std": 0.38667434453964233, + "step": 605 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 551.0, + "completions/max_terminated_length": 551.0, + "completions/mean_length": 342.734375, + "completions/mean_terminated_length": 342.734375, + "completions/min_length": 132.0, + "completions/min_terminated_length": 132.0, + "epoch": 0.2712016110986798, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07411551475524902, + "learning_rate": 4.911189501723846e-06, + "loss": 0.0243, + "num_tokens": 97224736.0, + "reward": 3.03371000289917, + "reward_std": 0.28567981719970703, + "rewards/accuracy_reward/mean": 2.484375, + "rewards/accuracy_reward/std": 3.0419929027557373, + "rewards/ngram_similarity_reward/mean": 0.5493350028991699, + "rewards/ngram_similarity_reward/std": 0.4061363935470581, + "step": 606 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 621.0, + "completions/max_terminated_length": 621.0, + "completions/mean_length": 361.578125, + "completions/mean_terminated_length": 361.578125, + "completions/min_length": 185.0, + "completions/min_terminated_length": 185.0, + "epoch": 0.2716491385097337, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.05806978419423103, + "learning_rate": 4.9107258047489654e-06, + "loss": 0.0089, + "num_tokens": 97350325.0, + "reward": 2.486035108566284, + "reward_std": 1.347016453742981, + "rewards/accuracy_reward/mean": 1.75, + "rewards/accuracy_reward/std": 2.9277002811431885, + "rewards/ngram_similarity_reward/mean": 0.7360352277755737, + "rewards/ngram_similarity_reward/std": 0.2674678862094879, + "step": 607 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 739.0, + "completions/max_terminated_length": 739.0, + "completions/mean_length": 406.8125, + "completions/mean_terminated_length": 406.8125, + "completions/min_length": 248.0, + "completions/min_terminated_length": 248.0, + "epoch": 0.27209666592078763, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0656108483672142, + "learning_rate": 4.910260924897583e-06, + "loss": 0.0071, + "num_tokens": 97494473.0, + "reward": 4.586330413818359, + "reward_std": 0.5184668302536011, + "rewards/accuracy_reward/mean": 3.90625, + "rewards/accuracy_reward/std": 2.6709415912628174, + "rewards/ngram_similarity_reward/mean": 0.6800804138183594, + "rewards/ngram_similarity_reward/std": 0.25902506709098816, + "step": 608 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 651.0, + "completions/max_terminated_length": 651.0, + "completions/mean_length": 392.21875, + "completions/mean_terminated_length": 392.21875, + "completions/min_length": 229.0, + "completions/min_terminated_length": 229.0, + "epoch": 0.2725441933318416, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06588999181985855, + "learning_rate": 4.909794862424195e-06, + "loss": -0.0084, + "num_tokens": 97619063.0, + "reward": 3.3414297103881836, + "reward_std": 2.093870162963867, + "rewards/accuracy_reward/mean": 2.875, + "rewards/accuracy_reward/std": 3.000000238418579, + "rewards/ngram_similarity_reward/mean": 0.4664297103881836, + "rewards/ngram_similarity_reward/std": 0.28803008794784546, + "step": 609 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 711.0, + "completions/max_terminated_length": 711.0, + "completions/mean_length": 483.03125, + "completions/mean_terminated_length": 483.03125, + "completions/min_length": 276.0, + "completions/min_terminated_length": 276.0, + "epoch": 0.2729917207428955, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06073449179530144, + "learning_rate": 4.909327617583943e-06, + "loss": -0.013, + "num_tokens": 97786841.0, + "reward": 3.875952959060669, + "reward_std": 0.8475580215454102, + "rewards/accuracy_reward/mean": 3.4375, + "rewards/accuracy_reward/std": 2.872281312942505, + "rewards/ngram_similarity_reward/mean": 0.43845313787460327, + "rewards/ngram_similarity_reward/std": 0.3433631360530853, + "step": 610 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 647.0, + "completions/max_terminated_length": 647.0, + "completions/mean_length": 367.546875, + "completions/mean_terminated_length": 367.546875, + "completions/min_length": 233.0, + "completions/min_terminated_length": 233.0, + "epoch": 0.27343924815394943, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09106756001710892, + "learning_rate": 4.90885919063262e-06, + "loss": -0.0131, + "num_tokens": 97904540.0, + "reward": 3.272808790206909, + "reward_std": 1.9939448833465576, + "rewards/accuracy_reward/mean": 2.6875, + "rewards/accuracy_reward/std": 3.0178043842315674, + "rewards/ngram_similarity_reward/mean": 0.5853087306022644, + "rewards/ngram_similarity_reward/std": 0.3691607713699341, + "step": 611 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 674.0, + "completions/max_terminated_length": 674.0, + "completions/mean_length": 358.8125, + "completions/mean_terminated_length": 358.8125, + "completions/min_length": 213.0, + "completions/min_terminated_length": 213.0, + "epoch": 0.27388677556500335, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06978929042816162, + "learning_rate": 4.908389581826661e-06, + "loss": -0.0039, + "num_tokens": 98030768.0, + "reward": 4.42750883102417, + "reward_std": 1.1210732460021973, + "rewards/accuracy_reward/mean": 3.8125, + "rewards/accuracy_reward/std": 2.7189810276031494, + "rewards/ngram_similarity_reward/mean": 0.615009069442749, + "rewards/ngram_similarity_reward/std": 0.36108338832855225, + "step": 612 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 641.0, + "completions/max_terminated_length": 641.0, + "completions/mean_length": 391.140625, + "completions/mean_terminated_length": 391.140625, + "completions/min_length": 240.0, + "completions/min_terminated_length": 240.0, + "epoch": 0.27433430297605726, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07510551810264587, + "learning_rate": 4.9079187914231515e-06, + "loss": 0.0097, + "num_tokens": 98201481.0, + "reward": 3.266921043395996, + "reward_std": 2.1021249294281006, + "rewards/accuracy_reward/mean": 2.734375, + "rewards/accuracy_reward/std": 3.0692648887634277, + "rewards/ngram_similarity_reward/mean": 0.5325460433959961, + "rewards/ngram_similarity_reward/std": 0.3077428638935089, + "step": 613 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 859.0, + "completions/max_terminated_length": 859.0, + "completions/mean_length": 525.453125, + "completions/mean_terminated_length": 525.453125, + "completions/min_length": 253.0, + "completions/min_terminated_length": 253.0, + "epoch": 0.27478183038711124, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06352479755878448, + "learning_rate": 4.907446819679822e-06, + "loss": 0.0376, + "num_tokens": 98410278.0, + "reward": 4.358247756958008, + "reward_std": 0.5445563793182373, + "rewards/accuracy_reward/mean": 3.890625, + "rewards/accuracy_reward/std": 2.6998953819274902, + "rewards/ngram_similarity_reward/mean": 0.4676225185394287, + "rewards/ngram_similarity_reward/std": 0.31227847933769226, + "step": 614 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 666.0, + "completions/max_terminated_length": 666.0, + "completions/mean_length": 389.625, + "completions/mean_terminated_length": 389.625, + "completions/min_length": 198.0, + "completions/min_terminated_length": 198.0, + "epoch": 0.27522935779816515, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09563887119293213, + "learning_rate": 4.906973666855053e-06, + "loss": -0.0267, + "num_tokens": 98536878.0, + "reward": 4.742339611053467, + "reward_std": 0.5450695157051086, + "rewards/accuracy_reward/mean": 4.09375, + "rewards/accuracy_reward/std": 2.5617377758026123, + "rewards/ngram_similarity_reward/mean": 0.6485894918441772, + "rewards/ngram_similarity_reward/std": 0.36741480231285095, + "step": 615 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 844.0, + "completions/max_terminated_length": 844.0, + "completions/mean_length": 471.078125, + "completions/mean_terminated_length": 471.078125, + "completions/min_length": 269.0, + "completions/min_terminated_length": 269.0, + "epoch": 0.27567688520921907, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.060624074190855026, + "learning_rate": 4.906499333207868e-06, + "loss": 0.0118, + "num_tokens": 98659075.0, + "reward": 3.159956216812134, + "reward_std": 1.605148434638977, + "rewards/accuracy_reward/mean": 2.78125, + "rewards/accuracy_reward/std": 3.0103988647460938, + "rewards/ngram_similarity_reward/mean": 0.37870633602142334, + "rewards/ngram_similarity_reward/std": 0.2827274203300476, + "step": 616 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 762.0, + "completions/max_terminated_length": 762.0, + "completions/mean_length": 496.875, + "completions/mean_terminated_length": 496.875, + "completions/min_length": 236.0, + "completions/min_terminated_length": 236.0, + "epoch": 0.276124412620273, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0545518733561039, + "learning_rate": 4.906023818997937e-06, + "loss": 0.0125, + "num_tokens": 98814603.0, + "reward": 2.415585517883301, + "reward_std": 1.351930022239685, + "rewards/accuracy_reward/mean": 1.71875, + "rewards/accuracy_reward/std": 2.9572014808654785, + "rewards/ngram_similarity_reward/mean": 0.6968356370925903, + "rewards/ngram_similarity_reward/std": 0.3501358926296234, + "step": 617 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 673.0, + "completions/max_terminated_length": 673.0, + "completions/mean_length": 437.890625, + "completions/mean_terminated_length": 437.890625, + "completions/min_length": 281.0, + "completions/min_terminated_length": 281.0, + "epoch": 0.2765719400313269, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06158144772052765, + "learning_rate": 4.905547124485579e-06, + "loss": -0.0449, + "num_tokens": 98954596.0, + "reward": 3.255443572998047, + "reward_std": 1.197216510772705, + "rewards/accuracy_reward/mean": 2.75, + "rewards/accuracy_reward/std": 3.0498504638671875, + "rewards/ngram_similarity_reward/mean": 0.5054433345794678, + "rewards/ngram_similarity_reward/std": 0.292623907327652, + "step": 618 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 604.0, + "completions/max_terminated_length": 604.0, + "completions/mean_length": 439.890625, + "completions/mean_terminated_length": 439.890625, + "completions/min_length": 294.0, + "completions/min_terminated_length": 294.0, + "epoch": 0.2770194674423809, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06922630220651627, + "learning_rate": 4.905069249931756e-06, + "loss": -0.0113, + "num_tokens": 99170269.0, + "reward": 1.4937655925750732, + "reward_std": 0.44643130898475647, + "rewards/accuracy_reward/mean": 0.90625, + "rewards/accuracy_reward/std": 2.5617377758026123, + "rewards/ngram_similarity_reward/mean": 0.5875155925750732, + "rewards/ngram_similarity_reward/std": 0.38779351115226746, + "step": 619 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 659.0, + "completions/max_terminated_length": 659.0, + "completions/mean_length": 448.75, + "completions/mean_terminated_length": 448.75, + "completions/min_length": 210.0, + "completions/min_terminated_length": 210.0, + "epoch": 0.2774669948534348, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06195135787129402, + "learning_rate": 4.904590195598079e-06, + "loss": -0.0009, + "num_tokens": 99338333.0, + "reward": 3.647702693939209, + "reward_std": 0.9113937616348267, + "rewards/accuracy_reward/mean": 3.15625, + "rewards/accuracy_reward/std": 2.950484275817871, + "rewards/ngram_similarity_reward/mean": 0.4914528727531433, + "rewards/ngram_similarity_reward/std": 0.2231704592704773, + "step": 620 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 595.0, + "completions/max_terminated_length": 595.0, + "completions/mean_length": 299.078125, + "completions/mean_terminated_length": 299.078125, + "completions/min_length": 160.0, + "completions/min_terminated_length": 160.0, + "epoch": 0.2779145222644887, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08496079593896866, + "learning_rate": 4.904109961746803e-06, + "loss": 0.037, + "num_tokens": 99479458.0, + "reward": 3.8821115493774414, + "reward_std": 1.3969141244888306, + "rewards/accuracy_reward/mean": 3.34375, + "rewards/accuracy_reward/std": 2.9016621112823486, + "rewards/ngram_similarity_reward/mean": 0.5383617281913757, + "rewards/ngram_similarity_reward/std": 0.41145893931388855, + "step": 621 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 612.0, + "completions/max_terminated_length": 612.0, + "completions/mean_length": 418.546875, + "completions/mean_terminated_length": 418.546875, + "completions/min_length": 279.0, + "completions/min_terminated_length": 279.0, + "epoch": 0.2783620496755426, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06467875093221664, + "learning_rate": 4.9036285486408284e-06, + "loss": 0.0095, + "num_tokens": 99599173.0, + "reward": 1.9830317497253418, + "reward_std": 1.8674609661102295, + "rewards/accuracy_reward/mean": 1.515625, + "rewards/accuracy_reward/std": 2.914072036743164, + "rewards/ngram_similarity_reward/mean": 0.467406690120697, + "rewards/ngram_similarity_reward/std": 0.3296161890029907, + "step": 622 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 706.0, + "completions/max_terminated_length": 706.0, + "completions/mean_length": 455.375, + "completions/mean_terminated_length": 455.375, + "completions/min_length": 289.0, + "completions/min_terminated_length": 289.0, + "epoch": 0.27880957708659654, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06274480372667313, + "learning_rate": 4.903145956543704e-06, + "loss": 0.0127, + "num_tokens": 99729165.0, + "reward": 5.742074012756348, + "reward_std": 0.8376166224479675, + "rewards/accuracy_reward/mean": 5.21875, + "rewards/accuracy_reward/std": 1.2782522439956665, + "rewards/ngram_similarity_reward/mean": 0.5233241319656372, + "rewards/ngram_similarity_reward/std": 0.32514575123786926, + "step": 623 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 702.0, + "completions/max_terminated_length": 702.0, + "completions/mean_length": 487.609375, + "completions/mean_terminated_length": 487.609375, + "completions/min_length": 274.0, + "completions/min_terminated_length": 274.0, + "epoch": 0.27925710449765045, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.057502295821905136, + "learning_rate": 4.90266218571962e-06, + "loss": 0.0013, + "num_tokens": 99848244.0, + "reward": 2.3932907581329346, + "reward_std": 0.9242256283760071, + "rewards/accuracy_reward/mean": 1.90625, + "rewards/accuracy_reward/std": 3.001157283782959, + "rewards/ngram_similarity_reward/mean": 0.48704057931900024, + "rewards/ngram_similarity_reward/std": 0.3349016606807709, + "step": 624 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 546.0, + "completions/max_terminated_length": 546.0, + "completions/mean_length": 324.171875, + "completions/mean_terminated_length": 324.171875, + "completions/min_length": 175.0, + "completions/min_terminated_length": 175.0, + "epoch": 0.2797046319087044, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07760117202997208, + "learning_rate": 4.902177236433414e-06, + "loss": 0.0132, + "num_tokens": 100005519.0, + "reward": 4.642778396606445, + "reward_std": 0.7077197432518005, + "rewards/accuracy_reward/mean": 3.796875, + "rewards/accuracy_reward/std": 2.746886730194092, + "rewards/ngram_similarity_reward/mean": 0.8459034562110901, + "rewards/ngram_similarity_reward/std": 0.35508018732070923, + "step": 625 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 695.0, + "completions/max_terminated_length": 695.0, + "completions/mean_length": 434.484375, + "completions/mean_terminated_length": 434.484375, + "completions/min_length": 199.0, + "completions/min_terminated_length": 199.0, + "epoch": 0.28015215931975834, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07231221348047256, + "learning_rate": 4.9016911089505695e-06, + "loss": 0.0261, + "num_tokens": 100249454.0, + "reward": 3.2338109016418457, + "reward_std": 1.6458979845046997, + "rewards/accuracy_reward/mean": 2.765625, + "rewards/accuracy_reward/std": 3.0302298069000244, + "rewards/ngram_similarity_reward/mean": 0.4681856632232666, + "rewards/ngram_similarity_reward/std": 0.31056922674179077, + "step": 626 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 653.0, + "completions/max_terminated_length": 653.0, + "completions/mean_length": 449.890625, + "completions/mean_terminated_length": 449.890625, + "completions/min_length": 230.0, + "completions/min_terminated_length": 230.0, + "epoch": 0.28059968673081226, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06774061173200607, + "learning_rate": 4.901203803537214e-06, + "loss": -0.0067, + "num_tokens": 100376119.0, + "reward": 5.257980823516846, + "reward_std": 2.083458662033081, + "rewards/accuracy_reward/mean": 4.453125, + "rewards/accuracy_reward/std": 2.319206953048706, + "rewards/ngram_similarity_reward/mean": 0.8048558235168457, + "rewards/ngram_similarity_reward/std": 0.3509519100189209, + "step": 627 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 895.0, + "completions/max_terminated_length": 895.0, + "completions/mean_length": 534.546875, + "completions/mean_terminated_length": 534.546875, + "completions/min_length": 323.0, + "completions/min_terminated_length": 323.0, + "epoch": 0.2810472141418662, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04980470985174179, + "learning_rate": 4.900715320460119e-06, + "loss": 0.0007, + "num_tokens": 100515658.0, + "reward": 4.141946315765381, + "reward_std": 0.7935642004013062, + "rewards/accuracy_reward/mean": 3.625, + "rewards/accuracy_reward/std": 2.8030595779418945, + "rewards/ngram_similarity_reward/mean": 0.5169461965560913, + "rewards/ngram_similarity_reward/std": 0.20286968350410461, + "step": 628 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 831.0, + "completions/max_terminated_length": 831.0, + "completions/mean_length": 561.375, + "completions/mean_terminated_length": 561.375, + "completions/min_length": 301.0, + "completions/min_terminated_length": 301.0, + "epoch": 0.2814947415529201, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04802021011710167, + "learning_rate": 4.900225659986703e-06, + "loss": -0.0061, + "num_tokens": 100706834.0, + "reward": 4.6559953689575195, + "reward_std": 0.5603998303413391, + "rewards/accuracy_reward/mean": 3.890625, + "rewards/accuracy_reward/std": 2.6998953819274902, + "rewards/ngram_similarity_reward/mean": 0.76537024974823, + "rewards/ngram_similarity_reward/std": 0.1904088407754898, + "step": 629 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 536.0, + "completions/max_terminated_length": 536.0, + "completions/mean_length": 392.5, + "completions/mean_terminated_length": 392.5, + "completions/min_length": 277.0, + "completions/min_terminated_length": 277.0, + "epoch": 0.28194226896397406, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07349932938814163, + "learning_rate": 4.899734822385027e-06, + "loss": 0.0164, + "num_tokens": 100879762.0, + "reward": 1.2480851411819458, + "reward_std": 0.28011858463287354, + "rewards/accuracy_reward/mean": 0.9375, + "rewards/accuracy_reward/std": 2.6659226417541504, + "rewards/ngram_similarity_reward/mean": 0.31058529019355774, + "rewards/ngram_similarity_reward/std": 0.20735912024974823, + "step": 630 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 713.0, + "completions/max_terminated_length": 713.0, + "completions/mean_length": 427.21875, + "completions/mean_terminated_length": 427.21875, + "completions/min_length": 167.0, + "completions/min_terminated_length": 167.0, + "epoch": 0.282389796375028, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05627664178609848, + "learning_rate": 4.8992428079237966e-06, + "loss": -0.0289, + "num_tokens": 101004768.0, + "reward": 1.7765861749649048, + "reward_std": 0.7219223976135254, + "rewards/accuracy_reward/mean": 1.15625, + "rewards/accuracy_reward/std": 2.7442219257354736, + "rewards/ngram_similarity_reward/mean": 0.6203360557556152, + "rewards/ngram_similarity_reward/std": 0.26624545454978943, + "step": 631 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 707.0, + "completions/mean_length": 517.953125, + "completions/mean_terminated_length": 493.66668701171875, + "completions/min_length": 209.0, + "completions/min_terminated_length": 209.0, + "epoch": 0.2828373237860819, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06717228144407272, + "learning_rate": 4.898749616872363e-06, + "loss": 0.0045, + "num_tokens": 101171965.0, + "reward": 2.8893322944641113, + "reward_std": 0.5401940941810608, + "rewards/accuracy_reward/mean": 2.40625, + "rewards/accuracy_reward/std": 3.0222392082214355, + "rewards/ngram_similarity_reward/mean": 0.48308250308036804, + "rewards/ngram_similarity_reward/std": 0.3030366599559784, + "step": 632 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 701.0, + "completions/max_terminated_length": 701.0, + "completions/mean_length": 425.515625, + "completions/mean_terminated_length": 425.515625, + "completions/min_length": 167.0, + "completions/min_terminated_length": 167.0, + "epoch": 0.2832848511971358, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06200220435857773, + "learning_rate": 4.89825524950072e-06, + "loss": -0.0197, + "num_tokens": 101308942.0, + "reward": 4.342251777648926, + "reward_std": 0.7502060532569885, + "rewards/accuracy_reward/mean": 3.78125, + "rewards/accuracy_reward/std": 2.7744226455688477, + "rewards/ngram_similarity_reward/mean": 0.5610020756721497, + "rewards/ngram_similarity_reward/std": 0.3634032905101776, + "step": 633 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 756.0, + "completions/max_terminated_length": 756.0, + "completions/mean_length": 410.03125, + "completions/mean_terminated_length": 410.03125, + "completions/min_length": 189.0, + "completions/min_terminated_length": 189.0, + "epoch": 0.28373237860818973, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.05336322635412216, + "learning_rate": 4.897759706079508e-06, + "loss": -0.0188, + "num_tokens": 101439872.0, + "reward": 5.264481544494629, + "reward_std": 1.371099591255188, + "rewards/accuracy_reward/mean": 4.65625, + "rewards/accuracy_reward/std": 2.102294683456421, + "rewards/ngram_similarity_reward/mean": 0.6082311868667603, + "rewards/ngram_similarity_reward/std": 0.4162740409374237, + "step": 634 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 824.0, + "completions/max_terminated_length": 824.0, + "completions/mean_length": 485.96875, + "completions/mean_terminated_length": 485.96875, + "completions/min_length": 292.0, + "completions/min_terminated_length": 292.0, + "epoch": 0.2841799060192437, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05253211408853531, + "learning_rate": 4.897262986880006e-06, + "loss": -0.0079, + "num_tokens": 101593086.0, + "reward": 3.2981011867523193, + "reward_std": 0.8422079086303711, + "rewards/accuracy_reward/mean": 2.671875, + "rewards/accuracy_reward/std": 3.037097215652466, + "rewards/ngram_similarity_reward/mean": 0.6262260675430298, + "rewards/ngram_similarity_reward/std": 0.3407411277294159, + "step": 635 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 913.0, + "completions/max_terminated_length": 913.0, + "completions/mean_length": 505.625, + "completions/mean_terminated_length": 505.625, + "completions/min_length": 297.0, + "completions/min_terminated_length": 297.0, + "epoch": 0.2846274334302976, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0634000152349472, + "learning_rate": 4.896765092174143e-06, + "loss": 0.0095, + "num_tokens": 101715574.0, + "reward": 3.890976905822754, + "reward_std": 0.9697810411453247, + "rewards/accuracy_reward/mean": 3.5625, + "rewards/accuracy_reward/std": 2.905249834060669, + "rewards/ngram_similarity_reward/mean": 0.3284766972064972, + "rewards/ngram_similarity_reward/std": 0.25927233695983887, + "step": 636 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 572.0, + "completions/max_terminated_length": 572.0, + "completions/mean_length": 366.921875, + "completions/mean_terminated_length": 366.921875, + "completions/min_length": 240.0, + "completions/min_terminated_length": 240.0, + "epoch": 0.28507496084135153, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06476201117038727, + "learning_rate": 4.896266022234487e-06, + "loss": 0.0642, + "num_tokens": 101864257.0, + "reward": 4.763962745666504, + "reward_std": 0.28067347407341003, + "rewards/accuracy_reward/mean": 4.0, + "rewards/accuracy_reward/std": 2.618614673614502, + "rewards/ngram_similarity_reward/mean": 0.7639628052711487, + "rewards/ngram_similarity_reward/std": 0.3518589437007904, + "step": 637 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 714.0, + "completions/max_terminated_length": 714.0, + "completions/mean_length": 421.375, + "completions/mean_terminated_length": 421.375, + "completions/min_length": 230.0, + "completions/min_terminated_length": 230.0, + "epoch": 0.28552248825240545, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06546300649642944, + "learning_rate": 4.895765777334251e-06, + "loss": 0.0724, + "num_tokens": 101997993.0, + "reward": 2.0880303382873535, + "reward_std": 1.4095115661621094, + "rewards/accuracy_reward/mean": 1.59375, + "rewards/accuracy_reward/std": 2.958543062210083, + "rewards/ngram_similarity_reward/mean": 0.49428027868270874, + "rewards/ngram_similarity_reward/std": 0.2925429344177246, + "step": 638 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 801.0, + "completions/max_terminated_length": 801.0, + "completions/mean_length": 564.78125, + "completions/mean_terminated_length": 564.78125, + "completions/min_length": 287.0, + "completions/min_terminated_length": 287.0, + "epoch": 0.28597001566345936, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.051474377512931824, + "learning_rate": 4.895264357747292e-06, + "loss": 0.0091, + "num_tokens": 102156187.0, + "reward": 5.279137134552002, + "reward_std": 1.6453781127929688, + "rewards/accuracy_reward/mean": 4.640625, + "rewards/accuracy_reward/std": 2.1445181369781494, + "rewards/ngram_similarity_reward/mean": 0.638512134552002, + "rewards/ngram_similarity_reward/std": 0.18305997550487518, + "step": 639 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 713.0, + "completions/max_terminated_length": 713.0, + "completions/mean_length": 457.1875, + "completions/mean_terminated_length": 457.1875, + "completions/min_length": 213.0, + "completions/min_terminated_length": 213.0, + "epoch": 0.28641754307451334, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06602423638105392, + "learning_rate": 4.8947617637481076e-06, + "loss": -0.0184, + "num_tokens": 102295063.0, + "reward": 2.9706907272338867, + "reward_std": 0.5678014755249023, + "rewards/accuracy_reward/mean": 2.59375, + "rewards/accuracy_reward/std": 3.0222392082214355, + "rewards/ngram_similarity_reward/mean": 0.37694060802459717, + "rewards/ngram_similarity_reward/std": 0.32089659571647644, + "step": 640 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 811.0, + "completions/mean_length": 513.359375, + "completions/mean_terminated_length": 489.0000305175781, + "completions/min_length": 271.0, + "completions/min_terminated_length": 271.0, + "epoch": 0.28686507048556725, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.057368215173482895, + "learning_rate": 4.894257995611841e-06, + "loss": -0.0077, + "num_tokens": 102435582.0, + "reward": 5.137367248535156, + "reward_std": 1.5684716701507568, + "rewards/accuracy_reward/mean": 4.546875, + "rewards/accuracy_reward/std": 2.2355687618255615, + "rewards/ngram_similarity_reward/mean": 0.590491771697998, + "rewards/ngram_similarity_reward/std": 0.2243080735206604, + "step": 641 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 764.0, + "completions/max_terminated_length": 764.0, + "completions/mean_length": 541.828125, + "completions/mean_terminated_length": 541.828125, + "completions/min_length": 362.0, + "completions/min_terminated_length": 362.0, + "epoch": 0.28731259789662117, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05485881119966507, + "learning_rate": 4.893753053614277e-06, + "loss": -0.01, + "num_tokens": 102592355.0, + "reward": 3.4653518199920654, + "reward_std": 0.8715257048606873, + "rewards/accuracy_reward/mean": 2.84375, + "rewards/accuracy_reward/std": 3.0405657291412354, + "rewards/ngram_similarity_reward/mean": 0.6216020584106445, + "rewards/ngram_similarity_reward/std": 0.2996468245983124, + "step": 642 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 810.0, + "completions/max_terminated_length": 810.0, + "completions/mean_length": 478.953125, + "completions/mean_terminated_length": 478.953125, + "completions/min_length": 272.0, + "completions/min_terminated_length": 272.0, + "epoch": 0.2877601253076751, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0552397146821022, + "learning_rate": 4.893246938031842e-06, + "loss": -0.0002, + "num_tokens": 102731856.0, + "reward": 2.9069056510925293, + "reward_std": 0.688241720199585, + "rewards/accuracy_reward/mean": 2.296875, + "rewards/accuracy_reward/std": 3.0351366996765137, + "rewards/ngram_similarity_reward/mean": 0.6100307106971741, + "rewards/ngram_similarity_reward/std": 0.379904180765152, + "step": 643 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 653.0, + "completions/max_terminated_length": 653.0, + "completions/mean_length": 403.171875, + "completions/mean_terminated_length": 403.171875, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, + "epoch": 0.288207652718729, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07412349432706833, + "learning_rate": 4.8927396491416086e-06, + "loss": 0.0488, + "num_tokens": 102911979.0, + "reward": 3.819849967956543, + "reward_std": 1.0436134338378906, + "rewards/accuracy_reward/mean": 3.078125, + "rewards/accuracy_reward/std": 3.0592284202575684, + "rewards/ngram_similarity_reward/mean": 0.7417250871658325, + "rewards/ngram_similarity_reward/std": 0.3448202311992645, + "step": 644 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 911.0, + "completions/max_terminated_length": 911.0, + "completions/mean_length": 472.609375, + "completions/mean_terminated_length": 472.609375, + "completions/min_length": 247.0, + "completions/min_terminated_length": 247.0, + "epoch": 0.288655180129783, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06565144658088684, + "learning_rate": 4.892231187221287e-06, + "loss": 0.0324, + "num_tokens": 103100290.0, + "reward": 3.119292736053467, + "reward_std": 1.507504940032959, + "rewards/accuracy_reward/mean": 2.59375, + "rewards/accuracy_reward/std": 3.0222392082214355, + "rewards/ngram_similarity_reward/mean": 0.5255429148674011, + "rewards/ngram_similarity_reward/std": 0.14479827880859375, + "step": 645 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 763.0, + "completions/max_terminated_length": 763.0, + "completions/mean_length": 461.78125, + "completions/mean_terminated_length": 461.78125, + "completions/min_length": 188.0, + "completions/min_terminated_length": 188.0, + "epoch": 0.2891027075408369, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05364995822310448, + "learning_rate": 4.891721552549231e-06, + "loss": -0.015, + "num_tokens": 103222788.0, + "reward": 4.745234966278076, + "reward_std": 0.746932864189148, + "rewards/accuracy_reward/mean": 4.1875, + "rewards/accuracy_reward/std": 2.5, + "rewards/ngram_similarity_reward/mean": 0.5577349662780762, + "rewards/ngram_similarity_reward/std": 0.29511386156082153, + "step": 646 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 863.0, + "completions/max_terminated_length": 863.0, + "completions/mean_length": 414.734375, + "completions/mean_terminated_length": 414.734375, + "completions/min_length": 218.0, + "completions/min_terminated_length": 218.0, + "epoch": 0.2895502349518908, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0833452045917511, + "learning_rate": 4.891210745404438e-06, + "loss": -0.0237, + "num_tokens": 103374115.0, + "reward": 1.926346778869629, + "reward_std": 2.1057116985321045, + "rewards/accuracy_reward/mean": 1.453125, + "rewards/accuracy_reward/std": 2.967708110809326, + "rewards/ngram_similarity_reward/mean": 0.473222017288208, + "rewards/ngram_similarity_reward/std": 0.4059741199016571, + "step": 647 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 558.0, + "completions/max_terminated_length": 558.0, + "completions/mean_length": 394.359375, + "completions/mean_terminated_length": 394.359375, + "completions/min_length": 272.0, + "completions/min_terminated_length": 272.0, + "epoch": 0.2899977623629447, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07352577894926071, + "learning_rate": 4.8906987660665476e-06, + "loss": -0.0028, + "num_tokens": 103561898.0, + "reward": 4.475122451782227, + "reward_std": 0.20881153643131256, + "rewards/accuracy_reward/mean": 4.0, + "rewards/accuracy_reward/std": 2.618614673614502, + "rewards/ngram_similarity_reward/mean": 0.4751223921775818, + "rewards/ngram_similarity_reward/std": 0.2813428044319153, + "step": 648 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 559.0, + "completions/max_terminated_length": 559.0, + "completions/mean_length": 405.03125, + "completions/mean_terminated_length": 405.03125, + "completions/min_length": 215.0, + "completions/min_terminated_length": 215.0, + "epoch": 0.29044528977399864, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09115469455718994, + "learning_rate": 4.8901856148158375e-06, + "loss": 0.0315, + "num_tokens": 103780764.0, + "reward": 2.3142309188842773, + "reward_std": 1.3402717113494873, + "rewards/accuracy_reward/mean": 1.875, + "rewards/accuracy_reward/std": 3.0315799713134766, + "rewards/ngram_similarity_reward/mean": 0.43923094868659973, + "rewards/ngram_similarity_reward/std": 0.20881466567516327, + "step": 649 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 786.0, + "completions/max_terminated_length": 786.0, + "completions/mean_length": 523.4375, + "completions/mean_terminated_length": 523.4375, + "completions/min_length": 310.0, + "completions/min_terminated_length": 310.0, + "epoch": 0.2908928171850526, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05662143975496292, + "learning_rate": 4.889671291933231e-06, + "loss": -0.0036, + "num_tokens": 103930728.0, + "reward": 1.6569280624389648, + "reward_std": 0.4532318115234375, + "rewards/accuracy_reward/mean": 1.09375, + "rewards/accuracy_reward/std": 2.6709415912628174, + "rewards/ngram_similarity_reward/mean": 0.5631780028343201, + "rewards/ngram_similarity_reward/std": 0.4234296381473541, + "step": 650 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 779.0, + "completions/max_terminated_length": 779.0, + "completions/mean_length": 475.84375, + "completions/mean_terminated_length": 475.84375, + "completions/min_length": 222.0, + "completions/min_terminated_length": 222.0, + "epoch": 0.2913403445961065, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06426508724689484, + "learning_rate": 4.8891557977002915e-06, + "loss": 0.009, + "num_tokens": 104147918.0, + "reward": 4.594308853149414, + "reward_std": 0.5861914753913879, + "rewards/accuracy_reward/mean": 3.890625, + "rewards/accuracy_reward/std": 2.6998953819274902, + "rewards/ngram_similarity_reward/mean": 0.7036839723587036, + "rewards/ngram_similarity_reward/std": 0.24967730045318604, + "step": 651 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 700.0, + "completions/max_terminated_length": 700.0, + "completions/mean_length": 462.875, + "completions/mean_terminated_length": 462.875, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "epoch": 0.29178787200716044, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05419683828949928, + "learning_rate": 4.888639132399221e-06, + "loss": 0.0002, + "num_tokens": 104316102.0, + "reward": 3.444973945617676, + "reward_std": 1.0651003122329712, + "rewards/accuracy_reward/mean": 2.875, + "rewards/accuracy_reward/std": 3.000000238418579, + "rewards/ngram_similarity_reward/mean": 0.5699740648269653, + "rewards/ngram_similarity_reward/std": 0.20455977320671082, + "step": 652 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1011.0, + "completions/max_terminated_length": 1011.0, + "completions/mean_length": 559.9375, + "completions/mean_terminated_length": 559.9375, + "completions/min_length": 309.0, + "completions/min_terminated_length": 309.0, + "epoch": 0.29223539941821436, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.058351751416921616, + "learning_rate": 4.888121296312867e-06, + "loss": 0.0426, + "num_tokens": 104454386.0, + "reward": 6.093581199645996, + "reward_std": 0.6258885860443115, + "rewards/accuracy_reward/mean": 5.390625, + "rewards/accuracy_reward/std": 0.8750000596046448, + "rewards/ngram_similarity_reward/mean": 0.7029563784599304, + "rewards/ngram_similarity_reward/std": 0.31050267815589905, + "step": 653 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 768.0, + "completions/max_terminated_length": 768.0, + "completions/mean_length": 456.09375, + "completions/mean_terminated_length": 456.09375, + "completions/min_length": 187.0, + "completions/min_terminated_length": 187.0, + "epoch": 0.2926829268292683, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06823959946632385, + "learning_rate": 4.887602289724715e-06, + "loss": 0.0201, + "num_tokens": 104597688.0, + "reward": 4.1921162605285645, + "reward_std": 0.9890491962432861, + "rewards/accuracy_reward/mean": 3.546875, + "rewards/accuracy_reward/std": 2.9300289154052734, + "rewards/ngram_similarity_reward/mean": 0.6452413201332092, + "rewards/ngram_similarity_reward/std": 0.35380610823631287, + "step": 654 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 750.0, + "completions/max_terminated_length": 750.0, + "completions/mean_length": 496.9375, + "completions/mean_terminated_length": 496.9375, + "completions/min_length": 342.0, + "completions/min_terminated_length": 342.0, + "epoch": 0.29313045424032225, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05976645275950432, + "learning_rate": 4.8870821129188915e-06, + "loss": 0.0346, + "num_tokens": 104694052.0, + "reward": 2.5127129554748535, + "reward_std": 1.7011172771453857, + "rewards/accuracy_reward/mean": 2.09375, + "rewards/accuracy_reward/std": 3.0327250957489014, + "rewards/ngram_similarity_reward/mean": 0.41896289587020874, + "rewards/ngram_similarity_reward/std": 0.30265194177627563, + "step": 655 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 752.0, + "completions/max_terminated_length": 752.0, + "completions/mean_length": 470.859375, + "completions/mean_terminated_length": 470.859375, + "completions/min_length": 271.0, + "completions/min_terminated_length": 271.0, + "epoch": 0.29357798165137616, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05093248561024666, + "learning_rate": 4.886560766180165e-06, + "loss": -0.012, + "num_tokens": 104829035.0, + "reward": 3.722310781478882, + "reward_std": 1.4227113723754883, + "rewards/accuracy_reward/mean": 3.25, + "rewards/accuracy_reward/std": 2.9277002811431885, + "rewards/ngram_similarity_reward/mean": 0.4723109006881714, + "rewards/ngram_similarity_reward/std": 0.33681726455688477, + "step": 656 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 653.0, + "completions/max_terminated_length": 653.0, + "completions/mean_length": 438.515625, + "completions/mean_terminated_length": 438.515625, + "completions/min_length": 272.0, + "completions/min_terminated_length": 272.0, + "epoch": 0.2940255090624301, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06808004528284073, + "learning_rate": 4.886038249793943e-06, + "loss": -0.0052, + "num_tokens": 104990956.0, + "reward": 3.241586685180664, + "reward_std": 0.6259248852729797, + "rewards/accuracy_reward/mean": 2.59375, + "rewards/accuracy_reward/std": 3.0222392082214355, + "rewards/ngram_similarity_reward/mean": 0.6478367447853088, + "rewards/ngram_similarity_reward/std": 0.3466797471046448, + "step": 657 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 770.0, + "completions/max_terminated_length": 770.0, + "completions/mean_length": 494.890625, + "completions/mean_terminated_length": 494.890625, + "completions/min_length": 244.0, + "completions/min_terminated_length": 244.0, + "epoch": 0.294473036473484, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08694926649332047, + "learning_rate": 4.885514564046276e-06, + "loss": -0.0148, + "num_tokens": 105229989.0, + "reward": 4.648185729980469, + "reward_std": 0.17134308815002441, + "rewards/accuracy_reward/mean": 4.0, + "rewards/accuracy_reward/std": 2.618614673614502, + "rewards/ngram_similarity_reward/mean": 0.6481857895851135, + "rewards/ngram_similarity_reward/std": 0.34424903988838196, + "step": 658 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 676.0, + "completions/max_terminated_length": 676.0, + "completions/mean_length": 460.671875, + "completions/mean_terminated_length": 460.671875, + "completions/min_length": 238.0, + "completions/min_terminated_length": 238.0, + "epoch": 0.2949205638845379, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06605000793933868, + "learning_rate": 4.884989709223849e-06, + "loss": 0.0273, + "num_tokens": 105362992.0, + "reward": 4.155838966369629, + "reward_std": 0.8568543195724487, + "rewards/accuracy_reward/mean": 3.53125, + "rewards/accuracy_reward/std": 2.839454174041748, + "rewards/ngram_similarity_reward/mean": 0.6245891451835632, + "rewards/ngram_similarity_reward/std": 0.35362404584884644, + "step": 659 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 526.0, + "completions/max_terminated_length": 526.0, + "completions/mean_length": 337.890625, + "completions/mean_terminated_length": 337.890625, + "completions/min_length": 197.0, + "completions/min_terminated_length": 197.0, + "epoch": 0.29536809129559183, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07516282796859741, + "learning_rate": 4.8844636856139946e-06, + "loss": -0.0098, + "num_tokens": 105524457.0, + "reward": 4.741536617279053, + "reward_std": 1.5814883708953857, + "rewards/accuracy_reward/mean": 4.375, + "rewards/accuracy_reward/std": 2.3603873252868652, + "rewards/ngram_similarity_reward/mean": 0.36653655767440796, + "rewards/ngram_similarity_reward/std": 0.26399847865104675, + "step": 660 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 584.0, + "completions/max_terminated_length": 584.0, + "completions/mean_length": 344.59375, + "completions/mean_terminated_length": 344.59375, + "completions/min_length": 185.0, + "completions/min_terminated_length": 185.0, + "epoch": 0.2958156187066458, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07712395489215851, + "learning_rate": 4.883936493504678e-06, + "loss": 0.0029, + "num_tokens": 105719711.0, + "reward": 4.29510498046875, + "reward_std": 0.5769654512405396, + "rewards/accuracy_reward/mean": 3.8125, + "rewards/accuracy_reward/std": 2.7189810276031494, + "rewards/ngram_similarity_reward/mean": 0.48260533809661865, + "rewards/ngram_similarity_reward/std": 0.33051398396492004, + "step": 661 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 734.0, + "completions/max_terminated_length": 734.0, + "completions/mean_length": 455.828125, + "completions/mean_terminated_length": 455.828125, + "completions/min_length": 226.0, + "completions/min_terminated_length": 226.0, + "epoch": 0.2962631461176997, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.052316777408123016, + "learning_rate": 4.8834081331845095e-06, + "loss": 0.0097, + "num_tokens": 105881860.0, + "reward": 4.3702569007873535, + "reward_std": 0.45644718408584595, + "rewards/accuracy_reward/mean": 3.90625, + "rewards/accuracy_reward/std": 2.6709415912628174, + "rewards/ngram_similarity_reward/mean": 0.46400687098503113, + "rewards/ngram_similarity_reward/std": 0.23395119607448578, + "step": 662 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 813.0, + "completions/max_terminated_length": 813.0, + "completions/mean_length": 468.65625, + "completions/mean_terminated_length": 468.65625, + "completions/min_length": 271.0, + "completions/min_terminated_length": 271.0, + "epoch": 0.29671067352875363, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0560014508664608, + "learning_rate": 4.882878604942737e-06, + "loss": -0.0453, + "num_tokens": 106034334.0, + "reward": 1.8861474990844727, + "reward_std": 0.8671086430549622, + "rewards/accuracy_reward/mean": 1.46875, + "rewards/accuracy_reward/std": 2.839454174041748, + "rewards/ngram_similarity_reward/mean": 0.4173974394798279, + "rewards/ngram_similarity_reward/std": 0.3207104206085205, + "step": 663 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 616.0, + "completions/max_terminated_length": 616.0, + "completions/mean_length": 393.03125, + "completions/mean_terminated_length": 393.03125, + "completions/min_length": 194.0, + "completions/min_terminated_length": 194.0, + "epoch": 0.29715820093980755, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06537173688411713, + "learning_rate": 4.882347909069246e-06, + "loss": -0.0027, + "num_tokens": 106171744.0, + "reward": 2.1393954753875732, + "reward_std": 2.222316026687622, + "rewards/accuracy_reward/mean": 1.453125, + "rewards/accuracy_reward/std": 3.077979803085327, + "rewards/ngram_similarity_reward/mean": 0.686270534992218, + "rewards/ngram_similarity_reward/std": 0.24120375514030457, + "step": 664 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 763.0, + "completions/max_terminated_length": 763.0, + "completions/mean_length": 558.328125, + "completions/mean_terminated_length": 558.328125, + "completions/min_length": 344.0, + "completions/min_terminated_length": 344.0, + "epoch": 0.29760572835086146, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04718297719955444, + "learning_rate": 4.881816045854562e-06, + "loss": -0.0246, + "num_tokens": 106351957.0, + "reward": 3.061079502105713, + "reward_std": 0.16121825575828552, + "rewards/accuracy_reward/mean": 2.46875, + "rewards/accuracy_reward/std": 3.06007981300354, + "rewards/ngram_similarity_reward/mean": 0.5923295021057129, + "rewards/ngram_similarity_reward/std": 0.2995980381965637, + "step": 665 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 782.0, + "completions/max_terminated_length": 782.0, + "completions/mean_length": 476.953125, + "completions/mean_terminated_length": 476.953125, + "completions/min_length": 304.0, + "completions/min_terminated_length": 304.0, + "epoch": 0.29805325576191544, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.059488993138074875, + "learning_rate": 4.8812830155898535e-06, + "loss": 0.0113, + "num_tokens": 106528754.0, + "reward": 2.6967692375183105, + "reward_std": 0.6507419347763062, + "rewards/accuracy_reward/mean": 2.296875, + "rewards/accuracy_reward/std": 3.0351366996765137, + "rewards/ngram_similarity_reward/mean": 0.3998942971229553, + "rewards/ngram_similarity_reward/std": 0.33484646677970886, + "step": 666 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 712.0, + "completions/max_terminated_length": 712.0, + "completions/mean_length": 443.703125, + "completions/mean_terminated_length": 443.703125, + "completions/min_length": 225.0, + "completions/min_terminated_length": 225.0, + "epoch": 0.29850078317296935, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07318499684333801, + "learning_rate": 4.880748818566923e-06, + "loss": 0.0234, + "num_tokens": 106660447.0, + "reward": 2.829117774963379, + "reward_std": 2.2205731868743896, + "rewards/accuracy_reward/mean": 2.3125, + "rewards/accuracy_reward/std": 3.0178043842315674, + "rewards/ngram_similarity_reward/mean": 0.5166178345680237, + "rewards/ngram_similarity_reward/std": 0.2861684262752533, + "step": 667 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 752.0, + "completions/max_terminated_length": 752.0, + "completions/mean_length": 496.0625, + "completions/mean_terminated_length": 496.0625, + "completions/min_length": 274.0, + "completions/min_terminated_length": 274.0, + "epoch": 0.29894831058402327, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05720727518200874, + "learning_rate": 4.880213455078214e-06, + "loss": 0.0109, + "num_tokens": 106802035.0, + "reward": 4.265432834625244, + "reward_std": 0.7786337733268738, + "rewards/accuracy_reward/mean": 3.71875, + "rewards/accuracy_reward/std": 2.881075382232666, + "rewards/ngram_similarity_reward/mean": 0.5466828346252441, + "rewards/ngram_similarity_reward/std": 0.2712513506412506, + "step": 668 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 545.0, + "completions/max_terminated_length": 545.0, + "completions/mean_length": 397.203125, + "completions/mean_terminated_length": 397.203125, + "completions/min_length": 267.0, + "completions/min_terminated_length": 267.0, + "epoch": 0.2993958379950772, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06245892122387886, + "learning_rate": 4.879676925416806e-06, + "loss": -0.0206, + "num_tokens": 106967792.0, + "reward": 4.526856422424316, + "reward_std": 0.13491158187389374, + "rewards/accuracy_reward/mean": 4.0, + "rewards/accuracy_reward/std": 2.618614673614502, + "rewards/ngram_similarity_reward/mean": 0.5268564224243164, + "rewards/ngram_similarity_reward/std": 0.37578147649765015, + "step": 669 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 722.0, + "completions/max_terminated_length": 722.0, + "completions/mean_length": 457.390625, + "completions/mean_terminated_length": 457.390625, + "completions/min_length": 333.0, + "completions/min_terminated_length": 333.0, + "epoch": 0.2998433654061311, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06878712773323059, + "learning_rate": 4.879139229876422e-06, + "loss": 0.041, + "num_tokens": 107112089.0, + "reward": 2.2240772247314453, + "reward_std": 1.5732771158218384, + "rewards/accuracy_reward/mean": 1.625, + "rewards/accuracy_reward/std": 2.9304099082946777, + "rewards/ngram_similarity_reward/mean": 0.5990773439407349, + "rewards/ngram_similarity_reward/std": 0.33353397250175476, + "step": 670 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 681.0, + "completions/max_terminated_length": 681.0, + "completions/mean_length": 504.71875, + "completions/mean_terminated_length": 504.71875, + "completions/min_length": 359.0, + "completions/min_terminated_length": 359.0, + "epoch": 0.3002908928171851, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.056628819555044174, + "learning_rate": 4.878600368751419e-06, + "loss": 0.0248, + "num_tokens": 107290327.0, + "reward": 5.932975769042969, + "reward_std": 0.2094021886587143, + "rewards/accuracy_reward/mean": 5.5, + "rewards/accuracy_reward/std": 0.0, + "rewards/ngram_similarity_reward/mean": 0.4329761564731598, + "rewards/ngram_similarity_reward/std": 0.28314828872680664, + "step": 671 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 642.0, + "completions/max_terminated_length": 642.0, + "completions/mean_length": 447.234375, + "completions/mean_terminated_length": 447.234375, + "completions/min_length": 247.0, + "completions/min_terminated_length": 247.0, + "epoch": 0.300738420228239, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0657930076122284, + "learning_rate": 4.8780603423367924e-06, + "loss": 0.0008, + "num_tokens": 107435574.0, + "reward": 1.5740094184875488, + "reward_std": 0.15247586369514465, + "rewards/accuracy_reward/mean": 1.0, + "rewards/accuracy_reward/std": 2.618614673614502, + "rewards/ngram_similarity_reward/mean": 0.5740094184875488, + "rewards/ngram_similarity_reward/std": 0.3831358850002289, + "step": 672 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 484.0, + "completions/max_terminated_length": 484.0, + "completions/mean_length": 341.15625, + "completions/mean_terminated_length": 341.15625, + "completions/min_length": 252.0, + "completions/min_terminated_length": 252.0, + "epoch": 0.3011859476392929, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08099042624235153, + "learning_rate": 4.877519150928178e-06, + "loss": -0.0101, + "num_tokens": 107615760.0, + "reward": 3.729093551635742, + "reward_std": 1.170121431350708, + "rewards/accuracy_reward/mean": 2.96875, + "rewards/accuracy_reward/std": 2.986577272415161, + "rewards/ngram_similarity_reward/mean": 0.7603434324264526, + "rewards/ngram_similarity_reward/std": 0.31671011447906494, + "step": 673 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 618.0, + "completions/max_terminated_length": 618.0, + "completions/mean_length": 452.953125, + "completions/mean_terminated_length": 452.953125, + "completions/min_length": 313.0, + "completions/min_terminated_length": 313.0, + "epoch": 0.3016334750503468, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.058367665857076645, + "learning_rate": 4.876976794821847e-06, + "loss": -0.0165, + "num_tokens": 107815037.0, + "reward": 3.6228060722351074, + "reward_std": 0.8924241065979004, + "rewards/accuracy_reward/mean": 3.15625, + "rewards/accuracy_reward/std": 2.950484275817871, + "rewards/ngram_similarity_reward/mean": 0.4665558934211731, + "rewards/ngram_similarity_reward/std": 0.23758745193481445, + "step": 674 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 838.0, + "completions/max_terminated_length": 838.0, + "completions/mean_length": 434.859375, + "completions/mean_terminated_length": 434.859375, + "completions/min_length": 201.0, + "completions/min_terminated_length": 201.0, + "epoch": 0.30208100246140074, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07167170941829681, + "learning_rate": 4.876433274314709e-06, + "loss": 0.0331, + "num_tokens": 107947844.0, + "reward": 4.721240997314453, + "reward_std": 0.17073199152946472, + "rewards/accuracy_reward/mean": 4.0, + "rewards/accuracy_reward/std": 2.618614673614502, + "rewards/ngram_similarity_reward/mean": 0.721240758895874, + "rewards/ngram_similarity_reward/std": 0.2963772416114807, + "step": 675 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 694.0, + "completions/max_terminated_length": 694.0, + "completions/mean_length": 391.765625, + "completions/mean_terminated_length": 391.765625, + "completions/min_length": 248.0, + "completions/min_terminated_length": 248.0, + "epoch": 0.3025285298724547, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.05580935254693031, + "learning_rate": 4.8758885897043115e-06, + "loss": 0.0061, + "num_tokens": 108083285.0, + "reward": 5.102502822875977, + "reward_std": 0.8618116974830627, + "rewards/accuracy_reward/mean": 4.46875, + "rewards/accuracy_reward/std": 2.2815253734588623, + "rewards/ngram_similarity_reward/mean": 0.6337529420852661, + "rewards/ngram_similarity_reward/std": 0.40071678161621094, + "step": 676 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 755.0, + "completions/max_terminated_length": 755.0, + "completions/mean_length": 470.109375, + "completions/mean_terminated_length": 470.109375, + "completions/min_length": 276.0, + "completions/min_terminated_length": 276.0, + "epoch": 0.3029760572835086, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.050674766302108765, + "learning_rate": 4.875342741288838e-06, + "loss": -0.0062, + "num_tokens": 108203852.0, + "reward": 4.888950824737549, + "reward_std": 0.1928871124982834, + "rewards/accuracy_reward/mean": 4.0, + "rewards/accuracy_reward/std": 2.618614673614502, + "rewards/ngram_similarity_reward/mean": 0.8889507055282593, + "rewards/ngram_similarity_reward/std": 0.2250843644142151, + "step": 677 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 803.0, + "completions/max_terminated_length": 803.0, + "completions/mean_length": 453.796875, + "completions/mean_terminated_length": 453.796875, + "completions/min_length": 201.0, + "completions/min_terminated_length": 201.0, + "epoch": 0.30342358469456254, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0681556984782219, + "learning_rate": 4.87479572936711e-06, + "loss": -0.0392, + "num_tokens": 108421407.0, + "reward": 2.8512043952941895, + "reward_std": 0.8869644403457642, + "rewards/accuracy_reward/mean": 2.5, + "rewards/accuracy_reward/std": 3.0237157344818115, + "rewards/ngram_similarity_reward/mean": 0.3512043356895447, + "rewards/ngram_similarity_reward/std": 0.22943827509880066, + "step": 678 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 590.0, + "completions/max_terminated_length": 590.0, + "completions/mean_length": 411.15625, + "completions/mean_terminated_length": 411.15625, + "completions/min_length": 245.0, + "completions/min_terminated_length": 245.0, + "epoch": 0.30387111210561646, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06231510639190674, + "learning_rate": 4.874247554238587e-06, + "loss": 0.0187, + "num_tokens": 108548841.0, + "reward": 2.546853542327881, + "reward_std": 0.9314303398132324, + "rewards/accuracy_reward/mean": 2.015625, + "rewards/accuracy_reward/std": 3.00260329246521, + "rewards/ngram_similarity_reward/mean": 0.5312284231185913, + "rewards/ngram_similarity_reward/std": 0.36710941791534424, + "step": 679 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 553.0, + "completions/max_terminated_length": 553.0, + "completions/mean_length": 373.359375, + "completions/mean_terminated_length": 373.359375, + "completions/min_length": 209.0, + "completions/min_terminated_length": 209.0, + "epoch": 0.3043186395166704, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0652371495962143, + "learning_rate": 4.873698216203364e-06, + "loss": 0.0112, + "num_tokens": 108667216.0, + "reward": 6.076634407043457, + "reward_std": 0.2062225341796875, + "rewards/accuracy_reward/mean": 5.5, + "rewards/accuracy_reward/std": 0.0, + "rewards/ngram_similarity_reward/mean": 0.5766341686248779, + "rewards/ngram_similarity_reward/std": 0.3065849840641022, + "step": 680 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 768.0, + "completions/max_terminated_length": 768.0, + "completions/mean_length": 376.90625, + "completions/mean_terminated_length": 376.90625, + "completions/min_length": 174.0, + "completions/min_terminated_length": 174.0, + "epoch": 0.30476616692772435, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07112135738134384, + "learning_rate": 4.873147715562173e-06, + "loss": 0.0233, + "num_tokens": 108830522.0, + "reward": 5.461245536804199, + "reward_std": 0.980175256729126, + "rewards/accuracy_reward/mean": 4.75, + "rewards/accuracy_reward/std": 2.0, + "rewards/ngram_similarity_reward/mean": 0.7112454175949097, + "rewards/ngram_similarity_reward/std": 0.39009037613868713, + "step": 681 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 639.0, + "completions/max_terminated_length": 639.0, + "completions/mean_length": 384.453125, + "completions/mean_terminated_length": 384.453125, + "completions/min_length": 234.0, + "completions/min_terminated_length": 234.0, + "epoch": 0.30521369433877826, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07231220602989197, + "learning_rate": 4.872596052616381e-06, + "loss": 0.0189, + "num_tokens": 108970679.0, + "reward": 5.196628093719482, + "reward_std": 1.5067083835601807, + "rewards/accuracy_reward/mean": 4.5625, + "rewards/accuracy_reward/std": 2.195775270462036, + "rewards/ngram_similarity_reward/mean": 0.6341278553009033, + "rewards/ngram_similarity_reward/std": 0.29436030983924866, + "step": 682 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 558.0, + "completions/max_terminated_length": 558.0, + "completions/mean_length": 326.125, + "completions/mean_terminated_length": 326.125, + "completions/min_length": 186.0, + "completions/min_terminated_length": 186.0, + "epoch": 0.3056612217498322, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08024755865335464, + "learning_rate": 4.872043227667993e-06, + "loss": 0.0353, + "num_tokens": 109087855.0, + "reward": 3.7122302055358887, + "reward_std": 1.7788197994232178, + "rewards/accuracy_reward/mean": 3.15625, + "rewards/accuracy_reward/std": 2.950484275817871, + "rewards/ngram_similarity_reward/mean": 0.5559800863265991, + "rewards/ngram_similarity_reward/std": 0.33515578508377075, + "step": 683 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 841.0, + "completions/max_terminated_length": 841.0, + "completions/mean_length": 430.1875, + "completions/mean_terminated_length": 430.1875, + "completions/min_length": 302.0, + "completions/min_terminated_length": 302.0, + "epoch": 0.3061087491608861, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07237806171178818, + "learning_rate": 4.8714892410196504e-06, + "loss": 0.0074, + "num_tokens": 109239659.0, + "reward": 0.10017599165439606, + "reward_std": 0.5458903312683105, + "rewards/accuracy_reward/mean": -0.40625, + "rewards/accuracy_reward/std": 0.7500000596046448, + "rewards/ngram_similarity_reward/mean": 0.5064259767532349, + "rewards/ngram_similarity_reward/std": 0.26822736859321594, + "step": 684 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 611.0, + "completions/max_terminated_length": 611.0, + "completions/mean_length": 423.109375, + "completions/mean_terminated_length": 423.109375, + "completions/min_length": 280.0, + "completions/min_terminated_length": 280.0, + "epoch": 0.30655627657194, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07401303946971893, + "learning_rate": 4.87093409297463e-06, + "loss": -0.0152, + "num_tokens": 109460226.0, + "reward": 2.838829755783081, + "reward_std": 0.14354142546653748, + "rewards/accuracy_reward/mean": 2.5, + "rewards/accuracy_reward/std": 3.0237157344818115, + "rewards/ngram_similarity_reward/mean": 0.33882978558540344, + "rewards/ngram_similarity_reward/std": 0.25283560156822205, + "step": 685 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 838.0, + "completions/max_terminated_length": 838.0, + "completions/mean_length": 423.21875, + "completions/mean_terminated_length": 423.21875, + "completions/min_length": 214.0, + "completions/min_terminated_length": 214.0, + "epoch": 0.307003803982994, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0632106363773346, + "learning_rate": 4.8703777838368435e-06, + "loss": -0.0158, + "num_tokens": 109596736.0, + "reward": 2.740299701690674, + "reward_std": 1.2174031734466553, + "rewards/accuracy_reward/mean": 2.0625, + "rewards/accuracy_reward/std": 3.06477689743042, + "rewards/ngram_similarity_reward/mean": 0.6777995824813843, + "rewards/ngram_similarity_reward/std": 0.24209056794643402, + "step": 686 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 679.0, + "completions/max_terminated_length": 679.0, + "completions/mean_length": 457.765625, + "completions/mean_terminated_length": 457.765625, + "completions/min_length": 300.0, + "completions/min_terminated_length": 300.0, + "epoch": 0.3074513313940479, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.053076568990945816, + "learning_rate": 4.869820313910839e-06, + "loss": 0.017, + "num_tokens": 109724545.0, + "reward": 2.3994107246398926, + "reward_std": 0.8497989177703857, + "rewards/accuracy_reward/mean": 2.0, + "rewards/accuracy_reward/std": 2.7602622509002686, + "rewards/ngram_similarity_reward/mean": 0.39941078424453735, + "rewards/ngram_similarity_reward/std": 0.2008398175239563, + "step": 687 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 704.0, + "completions/max_terminated_length": 704.0, + "completions/mean_length": 458.5625, + "completions/mean_terminated_length": 458.5625, + "completions/min_length": 279.0, + "completions/min_terminated_length": 279.0, + "epoch": 0.3078988588051018, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06561536341905594, + "learning_rate": 4.869261683501801e-06, + "loss": -0.0267, + "num_tokens": 109916965.0, + "reward": 2.2723612785339355, + "reward_std": 0.9010312557220459, + "rewards/accuracy_reward/mean": 1.828125, + "rewards/accuracy_reward/std": 2.9657018184661865, + "rewards/ngram_similarity_reward/mean": 0.4442363679409027, + "rewards/ngram_similarity_reward/std": 0.2966659963130951, + "step": 688 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 720.0, + "completions/max_terminated_length": 720.0, + "completions/mean_length": 465.296875, + "completions/mean_terminated_length": 465.296875, + "completions/min_length": 286.0, + "completions/min_terminated_length": 286.0, + "epoch": 0.30834638621615573, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0637257769703865, + "learning_rate": 4.868701892915549e-06, + "loss": -0.0295, + "num_tokens": 110058120.0, + "reward": 4.125423431396484, + "reward_std": 2.302952289581299, + "rewards/accuracy_reward/mean": 3.625, + "rewards/accuracy_reward/std": 2.8030595779418945, + "rewards/ngram_similarity_reward/mean": 0.500423789024353, + "rewards/ngram_similarity_reward/std": 0.321915864944458, + "step": 689 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 605.0, + "completions/max_terminated_length": 605.0, + "completions/mean_length": 394.15625, + "completions/mean_terminated_length": 394.15625, + "completions/min_length": 287.0, + "completions/min_terminated_length": 287.0, + "epoch": 0.30879391362720965, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08084577322006226, + "learning_rate": 4.868140942458535e-06, + "loss": -0.0049, + "num_tokens": 110201810.0, + "reward": 4.3223981857299805, + "reward_std": 0.7608660459518433, + "rewards/accuracy_reward/mean": 3.71875, + "rewards/accuracy_reward/std": 2.7629566192626953, + "rewards/ngram_similarity_reward/mean": 0.6036486625671387, + "rewards/ngram_similarity_reward/std": 0.3710053563117981, + "step": 690 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 585.0, + "completions/max_terminated_length": 585.0, + "completions/mean_length": 403.5, + "completions/mean_terminated_length": 403.5, + "completions/min_length": 257.0, + "completions/min_terminated_length": 257.0, + "epoch": 0.3092414410382636, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07621369510889053, + "learning_rate": 4.867578832437849e-06, + "loss": -0.0119, + "num_tokens": 110356402.0, + "reward": 5.923033714294434, + "reward_std": 0.14946463704109192, + "rewards/accuracy_reward/mean": 5.5, + "rewards/accuracy_reward/std": 0.0, + "rewards/ngram_similarity_reward/mean": 0.4230341911315918, + "rewards/ngram_similarity_reward/std": 0.3749995529651642, + "step": 691 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 450.0, + "completions/max_terminated_length": 450.0, + "completions/mean_length": 318.03125, + "completions/mean_terminated_length": 318.03125, + "completions/min_length": 218.0, + "completions/min_terminated_length": 218.0, + "epoch": 0.30968896844931754, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07899641990661621, + "learning_rate": 4.867015563161216e-06, + "loss": 0.0141, + "num_tokens": 110489444.0, + "reward": 5.396566390991211, + "reward_std": 0.969638466835022, + "rewards/accuracy_reward/mean": 4.75, + "rewards/accuracy_reward/std": 2.0, + "rewards/ngram_similarity_reward/mean": 0.6465665102005005, + "rewards/ngram_similarity_reward/std": 0.3430982530117035, + "step": 692 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 724.0, + "completions/max_terminated_length": 724.0, + "completions/mean_length": 544.75, + "completions/mean_terminated_length": 544.75, + "completions/min_length": 377.0, + "completions/min_terminated_length": 377.0, + "epoch": 0.31013649586037145, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04846769943833351, + "learning_rate": 4.866451134936991e-06, + "loss": 0.009, + "num_tokens": 110659156.0, + "reward": 6.0783796310424805, + "reward_std": 0.5822182297706604, + "rewards/accuracy_reward/mean": 5.40625, + "rewards/accuracy_reward/std": 0.7500000596046448, + "rewards/ngram_similarity_reward/mean": 0.6721292734146118, + "rewards/ngram_similarity_reward/std": 0.34718698263168335, + "step": 693 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 662.0, + "completions/max_terminated_length": 662.0, + "completions/mean_length": 472.421875, + "completions/mean_terminated_length": 472.421875, + "completions/min_length": 271.0, + "completions/min_terminated_length": 271.0, + "epoch": 0.31058402327142537, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0697740688920021, + "learning_rate": 4.86588554807417e-06, + "loss": 0.0322, + "num_tokens": 110820767.0, + "reward": 4.631163597106934, + "reward_std": 0.09964464604854584, + "rewards/accuracy_reward/mean": 4.0, + "rewards/accuracy_reward/std": 2.618614673614502, + "rewards/ngram_similarity_reward/mean": 0.6311637163162231, + "rewards/ngram_similarity_reward/std": 0.3299922049045563, + "step": 694 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 652.0, + "completions/max_terminated_length": 652.0, + "completions/mean_length": 388.390625, + "completions/mean_terminated_length": 388.390625, + "completions/min_length": 196.0, + "completions/min_terminated_length": 196.0, + "epoch": 0.3110315506824793, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08026900142431259, + "learning_rate": 4.86531880288238e-06, + "loss": 0.0145, + "num_tokens": 110958792.0, + "reward": 5.962247848510742, + "reward_std": 0.5507302284240723, + "rewards/accuracy_reward/mean": 5.390625, + "rewards/accuracy_reward/std": 0.8750000596046448, + "rewards/ngram_similarity_reward/mean": 0.5716227889060974, + "rewards/ngram_similarity_reward/std": 0.38306570053100586, + "step": 695 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 717.0, + "completions/max_terminated_length": 717.0, + "completions/mean_length": 454.125, + "completions/mean_terminated_length": 454.125, + "completions/min_length": 255.0, + "completions/min_terminated_length": 255.0, + "epoch": 0.3114790780935332, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05776282772421837, + "learning_rate": 4.86475089967188e-06, + "loss": -0.014, + "num_tokens": 111126784.0, + "reward": 0.5443506836891174, + "reward_std": 1.210315227508545, + "rewards/accuracy_reward/mean": -0.03125, + "rewards/accuracy_reward/std": 1.6229382753372192, + "rewards/ngram_similarity_reward/mean": 0.5756007432937622, + "rewards/ngram_similarity_reward/std": 0.27265891432762146, + "step": 696 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 647.0, + "completions/max_terminated_length": 647.0, + "completions/mean_length": 382.734375, + "completions/mean_terminated_length": 382.734375, + "completions/min_length": 235.0, + "completions/min_terminated_length": 235.0, + "epoch": 0.3119266055045872, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06984729319810867, + "learning_rate": 4.8641818387535674e-06, + "loss": -0.0328, + "num_tokens": 111280703.0, + "reward": 2.9061031341552734, + "reward_std": 1.9089854955673218, + "rewards/accuracy_reward/mean": 2.125, + "rewards/accuracy_reward/std": 3.000000238418579, + "rewards/ngram_similarity_reward/mean": 0.7811028957366943, + "rewards/ngram_similarity_reward/std": 0.2761169373989105, + "step": 697 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 747.0, + "completions/max_terminated_length": 747.0, + "completions/mean_length": 439.390625, + "completions/mean_terminated_length": 439.390625, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "epoch": 0.3123741329156411, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07779427617788315, + "learning_rate": 4.863611620438971e-06, + "loss": 0.0051, + "num_tokens": 111489048.0, + "reward": 4.679882526397705, + "reward_std": 1.6266264915466309, + "rewards/accuracy_reward/mean": 4.1875, + "rewards/accuracy_reward/std": 2.5, + "rewards/ngram_similarity_reward/mean": 0.4923825263977051, + "rewards/ngram_similarity_reward/std": 0.29693710803985596, + "step": 698 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 598.0, + "completions/max_terminated_length": 598.0, + "completions/mean_length": 372.234375, + "completions/mean_terminated_length": 372.234375, + "completions/min_length": 260.0, + "completions/min_terminated_length": 260.0, + "epoch": 0.312821660326695, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07299550622701645, + "learning_rate": 4.863040245040253e-06, + "loss": -0.0002, + "num_tokens": 111636247.0, + "reward": 3.861237049102783, + "reward_std": 1.870938777923584, + "rewards/accuracy_reward/mean": 3.53125, + "rewards/accuracy_reward/std": 2.839454174041748, + "rewards/ngram_similarity_reward/mean": 0.3299867808818817, + "rewards/ngram_similarity_reward/std": 0.2714199423789978, + "step": 699 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 700.0, + "completions/max_terminated_length": 700.0, + "completions/mean_length": 472.578125, + "completions/mean_terminated_length": 472.578125, + "completions/min_length": 290.0, + "completions/min_terminated_length": 290.0, + "epoch": 0.3132691877377489, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0719018429517746, + "learning_rate": 4.862467712870209e-06, + "loss": 0.031, + "num_tokens": 111793612.0, + "reward": 4.607283592224121, + "reward_std": 0.5640270113945007, + "rewards/accuracy_reward/mean": 3.90625, + "rewards/accuracy_reward/std": 2.6709415912628174, + "rewards/ngram_similarity_reward/mean": 0.7010337114334106, + "rewards/ngram_similarity_reward/std": 0.3264153301715851, + "step": 700 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 753.0, + "completions/max_terminated_length": 753.0, + "completions/mean_length": 398.765625, + "completions/mean_terminated_length": 398.765625, + "completions/min_length": 199.0, + "completions/min_terminated_length": 199.0, + "epoch": 0.31371671514880284, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05898962542414665, + "learning_rate": 4.861894024242269e-06, + "loss": -0.008, + "num_tokens": 111933629.0, + "reward": 5.79965877532959, + "reward_std": 1.086059808731079, + "rewards/accuracy_reward/mean": 5.125, + "rewards/accuracy_reward/std": 1.4638501405715942, + "rewards/ngram_similarity_reward/mean": 0.6746585369110107, + "rewards/ngram_similarity_reward/std": 0.4353345036506653, + "step": 701 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 835.0, + "completions/max_terminated_length": 835.0, + "completions/mean_length": 423.421875, + "completions/mean_terminated_length": 423.421875, + "completions/min_length": 152.0, + "completions/min_terminated_length": 152.0, + "epoch": 0.3141642425598568, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0693143978714943, + "learning_rate": 4.861319179470495e-06, + "loss": 0.0245, + "num_tokens": 112075736.0, + "reward": 2.0054614543914795, + "reward_std": 1.171264410018921, + "rewards/accuracy_reward/mean": 1.46875, + "rewards/accuracy_reward/std": 2.839454174041748, + "rewards/ngram_similarity_reward/mean": 0.5367113351821899, + "rewards/ngram_similarity_reward/std": 0.327590674161911, + "step": 702 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 597.0, + "completions/max_terminated_length": 597.0, + "completions/mean_length": 464.71875, + "completions/mean_terminated_length": 464.71875, + "completions/min_length": 340.0, + "completions/min_terminated_length": 340.0, + "epoch": 0.3146117699709107, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05373014882206917, + "learning_rate": 4.860743178869583e-06, + "loss": 0.0038, + "num_tokens": 112207318.0, + "reward": 3.5712907314300537, + "reward_std": 2.0306131839752197, + "rewards/accuracy_reward/mean": 2.96875, + "rewards/accuracy_reward/std": 2.986577272415161, + "rewards/ngram_similarity_reward/mean": 0.6025407910346985, + "rewards/ngram_similarity_reward/std": 0.28013697266578674, + "step": 703 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 589.0, + "completions/max_terminated_length": 589.0, + "completions/mean_length": 401.984375, + "completions/mean_terminated_length": 401.984375, + "completions/min_length": 201.0, + "completions/min_terminated_length": 201.0, + "epoch": 0.31505929738196464, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.064559206366539, + "learning_rate": 4.86016602275486e-06, + "loss": -0.0199, + "num_tokens": 112322181.0, + "reward": 2.7623391151428223, + "reward_std": 1.1833332777023315, + "rewards/accuracy_reward/mean": 2.203125, + "rewards/accuracy_reward/std": 3.0272817611694336, + "rewards/ngram_similarity_reward/mean": 0.5592142343521118, + "rewards/ngram_similarity_reward/std": 0.19574899971485138, + "step": 704 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 814.0, + "completions/max_terminated_length": 814.0, + "completions/mean_length": 459.875, + "completions/mean_terminated_length": 459.875, + "completions/min_length": 267.0, + "completions/min_terminated_length": 267.0, + "epoch": 0.31550682479301856, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0718189924955368, + "learning_rate": 4.8595877114422884e-06, + "loss": 0.0079, + "num_tokens": 112496957.0, + "reward": 3.048264980316162, + "reward_std": 0.13818775117397308, + "rewards/accuracy_reward/mean": 2.5, + "rewards/accuracy_reward/std": 3.0237157344818115, + "rewards/ngram_similarity_reward/mean": 0.5482649803161621, + "rewards/ngram_similarity_reward/std": 0.24722221493721008, + "step": 705 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 674.0, + "completions/max_terminated_length": 674.0, + "completions/mean_length": 414.65625, + "completions/mean_terminated_length": 414.65625, + "completions/min_length": 244.0, + "completions/min_terminated_length": 244.0, + "epoch": 0.3159543522040725, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06648236513137817, + "learning_rate": 4.85900824524846e-06, + "loss": 0.023, + "num_tokens": 112662759.0, + "reward": 4.96194314956665, + "reward_std": 1.1429578065872192, + "rewards/accuracy_reward/mean": 4.265625, + "rewards/accuracy_reward/std": 2.467195510864258, + "rewards/ngram_similarity_reward/mean": 0.6963184475898743, + "rewards/ngram_similarity_reward/std": 0.315858393907547, + "step": 706 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 636.0, + "completions/max_terminated_length": 636.0, + "completions/mean_length": 402.984375, + "completions/mean_terminated_length": 402.984375, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "epoch": 0.31640187961512645, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.070754274725914, + "learning_rate": 4.8584276244906e-06, + "loss": 0.0114, + "num_tokens": 112782006.0, + "reward": 5.393426895141602, + "reward_std": 1.2861722707748413, + "rewards/accuracy_reward/mean": 4.84375, + "rewards/accuracy_reward/std": 1.8874586820602417, + "rewards/ngram_similarity_reward/mean": 0.549676775932312, + "rewards/ngram_similarity_reward/std": 0.3782258629798889, + "step": 707 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 545.0, + "completions/max_terminated_length": 545.0, + "completions/mean_length": 355.546875, + "completions/mean_terminated_length": 355.546875, + "completions/min_length": 239.0, + "completions/min_terminated_length": 239.0, + "epoch": 0.31684940702618036, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0816711038351059, + "learning_rate": 4.857845849486566e-06, + "loss": -0.02, + "num_tokens": 112916265.0, + "reward": 3.490420341491699, + "reward_std": 0.8595148324966431, + "rewards/accuracy_reward/mean": 3.0625, + "rewards/accuracy_reward/std": 2.9700891971588135, + "rewards/ngram_similarity_reward/mean": 0.4279203712940216, + "rewards/ngram_similarity_reward/std": 0.2959030866622925, + "step": 708 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 847.0, + "completions/max_terminated_length": 847.0, + "completions/mean_length": 481.234375, + "completions/mean_terminated_length": 481.234375, + "completions/min_length": 226.0, + "completions/min_terminated_length": 226.0, + "epoch": 0.3172969344372343, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06152066960930824, + "learning_rate": 4.857262920554848e-06, + "loss": 0.0112, + "num_tokens": 113025592.0, + "reward": 5.843417167663574, + "reward_std": 0.7281556725502014, + "rewards/accuracy_reward/mean": 5.21875, + "rewards/accuracy_reward/std": 1.2782522439956665, + "rewards/ngram_similarity_reward/mean": 0.6246671676635742, + "rewards/ngram_similarity_reward/std": 0.2800602614879608, + "step": 709 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 523.0, + "completions/max_terminated_length": 523.0, + "completions/mean_length": 367.125, + "completions/mean_terminated_length": 367.125, + "completions/min_length": 200.0, + "completions/min_terminated_length": 200.0, + "epoch": 0.3177444618482882, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08010809123516083, + "learning_rate": 4.8566788380145665e-06, + "loss": -0.0067, + "num_tokens": 113166368.0, + "reward": 2.8130388259887695, + "reward_std": 0.08101053535938263, + "rewards/accuracy_reward/mean": 2.5, + "rewards/accuracy_reward/std": 3.0237157344818115, + "rewards/ngram_similarity_reward/mean": 0.31303858757019043, + "rewards/ngram_similarity_reward/std": 0.27182063460350037, + "step": 710 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 538.0, + "completions/max_terminated_length": 538.0, + "completions/mean_length": 371.671875, + "completions/mean_terminated_length": 371.671875, + "completions/min_length": 246.0, + "completions/min_terminated_length": 246.0, + "epoch": 0.3181919892593421, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07613039761781693, + "learning_rate": 4.856093602185473e-06, + "loss": 0.0029, + "num_tokens": 113373771.0, + "reward": 4.351356506347656, + "reward_std": 0.5276011228561401, + "rewards/accuracy_reward/mean": 3.90625, + "rewards/accuracy_reward/std": 2.6709415912628174, + "rewards/ngram_similarity_reward/mean": 0.44510650634765625, + "rewards/ngram_similarity_reward/std": 0.3404860198497772, + "step": 711 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 694.0, + "completions/max_terminated_length": 694.0, + "completions/mean_length": 451.890625, + "completions/mean_terminated_length": 451.890625, + "completions/min_length": 319.0, + "completions/min_terminated_length": 319.0, + "epoch": 0.3186395166703961, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08762478083372116, + "learning_rate": 4.855507213387954e-06, + "loss": -0.0114, + "num_tokens": 113518340.0, + "reward": 6.231166839599609, + "reward_std": 0.29846394062042236, + "rewards/accuracy_reward/mean": 5.5, + "rewards/accuracy_reward/std": 0.0, + "rewards/ngram_similarity_reward/mean": 0.7311668395996094, + "rewards/ngram_similarity_reward/std": 0.3233652412891388, + "step": 712 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 761.0, + "completions/max_terminated_length": 761.0, + "completions/mean_length": 463.609375, + "completions/mean_terminated_length": 463.609375, + "completions/min_length": 292.0, + "completions/min_terminated_length": 292.0, + "epoch": 0.31908704408145, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05265612527728081, + "learning_rate": 4.854919671943021e-06, + "loss": 0.0227, + "num_tokens": 113650379.0, + "reward": 2.966203212738037, + "reward_std": 1.021661400794983, + "rewards/accuracy_reward/mean": 2.40625, + "rewards/accuracy_reward/std": 3.0222392082214355, + "rewards/ngram_similarity_reward/mean": 0.5599533319473267, + "rewards/ngram_similarity_reward/std": 0.26916348934173584, + "step": 713 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 565.0, + "completions/max_terminated_length": 565.0, + "completions/mean_length": 353.75, + "completions/mean_terminated_length": 353.75, + "completions/min_length": 239.0, + "completions/min_terminated_length": 239.0, + "epoch": 0.3195345714925039, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07360535115003586, + "learning_rate": 4.8543309781723235e-06, + "loss": -0.002, + "num_tokens": 113788443.0, + "reward": 6.086665153503418, + "reward_std": 0.5752182006835938, + "rewards/accuracy_reward/mean": 5.40625, + "rewards/accuracy_reward/std": 0.7500000596046448, + "rewards/ngram_similarity_reward/mean": 0.6804152131080627, + "rewards/ngram_similarity_reward/std": 0.31077635288238525, + "step": 714 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 717.0, + "completions/max_terminated_length": 717.0, + "completions/mean_length": 388.46875, + "completions/mean_terminated_length": 388.46875, + "completions/min_length": 219.0, + "completions/min_terminated_length": 219.0, + "epoch": 0.31998209890355783, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06774787604808807, + "learning_rate": 4.853741132398136e-06, + "loss": -0.0142, + "num_tokens": 113931817.0, + "reward": 4.677279949188232, + "reward_std": 0.9036829471588135, + "rewards/accuracy_reward/mean": 4.0, + "rewards/accuracy_reward/std": 2.618614673614502, + "rewards/ngram_similarity_reward/mean": 0.677280068397522, + "rewards/ngram_similarity_reward/std": 0.32574334740638733, + "step": 715 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 548.0, + "completions/max_terminated_length": 548.0, + "completions/mean_length": 382.46875, + "completions/mean_terminated_length": 382.46875, + "completions/min_length": 284.0, + "completions/min_terminated_length": 284.0, + "epoch": 0.32042962631461175, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.061975304037332535, + "learning_rate": 4.853150134943367e-06, + "loss": -0.0007, + "num_tokens": 114050775.0, + "reward": 2.2052435874938965, + "reward_std": 1.7568916082382202, + "rewards/accuracy_reward/mean": 1.65625, + "rewards/accuracy_reward/std": 2.9016621112823486, + "rewards/ngram_similarity_reward/mean": 0.5489935874938965, + "rewards/ngram_similarity_reward/std": 0.265791654586792, + "step": 716 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 776.0, + "completions/max_terminated_length": 776.0, + "completions/mean_length": 437.515625, + "completions/mean_terminated_length": 437.515625, + "completions/min_length": 210.0, + "completions/min_terminated_length": 210.0, + "epoch": 0.3208771537256657, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05750018358230591, + "learning_rate": 4.852557986131555e-06, + "loss": -0.0013, + "num_tokens": 114183704.0, + "reward": 5.553138732910156, + "reward_std": 1.0766979455947876, + "rewards/accuracy_reward/mean": 5.125, + "rewards/accuracy_reward/std": 1.4638501405715942, + "rewards/ngram_similarity_reward/mean": 0.4281388521194458, + "rewards/ngram_similarity_reward/std": 0.2769782245159149, + "step": 717 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 807.0, + "completions/max_terminated_length": 807.0, + "completions/mean_length": 522.546875, + "completions/mean_terminated_length": 522.546875, + "completions/min_length": 351.0, + "completions/min_terminated_length": 351.0, + "epoch": 0.32132468113671964, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05540424585342407, + "learning_rate": 4.8519646862868675e-06, + "loss": 0.0045, + "num_tokens": 114336971.0, + "reward": 3.764643907546997, + "reward_std": 1.3306835889816284, + "rewards/accuracy_reward/mean": 3.203125, + "rewards/accuracy_reward/std": 2.995656728744507, + "rewards/ngram_similarity_reward/mean": 0.5615189075469971, + "rewards/ngram_similarity_reward/std": 0.3798581063747406, + "step": 718 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 704.0, + "completions/max_terminated_length": 704.0, + "completions/mean_length": 403.09375, + "completions/mean_terminated_length": 403.09375, + "completions/min_length": 241.0, + "completions/min_terminated_length": 241.0, + "epoch": 0.32177220854777355, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07903230935335159, + "learning_rate": 4.851370235734103e-06, + "loss": 0.0372, + "num_tokens": 114479969.0, + "reward": 5.639060974121094, + "reward_std": 1.0079065561294556, + "rewards/accuracy_reward/mean": 4.84375, + "rewards/accuracy_reward/std": 1.8874586820602417, + "rewards/ngram_similarity_reward/mean": 0.795311450958252, + "rewards/ngram_similarity_reward/std": 0.32375478744506836, + "step": 719 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 732.0, + "completions/max_terminated_length": 732.0, + "completions/mean_length": 449.453125, + "completions/mean_terminated_length": 449.453125, + "completions/min_length": 253.0, + "completions/min_terminated_length": 253.0, + "epoch": 0.32221973595882747, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06887117773294449, + "learning_rate": 4.85077463479869e-06, + "loss": -0.0308, + "num_tokens": 114705374.0, + "reward": 1.1147079467773438, + "reward_std": 0.863875150680542, + "rewards/accuracy_reward/mean": 0.59375, + "rewards/accuracy_reward/std": 2.381934404373169, + "rewards/ngram_similarity_reward/mean": 0.5209579467773438, + "rewards/ngram_similarity_reward/std": 0.2823002338409424, + "step": 720 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 593.0, + "completions/max_terminated_length": 593.0, + "completions/mean_length": 391.6875, + "completions/mean_terminated_length": 391.6875, + "completions/min_length": 235.0, + "completions/min_terminated_length": 235.0, + "epoch": 0.3226672633698814, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07737100124359131, + "learning_rate": 4.850177883806688e-06, + "loss": 0.0387, + "num_tokens": 114853418.0, + "reward": 3.036330223083496, + "reward_std": 0.6309786438941956, + "rewards/accuracy_reward/mean": 2.53125, + "rewards/accuracy_reward/std": 3.0961766242980957, + "rewards/ngram_similarity_reward/mean": 0.5050802230834961, + "rewards/ngram_similarity_reward/std": 0.40981554985046387, + "step": 721 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 761.0, + "completions/max_terminated_length": 761.0, + "completions/mean_length": 490.0625, + "completions/mean_terminated_length": 490.0625, + "completions/min_length": 315.0, + "completions/min_terminated_length": 315.0, + "epoch": 0.32311479078093536, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06010664626955986, + "learning_rate": 4.849579983084782e-06, + "loss": 0.0351, + "num_tokens": 115037470.0, + "reward": 4.774776935577393, + "reward_std": 1.4736918210983276, + "rewards/accuracy_reward/mean": 4.0, + "rewards/accuracy_reward/std": 2.618614673614502, + "rewards/ngram_similarity_reward/mean": 0.7747770547866821, + "rewards/ngram_similarity_reward/std": 0.26835912466049194, + "step": 722 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 549.0, + "completions/max_terminated_length": 549.0, + "completions/mean_length": 355.046875, + "completions/mean_terminated_length": 355.046875, + "completions/min_length": 225.0, + "completions/min_terminated_length": 225.0, + "epoch": 0.3235623181919893, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07075344771146774, + "learning_rate": 4.848980932960292e-06, + "loss": 0.0033, + "num_tokens": 115192289.0, + "reward": 2.6875319480895996, + "reward_std": 1.4031498432159424, + "rewards/accuracy_reward/mean": 2.03125, + "rewards/accuracy_reward/std": 2.986577272415161, + "rewards/ngram_similarity_reward/mean": 0.6562820672988892, + "rewards/ngram_similarity_reward/std": 0.36360806226730347, + "step": 723 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 700.0, + "completions/max_terminated_length": 700.0, + "completions/mean_length": 399.28125, + "completions/mean_terminated_length": 399.28125, + "completions/min_length": 224.0, + "completions/min_terminated_length": 224.0, + "epoch": 0.3240098456030432, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0756743997335434, + "learning_rate": 4.848380733761164e-06, + "loss": -0.0084, + "num_tokens": 115352051.0, + "reward": 4.374879837036133, + "reward_std": 2.440624237060547, + "rewards/accuracy_reward/mean": 3.625, + "rewards/accuracy_reward/std": 2.8030595779418945, + "rewards/ngram_similarity_reward/mean": 0.7498795986175537, + "rewards/ngram_similarity_reward/std": 0.2921641767024994, + "step": 724 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 708.0, + "completions/max_terminated_length": 708.0, + "completions/mean_length": 411.890625, + "completions/mean_terminated_length": 411.890625, + "completions/min_length": 195.0, + "completions/min_terminated_length": 195.0, + "epoch": 0.3244573730140971, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07049600034952164, + "learning_rate": 4.847779385815971e-06, + "loss": -0.0151, + "num_tokens": 115532108.0, + "reward": 4.49118709564209, + "reward_std": 0.833372950553894, + "rewards/accuracy_reward/mean": 3.625, + "rewards/accuracy_reward/std": 2.8030595779418945, + "rewards/ngram_similarity_reward/mean": 0.866187334060669, + "rewards/ngram_similarity_reward/std": 0.28561943769454956, + "step": 725 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 783.0, + "completions/max_terminated_length": 783.0, + "completions/mean_length": 432.625, + "completions/mean_terminated_length": 432.625, + "completions/min_length": 235.0, + "completions/min_terminated_length": 235.0, + "epoch": 0.324904900425151, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08293016254901886, + "learning_rate": 4.847176889453921e-06, + "loss": -0.0273, + "num_tokens": 115678692.0, + "reward": 5.021811485290527, + "reward_std": 0.9971826672554016, + "rewards/accuracy_reward/mean": 4.46875, + "rewards/accuracy_reward/std": 2.2815253734588623, + "rewards/ngram_similarity_reward/mean": 0.5530613660812378, + "rewards/ngram_similarity_reward/std": 0.3139452338218689, + "step": 726 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 750.0, + "completions/max_terminated_length": 750.0, + "completions/mean_length": 452.484375, + "completions/mean_terminated_length": 452.484375, + "completions/min_length": 271.0, + "completions/min_terminated_length": 271.0, + "epoch": 0.325352427836205, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06269259005784988, + "learning_rate": 4.846573245004844e-06, + "loss": -0.0029, + "num_tokens": 115835971.0, + "reward": 4.7243499755859375, + "reward_std": 0.22585970163345337, + "rewards/accuracy_reward/mean": 4.0, + "rewards/accuracy_reward/std": 2.618614673614502, + "rewards/ngram_similarity_reward/mean": 0.7243503332138062, + "rewards/ngram_similarity_reward/std": 0.3056999444961548, + "step": 727 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 500.0, + "completions/max_terminated_length": 500.0, + "completions/mean_length": 360.0625, + "completions/mean_terminated_length": 360.0625, + "completions/min_length": 281.0, + "completions/min_terminated_length": 281.0, + "epoch": 0.3257999552472589, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08586455881595612, + "learning_rate": 4.845968452799203e-06, + "loss": -0.0181, + "num_tokens": 115954839.0, + "reward": 1.1131083965301514, + "reward_std": 1.2290589809417725, + "rewards/accuracy_reward/mean": 0.609375, + "rewards/accuracy_reward/std": 2.3712377548217773, + "rewards/ngram_similarity_reward/mean": 0.5037335157394409, + "rewards/ngram_similarity_reward/std": 0.2511395812034607, + "step": 728 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 556.0, + "completions/max_terminated_length": 556.0, + "completions/mean_length": 386.21875, + "completions/mean_terminated_length": 386.21875, + "completions/min_length": 243.0, + "completions/min_terminated_length": 243.0, + "epoch": 0.3262474826583128, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06456200033426285, + "learning_rate": 4.845362513168088e-06, + "loss": 0.0137, + "num_tokens": 116087573.0, + "reward": 3.3358211517333984, + "reward_std": 2.3673391342163086, + "rewards/accuracy_reward/mean": 2.84375, + "rewards/accuracy_reward/std": 3.0405657291412354, + "rewards/ngram_similarity_reward/mean": 0.4920710325241089, + "rewards/ngram_similarity_reward/std": 0.3257303833961487, + "step": 729 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 668.0, + "completions/max_terminated_length": 668.0, + "completions/mean_length": 417.484375, + "completions/mean_terminated_length": 417.484375, + "completions/min_length": 218.0, + "completions/min_terminated_length": 218.0, + "epoch": 0.32669501006936674, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.069620780646801, + "learning_rate": 4.844755426443216e-06, + "loss": -0.0308, + "num_tokens": 116250084.0, + "reward": 4.713657855987549, + "reward_std": 0.9554732441902161, + "rewards/accuracy_reward/mean": 4.09375, + "rewards/accuracy_reward/std": 2.5617377758026123, + "rewards/ngram_similarity_reward/mean": 0.6199080348014832, + "rewards/ngram_similarity_reward/std": 0.2207847386598587, + "step": 730 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 552.0, + "completions/max_terminated_length": 552.0, + "completions/mean_length": 398.28125, + "completions/mean_terminated_length": 398.28125, + "completions/min_length": 289.0, + "completions/min_terminated_length": 289.0, + "epoch": 0.32714253748042066, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06426721811294556, + "learning_rate": 4.844147192956935e-06, + "loss": -0.0432, + "num_tokens": 116370214.0, + "reward": 3.733760118484497, + "reward_std": 0.9057095646858215, + "rewards/accuracy_reward/mean": 3.0625, + "rewards/accuracy_reward/std": 2.9700891971588135, + "rewards/ngram_similarity_reward/mean": 0.6712601184844971, + "rewards/ngram_similarity_reward/std": 0.30741092562675476, + "step": 731 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 642.0, + "completions/max_terminated_length": 642.0, + "completions/mean_length": 411.953125, + "completions/mean_terminated_length": 411.953125, + "completions/min_length": 303.0, + "completions/min_terminated_length": 303.0, + "epoch": 0.32759006489147463, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07242885231971741, + "learning_rate": 4.843537813042217e-06, + "loss": 0.0345, + "num_tokens": 116514611.0, + "reward": 4.382899284362793, + "reward_std": 1.9558072090148926, + "rewards/accuracy_reward/mean": 3.6875, + "rewards/accuracy_reward/std": 2.816476583480835, + "rewards/ngram_similarity_reward/mean": 0.695399284362793, + "rewards/ngram_similarity_reward/std": 0.3914608657360077, + "step": 732 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 998.0, + "completions/max_terminated_length": 998.0, + "completions/mean_length": 476.765625, + "completions/mean_terminated_length": 476.765625, + "completions/min_length": 190.0, + "completions/min_terminated_length": 190.0, + "epoch": 0.32803759230252855, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07082340866327286, + "learning_rate": 4.8429272870326635e-06, + "loss": -0.0057, + "num_tokens": 116651812.0, + "reward": 5.053439140319824, + "reward_std": 1.3016170263290405, + "rewards/accuracy_reward/mean": 4.46875, + "rewards/accuracy_reward/std": 2.2815253734588623, + "rewards/ngram_similarity_reward/mean": 0.5846890211105347, + "rewards/ngram_similarity_reward/std": 0.24929215013980865, + "step": 733 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1258.0, + "completions/max_terminated_length": 1258.0, + "completions/mean_length": 443.0625, + "completions/mean_terminated_length": 443.0625, + "completions/min_length": 149.0, + "completions/min_terminated_length": 149.0, + "epoch": 0.32848511971358246, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07560112327337265, + "learning_rate": 4.842315615262504e-06, + "loss": -0.0019, + "num_tokens": 116781128.0, + "reward": -0.02607668563723564, + "reward_std": 0.24621905386447906, + "rewards/accuracy_reward/mean": -0.59375, + "rewards/accuracy_reward/std": 0.29378482699394226, + "rewards/ngram_similarity_reward/mean": 0.5676733255386353, + "rewards/ngram_similarity_reward/std": 0.35898521542549133, + "step": 734 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 599.0, + "completions/max_terminated_length": 599.0, + "completions/mean_length": 322.625, + "completions/mean_terminated_length": 322.625, + "completions/min_length": 179.0, + "completions/min_terminated_length": 179.0, + "epoch": 0.3289326471246364, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.07953772693872452, + "learning_rate": 4.8417027980665945e-06, + "loss": -0.0063, + "num_tokens": 116935056.0, + "reward": 5.001359462738037, + "reward_std": 0.703096866607666, + "rewards/accuracy_reward/mean": 4.28125, + "rewards/accuracy_reward/std": 2.4330317974090576, + "rewards/ngram_similarity_reward/mean": 0.7201095819473267, + "rewards/ngram_similarity_reward/std": 0.4384860098361969, + "step": 735 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 594.0, + "completions/max_terminated_length": 594.0, + "completions/mean_length": 386.515625, + "completions/mean_terminated_length": 386.515625, + "completions/min_length": 250.0, + "completions/min_terminated_length": 250.0, + "epoch": 0.3293801745356903, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07305855304002762, + "learning_rate": 4.8410888357804176e-06, + "loss": -0.009, + "num_tokens": 117111985.0, + "reward": 4.526159763336182, + "reward_std": 0.1608743965625763, + "rewards/accuracy_reward/mean": 4.0, + "rewards/accuracy_reward/std": 2.618614673614502, + "rewards/ngram_similarity_reward/mean": 0.5261598229408264, + "rewards/ngram_similarity_reward/std": 0.30847957730293274, + "step": 736 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1072.0, + "completions/max_terminated_length": 1072.0, + "completions/mean_length": 438.9375, + "completions/mean_terminated_length": 438.9375, + "completions/min_length": 316.0, + "completions/min_terminated_length": 316.0, + "epoch": 0.3298277019467442, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07190577685832977, + "learning_rate": 4.840473728740084e-06, + "loss": 0.0651, + "num_tokens": 117270173.0, + "reward": 3.601214647293091, + "reward_std": 1.6631498336791992, + "rewards/accuracy_reward/mean": 2.953125, + "rewards/accuracy_reward/std": 3.0075550079345703, + "rewards/ngram_similarity_reward/mean": 0.6480897665023804, + "rewards/ngram_similarity_reward/std": 0.3352556824684143, + "step": 737 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 902.0, + "completions/max_terminated_length": 902.0, + "completions/mean_length": 562.625, + "completions/mean_terminated_length": 562.625, + "completions/min_length": 329.0, + "completions/min_terminated_length": 329.0, + "epoch": 0.3302752293577982, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06534037739038467, + "learning_rate": 4.839857477282331e-06, + "loss": -0.0154, + "num_tokens": 117433365.0, + "reward": 4.571503639221191, + "reward_std": 0.13676393032073975, + "rewards/accuracy_reward/mean": 4.0, + "rewards/accuracy_reward/std": 2.618614673614502, + "rewards/ngram_similarity_reward/mean": 0.5715036392211914, + "rewards/ngram_similarity_reward/std": 0.23960945010185242, + "step": 738 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 716.0, + "completions/max_terminated_length": 716.0, + "completions/mean_length": 418.8125, + "completions/mean_terminated_length": 418.8125, + "completions/min_length": 280.0, + "completions/min_terminated_length": 280.0, + "epoch": 0.3307227567688521, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06592286378145218, + "learning_rate": 4.83924008174452e-06, + "loss": -0.0127, + "num_tokens": 117608409.0, + "reward": 6.083271026611328, + "reward_std": 0.12093131989240646, + "rewards/accuracy_reward/mean": 5.5, + "rewards/accuracy_reward/std": 0.0, + "rewards/ngram_similarity_reward/mean": 0.5832710266113281, + "rewards/ngram_similarity_reward/std": 0.24732209742069244, + "step": 739 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 825.0, + "completions/max_terminated_length": 825.0, + "completions/mean_length": 494.421875, + "completions/mean_terminated_length": 494.421875, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, + "epoch": 0.331170284179906, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05999520793557167, + "learning_rate": 4.838621542464642e-06, + "loss": -0.0446, + "num_tokens": 117759108.0, + "reward": 4.562652587890625, + "reward_std": 0.8388803601264954, + "rewards/accuracy_reward/mean": 4.0, + "rewards/accuracy_reward/std": 2.618614673614502, + "rewards/ngram_similarity_reward/mean": 0.562652587890625, + "rewards/ngram_similarity_reward/std": 0.48074784874916077, + "step": 740 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 484.0, + "completions/max_terminated_length": 484.0, + "completions/mean_length": 366.03125, + "completions/mean_terminated_length": 366.03125, + "completions/min_length": 228.0, + "completions/min_terminated_length": 228.0, + "epoch": 0.33161781159095993, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09077738970518112, + "learning_rate": 4.838001859781311e-06, + "loss": 0.0005, + "num_tokens": 117964454.0, + "reward": 2.956620216369629, + "reward_std": 0.8590655326843262, + "rewards/accuracy_reward/mean": 2.46875, + "rewards/accuracy_reward/std": 3.06007981300354, + "rewards/ngram_similarity_reward/mean": 0.48787015676498413, + "rewards/ngram_similarity_reward/std": 0.42288804054260254, + "step": 741 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 737.0, + "completions/max_terminated_length": 737.0, + "completions/mean_length": 484.375, + "completions/mean_terminated_length": 484.375, + "completions/min_length": 306.0, + "completions/min_terminated_length": 306.0, + "epoch": 0.33206533900201385, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0587838850915432, + "learning_rate": 4.8373810340337704e-06, + "loss": -0.0207, + "num_tokens": 118144622.0, + "reward": 5.071628570556641, + "reward_std": 0.6989190578460693, + "rewards/accuracy_reward/mean": 4.28125, + "rewards/accuracy_reward/std": 2.4330317974090576, + "rewards/ngram_similarity_reward/mean": 0.7903785705566406, + "rewards/ngram_similarity_reward/std": 0.3258940875530243, + "step": 742 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 627.0, + "completions/max_terminated_length": 627.0, + "completions/mean_length": 374.671875, + "completions/mean_terminated_length": 374.671875, + "completions/min_length": 249.0, + "completions/min_terminated_length": 249.0, + "epoch": 0.3325128664130678, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06940338015556335, + "learning_rate": 4.836759065561887e-06, + "loss": -0.0062, + "num_tokens": 118280185.0, + "reward": 4.492125034332275, + "reward_std": 1.3534945249557495, + "rewards/accuracy_reward/mean": 3.8125, + "rewards/accuracy_reward/std": 2.7189810276031494, + "rewards/ngram_similarity_reward/mean": 0.6796249151229858, + "rewards/ngram_similarity_reward/std": 0.30835971236228943, + "step": 743 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 943.0, + "completions/max_terminated_length": 943.0, + "completions/mean_length": 614.46875, + "completions/mean_terminated_length": 614.46875, + "completions/min_length": 323.0, + "completions/min_terminated_length": 323.0, + "epoch": 0.33296039382412174, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05301578715443611, + "learning_rate": 4.8361359547061535e-06, + "loss": -0.0017, + "num_tokens": 118436999.0, + "reward": 1.7326395511627197, + "reward_std": 2.1927366256713867, + "rewards/accuracy_reward/mean": 1.34375, + "rewards/accuracy_reward/std": 2.8296544551849365, + "rewards/ngram_similarity_reward/mean": 0.3888895511627197, + "rewards/ngram_similarity_reward/std": 0.20641785860061646, + "step": 744 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 751.0, + "completions/max_terminated_length": 751.0, + "completions/mean_length": 575.4375, + "completions/mean_terminated_length": 575.4375, + "completions/min_length": 426.0, + "completions/min_terminated_length": 426.0, + "epoch": 0.33340792123517565, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04947161301970482, + "learning_rate": 4.835511701807689e-06, + "loss": -0.0175, + "num_tokens": 118618227.0, + "reward": 3.2324278354644775, + "reward_std": 0.16435107588768005, + "rewards/accuracy_reward/mean": 2.5, + "rewards/accuracy_reward/std": 3.0237157344818115, + "rewards/ngram_similarity_reward/mean": 0.7324278354644775, + "rewards/ngram_similarity_reward/std": 0.2556518018245697, + "step": 745 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 683.0, + "completions/max_terminated_length": 683.0, + "completions/mean_length": 444.5625, + "completions/mean_terminated_length": 444.5625, + "completions/min_length": 238.0, + "completions/min_terminated_length": 238.0, + "epoch": 0.33385544864622957, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06482308357954025, + "learning_rate": 4.834886307208235e-06, + "loss": -0.0144, + "num_tokens": 118742487.0, + "reward": 2.9939393997192383, + "reward_std": 0.10695018619298935, + "rewards/accuracy_reward/mean": 2.5, + "rewards/accuracy_reward/std": 3.0237157344818115, + "rewards/ngram_similarity_reward/mean": 0.4939391314983368, + "rewards/ngram_similarity_reward/std": 0.22385334968566895, + "step": 746 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 747.0, + "completions/max_terminated_length": 747.0, + "completions/mean_length": 466.1875, + "completions/mean_terminated_length": 466.1875, + "completions/min_length": 228.0, + "completions/min_terminated_length": 228.0, + "epoch": 0.3343029760572835, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06717753410339355, + "learning_rate": 4.834259771250162e-06, + "loss": 0.0452, + "num_tokens": 118882035.0, + "reward": 5.7880048751831055, + "reward_std": 1.249608039855957, + "rewards/accuracy_reward/mean": 5.015625, + "rewards/accuracy_reward/std": 1.68081796169281, + "rewards/ngram_similarity_reward/mean": 0.7723801732063293, + "rewards/ngram_similarity_reward/std": 0.2816872000694275, + "step": 747 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 626.0, + "completions/max_terminated_length": 626.0, + "completions/mean_length": 434.125, + "completions/mean_terminated_length": 434.125, + "completions/min_length": 286.0, + "completions/min_terminated_length": 286.0, + "epoch": 0.33475050346833746, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07000329345464706, + "learning_rate": 4.8336320942764634e-06, + "loss": -0.0102, + "num_tokens": 119045499.0, + "reward": 4.440940856933594, + "reward_std": 0.5408289432525635, + "rewards/accuracy_reward/mean": 3.890625, + "rewards/accuracy_reward/std": 2.6998953819274902, + "rewards/ngram_similarity_reward/mean": 0.5503160357475281, + "rewards/ngram_similarity_reward/std": 0.38620853424072266, + "step": 748 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 967.0, + "completions/max_terminated_length": 967.0, + "completions/mean_length": 495.296875, + "completions/mean_terminated_length": 495.296875, + "completions/min_length": 178.0, + "completions/min_terminated_length": 178.0, + "epoch": 0.3351980308793914, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.052323050796985626, + "learning_rate": 4.833003276630756e-06, + "loss": 0.0275, + "num_tokens": 119183662.0, + "reward": 1.5198101997375488, + "reward_std": 0.09568122029304504, + "rewards/accuracy_reward/mean": 1.0, + "rewards/accuracy_reward/std": 2.618614673614502, + "rewards/ngram_similarity_reward/mean": 0.5198101997375488, + "rewards/ngram_similarity_reward/std": 0.4339534342288971, + "step": 749 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 536.0, + "completions/max_terminated_length": 536.0, + "completions/mean_length": 376.3125, + "completions/mean_terminated_length": 376.3125, + "completions/min_length": 238.0, + "completions/min_terminated_length": 238.0, + "epoch": 0.3356455582904453, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07794438302516937, + "learning_rate": 4.832373318657283e-06, + "loss": -0.0132, + "num_tokens": 119396882.0, + "reward": 4.292140960693359, + "reward_std": 0.7269996404647827, + "rewards/accuracy_reward/mean": 3.71875, + "rewards/accuracy_reward/std": 2.7629566192626953, + "rewards/ngram_similarity_reward/mean": 0.5733910799026489, + "rewards/ngram_similarity_reward/std": 0.24113622307777405, + "step": 750 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 656.0, + "completions/max_terminated_length": 656.0, + "completions/mean_length": 445.359375, + "completions/mean_terminated_length": 445.359375, + "completions/min_length": 251.0, + "completions/min_terminated_length": 251.0, + "epoch": 0.3360930857014992, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0707058385014534, + "learning_rate": 4.831742220700911e-06, + "loss": 0.0098, + "num_tokens": 119628553.0, + "reward": 4.441817760467529, + "reward_std": 1.7053776979446411, + "rewards/accuracy_reward/mean": 3.609375, + "rewards/accuracy_reward/std": 2.829084634780884, + "rewards/ngram_similarity_reward/mean": 0.8324428796768188, + "rewards/ngram_similarity_reward/std": 0.24423421919345856, + "step": 751 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 715.0, + "completions/max_terminated_length": 715.0, + "completions/mean_length": 437.1875, + "completions/mean_terminated_length": 437.1875, + "completions/min_length": 268.0, + "completions/min_terminated_length": 268.0, + "epoch": 0.3365406131125531, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0756855458021164, + "learning_rate": 4.8311099831071316e-06, + "loss": 0.0006, + "num_tokens": 119767557.0, + "reward": 2.9687860012054443, + "reward_std": 1.2867786884307861, + "rewards/accuracy_reward/mean": 2.40625, + "rewards/accuracy_reward/std": 3.0222392082214355, + "rewards/ngram_similarity_reward/mean": 0.5625358819961548, + "rewards/ngram_similarity_reward/std": 0.37464669346809387, + "step": 752 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 628.0, + "completions/max_terminated_length": 628.0, + "completions/mean_length": 426.015625, + "completions/mean_terminated_length": 426.015625, + "completions/min_length": 279.0, + "completions/min_terminated_length": 279.0, + "epoch": 0.3369881405236071, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07023408263921738, + "learning_rate": 4.830476606222058e-06, + "loss": -0.015, + "num_tokens": 119951910.0, + "reward": 1.4517123699188232, + "reward_std": 0.5338603258132935, + "rewards/accuracy_reward/mean": 0.890625, + "rewards/accuracy_reward/std": 2.5734739303588867, + "rewards/ngram_similarity_reward/mean": 0.5610873699188232, + "rewards/ngram_similarity_reward/std": 0.2960151731967926, + "step": 753 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 633.0, + "completions/max_terminated_length": 633.0, + "completions/mean_length": 398.234375, + "completions/mean_terminated_length": 398.234375, + "completions/min_length": 278.0, + "completions/min_terminated_length": 278.0, + "epoch": 0.337435667934661, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07396300137042999, + "learning_rate": 4.8298420903924294e-06, + "loss": 0.0015, + "num_tokens": 120117189.0, + "reward": 0.6071276664733887, + "reward_std": 1.2192909717559814, + "rewards/accuracy_reward/mean": 0.15625, + "rewards/accuracy_reward/std": 1.8874586820602417, + "rewards/ngram_similarity_reward/mean": 0.45087766647338867, + "rewards/ngram_similarity_reward/std": 0.3548135757446289, + "step": 754 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 745.0, + "completions/max_terminated_length": 745.0, + "completions/mean_length": 406.609375, + "completions/mean_terminated_length": 406.609375, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, + "epoch": 0.3378831953457149, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0677541047334671, + "learning_rate": 4.829206435965608e-06, + "loss": 0.038, + "num_tokens": 120254060.0, + "reward": 5.05821418762207, + "reward_std": 1.5589964389801025, + "rewards/accuracy_reward/mean": 4.5625, + "rewards/accuracy_reward/std": 2.195775270462036, + "rewards/ngram_similarity_reward/mean": 0.4957137703895569, + "rewards/ngram_similarity_reward/std": 0.28042688965797424, + "step": 755 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 658.0, + "completions/max_terminated_length": 658.0, + "completions/mean_length": 461.9375, + "completions/mean_terminated_length": 461.9375, + "completions/min_length": 285.0, + "completions/min_terminated_length": 285.0, + "epoch": 0.33833072275676884, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07829014956951141, + "learning_rate": 4.828569643289579e-06, + "loss": 0.0159, + "num_tokens": 120394472.0, + "reward": 5.997868537902832, + "reward_std": 0.09310601651668549, + "rewards/accuracy_reward/mean": 5.5, + "rewards/accuracy_reward/std": 0.0, + "rewards/ngram_similarity_reward/mean": 0.49786829948425293, + "rewards/ngram_similarity_reward/std": 0.2516043186187744, + "step": 756 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 640.0, + "completions/max_terminated_length": 640.0, + "completions/mean_length": 406.875, + "completions/mean_terminated_length": 406.875, + "completions/min_length": 267.0, + "completions/min_terminated_length": 267.0, + "epoch": 0.33877825016782276, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0906175747513771, + "learning_rate": 4.827931712712951e-06, + "loss": 0.0037, + "num_tokens": 120568144.0, + "reward": 5.099058151245117, + "reward_std": 0.9228776097297668, + "rewards/accuracy_reward/mean": 4.453125, + "rewards/accuracy_reward/std": 2.319206953048706, + "rewards/ngram_similarity_reward/mean": 0.6459333300590515, + "rewards/ngram_similarity_reward/std": 0.30590537190437317, + "step": 757 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 743.0, + "completions/max_terminated_length": 743.0, + "completions/mean_length": 428.671875, + "completions/mean_terminated_length": 428.671875, + "completions/min_length": 221.0, + "completions/min_terminated_length": 221.0, + "epoch": 0.33922577757887673, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06237898766994476, + "learning_rate": 4.827292644584954e-06, + "loss": -0.0214, + "num_tokens": 120700987.0, + "reward": 4.630217552185059, + "reward_std": 0.129164919257164, + "rewards/accuracy_reward/mean": 4.0, + "rewards/accuracy_reward/std": 2.618614673614502, + "rewards/ngram_similarity_reward/mean": 0.6302177906036377, + "rewards/ngram_similarity_reward/std": 0.35490062832832336, + "step": 758 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 529.0, + "completions/max_terminated_length": 529.0, + "completions/mean_length": 370.0625, + "completions/mean_terminated_length": 370.0625, + "completions/min_length": 254.0, + "completions/min_terminated_length": 254.0, + "epoch": 0.33967330498993065, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06669400632381439, + "learning_rate": 4.826652439255443e-06, + "loss": -0.0037, + "num_tokens": 120831663.0, + "reward": 3.382824420928955, + "reward_std": 1.113814115524292, + "rewards/accuracy_reward/mean": 2.5625, + "rewards/accuracy_reward/std": 3.2702362537384033, + "rewards/ngram_similarity_reward/mean": 0.8203244805335999, + "rewards/ngram_similarity_reward/std": 0.26841795444488525, + "step": 759 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 715.0, + "completions/max_terminated_length": 715.0, + "completions/mean_length": 439.1875, + "completions/mean_terminated_length": 439.1875, + "completions/min_length": 296.0, + "completions/min_terminated_length": 296.0, + "epoch": 0.34012083240098456, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06376458704471588, + "learning_rate": 4.826011097074895e-06, + "loss": 0.0206, + "num_tokens": 120967899.0, + "reward": 4.12385368347168, + "reward_std": 0.9235842823982239, + "rewards/accuracy_reward/mean": 3.53125, + "rewards/accuracy_reward/std": 2.839454174041748, + "rewards/ngram_similarity_reward/mean": 0.5926036238670349, + "rewards/ngram_similarity_reward/std": 0.2803894877433777, + "step": 760 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 602.0, + "completions/max_terminated_length": 602.0, + "completions/mean_length": 401.859375, + "completions/mean_terminated_length": 401.859375, + "completions/min_length": 281.0, + "completions/min_terminated_length": 281.0, + "epoch": 0.3405683598120385, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08693066984415054, + "learning_rate": 4.825368618394407e-06, + "loss": 0.0204, + "num_tokens": 121179874.0, + "reward": 1.5503162145614624, + "reward_std": 0.8148245215415955, + "rewards/accuracy_reward/mean": 1.0, + "rewards/accuracy_reward/std": 2.618614673614502, + "rewards/ngram_similarity_reward/mean": 0.5503160953521729, + "rewards/ngram_similarity_reward/std": 0.14231202006340027, + "step": 761 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 704.0, + "completions/mean_length": 380.671875, + "completions/mean_terminated_length": 354.20635986328125, + "completions/min_length": 227.0, + "completions/min_terminated_length": 227.0, + "epoch": 0.3410158872230924, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0824568048119545, + "learning_rate": 4.8247250035657036e-06, + "loss": 0.0296, + "num_tokens": 121407437.0, + "reward": 4.567397117614746, + "reward_std": 0.2737312912940979, + "rewards/accuracy_reward/mean": 3.921875, + "rewards/accuracy_reward/std": 2.764885902404785, + "rewards/ngram_similarity_reward/mean": 0.6455221176147461, + "rewards/ngram_similarity_reward/std": 0.26874208450317383, + "step": 762 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 818.0, + "completions/max_terminated_length": 818.0, + "completions/mean_length": 464.359375, + "completions/mean_terminated_length": 464.359375, + "completions/min_length": 271.0, + "completions/min_terminated_length": 271.0, + "epoch": 0.34146341463414637, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07261194288730621, + "learning_rate": 4.824080252941125e-06, + "loss": 0.0008, + "num_tokens": 121575588.0, + "reward": 5.645310878753662, + "reward_std": 0.9448514580726624, + "rewards/accuracy_reward/mean": 5.03125, + "rewards/accuracy_reward/std": 1.6229382753372192, + "rewards/ngram_similarity_reward/mean": 0.614061176776886, + "rewards/ngram_similarity_reward/std": 0.26361551880836487, + "step": 763 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 671.0, + "completions/max_terminated_length": 671.0, + "completions/mean_length": 477.1875, + "completions/mean_terminated_length": 477.1875, + "completions/min_length": 231.0, + "completions/min_terminated_length": 231.0, + "epoch": 0.3419109420452003, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06376535445451736, + "learning_rate": 4.823434366873636e-06, + "loss": -0.0149, + "num_tokens": 121735952.0, + "reward": 5.15456485748291, + "reward_std": 1.620121955871582, + "rewards/accuracy_reward/mean": 4.5625, + "rewards/accuracy_reward/std": 2.195775270462036, + "rewards/ngram_similarity_reward/mean": 0.5920648574829102, + "rewards/ngram_similarity_reward/std": 0.350754052400589, + "step": 764 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 898.0, + "completions/mean_length": 404.796875, + "completions/mean_terminated_length": 378.71429443359375, + "completions/min_length": 221.0, + "completions/min_terminated_length": 221.0, + "epoch": 0.3423584694562542, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07600030303001404, + "learning_rate": 4.822787345716826e-06, + "loss": 0.0008, + "num_tokens": 121888595.0, + "reward": 2.8059439659118652, + "reward_std": 0.07767674326896667, + "rewards/accuracy_reward/mean": 2.5, + "rewards/accuracy_reward/std": 3.0237157344818115, + "rewards/ngram_similarity_reward/mean": 0.3059441149234772, + "rewards/ngram_similarity_reward/std": 0.10181257873773575, + "step": 765 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 670.0, + "completions/max_terminated_length": 670.0, + "completions/mean_length": 415.078125, + "completions/mean_terminated_length": 415.078125, + "completions/min_length": 191.0, + "completions/min_terminated_length": 191.0, + "epoch": 0.3428059968673081, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09150941669940948, + "learning_rate": 4.8221391898249005e-06, + "loss": -0.0073, + "num_tokens": 122045240.0, + "reward": 2.7265686988830566, + "reward_std": 0.7440415024757385, + "rewards/accuracy_reward/mean": 2.125, + "rewards/accuracy_reward/std": 3.000000238418579, + "rewards/ngram_similarity_reward/mean": 0.6015689969062805, + "rewards/ngram_similarity_reward/std": 0.32731401920318604, + "step": 766 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 600.0, + "completions/max_terminated_length": 600.0, + "completions/mean_length": 390.046875, + "completions/mean_terminated_length": 390.046875, + "completions/min_length": 233.0, + "completions/min_terminated_length": 233.0, + "epoch": 0.34325352427836203, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08688319474458694, + "learning_rate": 4.821489899552688e-06, + "loss": -0.0285, + "num_tokens": 122209819.0, + "reward": 4.436158657073975, + "reward_std": 0.49523043632507324, + "rewards/accuracy_reward/mean": 4.09375, + "rewards/accuracy_reward/std": 2.5617377758026123, + "rewards/ngram_similarity_reward/mean": 0.3424083888530731, + "rewards/ngram_similarity_reward/std": 0.2369903326034546, + "step": 767 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 774.0, + "completions/max_terminated_length": 774.0, + "completions/mean_length": 460.25, + "completions/mean_terminated_length": 460.25, + "completions/min_length": 223.0, + "completions/min_terminated_length": 223.0, + "epoch": 0.343701051689416, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08063489198684692, + "learning_rate": 4.820839475255641e-06, + "loss": -0.0008, + "num_tokens": 122410091.0, + "reward": -0.36356696486473083, + "reward_std": 0.26210200786590576, + "rewards/accuracy_reward/mean": -0.625, + "rewards/accuracy_reward/std": 0.3333333432674408, + "rewards/ngram_similarity_reward/mean": 0.26143306493759155, + "rewards/ngram_similarity_reward/std": 0.10493180900812149, + "step": 768 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1005.0, + "completions/max_terminated_length": 1005.0, + "completions/mean_length": 444.28125, + "completions/mean_terminated_length": 444.28125, + "completions/min_length": 207.0, + "completions/min_terminated_length": 207.0, + "epoch": 0.3441485791004699, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09552402794361115, + "learning_rate": 4.820187917289829e-06, + "loss": 0.0433, + "num_tokens": 122593261.0, + "reward": 2.9167373180389404, + "reward_std": 0.44910675287246704, + "rewards/accuracy_reward/mean": 2.40625, + "rewards/accuracy_reward/std": 3.0222392082214355, + "rewards/ngram_similarity_reward/mean": 0.51048743724823, + "rewards/ngram_similarity_reward/std": 0.2428485006093979, + "step": 769 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 690.0, + "completions/max_terminated_length": 690.0, + "completions/mean_length": 417.625, + "completions/mean_terminated_length": 417.625, + "completions/min_length": 269.0, + "completions/min_terminated_length": 269.0, + "epoch": 0.34459610651152384, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07206974923610687, + "learning_rate": 4.819535226011943e-06, + "loss": 0.0232, + "num_tokens": 122745157.0, + "reward": 3.581268072128296, + "reward_std": 1.694632649421692, + "rewards/accuracy_reward/mean": 2.96875, + "rewards/accuracy_reward/std": 2.986577272415161, + "rewards/ngram_similarity_reward/mean": 0.6125181317329407, + "rewards/ngram_similarity_reward/std": 0.44275662302970886, + "step": 770 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 413.0, + "completions/max_terminated_length": 413.0, + "completions/mean_length": 320.015625, + "completions/mean_terminated_length": 320.015625, + "completions/min_length": 246.0, + "completions/min_terminated_length": 246.0, + "epoch": 0.34504363392257775, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09001357853412628, + "learning_rate": 4.818881401779296e-06, + "loss": 0.0099, + "num_tokens": 122867302.0, + "reward": 4.758194923400879, + "reward_std": 0.22100293636322021, + "rewards/accuracy_reward/mean": 4.0, + "rewards/accuracy_reward/std": 2.618614673614502, + "rewards/ngram_similarity_reward/mean": 0.7581952810287476, + "rewards/ngram_similarity_reward/std": 0.3720369338989258, + "step": 771 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 675.0, + "completions/max_terminated_length": 675.0, + "completions/mean_length": 379.1875, + "completions/mean_terminated_length": 379.1875, + "completions/min_length": 209.0, + "completions/min_terminated_length": 209.0, + "epoch": 0.34549116133363167, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0792035311460495, + "learning_rate": 4.818226444949819e-06, + "loss": -0.0154, + "num_tokens": 123002306.0, + "reward": 4.732837677001953, + "reward_std": 0.5926983952522278, + "rewards/accuracy_reward/mean": 3.90625, + "rewards/accuracy_reward/std": 2.6709415912628174, + "rewards/ngram_similarity_reward/mean": 0.826587438583374, + "rewards/ngram_similarity_reward/std": 0.31210973858833313, + "step": 772 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 690.0, + "completions/max_terminated_length": 690.0, + "completions/mean_length": 441.296875, + "completions/mean_terminated_length": 441.296875, + "completions/min_length": 210.0, + "completions/min_terminated_length": 210.0, + "epoch": 0.3459386887446856, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07282985746860504, + "learning_rate": 4.817570355882067e-06, + "loss": 0.0359, + "num_tokens": 123138501.0, + "reward": 0.6833294630050659, + "reward_std": 0.9691657423973083, + "rewards/accuracy_reward/mean": 0.15625, + "rewards/accuracy_reward/std": 1.8874586820602417, + "rewards/ngram_similarity_reward/mean": 0.5270794630050659, + "rewards/ngram_similarity_reward/std": 0.3545078933238983, + "step": 773 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 627.0, + "completions/max_terminated_length": 627.0, + "completions/mean_length": 416.359375, + "completions/mean_terminated_length": 416.359375, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, + "epoch": 0.34638621615573956, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06430497765541077, + "learning_rate": 4.816913134935208e-06, + "loss": 0.0343, + "num_tokens": 123296300.0, + "reward": 5.439652442932129, + "reward_std": 1.4340357780456543, + "rewards/accuracy_reward/mean": 4.5625, + "rewards/accuracy_reward/std": 2.195775270462036, + "rewards/ngram_similarity_reward/mean": 0.8771523237228394, + "rewards/ngram_similarity_reward/std": 0.29859817028045654, + "step": 774 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 884.0, + "completions/max_terminated_length": 884.0, + "completions/mean_length": 393.5625, + "completions/mean_terminated_length": 393.5625, + "completions/min_length": 203.0, + "completions/min_terminated_length": 203.0, + "epoch": 0.3468337435667935, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0746258869767189, + "learning_rate": 4.8162547824690365e-06, + "loss": 0.0325, + "num_tokens": 123456832.0, + "reward": 4.419753551483154, + "reward_std": 0.6457646489143372, + "rewards/accuracy_reward/mean": 3.8125, + "rewards/accuracy_reward/std": 2.7189810276031494, + "rewards/ngram_similarity_reward/mean": 0.6072536706924438, + "rewards/ngram_similarity_reward/std": 0.2622377872467041, + "step": 775 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 857.0, + "completions/max_terminated_length": 857.0, + "completions/mean_length": 444.96875, + "completions/mean_terminated_length": 444.96875, + "completions/min_length": 213.0, + "completions/min_terminated_length": 213.0, + "epoch": 0.3472812709778474, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07858631014823914, + "learning_rate": 4.815595298843963e-06, + "loss": 0.0147, + "num_tokens": 123606398.0, + "reward": 1.5636564493179321, + "reward_std": 1.172206163406372, + "rewards/accuracy_reward/mean": 0.96875, + "rewards/accuracy_reward/std": 2.6425621509552, + "rewards/ngram_similarity_reward/mean": 0.5949063897132874, + "rewards/ngram_similarity_reward/std": 0.3762511909008026, + "step": 776 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 876.0, + "completions/max_terminated_length": 876.0, + "completions/mean_length": 532.890625, + "completions/mean_terminated_length": 532.890625, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, + "epoch": 0.3477287983889013, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06011474132537842, + "learning_rate": 4.814934684421018e-06, + "loss": -0.0029, + "num_tokens": 123752823.0, + "reward": 3.070727586746216, + "reward_std": 0.12144973129034042, + "rewards/accuracy_reward/mean": 2.5, + "rewards/accuracy_reward/std": 3.0237157344818115, + "rewards/ngram_similarity_reward/mean": 0.5707273483276367, + "rewards/ngram_similarity_reward/std": 0.39387303590774536, + "step": 777 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 662.0, + "completions/max_terminated_length": 662.0, + "completions/mean_length": 454.28125, + "completions/mean_terminated_length": 454.28125, + "completions/min_length": 256.0, + "completions/min_terminated_length": 256.0, + "epoch": 0.3481763257999552, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06932368129491806, + "learning_rate": 4.8142729395618505e-06, + "loss": -0.0243, + "num_tokens": 123903465.0, + "reward": 1.4879626035690308, + "reward_std": 0.9695565104484558, + "rewards/accuracy_reward/mean": 0.90625, + "rewards/accuracy_reward/std": 2.5617377758026123, + "rewards/ngram_similarity_reward/mean": 0.581712543964386, + "rewards/ngram_similarity_reward/std": 0.21801921725273132, + "step": 778 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 736.0, + "completions/max_terminated_length": 736.0, + "completions/mean_length": 453.890625, + "completions/mean_terminated_length": 453.890625, + "completions/min_length": 224.0, + "completions/min_terminated_length": 224.0, + "epoch": 0.3486238532110092, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0718824565410614, + "learning_rate": 4.813610064628729e-06, + "loss": -0.0058, + "num_tokens": 124056482.0, + "reward": 5.090573310852051, + "reward_std": 1.7853457927703857, + "rewards/accuracy_reward/mean": 4.28125, + "rewards/accuracy_reward/std": 2.4330317974090576, + "rewards/ngram_similarity_reward/mean": 0.8093235492706299, + "rewards/ngram_similarity_reward/std": 0.39342236518859863, + "step": 779 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 536.0, + "completions/max_terminated_length": 536.0, + "completions/mean_length": 361.140625, + "completions/mean_terminated_length": 361.140625, + "completions/min_length": 191.0, + "completions/min_terminated_length": 191.0, + "epoch": 0.3490713806220631, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13112983107566833, + "learning_rate": 4.8129460599845416e-06, + "loss": -0.0114, + "num_tokens": 124233579.0, + "reward": 2.248539686203003, + "reward_std": 1.9886976480484009, + "rewards/accuracy_reward/mean": 1.65625, + "rewards/accuracy_reward/std": 3.32961106300354, + "rewards/ngram_similarity_reward/mean": 0.5922897458076477, + "rewards/ngram_similarity_reward/std": 0.321114718914032, + "step": 780 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 683.0, + "completions/max_terminated_length": 683.0, + "completions/mean_length": 391.046875, + "completions/mean_terminated_length": 391.046875, + "completions/min_length": 241.0, + "completions/min_terminated_length": 241.0, + "epoch": 0.349518908033117, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07911917567253113, + "learning_rate": 4.812280925992791e-06, + "loss": 0.0378, + "num_tokens": 124383454.0, + "reward": 2.1933369636535645, + "reward_std": 1.624046802520752, + "rewards/accuracy_reward/mean": 1.5625, + "rewards/accuracy_reward/std": 2.872281312942505, + "rewards/ngram_similarity_reward/mean": 0.630837082862854, + "rewards/ngram_similarity_reward/std": 0.3428136706352234, + "step": 781 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 549.0, + "completions/max_terminated_length": 549.0, + "completions/mean_length": 405.171875, + "completions/mean_terminated_length": 405.171875, + "completions/min_length": 293.0, + "completions/min_terminated_length": 293.0, + "epoch": 0.34996643544417094, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08272143453359604, + "learning_rate": 4.811614663017603e-06, + "loss": -0.0183, + "num_tokens": 124575481.0, + "reward": 4.482691764831543, + "reward_std": 0.5504340529441833, + "rewards/accuracy_reward/mean": 3.90625, + "rewards/accuracy_reward/std": 2.6709415912628174, + "rewards/ngram_similarity_reward/mean": 0.5764422416687012, + "rewards/ngram_similarity_reward/std": 0.2652113139629364, + "step": 782 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 695.0, + "completions/max_terminated_length": 695.0, + "completions/mean_length": 458.015625, + "completions/mean_terminated_length": 458.015625, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "epoch": 0.35041396285522486, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06321236491203308, + "learning_rate": 4.810947271423719e-06, + "loss": -0.0047, + "num_tokens": 124745994.0, + "reward": 4.337434768676758, + "reward_std": 1.5873150825500488, + "rewards/accuracy_reward/mean": 3.5625, + "rewards/accuracy_reward/std": 2.905249834060669, + "rewards/ngram_similarity_reward/mean": 0.7749345302581787, + "rewards/ngram_similarity_reward/std": 0.27499550580978394, + "step": 783 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 865.0, + "completions/max_terminated_length": 865.0, + "completions/mean_length": 494.0, + "completions/mean_terminated_length": 494.0, + "completions/min_length": 226.0, + "completions/min_terminated_length": 226.0, + "epoch": 0.35086149026627883, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06817396730184555, + "learning_rate": 4.810278751576498e-06, + "loss": 0.0293, + "num_tokens": 124899482.0, + "reward": 1.5161876678466797, + "reward_std": 0.1321793794631958, + "rewards/accuracy_reward/mean": 1.0, + "rewards/accuracy_reward/std": 2.618614673614502, + "rewards/ngram_similarity_reward/mean": 0.5161875486373901, + "rewards/ngram_similarity_reward/std": 0.29837268590927124, + "step": 784 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 803.0, + "completions/max_terminated_length": 803.0, + "completions/mean_length": 526.40625, + "completions/mean_terminated_length": 526.40625, + "completions/min_length": 331.0, + "completions/min_terminated_length": 331.0, + "epoch": 0.35130901767733275, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07075538486242294, + "learning_rate": 4.809609103841917e-06, + "loss": -0.0369, + "num_tokens": 125075924.0, + "reward": 4.434210300445557, + "reward_std": 0.5480722188949585, + "rewards/accuracy_reward/mean": 3.890625, + "rewards/accuracy_reward/std": 2.6998953819274902, + "rewards/ngram_similarity_reward/mean": 0.543585479259491, + "rewards/ngram_similarity_reward/std": 0.30443742871284485, + "step": 785 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 740.0, + "completions/max_terminated_length": 740.0, + "completions/mean_length": 455.796875, + "completions/mean_terminated_length": 455.796875, + "completions/min_length": 239.0, + "completions/min_terminated_length": 239.0, + "epoch": 0.35175654508838666, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08288660645484924, + "learning_rate": 4.808938328586573e-06, + "loss": 0.0019, + "num_tokens": 125272263.0, + "reward": 0.5084143877029419, + "reward_std": 1.5748283863067627, + "rewards/accuracy_reward/mean": 0.125, + "rewards/accuracy_reward/std": 1.9065468311309814, + "rewards/ngram_similarity_reward/mean": 0.3834143877029419, + "rewards/ngram_similarity_reward/std": 0.21710476279258728, + "step": 786 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 738.0, + "completions/max_terminated_length": 738.0, + "completions/mean_length": 425.21875, + "completions/mean_terminated_length": 425.21875, + "completions/min_length": 236.0, + "completions/min_terminated_length": 236.0, + "epoch": 0.3522040724994406, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09674753993749619, + "learning_rate": 4.808266426177674e-06, + "loss": -0.0288, + "num_tokens": 125513173.0, + "reward": 1.8925563097000122, + "reward_std": 0.9368428587913513, + "rewards/accuracy_reward/mean": 1.484375, + "rewards/accuracy_reward/std": 2.941181182861328, + "rewards/ngram_similarity_reward/mean": 0.40818145871162415, + "rewards/ngram_similarity_reward/std": 0.2608228325843811, + "step": 787 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 580.0, + "completions/max_terminated_length": 580.0, + "completions/mean_length": 356.671875, + "completions/mean_terminated_length": 356.671875, + "completions/min_length": 190.0, + "completions/min_terminated_length": 190.0, + "epoch": 0.3526515999104945, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.06609658896923065, + "learning_rate": 4.807593396983053e-06, + "loss": -0.0023, + "num_tokens": 125681984.0, + "reward": 4.787121295928955, + "reward_std": 1.3088996410369873, + "rewards/accuracy_reward/mean": 4.0, + "rewards/accuracy_reward/std": 2.618614673614502, + "rewards/ngram_similarity_reward/mean": 0.7871214151382446, + "rewards/ngram_similarity_reward/std": 0.33557581901550293, + "step": 788 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 665.0, + "completions/max_terminated_length": 665.0, + "completions/mean_length": 446.96875, + "completions/mean_terminated_length": 446.96875, + "completions/min_length": 278.0, + "completions/min_terminated_length": 278.0, + "epoch": 0.35309912732154847, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0749947801232338, + "learning_rate": 4.806919241371153e-06, + "loss": -0.0101, + "num_tokens": 125865406.0, + "reward": 2.938890218734741, + "reward_std": 1.8307483196258545, + "rewards/accuracy_reward/mean": 2.28125, + "rewards/accuracy_reward/std": 3.1596100330352783, + "rewards/ngram_similarity_reward/mean": 0.6576401591300964, + "rewards/ngram_similarity_reward/std": 0.3600626289844513, + "step": 789 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 645.0, + "completions/max_terminated_length": 645.0, + "completions/mean_length": 387.3125, + "completions/mean_terminated_length": 387.3125, + "completions/min_length": 210.0, + "completions/min_terminated_length": 210.0, + "epoch": 0.3535466547326024, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07980632036924362, + "learning_rate": 4.806243959711037e-06, + "loss": -0.0154, + "num_tokens": 126032418.0, + "reward": 2.545635223388672, + "reward_std": 1.1480920314788818, + "rewards/accuracy_reward/mean": 2.015625, + "rewards/accuracy_reward/std": 3.00260329246521, + "rewards/ngram_similarity_reward/mean": 0.5300101041793823, + "rewards/ngram_similarity_reward/std": 0.27905625104904175, + "step": 790 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 621.0, + "completions/max_terminated_length": 621.0, + "completions/mean_length": 391.875, + "completions/mean_terminated_length": 391.875, + "completions/min_length": 221.0, + "completions/min_terminated_length": 221.0, + "epoch": 0.3539941821436563, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07998666167259216, + "learning_rate": 4.805567552372385e-06, + "loss": 0.0563, + "num_tokens": 126182538.0, + "reward": 6.1160430908203125, + "reward_std": 0.5871672034263611, + "rewards/accuracy_reward/mean": 5.390625, + "rewards/accuracy_reward/std": 0.8750000596046448, + "rewards/ngram_similarity_reward/mean": 0.7254180312156677, + "rewards/ngram_similarity_reward/std": 0.30045783519744873, + "step": 791 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 733.0, + "completions/max_terminated_length": 733.0, + "completions/mean_length": 465.40625, + "completions/mean_terminated_length": 465.40625, + "completions/min_length": 303.0, + "completions/min_terminated_length": 303.0, + "epoch": 0.3544417095547102, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06252842396497726, + "learning_rate": 4.804890019725492e-06, + "loss": 0.0193, + "num_tokens": 126348212.0, + "reward": 3.336127519607544, + "reward_std": 0.6081538200378418, + "rewards/accuracy_reward/mean": 2.6875, + "rewards/accuracy_reward/std": 3.0178043842315674, + "rewards/ngram_similarity_reward/mean": 0.648627519607544, + "rewards/ngram_similarity_reward/std": 0.41256552934646606, + "step": 792 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 750.0, + "completions/max_terminated_length": 750.0, + "completions/mean_length": 411.5625, + "completions/mean_terminated_length": 411.5625, + "completions/min_length": 211.0, + "completions/min_terminated_length": 211.0, + "epoch": 0.35488923696576413, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07198772579431534, + "learning_rate": 4.804211362141267e-06, + "loss": 0.0334, + "num_tokens": 126462888.0, + "reward": 4.9459052085876465, + "reward_std": 0.7472313642501831, + "rewards/accuracy_reward/mean": 4.1875, + "rewards/accuracy_reward/std": 2.5, + "rewards/ngram_similarity_reward/mean": 0.7584052085876465, + "rewards/ngram_similarity_reward/std": 0.2952753007411957, + "step": 793 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 623.0, + "completions/max_terminated_length": 623.0, + "completions/mean_length": 369.765625, + "completions/mean_terminated_length": 369.765625, + "completions/min_length": 209.0, + "completions/min_terminated_length": 209.0, + "epoch": 0.3553367643768181, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08004353940486908, + "learning_rate": 4.8035315799912404e-06, + "loss": -0.0228, + "num_tokens": 126599897.0, + "reward": 3.7750284671783447, + "reward_std": 1.6776546239852905, + "rewards/accuracy_reward/mean": 3.09375, + "rewards/accuracy_reward/std": 3.037954330444336, + "rewards/ngram_similarity_reward/mean": 0.6812787055969238, + "rewards/ngram_similarity_reward/std": 0.440616250038147, + "step": 794 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 965.0, + "completions/max_terminated_length": 965.0, + "completions/mean_length": 472.640625, + "completions/mean_terminated_length": 472.640625, + "completions/min_length": 293.0, + "completions/min_terminated_length": 293.0, + "epoch": 0.355784291787872, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07230333983898163, + "learning_rate": 4.802850673647553e-06, + "loss": 0.0271, + "num_tokens": 126781618.0, + "reward": 3.80275297164917, + "reward_std": 1.2974958419799805, + "rewards/accuracy_reward/mean": 3.0625, + "rewards/accuracy_reward/std": 2.9700891971588135, + "rewards/ngram_similarity_reward/mean": 0.7402530908584595, + "rewards/ngram_similarity_reward/std": 0.3653397262096405, + "step": 795 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 970.0, + "completions/max_terminated_length": 970.0, + "completions/mean_length": 508.1875, + "completions/mean_terminated_length": 508.1875, + "completions/min_length": 274.0, + "completions/min_terminated_length": 274.0, + "epoch": 0.35623181919892594, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06609354168176651, + "learning_rate": 4.802168643482963e-06, + "loss": -0.0023, + "num_tokens": 126959294.0, + "reward": 2.959211826324463, + "reward_std": 1.0579187870025635, + "rewards/accuracy_reward/mean": 2.3125, + "rewards/accuracy_reward/std": 3.0178043842315674, + "rewards/ngram_similarity_reward/mean": 0.6467118263244629, + "rewards/ngram_similarity_reward/std": 0.281398743391037, + "step": 796 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 670.0, + "completions/max_terminated_length": 670.0, + "completions/mean_length": 454.859375, + "completions/mean_terminated_length": 454.859375, + "completions/min_length": 268.0, + "completions/min_terminated_length": 268.0, + "epoch": 0.35667934660997985, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06357131153345108, + "learning_rate": 4.801485489870845e-06, + "loss": -0.0384, + "num_tokens": 127091813.0, + "reward": 2.302091598510742, + "reward_std": 2.009054183959961, + "rewards/accuracy_reward/mean": 1.5625, + "rewards/accuracy_reward/std": 2.872281312942505, + "rewards/ngram_similarity_reward/mean": 0.7395914793014526, + "rewards/ngram_similarity_reward/std": 0.25282952189445496, + "step": 797 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 543.0, + "completions/max_terminated_length": 543.0, + "completions/mean_length": 348.390625, + "completions/mean_terminated_length": 348.390625, + "completions/min_length": 240.0, + "completions/min_terminated_length": 240.0, + "epoch": 0.35712687402103377, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07078349590301514, + "learning_rate": 4.800801213185184e-06, + "loss": 0.0286, + "num_tokens": 127249374.0, + "reward": 4.6454057693481445, + "reward_std": 1.6441264152526855, + "rewards/accuracy_reward/mean": 4.09375, + "rewards/accuracy_reward/std": 2.5617377758026123, + "rewards/ngram_similarity_reward/mean": 0.5516562461853027, + "rewards/ngram_similarity_reward/std": 0.34286314249038696, + "step": 798 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 727.0, + "completions/max_terminated_length": 727.0, + "completions/mean_length": 495.890625, + "completions/mean_terminated_length": 495.890625, + "completions/min_length": 288.0, + "completions/min_terminated_length": 288.0, + "epoch": 0.35757440143208774, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0576631985604763, + "learning_rate": 4.800115813800587e-06, + "loss": 0.034, + "num_tokens": 127385719.0, + "reward": 5.4827423095703125, + "reward_std": 0.9034347534179688, + "rewards/accuracy_reward/mean": 4.9375, + "rewards/accuracy_reward/std": 1.7627090215682983, + "rewards/ngram_similarity_reward/mean": 0.5452421307563782, + "rewards/ngram_similarity_reward/std": 0.347225159406662, + "step": 799 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 654.0, + "completions/max_terminated_length": 654.0, + "completions/mean_length": 453.78125, + "completions/mean_terminated_length": 453.78125, + "completions/min_length": 237.0, + "completions/min_terminated_length": 237.0, + "epoch": 0.35802192884314166, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06489190459251404, + "learning_rate": 4.799429292092272e-06, + "loss": 0.0004, + "num_tokens": 127558409.0, + "reward": 3.6968436241149902, + "reward_std": 1.5084903240203857, + "rewards/accuracy_reward/mean": 3.078125, + "rewards/accuracy_reward/std": 3.0592284202575684, + "rewards/ngram_similarity_reward/mean": 0.6187184453010559, + "rewards/ngram_similarity_reward/std": 0.39123642444610596, + "step": 800 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 803.0, + "completions/max_terminated_length": 803.0, + "completions/mean_length": 436.109375, + "completions/mean_terminated_length": 436.109375, + "completions/min_length": 200.0, + "completions/min_terminated_length": 200.0, + "epoch": 0.3584694562541956, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06589549034833908, + "learning_rate": 4.798741648436068e-06, + "loss": 0.0169, + "num_tokens": 127696000.0, + "reward": 3.2289295196533203, + "reward_std": 0.13976755738258362, + "rewards/accuracy_reward/mean": 2.5, + "rewards/accuracy_reward/std": 3.0237157344818115, + "rewards/ngram_similarity_reward/mean": 0.7289294600486755, + "rewards/ngram_similarity_reward/std": 0.22076164186000824, + "step": 801 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 722.0, + "completions/max_terminated_length": 722.0, + "completions/mean_length": 429.1875, + "completions/mean_terminated_length": 429.1875, + "completions/min_length": 273.0, + "completions/min_terminated_length": 273.0, + "epoch": 0.3589169836652495, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07002793252468109, + "learning_rate": 4.798052883208424e-06, + "loss": 0.0133, + "num_tokens": 127861036.0, + "reward": 4.147583484649658, + "reward_std": 1.162605881690979, + "rewards/accuracy_reward/mean": 3.6875, + "rewards/accuracy_reward/std": 2.816476583480835, + "rewards/ngram_similarity_reward/mean": 0.46008336544036865, + "rewards/ngram_similarity_reward/std": 0.39208683371543884, + "step": 802 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 559.0, + "completions/max_terminated_length": 559.0, + "completions/mean_length": 372.09375, + "completions/mean_terminated_length": 372.09375, + "completions/min_length": 234.0, + "completions/min_terminated_length": 234.0, + "epoch": 0.3593645110763034, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.07022594660520554, + "learning_rate": 4.797362996786398e-06, + "loss": -0.0101, + "num_tokens": 128016226.0, + "reward": 4.534200668334961, + "reward_std": 0.6052448153495789, + "rewards/accuracy_reward/mean": 3.8125, + "rewards/accuracy_reward/std": 2.7189810276031494, + "rewards/ngram_similarity_reward/mean": 0.72170090675354, + "rewards/ngram_similarity_reward/std": 0.3617556691169739, + "step": 803 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 608.0, + "completions/max_terminated_length": 608.0, + "completions/mean_length": 381.140625, + "completions/mean_terminated_length": 381.140625, + "completions/min_length": 248.0, + "completions/min_terminated_length": 248.0, + "epoch": 0.3598120384873574, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09523069113492966, + "learning_rate": 4.796671989547667e-06, + "loss": 0.036, + "num_tokens": 128213915.0, + "reward": 1.9837148189544678, + "reward_std": 0.841371476650238, + "rewards/accuracy_reward/mean": 1.5625, + "rewards/accuracy_reward/std": 2.872281312942505, + "rewards/ngram_similarity_reward/mean": 0.42121487855911255, + "rewards/ngram_similarity_reward/std": 0.22523535788059235, + "step": 804 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 693.0, + "completions/max_terminated_length": 693.0, + "completions/mean_length": 464.53125, + "completions/mean_terminated_length": 464.53125, + "completions/min_length": 288.0, + "completions/min_terminated_length": 288.0, + "epoch": 0.3602595658984113, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06792967766523361, + "learning_rate": 4.795979861870517e-06, + "loss": -0.0196, + "num_tokens": 128337133.0, + "reward": 3.660709857940674, + "reward_std": 1.4073338508605957, + "rewards/accuracy_reward/mean": 3.03125, + "rewards/accuracy_reward/std": 3.0130341053009033, + "rewards/ngram_similarity_reward/mean": 0.6294599771499634, + "rewards/ngram_similarity_reward/std": 0.300203412771225, + "step": 805 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 995.0, + "completions/max_terminated_length": 995.0, + "completions/mean_length": 580.90625, + "completions/mean_terminated_length": 580.90625, + "completions/min_length": 280.0, + "completions/min_terminated_length": 280.0, + "epoch": 0.3607070933094652, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05525697395205498, + "learning_rate": 4.79528661413385e-06, + "loss": -0.0096, + "num_tokens": 128487687.0, + "reward": 4.130392074584961, + "reward_std": 0.8165697455406189, + "rewards/accuracy_reward/mean": 3.625, + "rewards/accuracy_reward/std": 2.8030595779418945, + "rewards/ngram_similarity_reward/mean": 0.5053920745849609, + "rewards/ngram_similarity_reward/std": 0.29331842064857483, + "step": 806 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 709.0, + "completions/max_terminated_length": 709.0, + "completions/mean_length": 446.96875, + "completions/mean_terminated_length": 446.96875, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "epoch": 0.3611546207205191, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06297452002763748, + "learning_rate": 4.79459224671718e-06, + "loss": -0.0148, + "num_tokens": 128631093.0, + "reward": 3.1118602752685547, + "reward_std": 0.8096895217895508, + "rewards/accuracy_reward/mean": 2.5, + "rewards/accuracy_reward/std": 3.0237157344818115, + "rewards/ngram_similarity_reward/mean": 0.6118603348731995, + "rewards/ngram_similarity_reward/std": 0.240114226937294, + "step": 807 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 495.0, + "completions/max_terminated_length": 495.0, + "completions/mean_length": 371.265625, + "completions/mean_terminated_length": 371.265625, + "completions/min_length": 253.0, + "completions/min_terminated_length": 253.0, + "epoch": 0.36160214813157304, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07497435808181763, + "learning_rate": 4.7938967600006345e-06, + "loss": -0.0025, + "num_tokens": 128764758.0, + "reward": 3.28179931640625, + "reward_std": 1.3945362567901611, + "rewards/accuracy_reward/mean": 2.96875, + "rewards/accuracy_reward/std": 2.986577272415161, + "rewards/ngram_similarity_reward/mean": 0.3130495250225067, + "rewards/ngram_similarity_reward/std": 0.24311913549900055, + "step": 808 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 646.0, + "completions/max_terminated_length": 646.0, + "completions/mean_length": 385.6875, + "completions/mean_terminated_length": 385.6875, + "completions/min_length": 259.0, + "completions/min_terminated_length": 259.0, + "epoch": 0.36204967554262696, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07288374751806259, + "learning_rate": 4.793200154364952e-06, + "loss": 0.0016, + "num_tokens": 128902994.0, + "reward": 5.032406806945801, + "reward_std": 0.857367217540741, + "rewards/accuracy_reward/mean": 4.46875, + "rewards/accuracy_reward/std": 2.2815253734588623, + "rewards/ngram_similarity_reward/mean": 0.5636566877365112, + "rewards/ngram_similarity_reward/std": 0.2474435567855835, + "step": 809 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 808.0, + "completions/max_terminated_length": 808.0, + "completions/mean_length": 495.0, + "completions/mean_terminated_length": 495.0, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "epoch": 0.36249720295368093, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.060755811631679535, + "learning_rate": 4.792502430191489e-06, + "loss": -0.035, + "num_tokens": 129072722.0, + "reward": 5.3756914138793945, + "reward_std": 1.7483309507369995, + "rewards/accuracy_reward/mean": 4.5625, + "rewards/accuracy_reward/std": 2.195775270462036, + "rewards/ngram_similarity_reward/mean": 0.8131917119026184, + "rewards/ngram_similarity_reward/std": 0.18953640758991241, + "step": 810 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1525.0, + "completions/max_terminated_length": 1525.0, + "completions/mean_length": 508.984375, + "completions/mean_terminated_length": 508.984375, + "completions/min_length": 210.0, + "completions/min_terminated_length": 210.0, + "epoch": 0.36294473036473485, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10541582852602005, + "learning_rate": 4.791803587862207e-06, + "loss": 0.093, + "num_tokens": 129302561.0, + "reward": 4.274901866912842, + "reward_std": 1.602287769317627, + "rewards/accuracy_reward/mean": 3.8125, + "rewards/accuracy_reward/std": 2.7189810276031494, + "rewards/ngram_similarity_reward/mean": 0.4624021053314209, + "rewards/ngram_similarity_reward/std": 0.3578868508338928, + "step": 811 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 557.0, + "completions/max_terminated_length": 557.0, + "completions/mean_length": 402.828125, + "completions/mean_terminated_length": 402.828125, + "completions/min_length": 219.0, + "completions/min_terminated_length": 219.0, + "epoch": 0.36339225777578876, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06787524372339249, + "learning_rate": 4.791103627759684e-06, + "loss": -0.004, + "num_tokens": 129464822.0, + "reward": 6.206984996795654, + "reward_std": 0.16916683316230774, + "rewards/accuracy_reward/mean": 5.5, + "rewards/accuracy_reward/std": 0.0, + "rewards/ngram_similarity_reward/mean": 0.7069849967956543, + "rewards/ngram_similarity_reward/std": 0.307819128036499, + "step": 812 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 629.0, + "completions/max_terminated_length": 629.0, + "completions/mean_length": 426.265625, + "completions/mean_terminated_length": 426.265625, + "completions/min_length": 284.0, + "completions/min_terminated_length": 284.0, + "epoch": 0.3638397851868427, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08130761981010437, + "learning_rate": 4.7904025502671085e-06, + "loss": 0.0069, + "num_tokens": 129604519.0, + "reward": 2.542485237121582, + "reward_std": 0.8359096646308899, + "rewards/accuracy_reward/mean": 2.03125, + "rewards/accuracy_reward/std": 2.986577272415161, + "rewards/ngram_similarity_reward/mean": 0.5112350583076477, + "rewards/ngram_similarity_reward/std": 0.2730526328086853, + "step": 813 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 658.0, + "completions/max_terminated_length": 658.0, + "completions/mean_length": 417.15625, + "completions/mean_terminated_length": 417.15625, + "completions/min_length": 278.0, + "completions/min_terminated_length": 278.0, + "epoch": 0.3642873125978966, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07702480256557465, + "learning_rate": 4.789700355768283e-06, + "loss": -0.0035, + "num_tokens": 129739105.0, + "reward": 3.377739906311035, + "reward_std": 1.9412826299667358, + "rewards/accuracy_reward/mean": 2.78125, + "rewards/accuracy_reward/std": 3.0103988647460938, + "rewards/ngram_similarity_reward/mean": 0.5964900851249695, + "rewards/ngram_similarity_reward/std": 0.3124755024909973, + "step": 814 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 664.0, + "completions/max_terminated_length": 664.0, + "completions/mean_length": 324.546875, + "completions/mean_terminated_length": 324.546875, + "completions/min_length": 196.0, + "completions/min_terminated_length": 196.0, + "epoch": 0.36473484000895057, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09259375929832458, + "learning_rate": 4.788997044647618e-06, + "loss": -0.0038, + "num_tokens": 129846964.0, + "reward": 3.172454833984375, + "reward_std": 1.0749475955963135, + "rewards/accuracy_reward/mean": 2.6875, + "rewards/accuracy_reward/std": 3.0178043842315674, + "rewards/ngram_similarity_reward/mean": 0.4849551022052765, + "rewards/ngram_similarity_reward/std": 0.23124173283576965, + "step": 815 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 512.0, + "completions/max_terminated_length": 512.0, + "completions/mean_length": 392.484375, + "completions/mean_terminated_length": 392.484375, + "completions/min_length": 267.0, + "completions/min_terminated_length": 267.0, + "epoch": 0.3651823674200045, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07372072339057922, + "learning_rate": 4.788292617290137e-06, + "loss": 0.0082, + "num_tokens": 129981875.0, + "reward": 2.8951284885406494, + "reward_std": 0.7518053650856018, + "rewards/accuracy_reward/mean": 2.21875, + "rewards/accuracy_reward/std": 3.0103988647460938, + "rewards/ngram_similarity_reward/mean": 0.6763784885406494, + "rewards/ngram_similarity_reward/std": 0.3303958475589752, + "step": 816 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 851.0, + "completions/max_terminated_length": 851.0, + "completions/mean_length": 478.296875, + "completions/mean_terminated_length": 478.296875, + "completions/min_length": 171.0, + "completions/min_terminated_length": 171.0, + "epoch": 0.3656298948310584, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.050354402512311935, + "learning_rate": 4.787587074081476e-06, + "loss": 0.0352, + "num_tokens": 130164454.0, + "reward": 4.420853614807129, + "reward_std": 0.7614438533782959, + "rewards/accuracy_reward/mean": 3.625, + "rewards/accuracy_reward/std": 2.8030595779418945, + "rewards/ngram_similarity_reward/mean": 0.7958537340164185, + "rewards/ngram_similarity_reward/std": 0.4121065139770508, + "step": 817 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 668.0, + "completions/max_terminated_length": 668.0, + "completions/mean_length": 389.296875, + "completions/mean_terminated_length": 389.296875, + "completions/min_length": 272.0, + "completions/min_terminated_length": 272.0, + "epoch": 0.3660774222421123, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0717056393623352, + "learning_rate": 4.786880415407879e-06, + "loss": -0.0265, + "num_tokens": 130348473.0, + "reward": 3.9296135902404785, + "reward_std": 1.0928839445114136, + "rewards/accuracy_reward/mean": 3.125, + "rewards/accuracy_reward/std": 3.1040170192718506, + "rewards/ngram_similarity_reward/mean": 0.8046135306358337, + "rewards/ngram_similarity_reward/std": 0.3383404314517975, + "step": 818 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 811.0, + "completions/max_terminated_length": 811.0, + "completions/mean_length": 471.6875, + "completions/mean_terminated_length": 471.6875, + "completions/min_length": 240.0, + "completions/min_terminated_length": 240.0, + "epoch": 0.36652494965316623, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06714696437120438, + "learning_rate": 4.786172641656203e-06, + "loss": 0.0059, + "num_tokens": 130577109.0, + "reward": 2.7257204055786133, + "reward_std": 0.6096923351287842, + "rewards/accuracy_reward/mean": 2.3125, + "rewards/accuracy_reward/std": 3.0178043842315674, + "rewards/ngram_similarity_reward/mean": 0.41322028636932373, + "rewards/ngram_similarity_reward/std": 0.16564498841762543, + "step": 819 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 739.0, + "completions/max_terminated_length": 739.0, + "completions/mean_length": 446.328125, + "completions/mean_terminated_length": 446.328125, + "completions/min_length": 243.0, + "completions/min_terminated_length": 243.0, + "epoch": 0.3669724770642202, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07103020697832108, + "learning_rate": 4.785463753213914e-06, + "loss": 0.0415, + "num_tokens": 130805866.0, + "reward": 4.421981334686279, + "reward_std": 1.171364188194275, + "rewards/accuracy_reward/mean": 3.625, + "rewards/accuracy_reward/std": 2.8030595779418945, + "rewards/ngram_similarity_reward/mean": 0.796981692314148, + "rewards/ngram_similarity_reward/std": 0.306751549243927, + "step": 820 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 610.0, + "completions/max_terminated_length": 610.0, + "completions/mean_length": 444.984375, + "completions/mean_terminated_length": 444.984375, + "completions/min_length": 269.0, + "completions/min_terminated_length": 269.0, + "epoch": 0.3674200044752741, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08048597723245621, + "learning_rate": 4.784753750469089e-06, + "loss": -0.0204, + "num_tokens": 130960793.0, + "reward": 2.020061492919922, + "reward_std": 0.7841981649398804, + "rewards/accuracy_reward/mean": 1.375, + "rewards/accuracy_reward/std": 2.8030595779418945, + "rewards/ngram_similarity_reward/mean": 0.6450613737106323, + "rewards/ngram_similarity_reward/std": 0.3096904754638672, + "step": 821 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 597.0, + "completions/max_terminated_length": 597.0, + "completions/mean_length": 398.796875, + "completions/mean_terminated_length": 398.796875, + "completions/min_length": 251.0, + "completions/min_terminated_length": 251.0, + "epoch": 0.36786753188632804, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0813199058175087, + "learning_rate": 4.784042633810414e-06, + "loss": 0.0313, + "num_tokens": 131113244.0, + "reward": 4.319128036499023, + "reward_std": 0.8010765910148621, + "rewards/accuracy_reward/mean": 3.625, + "rewards/accuracy_reward/std": 2.8030595779418945, + "rewards/ngram_similarity_reward/mean": 0.6941279172897339, + "rewards/ngram_similarity_reward/std": 0.2977048456668854, + "step": 822 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 707.0, + "completions/max_terminated_length": 707.0, + "completions/mean_length": 430.421875, + "completions/mean_terminated_length": 430.421875, + "completions/min_length": 283.0, + "completions/min_terminated_length": 283.0, + "epoch": 0.36831505929738195, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08103030920028687, + "learning_rate": 4.783330403627188e-06, + "loss": -0.0209, + "num_tokens": 131249975.0, + "reward": 4.559375762939453, + "reward_std": 0.19386625289916992, + "rewards/accuracy_reward/mean": 4.0, + "rewards/accuracy_reward/std": 2.618614673614502, + "rewards/ngram_similarity_reward/mean": 0.5593758821487427, + "rewards/ngram_similarity_reward/std": 0.3220345675945282, + "step": 823 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 626.0, + "completions/max_terminated_length": 626.0, + "completions/mean_length": 410.34375, + "completions/mean_terminated_length": 410.34375, + "completions/min_length": 269.0, + "completions/min_terminated_length": 269.0, + "epoch": 0.36876258670843587, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07567102462053299, + "learning_rate": 4.782617060309314e-06, + "loss": 0.0129, + "num_tokens": 131397981.0, + "reward": 4.7222490310668945, + "reward_std": 0.1576964557170868, + "rewards/accuracy_reward/mean": 4.0, + "rewards/accuracy_reward/std": 2.618614673614502, + "rewards/ngram_similarity_reward/mean": 0.7222490310668945, + "rewards/ngram_similarity_reward/std": 0.2786788046360016, + "step": 824 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 736.0, + "completions/max_terminated_length": 736.0, + "completions/mean_length": 420.359375, + "completions/mean_terminated_length": 420.359375, + "completions/min_length": 249.0, + "completions/min_terminated_length": 249.0, + "epoch": 0.36921011411948984, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.06875015050172806, + "learning_rate": 4.7819026042473095e-06, + "loss": 0.0181, + "num_tokens": 131562948.0, + "reward": 3.585540771484375, + "reward_std": 0.8135175108909607, + "rewards/accuracy_reward/mean": 2.875, + "rewards/accuracy_reward/std": 3.000000238418579, + "rewards/ngram_similarity_reward/mean": 0.7105408310890198, + "rewards/ngram_similarity_reward/std": 0.3925935626029968, + "step": 825 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 756.0, + "completions/max_terminated_length": 756.0, + "completions/mean_length": 475.0, + "completions/mean_terminated_length": 475.0, + "completions/min_length": 317.0, + "completions/min_terminated_length": 317.0, + "epoch": 0.36965764153054376, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05377297103404999, + "learning_rate": 4.7811870358322985e-06, + "loss": 0.0007, + "num_tokens": 131682884.0, + "reward": 4.313448905944824, + "reward_std": 1.2124178409576416, + "rewards/accuracy_reward/mean": 3.53125, + "rewards/accuracy_reward/std": 2.839454174041748, + "rewards/ngram_similarity_reward/mean": 0.7821989059448242, + "rewards/ngram_similarity_reward/std": 0.35354679822921753, + "step": 826 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 881.0, + "completions/max_terminated_length": 881.0, + "completions/mean_length": 436.78125, + "completions/mean_terminated_length": 436.78125, + "completions/min_length": 234.0, + "completions/min_terminated_length": 234.0, + "epoch": 0.3701051689415977, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0779440850019455, + "learning_rate": 4.780470355456015e-06, + "loss": 0.0133, + "num_tokens": 131845030.0, + "reward": 4.533501625061035, + "reward_std": 0.4481460452079773, + "rewards/accuracy_reward/mean": 3.90625, + "rewards/accuracy_reward/std": 2.6709415912628174, + "rewards/ngram_similarity_reward/mean": 0.6272518634796143, + "rewards/ngram_similarity_reward/std": 0.2969803512096405, + "step": 827 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 699.0, + "completions/max_terminated_length": 699.0, + "completions/mean_length": 454.671875, + "completions/mean_terminated_length": 454.671875, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "epoch": 0.3705526963526516, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07512392848730087, + "learning_rate": 4.779752563510802e-06, + "loss": 0.0025, + "num_tokens": 132009345.0, + "reward": 1.7263811826705933, + "reward_std": 0.6355293989181519, + "rewards/accuracy_reward/mean": 1.171875, + "rewards/accuracy_reward/std": 2.7316761016845703, + "rewards/ngram_similarity_reward/mean": 0.554506242275238, + "rewards/ngram_similarity_reward/std": 0.37061354517936707, + "step": 828 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 650.0, + "completions/max_terminated_length": 650.0, + "completions/mean_length": 420.96875, + "completions/mean_terminated_length": 420.96875, + "completions/min_length": 292.0, + "completions/min_terminated_length": 292.0, + "epoch": 0.3710002237637055, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07953844219446182, + "learning_rate": 4.779033660389609e-06, + "loss": 0.0005, + "num_tokens": 132149967.0, + "reward": 1.5752811431884766, + "reward_std": 0.841187059879303, + "rewards/accuracy_reward/mean": 1.171875, + "rewards/accuracy_reward/std": 2.7316761016845703, + "rewards/ngram_similarity_reward/mean": 0.40340620279312134, + "rewards/ngram_similarity_reward/std": 0.2614307105541229, + "step": 829 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 583.0, + "completions/max_terminated_length": 583.0, + "completions/mean_length": 383.609375, + "completions/mean_terminated_length": 383.609375, + "completions/min_length": 232.0, + "completions/min_terminated_length": 232.0, + "epoch": 0.3714477511747595, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.05196192115545273, + "learning_rate": 4.7783136464859955e-06, + "loss": 0.0109, + "num_tokens": 132311542.0, + "reward": 6.148770809173584, + "reward_std": 0.6349098086357117, + "rewards/accuracy_reward/mean": 5.21875, + "rewards/accuracy_reward/std": 1.2782522439956665, + "rewards/ngram_similarity_reward/mean": 0.9300208687782288, + "rewards/ngram_similarity_reward/std": 0.2650148272514343, + "step": 830 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 935.0, + "completions/mean_length": 570.96875, + "completions/mean_terminated_length": 547.5238647460938, + "completions/min_length": 192.0, + "completions/min_terminated_length": 192.0, + "epoch": 0.3718952785858134, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05970247462391853, + "learning_rate": 4.77759252219413e-06, + "loss": 0.0341, + "num_tokens": 132453316.0, + "reward": 4.687633037567139, + "reward_std": 0.1622379720211029, + "rewards/accuracy_reward/mean": 4.0, + "rewards/accuracy_reward/std": 2.618614673614502, + "rewards/ngram_similarity_reward/mean": 0.6876329183578491, + "rewards/ngram_similarity_reward/std": 0.3011060059070587, + "step": 831 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 705.0, + "completions/max_terminated_length": 705.0, + "completions/mean_length": 462.34375, + "completions/mean_terminated_length": 462.34375, + "completions/min_length": 220.0, + "completions/min_terminated_length": 220.0, + "epoch": 0.3723428059968673, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07145772129297256, + "learning_rate": 4.776870287908788e-06, + "loss": 0.0045, + "num_tokens": 132643898.0, + "reward": 3.6631765365600586, + "reward_std": 1.224141001701355, + "rewards/accuracy_reward/mean": 3.046875, + "rewards/accuracy_reward/std": 2.991680145263672, + "rewards/ngram_similarity_reward/mean": 0.6163015365600586, + "rewards/ngram_similarity_reward/std": 0.2998967170715332, + "step": 832 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 809.0, + "completions/max_terminated_length": 809.0, + "completions/mean_length": 489.328125, + "completions/mean_terminated_length": 489.328125, + "completions/min_length": 289.0, + "completions/min_terminated_length": 289.0, + "epoch": 0.3727903334079212, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07153812050819397, + "learning_rate": 4.776146944025351e-06, + "loss": -0.0005, + "num_tokens": 132822335.0, + "reward": 4.376694679260254, + "reward_std": 1.0175447463989258, + "rewards/accuracy_reward/mean": 3.71875, + "rewards/accuracy_reward/std": 2.7629566192626953, + "rewards/ngram_similarity_reward/mean": 0.657944917678833, + "rewards/ngram_similarity_reward/std": 0.3186005651950836, + "step": 833 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 738.0, + "completions/max_terminated_length": 738.0, + "completions/mean_length": 458.421875, + "completions/mean_terminated_length": 458.421875, + "completions/min_length": 351.0, + "completions/min_terminated_length": 351.0, + "epoch": 0.37323786081897514, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0736360028386116, + "learning_rate": 4.775422490939809e-06, + "loss": -0.0249, + "num_tokens": 132967290.0, + "reward": 3.1810238361358643, + "reward_std": 0.1251983940601349, + "rewards/accuracy_reward/mean": 2.5, + "rewards/accuracy_reward/std": 3.0237157344818115, + "rewards/ngram_similarity_reward/mean": 0.6810238361358643, + "rewards/ngram_similarity_reward/std": 0.30840054154396057, + "step": 834 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 759.0, + "completions/max_terminated_length": 759.0, + "completions/mean_length": 444.46875, + "completions/mean_terminated_length": 444.46875, + "completions/min_length": 279.0, + "completions/min_terminated_length": 279.0, + "epoch": 0.3736853882300291, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08364483714103699, + "learning_rate": 4.774696929048761e-06, + "loss": -0.0182, + "num_tokens": 133178968.0, + "reward": 5.657986640930176, + "reward_std": 1.2540444135665894, + "rewards/accuracy_reward/mean": 5.015625, + "rewards/accuracy_reward/std": 1.68081796169281, + "rewards/ngram_similarity_reward/mean": 0.6423616409301758, + "rewards/ngram_similarity_reward/std": 0.37156593799591064, + "step": 835 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 755.0, + "completions/max_terminated_length": 755.0, + "completions/mean_length": 424.53125, + "completions/mean_terminated_length": 424.53125, + "completions/min_length": 208.0, + "completions/min_terminated_length": 208.0, + "epoch": 0.37413291564108303, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08677200227975845, + "learning_rate": 4.7739702587494105e-06, + "loss": 0.0051, + "num_tokens": 133359402.0, + "reward": 1.4438471794128418, + "reward_std": 0.5690076947212219, + "rewards/accuracy_reward/mean": 1.0625, + "rewards/accuracy_reward/std": 2.695528507232666, + "rewards/ngram_similarity_reward/mean": 0.3813472390174866, + "rewards/ngram_similarity_reward/std": 0.25122034549713135, + "step": 836 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 826.0, + "completions/max_terminated_length": 826.0, + "completions/mean_length": 552.40625, + "completions/mean_terminated_length": 552.40625, + "completions/min_length": 309.0, + "completions/min_terminated_length": 309.0, + "epoch": 0.37458044305213695, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06587916612625122, + "learning_rate": 4.77324248043957e-06, + "loss": 0.0143, + "num_tokens": 133519300.0, + "reward": 6.079070091247559, + "reward_std": 0.2363794445991516, + "rewards/accuracy_reward/mean": 5.5, + "rewards/accuracy_reward/std": 0.0, + "rewards/ngram_similarity_reward/mean": 0.579070508480072, + "rewards/ngram_similarity_reward/std": 0.33267608284950256, + "step": 837 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 957.0, + "completions/max_terminated_length": 957.0, + "completions/mean_length": 520.453125, + "completions/mean_terminated_length": 520.453125, + "completions/min_length": 277.0, + "completions/min_terminated_length": 277.0, + "epoch": 0.37502797046319086, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.05705634877085686, + "learning_rate": 4.7725135945176545e-06, + "loss": -0.0057, + "num_tokens": 133698721.0, + "reward": 4.572680950164795, + "reward_std": 0.8313631415367126, + "rewards/accuracy_reward/mean": 4.0, + "rewards/accuracy_reward/std": 2.618614673614502, + "rewards/ngram_similarity_reward/mean": 0.5726807713508606, + "rewards/ngram_similarity_reward/std": 0.42959290742874146, + "step": 838 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 500.0, + "completions/max_terminated_length": 500.0, + "completions/mean_length": 367.5625, + "completions/mean_terminated_length": 367.5625, + "completions/min_length": 268.0, + "completions/min_terminated_length": 268.0, + "epoch": 0.3754754978742448, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09058309346437454, + "learning_rate": 4.771783601382693e-06, + "loss": 0.0048, + "num_tokens": 133819445.0, + "reward": 3.2510876655578613, + "reward_std": 0.20160742104053497, + "rewards/accuracy_reward/mean": 2.5, + "rewards/accuracy_reward/std": 3.0237157344818115, + "rewards/ngram_similarity_reward/mean": 0.7510875463485718, + "rewards/ngram_similarity_reward/std": 0.34454336762428284, + "step": 839 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 495.0, + "completions/max_terminated_length": 495.0, + "completions/mean_length": 370.3125, + "completions/mean_terminated_length": 370.3125, + "completions/min_length": 268.0, + "completions/min_terminated_length": 268.0, + "epoch": 0.37592302528529875, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07292758673429489, + "learning_rate": 4.771052501434311e-06, + "loss": 0.0127, + "num_tokens": 133989113.0, + "reward": 3.195587635040283, + "reward_std": 0.4828079342842102, + "rewards/accuracy_reward/mean": 2.40625, + "rewards/accuracy_reward/std": 3.0222392082214355, + "rewards/ngram_similarity_reward/mean": 0.7893376350402832, + "rewards/ngram_similarity_reward/std": 0.20010864734649658, + "step": 840 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 579.0, + "completions/max_terminated_length": 579.0, + "completions/mean_length": 388.28125, + "completions/mean_terminated_length": 388.28125, + "completions/min_length": 232.0, + "completions/min_terminated_length": 232.0, + "epoch": 0.37637055269635267, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07030453532934189, + "learning_rate": 4.770320295072748e-06, + "loss": -0.0096, + "num_tokens": 134169947.0, + "reward": 5.593858242034912, + "reward_std": 1.3770513534545898, + "rewards/accuracy_reward/mean": 4.84375, + "rewards/accuracy_reward/std": 1.8874586820602417, + "rewards/ngram_similarity_reward/mean": 0.7501083016395569, + "rewards/ngram_similarity_reward/std": 0.3666737973690033, + "step": 841 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 860.0, + "completions/max_terminated_length": 860.0, + "completions/mean_length": 516.5, + "completions/mean_terminated_length": 516.5, + "completions/min_length": 289.0, + "completions/min_terminated_length": 289.0, + "epoch": 0.3768180801074066, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06108309328556061, + "learning_rate": 4.769586982698845e-06, + "loss": 0.0281, + "num_tokens": 134319675.0, + "reward": 4.372067451477051, + "reward_std": 1.1060233116149902, + "rewards/accuracy_reward/mean": 3.625, + "rewards/accuracy_reward/std": 2.8030595779418945, + "rewards/ngram_similarity_reward/mean": 0.7470671534538269, + "rewards/ngram_similarity_reward/std": 0.2952858507633209, + "step": 842 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 665.0, + "completions/max_terminated_length": 665.0, + "completions/mean_length": 488.125, + "completions/mean_terminated_length": 488.125, + "completions/min_length": 298.0, + "completions/min_terminated_length": 298.0, + "epoch": 0.3772656075184605, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06603636592626572, + "learning_rate": 4.768852564714049e-06, + "loss": 0.006, + "num_tokens": 134473635.0, + "reward": 4.100539207458496, + "reward_std": 0.8879941701889038, + "rewards/accuracy_reward/mean": 3.4375, + "rewards/accuracy_reward/std": 2.872281312942505, + "rewards/ngram_similarity_reward/mean": 0.6630393266677856, + "rewards/ngram_similarity_reward/std": 0.257058322429657, + "step": 843 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 728.0, + "completions/max_terminated_length": 728.0, + "completions/mean_length": 379.78125, + "completions/mean_terminated_length": 379.78125, + "completions/min_length": 188.0, + "completions/min_terminated_length": 188.0, + "epoch": 0.3777131349295144, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08025288581848145, + "learning_rate": 4.768117041520414e-06, + "loss": 0.0109, + "num_tokens": 134636229.0, + "reward": 5.075590133666992, + "reward_std": 1.9024856090545654, + "rewards/accuracy_reward/mean": 4.375, + "rewards/accuracy_reward/std": 2.3603873252868652, + "rewards/ngram_similarity_reward/mean": 0.7005902528762817, + "rewards/ngram_similarity_reward/std": 0.2098340094089508, + "step": 844 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1139.0, + "completions/max_terminated_length": 1139.0, + "completions/mean_length": 499.015625, + "completions/mean_terminated_length": 499.015625, + "completions/min_length": 268.0, + "completions/min_terminated_length": 268.0, + "epoch": 0.37816066234056833, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06738443672657013, + "learning_rate": 4.767380413520598e-06, + "loss": 0.0734, + "num_tokens": 134835254.0, + "reward": 2.6038081645965576, + "reward_std": 1.4416784048080444, + "rewards/accuracy_reward/mean": 2.125, + "rewards/accuracy_reward/std": 3.000000238418579, + "rewards/ngram_similarity_reward/mean": 0.4788079857826233, + "rewards/ngram_similarity_reward/std": 0.2530724108219147, + "step": 845 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 801.0, + "completions/max_terminated_length": 801.0, + "completions/mean_length": 495.0625, + "completions/mean_terminated_length": 495.0625, + "completions/min_length": 258.0, + "completions/min_terminated_length": 258.0, + "epoch": 0.3786081897516223, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05641628056764603, + "learning_rate": 4.766642681117862e-06, + "loss": -0.0054, + "num_tokens": 134958810.0, + "reward": 4.782063961029053, + "reward_std": 0.1804514229297638, + "rewards/accuracy_reward/mean": 4.0, + "rewards/accuracy_reward/std": 2.618614673614502, + "rewards/ngram_similarity_reward/mean": 0.7820637226104736, + "rewards/ngram_similarity_reward/std": 0.2899869680404663, + "step": 846 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 633.0, + "completions/max_terminated_length": 633.0, + "completions/mean_length": 379.15625, + "completions/mean_terminated_length": 379.15625, + "completions/min_length": 235.0, + "completions/min_terminated_length": 235.0, + "epoch": 0.3790557171626762, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.06728257983922958, + "learning_rate": 4.7659038447160735e-06, + "loss": 0.0059, + "num_tokens": 135081236.0, + "reward": 6.2020463943481445, + "reward_std": 0.6189609169960022, + "rewards/accuracy_reward/mean": 5.3125, + "rewards/accuracy_reward/std": 1.0522085428237915, + "rewards/ngram_similarity_reward/mean": 0.8895463943481445, + "rewards/ngram_similarity_reward/std": 0.23006707429885864, + "step": 847 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 640.0, + "completions/max_terminated_length": 640.0, + "completions/mean_length": 472.890625, + "completions/mean_terminated_length": 472.890625, + "completions/min_length": 301.0, + "completions/min_terminated_length": 301.0, + "epoch": 0.37950324457373014, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06494006514549255, + "learning_rate": 4.7651639047197045e-06, + "loss": -0.0379, + "num_tokens": 135247101.0, + "reward": 5.4059553146362305, + "reward_std": 0.9737482070922852, + "rewards/accuracy_reward/mean": 4.75, + "rewards/accuracy_reward/std": 2.0, + "rewards/ngram_similarity_reward/mean": 0.65595543384552, + "rewards/ngram_similarity_reward/std": 0.3362525999546051, + "step": 848 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 832.0, + "completions/max_terminated_length": 832.0, + "completions/mean_length": 512.5625, + "completions/mean_terminated_length": 512.5625, + "completions/min_length": 271.0, + "completions/min_terminated_length": 271.0, + "epoch": 0.37995077198478405, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.058171700686216354, + "learning_rate": 4.764422861533832e-06, + "loss": -0.0188, + "num_tokens": 135407329.0, + "reward": 5.091101169586182, + "reward_std": 0.7125576138496399, + "rewards/accuracy_reward/mean": 4.28125, + "rewards/accuracy_reward/std": 2.4330317974090576, + "rewards/ngram_similarity_reward/mean": 0.8098511695861816, + "rewards/ngram_similarity_reward/std": 0.21350127458572388, + "step": 849 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 686.0, + "completions/max_terminated_length": 686.0, + "completions/mean_length": 425.890625, + "completions/mean_terminated_length": 425.890625, + "completions/min_length": 287.0, + "completions/min_terminated_length": 287.0, + "epoch": 0.38039829939583797, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07442991435527802, + "learning_rate": 4.763680715564134e-06, + "loss": -0.0164, + "num_tokens": 135559594.0, + "reward": 3.915182113647461, + "reward_std": 1.5784138441085815, + "rewards/accuracy_reward/mean": 3.109375, + "rewards/accuracy_reward/std": 3.125000238418579, + "rewards/ngram_similarity_reward/mean": 0.8058068752288818, + "rewards/ngram_similarity_reward/std": 0.26200681924819946, + "step": 850 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1321.0, + "completions/max_terminated_length": 1321.0, + "completions/mean_length": 561.984375, + "completions/mean_terminated_length": 561.984375, + "completions/min_length": 225.0, + "completions/min_terminated_length": 225.0, + "epoch": 0.38084582680689194, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.058048561215400696, + "learning_rate": 4.762937467216894e-06, + "loss": -0.0308, + "num_tokens": 135688793.0, + "reward": 4.483916282653809, + "reward_std": 0.10813301801681519, + "rewards/accuracy_reward/mean": 4.0, + "rewards/accuracy_reward/std": 2.618614673614502, + "rewards/ngram_similarity_reward/mean": 0.4839160442352295, + "rewards/ngram_similarity_reward/std": 0.26713013648986816, + "step": 851 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 664.0, + "completions/max_terminated_length": 664.0, + "completions/mean_length": 333.640625, + "completions/mean_terminated_length": 333.640625, + "completions/min_length": 201.0, + "completions/min_terminated_length": 201.0, + "epoch": 0.38129335421794586, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.07030487060546875, + "learning_rate": 4.762193116898999e-06, + "loss": -0.0266, + "num_tokens": 135926914.0, + "reward": 2.676629066467285, + "reward_std": 0.8994293808937073, + "rewards/accuracy_reward/mean": 1.9375, + "rewards/accuracy_reward/std": 2.9700891971588135, + "rewards/ngram_similarity_reward/mean": 0.7391289472579956, + "rewards/ngram_similarity_reward/std": 0.2241910994052887, + "step": 852 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 683.0, + "completions/max_terminated_length": 683.0, + "completions/mean_length": 481.8125, + "completions/mean_terminated_length": 481.8125, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "epoch": 0.3817408816289998, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07144621759653091, + "learning_rate": 4.761447665017941e-06, + "loss": -0.0084, + "num_tokens": 136068998.0, + "reward": 2.857234001159668, + "reward_std": 1.1894598007202148, + "rewards/accuracy_reward/mean": 2.265625, + "rewards/accuracy_reward/std": 3.0692648887634277, + "rewards/ngram_similarity_reward/mean": 0.591609001159668, + "rewards/ngram_similarity_reward/std": 0.3215799927711487, + "step": 853 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 845.0, + "completions/max_terminated_length": 845.0, + "completions/mean_length": 495.328125, + "completions/mean_terminated_length": 495.328125, + "completions/min_length": 207.0, + "completions/min_terminated_length": 207.0, + "epoch": 0.3821884090400537, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.06216570734977722, + "learning_rate": 4.760701111981811e-06, + "loss": -0.0139, + "num_tokens": 136221339.0, + "reward": 4.3738694190979, + "reward_std": 0.703353226184845, + "rewards/accuracy_reward/mean": 3.765625, + "rewards/accuracy_reward/std": 2.8015992641448975, + "rewards/ngram_similarity_reward/mean": 0.6082445383071899, + "rewards/ngram_similarity_reward/std": 0.5324356555938721, + "step": 854 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 583.0, + "completions/max_terminated_length": 583.0, + "completions/mean_length": 380.296875, + "completions/mean_terminated_length": 380.296875, + "completions/min_length": 255.0, + "completions/min_terminated_length": 255.0, + "epoch": 0.3826359364511076, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08928291499614716, + "learning_rate": 4.759953458199306e-06, + "loss": 0.0081, + "num_tokens": 136367230.0, + "reward": 3.6769230365753174, + "reward_std": 1.3307888507843018, + "rewards/accuracy_reward/mean": 3.0625, + "rewards/accuracy_reward/std": 2.9700891971588135, + "rewards/ngram_similarity_reward/mean": 0.6144229173660278, + "rewards/ngram_similarity_reward/std": 0.22901320457458496, + "step": 855 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 711.0, + "completions/max_terminated_length": 711.0, + "completions/mean_length": 419.390625, + "completions/mean_terminated_length": 419.390625, + "completions/min_length": 250.0, + "completions/min_terminated_length": 250.0, + "epoch": 0.3830834638621616, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06559686362743378, + "learning_rate": 4.759204704079724e-06, + "loss": -0.0055, + "num_tokens": 136505703.0, + "reward": 2.4887826442718506, + "reward_std": 0.9327364563941956, + "rewards/accuracy_reward/mean": 1.921875, + "rewards/accuracy_reward/std": 2.9857051372528076, + "rewards/ngram_similarity_reward/mean": 0.5669077634811401, + "rewards/ngram_similarity_reward/std": 0.41834351420402527, + "step": 856 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 638.0, + "completions/max_terminated_length": 638.0, + "completions/mean_length": 407.828125, + "completions/mean_terminated_length": 407.828125, + "completions/min_length": 286.0, + "completions/min_terminated_length": 286.0, + "epoch": 0.3835309912732155, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07738467305898666, + "learning_rate": 4.7584548500329654e-06, + "loss": 0.0314, + "num_tokens": 136730732.0, + "reward": 4.649050712585449, + "reward_std": 0.19639216363430023, + "rewards/accuracy_reward/mean": 4.0, + "rewards/accuracy_reward/std": 2.618614673614502, + "rewards/ngram_similarity_reward/mean": 0.6490504741668701, + "rewards/ngram_similarity_reward/std": 0.28024277091026306, + "step": 857 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 526.0, + "completions/max_terminated_length": 526.0, + "completions/mean_length": 377.53125, + "completions/mean_terminated_length": 377.53125, + "completions/min_length": 268.0, + "completions/min_terminated_length": 268.0, + "epoch": 0.3839785186842694, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08628908544778824, + "learning_rate": 4.757703896469535e-06, + "loss": 0.0016, + "num_tokens": 136854414.0, + "reward": 4.2408447265625, + "reward_std": 0.7140259742736816, + "rewards/accuracy_reward/mean": 3.71875, + "rewards/accuracy_reward/std": 2.7629566192626953, + "rewards/ngram_similarity_reward/mean": 0.5220947265625, + "rewards/ngram_similarity_reward/std": 0.2464819848537445, + "step": 858 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 735.0, + "completions/max_terminated_length": 735.0, + "completions/mean_length": 447.328125, + "completions/mean_terminated_length": 447.328125, + "completions/min_length": 280.0, + "completions/min_terminated_length": 280.0, + "epoch": 0.3844260460953233, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06706064939498901, + "learning_rate": 4.756951843800537e-06, + "loss": 0.011, + "num_tokens": 136972851.0, + "reward": 5.8076677322387695, + "reward_std": 0.831523060798645, + "rewards/accuracy_reward/mean": 5.125, + "rewards/accuracy_reward/std": 1.4638501405715942, + "rewards/ngram_similarity_reward/mean": 0.68266761302948, + "rewards/ngram_similarity_reward/std": 0.4277319610118866, + "step": 859 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1016.0, + "completions/max_terminated_length": 1016.0, + "completions/mean_length": 464.859375, + "completions/mean_terminated_length": 464.859375, + "completions/min_length": 197.0, + "completions/min_terminated_length": 197.0, + "epoch": 0.38487357350637724, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07416027039289474, + "learning_rate": 4.756198692437679e-06, + "loss": -0.0561, + "num_tokens": 137198074.0, + "reward": 4.66461706161499, + "reward_std": 0.14941942691802979, + "rewards/accuracy_reward/mean": 4.0, + "rewards/accuracy_reward/std": 2.618614673614502, + "rewards/ngram_similarity_reward/mean": 0.6646170616149902, + "rewards/ngram_similarity_reward/std": 0.4006292223930359, + "step": 860 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 626.0, + "completions/max_terminated_length": 626.0, + "completions/mean_length": 423.59375, + "completions/mean_terminated_length": 423.59375, + "completions/min_length": 302.0, + "completions/min_terminated_length": 302.0, + "epoch": 0.3853211009174312, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0800650343298912, + "learning_rate": 4.755444442793269e-06, + "loss": -0.0034, + "num_tokens": 137372336.0, + "reward": 3.025587320327759, + "reward_std": 0.53461754322052, + "rewards/accuracy_reward/mean": 2.40625, + "rewards/accuracy_reward/std": 3.0222392082214355, + "rewards/ngram_similarity_reward/mean": 0.619337260723114, + "rewards/ngram_similarity_reward/std": 0.2948019504547119, + "step": 861 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 791.0, + "completions/max_terminated_length": 791.0, + "completions/mean_length": 456.953125, + "completions/mean_terminated_length": 456.953125, + "completions/min_length": 297.0, + "completions/min_terminated_length": 297.0, + "epoch": 0.38576862832848513, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07621988654136658, + "learning_rate": 4.754689095280214e-06, + "loss": 0.0054, + "num_tokens": 137511597.0, + "reward": 4.9953718185424805, + "reward_std": 1.2807860374450684, + "rewards/accuracy_reward/mean": 4.46875, + "rewards/accuracy_reward/std": 2.2815253734588623, + "rewards/ngram_similarity_reward/mean": 0.5266216993331909, + "rewards/ngram_similarity_reward/std": 0.2767489552497864, + "step": 862 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 713.0, + "completions/max_terminated_length": 713.0, + "completions/mean_length": 406.21875, + "completions/mean_terminated_length": 406.21875, + "completions/min_length": 280.0, + "completions/min_terminated_length": 280.0, + "epoch": 0.38621615573953905, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.05878997966647148, + "learning_rate": 4.753932650312028e-06, + "loss": -0.0025, + "num_tokens": 137692715.0, + "reward": 2.9865684509277344, + "reward_std": 0.4927492141723633, + "rewards/accuracy_reward/mean": 2.40625, + "rewards/accuracy_reward/std": 3.0222392082214355, + "rewards/ngram_similarity_reward/mean": 0.5803183913230896, + "rewards/ngram_similarity_reward/std": 0.32269957661628723, + "step": 863 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 533.0, + "completions/max_terminated_length": 533.0, + "completions/mean_length": 388.03125, + "completions/mean_terminated_length": 388.03125, + "completions/min_length": 289.0, + "completions/min_terminated_length": 289.0, + "epoch": 0.38666368315059296, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08189192414283752, + "learning_rate": 4.753175108302821e-06, + "loss": -0.0043, + "num_tokens": 137837405.0, + "reward": 0.13617974519729614, + "reward_std": 0.46592044830322266, + "rewards/accuracy_reward/mean": -0.421875, + "rewards/accuracy_reward/std": 0.7622999548912048, + "rewards/ngram_similarity_reward/mean": 0.5580548048019409, + "rewards/ngram_similarity_reward/std": 0.36090996861457825, + "step": 864 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 595.0, + "completions/max_terminated_length": 595.0, + "completions/mean_length": 396.96875, + "completions/mean_terminated_length": 396.96875, + "completions/min_length": 230.0, + "completions/min_terminated_length": 230.0, + "epoch": 0.3871112105616469, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.06704279780387878, + "learning_rate": 4.7524164696673035e-06, + "loss": 0.0149, + "num_tokens": 138019051.0, + "reward": 4.136058807373047, + "reward_std": 1.4905953407287598, + "rewards/accuracy_reward/mean": 3.4375, + "rewards/accuracy_reward/std": 2.872281312942505, + "rewards/ngram_similarity_reward/mean": 0.698559045791626, + "rewards/ngram_similarity_reward/std": 0.30539950728416443, + "step": 865 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 601.0, + "completions/max_terminated_length": 601.0, + "completions/mean_length": 419.5, + "completions/mean_terminated_length": 419.5, + "completions/min_length": 277.0, + "completions/min_terminated_length": 277.0, + "epoch": 0.38755873797270085, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0700206458568573, + "learning_rate": 4.75165673482079e-06, + "loss": 0.0044, + "num_tokens": 138205643.0, + "reward": 1.98746919631958, + "reward_std": 0.6994311809539795, + "rewards/accuracy_reward/mean": 1.28125, + "rewards/accuracy_reward/std": 2.7629566192626953, + "rewards/ngram_similarity_reward/mean": 0.7062191367149353, + "rewards/ngram_similarity_reward/std": 0.14535358548164368, + "step": 866 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 647.0, + "completions/max_terminated_length": 647.0, + "completions/mean_length": 431.546875, + "completions/mean_terminated_length": 431.546875, + "completions/min_length": 278.0, + "completions/min_terminated_length": 278.0, + "epoch": 0.38800626538375477, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07644589245319366, + "learning_rate": 4.750895904179191e-06, + "loss": 0.0037, + "num_tokens": 138355630.0, + "reward": 4.456112384796143, + "reward_std": 1.1625944375991821, + "rewards/accuracy_reward/mean": 4.0, + "rewards/accuracy_reward/std": 2.618614673614502, + "rewards/ngram_similarity_reward/mean": 0.4561123549938202, + "rewards/ngram_similarity_reward/std": 0.2667543292045593, + "step": 867 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 690.0, + "completions/max_terminated_length": 690.0, + "completions/mean_length": 428.0625, + "completions/mean_terminated_length": 428.0625, + "completions/min_length": 243.0, + "completions/min_terminated_length": 243.0, + "epoch": 0.3884537927948087, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07240907847881317, + "learning_rate": 4.75013397815902e-06, + "loss": -0.0122, + "num_tokens": 138484930.0, + "reward": 4.688074111938477, + "reward_std": 0.14621832966804504, + "rewards/accuracy_reward/mean": 4.0, + "rewards/accuracy_reward/std": 2.618614673614502, + "rewards/ngram_similarity_reward/mean": 0.688073992729187, + "rewards/ngram_similarity_reward/std": 0.2483687698841095, + "step": 868 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 553.0, + "completions/max_terminated_length": 553.0, + "completions/mean_length": 377.15625, + "completions/mean_terminated_length": 377.15625, + "completions/min_length": 235.0, + "completions/min_terminated_length": 235.0, + "epoch": 0.3889013202058626, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09912554919719696, + "learning_rate": 4.7493709571773875e-06, + "loss": 0.0051, + "num_tokens": 138620940.0, + "reward": 3.0040464401245117, + "reward_std": 0.5850521326065063, + "rewards/accuracy_reward/mean": 2.484375, + "rewards/accuracy_reward/std": 3.1496646404266357, + "rewards/ngram_similarity_reward/mean": 0.5196714401245117, + "rewards/ngram_similarity_reward/std": 0.3500573933124542, + "step": 869 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 541.0, + "completions/max_terminated_length": 541.0, + "completions/mean_length": 339.40625, + "completions/mean_terminated_length": 339.40625, + "completions/min_length": 189.0, + "completions/min_terminated_length": 189.0, + "epoch": 0.3893488476169165, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09624762088060379, + "learning_rate": 4.7486068416520065e-06, + "loss": -0.0155, + "num_tokens": 138746134.0, + "reward": 3.2631068229675293, + "reward_std": 0.6305399537086487, + "rewards/accuracy_reward/mean": 2.6875, + "rewards/accuracy_reward/std": 3.0178043842315674, + "rewards/ngram_similarity_reward/mean": 0.5756068825721741, + "rewards/ngram_similarity_reward/std": 0.3386700749397278, + "step": 870 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 857.0, + "completions/max_terminated_length": 857.0, + "completions/mean_length": 497.53125, + "completions/mean_terminated_length": 497.53125, + "completions/min_length": 286.0, + "completions/min_terminated_length": 286.0, + "epoch": 0.3897963750279705, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06503531336784363, + "learning_rate": 4.747841632001186e-06, + "loss": -0.0295, + "num_tokens": 138942824.0, + "reward": 1.393243432044983, + "reward_std": 0.48505699634552, + "rewards/accuracy_reward/mean": 1.09375, + "rewards/accuracy_reward/std": 2.6709415912628174, + "rewards/ngram_similarity_reward/mean": 0.29949337244033813, + "rewards/ngram_similarity_reward/std": 0.18441572785377502, + "step": 871 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 663.0, + "completions/max_terminated_length": 663.0, + "completions/mean_length": 435.046875, + "completions/mean_terminated_length": 435.046875, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "epoch": 0.3902439024390244, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07496998459100723, + "learning_rate": 4.747075328643837e-06, + "loss": 0.0195, + "num_tokens": 139082443.0, + "reward": 3.0874123573303223, + "reward_std": 1.6260262727737427, + "rewards/accuracy_reward/mean": 2.296875, + "rewards/accuracy_reward/std": 3.0351366996765137, + "rewards/ngram_similarity_reward/mean": 0.7905375361442566, + "rewards/ngram_similarity_reward/std": 0.26953014731407166, + "step": 872 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 833.0, + "completions/max_terminated_length": 833.0, + "completions/mean_length": 396.09375, + "completions/mean_terminated_length": 396.09375, + "completions/min_length": 190.0, + "completions/min_terminated_length": 190.0, + "epoch": 0.3906914298500783, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.05977340787649155, + "learning_rate": 4.7463079319994665e-06, + "loss": -0.0123, + "num_tokens": 139232753.0, + "reward": 3.3932254314422607, + "reward_std": 0.6104599237442017, + "rewards/accuracy_reward/mean": 2.671875, + "rewards/accuracy_reward/std": 3.037097215652466, + "rewards/ngram_similarity_reward/mean": 0.7213504314422607, + "rewards/ngram_similarity_reward/std": 0.3697623908519745, + "step": 873 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 640.0, + "completions/max_terminated_length": 640.0, + "completions/mean_length": 475.265625, + "completions/mean_terminated_length": 475.265625, + "completions/min_length": 317.0, + "completions/min_terminated_length": 317.0, + "epoch": 0.39113895726113224, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06421991437673569, + "learning_rate": 4.745539442488181e-06, + "loss": -0.0158, + "num_tokens": 139369218.0, + "reward": 2.3920938968658447, + "reward_std": 1.329725742340088, + "rewards/accuracy_reward/mean": 1.84375, + "rewards/accuracy_reward/std": 2.950484275817871, + "rewards/ngram_similarity_reward/mean": 0.5483440160751343, + "rewards/ngram_similarity_reward/std": 0.29731571674346924, + "step": 874 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1109.0, + "completions/max_terminated_length": 1109.0, + "completions/mean_length": 533.234375, + "completions/mean_terminated_length": 533.234375, + "completions/min_length": 360.0, + "completions/min_terminated_length": 360.0, + "epoch": 0.39158648467218615, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07584026455879211, + "learning_rate": 4.744769860530687e-06, + "loss": 0.0287, + "num_tokens": 139521953.0, + "reward": 1.8337633609771729, + "reward_std": 0.9680390357971191, + "rewards/accuracy_reward/mean": 1.390625, + "rewards/accuracy_reward/std": 3.019078016281128, + "rewards/ngram_similarity_reward/mean": 0.4431384205818176, + "rewards/ngram_similarity_reward/std": 0.19377842545509338, + "step": 875 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 832.0, + "completions/max_terminated_length": 832.0, + "completions/mean_length": 473.1875, + "completions/mean_terminated_length": 473.1875, + "completions/min_length": 300.0, + "completions/min_terminated_length": 300.0, + "epoch": 0.3920340120832401, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06925900280475616, + "learning_rate": 4.743999186548286e-06, + "loss": -0.0045, + "num_tokens": 139659837.0, + "reward": 5.710826396942139, + "reward_std": 1.1054452657699585, + "rewards/accuracy_reward/mean": 5.109375, + "rewards/accuracy_reward/std": 1.5287425518035889, + "rewards/ngram_similarity_reward/mean": 0.6014513969421387, + "rewards/ngram_similarity_reward/std": 0.3219076991081238, + "step": 876 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 682.0, + "completions/max_terminated_length": 682.0, + "completions/mean_length": 478.171875, + "completions/mean_terminated_length": 478.171875, + "completions/min_length": 284.0, + "completions/min_terminated_length": 284.0, + "epoch": 0.39248153949429404, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07373322546482086, + "learning_rate": 4.74322742096288e-06, + "loss": -0.0055, + "num_tokens": 139868232.0, + "reward": 4.700196266174316, + "reward_std": 1.3519628047943115, + "rewards/accuracy_reward/mean": 4.375, + "rewards/accuracy_reward/std": 2.3603873252868652, + "rewards/ngram_similarity_reward/mean": 0.32519641518592834, + "rewards/ngram_similarity_reward/std": 0.20448847115039825, + "step": 877 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 948.0, + "completions/max_terminated_length": 948.0, + "completions/mean_length": 530.640625, + "completions/mean_terminated_length": 530.640625, + "completions/min_length": 314.0, + "completions/min_terminated_length": 314.0, + "epoch": 0.39292906690534796, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.05321519076824188, + "learning_rate": 4.742454564196966e-06, + "loss": -0.0018, + "num_tokens": 140030065.0, + "reward": 4.054103851318359, + "reward_std": 1.2656118869781494, + "rewards/accuracy_reward/mean": 3.421875, + "rewards/accuracy_reward/std": 2.896657705307007, + "rewards/ngram_similarity_reward/mean": 0.6322291493415833, + "rewards/ngram_similarity_reward/std": 0.31668195128440857, + "step": 878 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 713.0, + "completions/max_terminated_length": 713.0, + "completions/mean_length": 442.890625, + "completions/mean_terminated_length": 442.890625, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "epoch": 0.3933765943164019, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.06103889271616936, + "learning_rate": 4.741680616673642e-06, + "loss": 0.0291, + "num_tokens": 140214026.0, + "reward": 3.1103811264038086, + "reward_std": 0.4770812392234802, + "rewards/accuracy_reward/mean": 2.40625, + "rewards/accuracy_reward/std": 3.0222392082214355, + "rewards/ngram_similarity_reward/mean": 0.7041311860084534, + "rewards/ngram_similarity_reward/std": 0.3329959213733673, + "step": 879 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 712.0, + "completions/max_terminated_length": 712.0, + "completions/mean_length": 439.609375, + "completions/mean_terminated_length": 439.609375, + "completions/min_length": 313.0, + "completions/min_terminated_length": 313.0, + "epoch": 0.3938241217274558, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0781811773777008, + "learning_rate": 4.740905578816599e-06, + "loss": 0.0171, + "num_tokens": 140370721.0, + "reward": 2.5304312705993652, + "reward_std": 1.2637858390808105, + "rewards/accuracy_reward/mean": 1.921875, + "rewards/accuracy_reward/std": 2.9857051372528076, + "rewards/ngram_similarity_reward/mean": 0.6085561513900757, + "rewards/ngram_similarity_reward/std": 0.31637707352638245, + "step": 880 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 646.0, + "completions/max_terminated_length": 646.0, + "completions/mean_length": 381.265625, + "completions/mean_terminated_length": 381.265625, + "completions/min_length": 142.0, + "completions/min_terminated_length": 142.0, + "epoch": 0.39427164913850976, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09314892441034317, + "learning_rate": 4.740129451050129e-06, + "loss": -0.0046, + "num_tokens": 140518082.0, + "reward": 4.54716682434082, + "reward_std": 1.0344082117080688, + "rewards/accuracy_reward/mean": 3.71875, + "rewards/accuracy_reward/std": 2.7629566192626953, + "rewards/ngram_similarity_reward/mean": 0.8284169435501099, + "rewards/ngram_similarity_reward/std": 0.23733720183372498, + "step": 881 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 608.0, + "completions/max_terminated_length": 608.0, + "completions/mean_length": 382.46875, + "completions/mean_terminated_length": 382.46875, + "completions/min_length": 283.0, + "completions/min_terminated_length": 283.0, + "epoch": 0.3947191765495637, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07349873334169388, + "learning_rate": 4.739352233799116e-06, + "loss": 0.011, + "num_tokens": 140657392.0, + "reward": 3.518798828125, + "reward_std": 1.0502201318740845, + "rewards/accuracy_reward/mean": 2.875, + "rewards/accuracy_reward/std": 3.000000238418579, + "rewards/ngram_similarity_reward/mean": 0.6437988877296448, + "rewards/ngram_similarity_reward/std": 0.2269400656223297, + "step": 882 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 527.0, + "completions/max_terminated_length": 527.0, + "completions/mean_length": 389.671875, + "completions/mean_terminated_length": 389.671875, + "completions/min_length": 237.0, + "completions/min_terminated_length": 237.0, + "epoch": 0.3951667039606176, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08306004852056503, + "learning_rate": 4.7385739274890444e-06, + "loss": 0.0067, + "num_tokens": 140779611.0, + "reward": 1.7087817192077637, + "reward_std": 1.6573550701141357, + "rewards/accuracy_reward/mean": 1.109375, + "rewards/accuracy_reward/std": 2.7809853553771973, + "rewards/ngram_similarity_reward/mean": 0.5994066596031189, + "rewards/ngram_similarity_reward/std": 0.20897532999515533, + "step": 883 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 642.0, + "completions/max_terminated_length": 642.0, + "completions/mean_length": 479.703125, + "completions/mean_terminated_length": 479.703125, + "completions/min_length": 243.0, + "completions/min_terminated_length": 243.0, + "epoch": 0.3956142313716715, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06382640451192856, + "learning_rate": 4.737794532545994e-06, + "loss": 0.0178, + "num_tokens": 140924744.0, + "reward": 4.415198802947998, + "reward_std": 0.7818960547447205, + "rewards/accuracy_reward/mean": 3.71875, + "rewards/accuracy_reward/std": 2.7629566192626953, + "rewards/ngram_similarity_reward/mean": 0.6964489221572876, + "rewards/ngram_similarity_reward/std": 0.3221212327480316, + "step": 884 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 891.0, + "completions/max_terminated_length": 891.0, + "completions/mean_length": 502.859375, + "completions/mean_terminated_length": 502.859375, + "completions/min_length": 297.0, + "completions/min_terminated_length": 297.0, + "epoch": 0.3960617587827254, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06667076796293259, + "learning_rate": 4.737014049396639e-06, + "loss": -0.0226, + "num_tokens": 141082239.0, + "reward": 1.4672009944915771, + "reward_std": 0.5684460401535034, + "rewards/accuracy_reward/mean": 1.171875, + "rewards/accuracy_reward/std": 2.7316761016845703, + "rewards/ngram_similarity_reward/mean": 0.2953259348869324, + "rewards/ngram_similarity_reward/std": 0.1921965628862381, + "step": 885 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 563.0, + "completions/max_terminated_length": 563.0, + "completions/mean_length": 403.53125, + "completions/mean_terminated_length": 403.53125, + "completions/min_length": 200.0, + "completions/min_terminated_length": 200.0, + "epoch": 0.39650928619377934, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07745262235403061, + "learning_rate": 4.736232478468249e-06, + "loss": -0.0128, + "num_tokens": 141191393.0, + "reward": 1.348705768585205, + "reward_std": 0.20715712010860443, + "rewards/accuracy_reward/mean": 0.984375, + "rewards/accuracy_reward/std": 2.6306629180908203, + "rewards/ngram_similarity_reward/mean": 0.3643309473991394, + "rewards/ngram_similarity_reward/std": 0.2165614366531372, + "step": 886 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 598.0, + "completions/max_terminated_length": 598.0, + "completions/mean_length": 351.15625, + "completions/mean_terminated_length": 351.15625, + "completions/min_length": 155.0, + "completions/min_terminated_length": 155.0, + "epoch": 0.3969568136048333, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.104799285531044, + "learning_rate": 4.735449820188693e-06, + "loss": 0.046, + "num_tokens": 141304011.0, + "reward": 3.7475767135620117, + "reward_std": 2.1090166568756104, + "rewards/accuracy_reward/mean": 3.34375, + "rewards/accuracy_reward/std": 2.9016621112823486, + "rewards/ngram_similarity_reward/mean": 0.4038265347480774, + "rewards/ngram_similarity_reward/std": 0.25956711173057556, + "step": 887 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 614.0, + "completions/max_terminated_length": 614.0, + "completions/mean_length": 411.765625, + "completions/mean_terminated_length": 411.765625, + "completions/min_length": 222.0, + "completions/min_terminated_length": 222.0, + "epoch": 0.39740434101588723, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07789560407400131, + "learning_rate": 4.73466607498643e-06, + "loss": -0.0069, + "num_tokens": 141415644.0, + "reward": 5.63222599029541, + "reward_std": 0.7778723239898682, + "rewards/accuracy_reward/mean": 5.21875, + "rewards/accuracy_reward/std": 1.2782522439956665, + "rewards/ngram_similarity_reward/mean": 0.41347628831863403, + "rewards/ngram_similarity_reward/std": 0.28358733654022217, + "step": 888 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 692.0, + "completions/max_terminated_length": 692.0, + "completions/mean_length": 418.25, + "completions/mean_terminated_length": 418.25, + "completions/min_length": 260.0, + "completions/min_terminated_length": 260.0, + "epoch": 0.39785186842694115, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08741523325443268, + "learning_rate": 4.73388124329052e-06, + "loss": 0.0285, + "num_tokens": 141587308.0, + "reward": 3.2023792266845703, + "reward_std": 0.5806549787521362, + "rewards/accuracy_reward/mean": 2.59375, + "rewards/accuracy_reward/std": 3.0222392082214355, + "rewards/ngram_similarity_reward/mean": 0.6086291074752808, + "rewards/ngram_similarity_reward/std": 0.27053776383399963, + "step": 889 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 624.0, + "completions/max_terminated_length": 624.0, + "completions/mean_length": 392.71875, + "completions/mean_terminated_length": 392.71875, + "completions/min_length": 241.0, + "completions/min_terminated_length": 241.0, + "epoch": 0.39829939583799506, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07778114080429077, + "learning_rate": 4.7330953255306114e-06, + "loss": 0.0013, + "num_tokens": 141727178.0, + "reward": 2.7171683311462402, + "reward_std": 1.6601142883300781, + "rewards/accuracy_reward/mean": 2.125, + "rewards/accuracy_reward/std": 3.000000238418579, + "rewards/ngram_similarity_reward/mean": 0.592168390750885, + "rewards/ngram_similarity_reward/std": 0.32917869091033936, + "step": 890 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 792.0, + "completions/max_terminated_length": 792.0, + "completions/mean_length": 476.765625, + "completions/mean_terminated_length": 476.765625, + "completions/min_length": 314.0, + "completions/min_terminated_length": 314.0, + "epoch": 0.398746923249049, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06503872573375702, + "learning_rate": 4.732308322136951e-06, + "loss": 0.0005, + "num_tokens": 141879595.0, + "reward": 3.591559886932373, + "reward_std": 1.2463946342468262, + "rewards/accuracy_reward/mean": 3.15625, + "rewards/accuracy_reward/std": 2.950484275817871, + "rewards/ngram_similarity_reward/mean": 0.4353097677230835, + "rewards/ngram_similarity_reward/std": 0.21791474521160126, + "step": 891 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 888.0, + "completions/max_terminated_length": 888.0, + "completions/mean_length": 480.890625, + "completions/mean_terminated_length": 480.890625, + "completions/min_length": 273.0, + "completions/min_terminated_length": 273.0, + "epoch": 0.39919445066010295, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07314197719097137, + "learning_rate": 4.7315202335403794e-06, + "loss": 0.031, + "num_tokens": 142044292.0, + "reward": 4.573571681976318, + "reward_std": 0.5166336297988892, + "rewards/accuracy_reward/mean": 3.90625, + "rewards/accuracy_reward/std": 2.6709415912628174, + "rewards/ngram_similarity_reward/mean": 0.6673218607902527, + "rewards/ngram_similarity_reward/std": 0.35875454545021057, + "step": 892 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 685.0, + "completions/max_terminated_length": 685.0, + "completions/mean_length": 435.046875, + "completions/mean_terminated_length": 435.046875, + "completions/min_length": 251.0, + "completions/min_terminated_length": 251.0, + "epoch": 0.39964197807115687, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07555290311574936, + "learning_rate": 4.730731060172331e-06, + "loss": -0.0136, + "num_tokens": 142215895.0, + "reward": 0.5935059189796448, + "reward_std": 0.8443008661270142, + "rewards/accuracy_reward/mean": -0.140625, + "rewards/accuracy_reward/std": 1.473223328590393, + "rewards/ngram_similarity_reward/mean": 0.7341309189796448, + "rewards/ngram_similarity_reward/std": 0.284699022769928, + "step": 893 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 808.0, + "completions/max_terminated_length": 808.0, + "completions/mean_length": 516.21875, + "completions/mean_terminated_length": 516.21875, + "completions/min_length": 342.0, + "completions/min_terminated_length": 342.0, + "epoch": 0.4000895054822108, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0662178173661232, + "learning_rate": 4.7299408024648345e-06, + "loss": 0.0232, + "num_tokens": 142362757.0, + "reward": 6.127585411071777, + "reward_std": 0.2145819067955017, + "rewards/accuracy_reward/mean": 5.5, + "rewards/accuracy_reward/std": 0.0, + "rewards/ngram_similarity_reward/mean": 0.6275853514671326, + "rewards/ngram_similarity_reward/std": 0.23748208582401276, + "step": 894 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1089.0, + "completions/max_terminated_length": 1089.0, + "completions/mean_length": 569.21875, + "completions/mean_terminated_length": 569.21875, + "completions/min_length": 292.0, + "completions/min_terminated_length": 292.0, + "epoch": 0.4005370328932647, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.062257200479507446, + "learning_rate": 4.729149460850512e-06, + "loss": 0.0213, + "num_tokens": 142544371.0, + "reward": 2.992208957672119, + "reward_std": 1.5508846044540405, + "rewards/accuracy_reward/mean": 2.5, + "rewards/accuracy_reward/std": 3.0237157344818115, + "rewards/ngram_similarity_reward/mean": 0.49220892786979675, + "rewards/ngram_similarity_reward/std": 0.2310357689857483, + "step": 895 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 652.0, + "completions/max_terminated_length": 652.0, + "completions/mean_length": 436.6875, + "completions/mean_terminated_length": 436.6875, + "completions/min_length": 276.0, + "completions/min_terminated_length": 276.0, + "epoch": 0.4009845603043186, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0728151872754097, + "learning_rate": 4.728357035762577e-06, + "loss": -0.0219, + "num_tokens": 142679503.0, + "reward": 3.1466708183288574, + "reward_std": 0.1647435575723648, + "rewards/accuracy_reward/mean": 2.5, + "rewards/accuracy_reward/std": 3.0237157344818115, + "rewards/ngram_similarity_reward/mean": 0.6466709971427917, + "rewards/ngram_similarity_reward/std": 0.294294536113739, + "step": 896 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 679.0, + "completions/max_terminated_length": 679.0, + "completions/mean_length": 420.0, + "completions/mean_terminated_length": 420.0, + "completions/min_length": 196.0, + "completions/min_terminated_length": 196.0, + "epoch": 0.4014320877153726, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0714784786105156, + "learning_rate": 4.727563527634839e-06, + "loss": -0.0301, + "num_tokens": 142814111.0, + "reward": 3.757340908050537, + "reward_std": 1.262383222579956, + "rewards/accuracy_reward/mean": 3.140625, + "rewards/accuracy_reward/std": 2.9727182388305664, + "rewards/ngram_similarity_reward/mean": 0.6167159080505371, + "rewards/ngram_similarity_reward/std": 0.37762531638145447, + "step": 897 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 616.0, + "completions/max_terminated_length": 616.0, + "completions/mean_length": 387.90625, + "completions/mean_terminated_length": 387.90625, + "completions/min_length": 256.0, + "completions/min_terminated_length": 256.0, + "epoch": 0.4018796151264265, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09113533049821854, + "learning_rate": 4.7267689369017e-06, + "loss": -0.0, + "num_tokens": 143032425.0, + "reward": 5.159881114959717, + "reward_std": 1.1793248653411865, + "rewards/accuracy_reward/mean": 4.375, + "rewards/accuracy_reward/std": 2.3603873252868652, + "rewards/ngram_similarity_reward/mean": 0.7848809957504272, + "rewards/ngram_similarity_reward/std": 0.3087009787559509, + "step": 898 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 871.0, + "completions/max_terminated_length": 871.0, + "completions/mean_length": 543.875, + "completions/mean_terminated_length": 543.875, + "completions/min_length": 325.0, + "completions/min_terminated_length": 325.0, + "epoch": 0.4023271425374804, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06111174076795578, + "learning_rate": 4.725973263998154e-06, + "loss": 0.0037, + "num_tokens": 143185457.0, + "reward": 6.120242118835449, + "reward_std": 0.2244957983493805, + "rewards/accuracy_reward/mean": 5.5, + "rewards/accuracy_reward/std": 0.0, + "rewards/ngram_similarity_reward/mean": 0.6202424764633179, + "rewards/ngram_similarity_reward/std": 0.2629983127117157, + "step": 899 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 753.0, + "completions/max_terminated_length": 753.0, + "completions/mean_length": 457.0625, + "completions/mean_terminated_length": 457.0625, + "completions/min_length": 268.0, + "completions/min_terminated_length": 268.0, + "epoch": 0.40277466994853434, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07256443798542023, + "learning_rate": 4.725176509359784e-06, + "loss": -0.008, + "num_tokens": 143354309.0, + "reward": 3.8044281005859375, + "reward_std": 1.7351446151733398, + "rewards/accuracy_reward/mean": 3.15625, + "rewards/accuracy_reward/std": 2.950484275817871, + "rewards/ngram_similarity_reward/mean": 0.648178219795227, + "rewards/ngram_similarity_reward/std": 0.25434863567352295, + "step": 900 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 782.0, + "completions/max_terminated_length": 782.0, + "completions/mean_length": 513.1875, + "completions/mean_terminated_length": 513.1875, + "completions/min_length": 336.0, + "completions/min_terminated_length": 336.0, + "epoch": 0.40322219735958825, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05711884796619415, + "learning_rate": 4.7243786734227745e-06, + "loss": -0.0348, + "num_tokens": 143517345.0, + "reward": 2.1598706245422363, + "reward_std": 1.3489621877670288, + "rewards/accuracy_reward/mean": 1.65625, + "rewards/accuracy_reward/std": 2.9016621112823486, + "rewards/ngram_similarity_reward/mean": 0.5036205053329468, + "rewards/ngram_similarity_reward/std": 0.2906215190887451, + "step": 901 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 725.0, + "completions/max_terminated_length": 725.0, + "completions/mean_length": 435.140625, + "completions/mean_terminated_length": 435.140625, + "completions/min_length": 257.0, + "completions/min_terminated_length": 257.0, + "epoch": 0.4036697247706422, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07961972057819366, + "learning_rate": 4.72357975662389e-06, + "loss": -0.0097, + "num_tokens": 143646346.0, + "reward": 2.9096150398254395, + "reward_std": 0.5758260488510132, + "rewards/accuracy_reward/mean": 2.3125, + "rewards/accuracy_reward/std": 3.0178043842315674, + "rewards/ngram_similarity_reward/mean": 0.5971152186393738, + "rewards/ngram_similarity_reward/std": 0.16422002017498016, + "step": 902 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 897.0, + "completions/mean_length": 413.5625, + "completions/mean_terminated_length": 387.61907958984375, + "completions/min_length": 237.0, + "completions/min_terminated_length": 237.0, + "epoch": 0.40411725218169614, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09119610488414764, + "learning_rate": 4.722779759400499e-06, + "loss": -0.0105, + "num_tokens": 143785374.0, + "reward": 2.8944010734558105, + "reward_std": 2.1692235469818115, + "rewards/accuracy_reward/mean": 2.203125, + "rewards/accuracy_reward/std": 3.0272817611694336, + "rewards/ngram_similarity_reward/mean": 0.6912758350372314, + "rewards/ngram_similarity_reward/std": 0.4539448022842407, + "step": 903 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 763.0, + "completions/max_terminated_length": 763.0, + "completions/mean_length": 464.125, + "completions/mean_terminated_length": 464.125, + "completions/min_length": 282.0, + "completions/min_terminated_length": 282.0, + "epoch": 0.40456477959275006, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.058656852692365646, + "learning_rate": 4.721978682190549e-06, + "loss": 0.0133, + "num_tokens": 143932070.0, + "reward": 5.176098823547363, + "reward_std": 1.2019221782684326, + "rewards/accuracy_reward/mean": 4.375, + "rewards/accuracy_reward/std": 2.3603873252868652, + "rewards/ngram_similarity_reward/mean": 0.8010987043380737, + "rewards/ngram_similarity_reward/std": 0.353232204914093, + "step": 904 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 700.0, + "completions/max_terminated_length": 700.0, + "completions/mean_length": 426.0625, + "completions/mean_terminated_length": 426.0625, + "completions/min_length": 259.0, + "completions/min_terminated_length": 259.0, + "epoch": 0.405012307003804, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07713035494089127, + "learning_rate": 4.721176525432588e-06, + "loss": -0.0042, + "num_tokens": 144071882.0, + "reward": 2.6432571411132812, + "reward_std": 1.379880428314209, + "rewards/accuracy_reward/mean": 1.84375, + "rewards/accuracy_reward/std": 2.950484275817871, + "rewards/ngram_similarity_reward/mean": 0.7995070219039917, + "rewards/ngram_similarity_reward/std": 0.20705363154411316, + "step": 905 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 726.0, + "completions/max_terminated_length": 726.0, + "completions/mean_length": 421.203125, + "completions/mean_terminated_length": 421.203125, + "completions/min_length": 206.0, + "completions/min_terminated_length": 206.0, + "epoch": 0.4054598344148579, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.06027873232960701, + "learning_rate": 4.720373289565753e-06, + "loss": -0.0172, + "num_tokens": 144180327.0, + "reward": 5.264594078063965, + "reward_std": 0.8011810183525085, + "rewards/accuracy_reward/mean": 4.375, + "rewards/accuracy_reward/std": 2.3603873252868652, + "rewards/ngram_similarity_reward/mean": 0.8895940184593201, + "rewards/ngram_similarity_reward/std": 0.3584500551223755, + "step": 906 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 840.0, + "completions/max_terminated_length": 840.0, + "completions/mean_length": 530.984375, + "completions/mean_terminated_length": 530.984375, + "completions/min_length": 236.0, + "completions/min_terminated_length": 236.0, + "epoch": 0.40590736182591186, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.054708436131477356, + "learning_rate": 4.719568975029769e-06, + "loss": -0.0284, + "num_tokens": 144370614.0, + "reward": 4.3622941970825195, + "reward_std": 1.1988966464996338, + "rewards/accuracy_reward/mean": 3.53125, + "rewards/accuracy_reward/std": 2.839454174041748, + "rewards/ngram_similarity_reward/mean": 0.8310438394546509, + "rewards/ngram_similarity_reward/std": 0.2436477094888687, + "step": 907 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 622.0, + "completions/max_terminated_length": 622.0, + "completions/mean_length": 363.265625, + "completions/mean_terminated_length": 363.265625, + "completions/min_length": 185.0, + "completions/min_terminated_length": 185.0, + "epoch": 0.4063548892369658, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.09403359144926071, + "learning_rate": 4.718763582264954e-06, + "loss": 0.0102, + "num_tokens": 144493943.0, + "reward": 3.0666751861572266, + "reward_std": 0.14330953359603882, + "rewards/accuracy_reward/mean": 2.484375, + "rewards/accuracy_reward/std": 3.0419929027557373, + "rewards/ngram_similarity_reward/mean": 0.5823002457618713, + "rewards/ngram_similarity_reward/std": 0.22058938443660736, + "step": 908 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 885.0, + "completions/max_terminated_length": 885.0, + "completions/mean_length": 434.203125, + "completions/mean_terminated_length": 434.203125, + "completions/min_length": 238.0, + "completions/min_terminated_length": 238.0, + "epoch": 0.4068024166480197, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.06283697485923767, + "learning_rate": 4.7179571117122145e-06, + "loss": -0.0004, + "num_tokens": 144630804.0, + "reward": 1.8347517251968384, + "reward_std": 0.7851179838180542, + "rewards/accuracy_reward/mean": 1.265625, + "rewards/accuracy_reward/std": 2.775986671447754, + "rewards/ngram_similarity_reward/mean": 0.5691266059875488, + "rewards/ngram_similarity_reward/std": 0.2655256688594818, + "step": 909 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1155.0, + "completions/max_terminated_length": 1155.0, + "completions/mean_length": 583.28125, + "completions/mean_terminated_length": 583.28125, + "completions/min_length": 243.0, + "completions/min_terminated_length": 243.0, + "epoch": 0.4072499440590736, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0670977532863617, + "learning_rate": 4.717149563813049e-06, + "loss": 0.0408, + "num_tokens": 144773222.0, + "reward": 4.14152717590332, + "reward_std": 1.1679092645645142, + "rewards/accuracy_reward/mean": 3.515625, + "rewards/accuracy_reward/std": 2.8646292686462402, + "rewards/ngram_similarity_reward/mean": 0.6259022355079651, + "rewards/ngram_similarity_reward/std": 0.2814485728740692, + "step": 910 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1138.0, + "completions/max_terminated_length": 1138.0, + "completions/mean_length": 482.03125, + "completions/mean_terminated_length": 482.03125, + "completions/min_length": 315.0, + "completions/min_terminated_length": 315.0, + "epoch": 0.4076974714701275, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07531841844320297, + "learning_rate": 4.716340939009544e-06, + "loss": -0.0059, + "num_tokens": 144910840.0, + "reward": 3.2331812381744385, + "reward_std": 0.5628366470336914, + "rewards/accuracy_reward/mean": 2.59375, + "rewards/accuracy_reward/std": 3.0222392082214355, + "rewards/ngram_similarity_reward/mean": 0.6394312381744385, + "rewards/ngram_similarity_reward/std": 0.27154573798179626, + "step": 911 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 618.0, + "completions/max_terminated_length": 618.0, + "completions/mean_length": 390.734375, + "completions/mean_terminated_length": 390.734375, + "completions/min_length": 225.0, + "completions/min_terminated_length": 225.0, + "epoch": 0.4081449988811815, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08654443919658661, + "learning_rate": 4.715531237744377e-06, + "loss": 0.0477, + "num_tokens": 145079527.0, + "reward": 1.4686279296875, + "reward_std": 0.15591643750667572, + "rewards/accuracy_reward/mean": 0.984375, + "rewards/accuracy_reward/std": 2.6306629180908203, + "rewards/ngram_similarity_reward/mean": 0.48425301909446716, + "rewards/ngram_similarity_reward/std": 0.23591560125350952, + "step": 912 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 577.0, + "completions/max_terminated_length": 577.0, + "completions/mean_length": 446.140625, + "completions/mean_terminated_length": 446.140625, + "completions/min_length": 351.0, + "completions/min_terminated_length": 351.0, + "epoch": 0.4085925262922354, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07434502989053726, + "learning_rate": 4.714720460460814e-06, + "loss": 0.0123, + "num_tokens": 145252720.0, + "reward": 3.0408496856689453, + "reward_std": 0.1428745537996292, + "rewards/accuracy_reward/mean": 2.5, + "rewards/accuracy_reward/std": 3.0237157344818115, + "rewards/ngram_similarity_reward/mean": 0.5408496260643005, + "rewards/ngram_similarity_reward/std": 0.20481747388839722, + "step": 913 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 745.0, + "completions/max_terminated_length": 745.0, + "completions/mean_length": 430.984375, + "completions/mean_terminated_length": 430.984375, + "completions/min_length": 308.0, + "completions/min_terminated_length": 308.0, + "epoch": 0.40904005370328933, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07099854946136475, + "learning_rate": 4.713908607602712e-06, + "loss": -0.0142, + "num_tokens": 145423663.0, + "reward": 3.6085739135742188, + "reward_std": 1.2402714490890503, + "rewards/accuracy_reward/mean": 3.15625, + "rewards/accuracy_reward/std": 2.950484275817871, + "rewards/ngram_similarity_reward/mean": 0.4523240327835083, + "rewards/ngram_similarity_reward/std": 0.22499507665634155, + "step": 914 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 593.0, + "completions/max_terminated_length": 593.0, + "completions/mean_length": 401.21875, + "completions/mean_terminated_length": 401.21875, + "completions/min_length": 282.0, + "completions/min_terminated_length": 282.0, + "epoch": 0.40948758111434325, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07884711027145386, + "learning_rate": 4.71309567961451e-06, + "loss": -0.0077, + "num_tokens": 145603309.0, + "reward": 3.2621984481811523, + "reward_std": 0.4593298137187958, + "rewards/accuracy_reward/mean": 2.59375, + "rewards/accuracy_reward/std": 3.0222392082214355, + "rewards/ngram_similarity_reward/mean": 0.6684484481811523, + "rewards/ngram_similarity_reward/std": 0.283232718706131, + "step": 915 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 614.0, + "completions/max_terminated_length": 614.0, + "completions/mean_length": 408.40625, + "completions/mean_terminated_length": 408.40625, + "completions/min_length": 217.0, + "completions/min_terminated_length": 217.0, + "epoch": 0.40993510852539716, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07480478286743164, + "learning_rate": 4.712281676941246e-06, + "loss": 0.0063, + "num_tokens": 145756311.0, + "reward": 4.161160469055176, + "reward_std": 0.7893478274345398, + "rewards/accuracy_reward/mean": 3.625, + "rewards/accuracy_reward/std": 2.8030595779418945, + "rewards/ngram_similarity_reward/mean": 0.5361604690551758, + "rewards/ngram_similarity_reward/std": 0.35466256737709045, + "step": 916 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 610.0, + "completions/max_terminated_length": 610.0, + "completions/mean_length": 460.734375, + "completions/mean_terminated_length": 460.734375, + "completions/min_length": 284.0, + "completions/min_terminated_length": 284.0, + "epoch": 0.41038263593645113, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06919507682323456, + "learning_rate": 4.711466600028538e-06, + "loss": 0.0027, + "num_tokens": 145902550.0, + "reward": 3.319559097290039, + "reward_std": 1.3462203741073608, + "rewards/accuracy_reward/mean": 2.6875, + "rewards/accuracy_reward/std": 3.0178043842315674, + "rewards/ngram_similarity_reward/mean": 0.6320590972900391, + "rewards/ngram_similarity_reward/std": 0.1879875659942627, + "step": 917 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 582.0, + "completions/max_terminated_length": 582.0, + "completions/mean_length": 400.984375, + "completions/mean_terminated_length": 400.984375, + "completions/min_length": 289.0, + "completions/min_terminated_length": 289.0, + "epoch": 0.41083016334750505, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.06792223453521729, + "learning_rate": 4.710650449322595e-06, + "loss": -0.0116, + "num_tokens": 146033989.0, + "reward": 4.589241981506348, + "reward_std": 0.4721425175666809, + "rewards/accuracy_reward/mean": 3.90625, + "rewards/accuracy_reward/std": 2.6709415912628174, + "rewards/ngram_similarity_reward/mean": 0.6829920411109924, + "rewards/ngram_similarity_reward/std": 0.28800156712532043, + "step": 918 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 819.0, + "completions/max_terminated_length": 819.0, + "completions/mean_length": 466.828125, + "completions/mean_terminated_length": 466.828125, + "completions/min_length": 206.0, + "completions/min_terminated_length": 206.0, + "epoch": 0.41127769075855897, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07291481643915176, + "learning_rate": 4.709833225270215e-06, + "loss": -0.0149, + "num_tokens": 146223274.0, + "reward": 4.248568058013916, + "reward_std": 0.7303920388221741, + "rewards/accuracy_reward/mean": 3.625, + "rewards/accuracy_reward/std": 2.8030595779418945, + "rewards/ngram_similarity_reward/mean": 0.6235682368278503, + "rewards/ngram_similarity_reward/std": 0.2175399512052536, + "step": 919 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1037.0, + "completions/max_terminated_length": 1037.0, + "completions/mean_length": 474.5625, + "completions/mean_terminated_length": 474.5625, + "completions/min_length": 203.0, + "completions/min_terminated_length": 203.0, + "epoch": 0.4117252181696129, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09087973088026047, + "learning_rate": 4.709014928318783e-06, + "loss": -0.0214, + "num_tokens": 146420542.0, + "reward": 3.0612213611602783, + "reward_std": 1.0413897037506104, + "rewards/accuracy_reward/mean": 2.59375, + "rewards/accuracy_reward/std": 3.0222392082214355, + "rewards/ngram_similarity_reward/mean": 0.4674713611602783, + "rewards/ngram_similarity_reward/std": 0.29778701066970825, + "step": 920 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 504.0, + "completions/max_terminated_length": 504.0, + "completions/mean_length": 357.28125, + "completions/mean_terminated_length": 357.28125, + "completions/min_length": 258.0, + "completions/min_terminated_length": 258.0, + "epoch": 0.4121727455806668, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07861248403787613, + "learning_rate": 4.708195558916269e-06, + "loss": 0.0205, + "num_tokens": 146552192.0, + "reward": 3.3277878761291504, + "reward_std": 0.6478875279426575, + "rewards/accuracy_reward/mean": 2.6875, + "rewards/accuracy_reward/std": 3.0178043842315674, + "rewards/ngram_similarity_reward/mean": 0.6402877569198608, + "rewards/ngram_similarity_reward/std": 0.35690537095069885, + "step": 921 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 584.0, + "completions/max_terminated_length": 584.0, + "completions/mean_length": 376.953125, + "completions/mean_terminated_length": 376.953125, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "epoch": 0.4126202729917207, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.07197709381580353, + "learning_rate": 4.707375117511233e-06, + "loss": -0.0047, + "num_tokens": 146721805.0, + "reward": 1.8992624282836914, + "reward_std": 0.7386025190353394, + "rewards/accuracy_reward/mean": 1.375, + "rewards/accuracy_reward/std": 2.8030595779418945, + "rewards/ngram_similarity_reward/mean": 0.5242624282836914, + "rewards/ngram_similarity_reward/std": 0.38837382197380066, + "step": 922 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 592.0, + "completions/max_terminated_length": 592.0, + "completions/mean_length": 375.203125, + "completions/mean_terminated_length": 375.203125, + "completions/min_length": 259.0, + "completions/min_terminated_length": 259.0, + "epoch": 0.4130678004027747, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08947842568159103, + "learning_rate": 4.70655360455282e-06, + "loss": 0.001, + "num_tokens": 146859530.0, + "reward": 4.687874794006348, + "reward_std": 1.5216972827911377, + "rewards/accuracy_reward/mean": 4.28125, + "rewards/accuracy_reward/std": 2.4330317974090576, + "rewards/ngram_similarity_reward/mean": 0.4066246747970581, + "rewards/ngram_similarity_reward/std": 0.287428081035614, + "step": 923 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1461.0, + "completions/max_terminated_length": 1461.0, + "completions/mean_length": 558.484375, + "completions/mean_terminated_length": 558.484375, + "completions/min_length": 288.0, + "completions/min_terminated_length": 288.0, + "epoch": 0.4135153278138286, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06963956356048584, + "learning_rate": 4.705731020490763e-06, + "loss": -0.0101, + "num_tokens": 147005209.0, + "reward": 1.4135024547576904, + "reward_std": 0.17173431813716888, + "rewards/accuracy_reward/mean": 0.984375, + "rewards/accuracy_reward/std": 2.6306629180908203, + "rewards/ngram_similarity_reward/mean": 0.42912742495536804, + "rewards/ngram_similarity_reward/std": 0.34089791774749756, + "step": 924 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 688.0, + "completions/max_terminated_length": 688.0, + "completions/mean_length": 396.0625, + "completions/mean_terminated_length": 396.0625, + "completions/min_length": 172.0, + "completions/min_terminated_length": 172.0, + "epoch": 0.4139628552248825, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07255294173955917, + "learning_rate": 4.70490736577538e-06, + "loss": 0.0061, + "num_tokens": 147153325.0, + "reward": 3.5313873291015625, + "reward_std": 2.2151896953582764, + "rewards/accuracy_reward/mean": 2.78125, + "rewards/accuracy_reward/std": 3.0103988647460938, + "rewards/ngram_similarity_reward/mean": 0.7501375675201416, + "rewards/ngram_similarity_reward/std": 0.32853633165359497, + "step": 925 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 748.0, + "completions/max_terminated_length": 748.0, + "completions/mean_length": 467.46875, + "completions/mean_terminated_length": 467.46875, + "completions/min_length": 276.0, + "completions/min_terminated_length": 276.0, + "epoch": 0.41441038263593644, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06902709603309631, + "learning_rate": 4.704082640857578e-06, + "loss": -0.0027, + "num_tokens": 147271787.0, + "reward": 3.3249874114990234, + "reward_std": 0.1639261543750763, + "rewards/accuracy_reward/mean": 2.5, + "rewards/accuracy_reward/std": 3.0237157344818115, + "rewards/ngram_similarity_reward/mean": 0.8249874114990234, + "rewards/ngram_similarity_reward/std": 0.23711428046226501, + "step": 926 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 884.0, + "completions/max_terminated_length": 884.0, + "completions/mean_length": 447.34375, + "completions/mean_terminated_length": 447.34375, + "completions/min_length": 239.0, + "completions/min_terminated_length": 239.0, + "epoch": 0.41485791004699035, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.066554494202137, + "learning_rate": 4.703256846188846e-06, + "loss": 0.0197, + "num_tokens": 147425857.0, + "reward": 2.882577657699585, + "reward_std": 1.545878291130066, + "rewards/accuracy_reward/mean": 2.21875, + "rewards/accuracy_reward/std": 3.0103988647460938, + "rewards/ngram_similarity_reward/mean": 0.663827657699585, + "rewards/ngram_similarity_reward/std": 0.33302900195121765, + "step": 927 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 773.0, + "completions/max_terminated_length": 773.0, + "completions/mean_length": 448.53125, + "completions/mean_terminated_length": 448.53125, + "completions/min_length": 214.0, + "completions/min_terminated_length": 214.0, + "epoch": 0.4153054374580443, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07156302034854889, + "learning_rate": 4.70242998222126e-06, + "loss": 0.001, + "num_tokens": 147555107.0, + "reward": 4.492798328399658, + "reward_std": 0.12431110441684723, + "rewards/accuracy_reward/mean": 4.0, + "rewards/accuracy_reward/std": 2.618614673614502, + "rewards/ngram_similarity_reward/mean": 0.49279868602752686, + "rewards/ngram_similarity_reward/std": 0.32165202498435974, + "step": 928 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 691.0, + "completions/max_terminated_length": 691.0, + "completions/mean_length": 476.046875, + "completions/mean_terminated_length": 476.046875, + "completions/min_length": 275.0, + "completions/min_terminated_length": 275.0, + "epoch": 0.41575296486909824, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0779896154999733, + "learning_rate": 4.701602049407482e-06, + "loss": 0.0273, + "num_tokens": 147775574.0, + "reward": 5.030037879943848, + "reward_std": 0.7520781755447388, + "rewards/accuracy_reward/mean": 4.375, + "rewards/accuracy_reward/std": 2.3603873252868652, + "rewards/ngram_similarity_reward/mean": 0.6550377607345581, + "rewards/ngram_similarity_reward/std": 0.22018156945705414, + "step": 929 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 751.0, + "completions/max_terminated_length": 751.0, + "completions/mean_length": 461.484375, + "completions/mean_terminated_length": 461.484375, + "completions/min_length": 296.0, + "completions/min_terminated_length": 296.0, + "epoch": 0.41620049228015216, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07115507125854492, + "learning_rate": 4.70077304820076e-06, + "loss": -0.0071, + "num_tokens": 147918117.0, + "reward": 4.60894775390625, + "reward_std": 1.4901347160339355, + "rewards/accuracy_reward/mean": 3.8125, + "rewards/accuracy_reward/std": 2.7189810276031494, + "rewards/ngram_similarity_reward/mean": 0.7964481711387634, + "rewards/ngram_similarity_reward/std": 0.3186900317668915, + "step": 930 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 676.0, + "completions/max_terminated_length": 676.0, + "completions/mean_length": 399.90625, + "completions/mean_terminated_length": 399.90625, + "completions/min_length": 255.0, + "completions/min_terminated_length": 255.0, + "epoch": 0.4166480196912061, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07521829009056091, + "learning_rate": 4.699942979054926e-06, + "loss": 0.0065, + "num_tokens": 148077247.0, + "reward": 5.061558723449707, + "reward_std": 1.702772855758667, + "rewards/accuracy_reward/mean": 4.1875, + "rewards/accuracy_reward/std": 2.5, + "rewards/ngram_similarity_reward/mean": 0.8740589022636414, + "rewards/ngram_similarity_reward/std": 0.30254870653152466, + "step": 931 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 790.0, + "completions/max_terminated_length": 790.0, + "completions/mean_length": 433.140625, + "completions/mean_terminated_length": 433.140625, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "epoch": 0.41709554710226, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06832011044025421, + "learning_rate": 4.699111842424394e-06, + "loss": -0.0374, + "num_tokens": 148223560.0, + "reward": 5.83868408203125, + "reward_std": 0.5857259035110474, + "rewards/accuracy_reward/mean": 5.390625, + "rewards/accuracy_reward/std": 0.8750000596046448, + "rewards/ngram_similarity_reward/mean": 0.4480590522289276, + "rewards/ngram_similarity_reward/std": 0.27246713638305664, + "step": 932 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 620.0, + "completions/max_terminated_length": 620.0, + "completions/mean_length": 436.328125, + "completions/mean_terminated_length": 436.328125, + "completions/min_length": 323.0, + "completions/min_terminated_length": 323.0, + "epoch": 0.41754307451331396, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07530754804611206, + "learning_rate": 4.698279638764167e-06, + "loss": 0.0037, + "num_tokens": 148405437.0, + "reward": 4.639106750488281, + "reward_std": 0.6088700294494629, + "rewards/accuracy_reward/mean": 3.8125, + "rewards/accuracy_reward/std": 2.7189810276031494, + "rewards/ngram_similarity_reward/mean": 0.8266069293022156, + "rewards/ngram_similarity_reward/std": 0.20045363903045654, + "step": 933 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1019.0, + "completions/max_terminated_length": 1019.0, + "completions/mean_length": 388.921875, + "completions/mean_terminated_length": 388.921875, + "completions/min_length": 172.0, + "completions/min_terminated_length": 172.0, + "epoch": 0.4179906019243679, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0640043392777443, + "learning_rate": 4.697446368529829e-06, + "loss": 0.0264, + "num_tokens": 148569240.0, + "reward": 3.4495487213134766, + "reward_std": 1.641690731048584, + "rewards/accuracy_reward/mean": 2.75, + "rewards/accuracy_reward/std": 3.0498504638671875, + "rewards/ngram_similarity_reward/mean": 0.6995489001274109, + "rewards/ngram_similarity_reward/std": 0.3327275514602661, + "step": 934 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 722.0, + "completions/max_terminated_length": 722.0, + "completions/mean_length": 506.71875, + "completions/mean_terminated_length": 506.71875, + "completions/min_length": 324.0, + "completions/min_terminated_length": 324.0, + "epoch": 0.4184381293354218, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.055351290851831436, + "learning_rate": 4.69661203217755e-06, + "loss": -0.0101, + "num_tokens": 148767350.0, + "reward": 4.562273025512695, + "reward_std": 0.49909937381744385, + "rewards/accuracy_reward/mean": 3.890625, + "rewards/accuracy_reward/std": 2.6998953819274902, + "rewards/ngram_similarity_reward/mean": 0.6716479659080505, + "rewards/ngram_similarity_reward/std": 0.2990396022796631, + "step": 935 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 495.0, + "completions/max_terminated_length": 495.0, + "completions/mean_length": 357.046875, + "completions/mean_terminated_length": 357.046875, + "completions/min_length": 246.0, + "completions/min_terminated_length": 246.0, + "epoch": 0.4188856567464757, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09700652211904526, + "learning_rate": 4.6957766301640814e-06, + "loss": 0.005, + "num_tokens": 148934665.0, + "reward": 4.540606498718262, + "reward_std": 1.3007328510284424, + "rewards/accuracy_reward/mean": 3.90625, + "rewards/accuracy_reward/std": 2.6709415912628174, + "rewards/ngram_similarity_reward/mean": 0.6343559622764587, + "rewards/ngram_similarity_reward/std": 0.36280736327171326, + "step": 936 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 768.0, + "completions/max_terminated_length": 768.0, + "completions/mean_length": 431.671875, + "completions/mean_terminated_length": 431.671875, + "completions/min_length": 276.0, + "completions/min_terminated_length": 276.0, + "epoch": 0.4193331841575296, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08109112828969955, + "learning_rate": 4.694940162946759e-06, + "loss": 0.0015, + "num_tokens": 149098900.0, + "reward": 2.9900810718536377, + "reward_std": 0.4328707456588745, + "rewards/accuracy_reward/mean": 2.59375, + "rewards/accuracy_reward/std": 3.0222392082214355, + "rewards/ngram_similarity_reward/mean": 0.396331250667572, + "rewards/ngram_similarity_reward/std": 0.2602551281452179, + "step": 937 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 983.0, + "completions/max_terminated_length": 983.0, + "completions/mean_length": 512.09375, + "completions/mean_terminated_length": 512.09375, + "completions/min_length": 306.0, + "completions/min_terminated_length": 306.0, + "epoch": 0.4197807115685836, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07237434387207031, + "learning_rate": 4.694102630983502e-06, + "loss": 0.0048, + "num_tokens": 149249674.0, + "reward": 0.04293042793869972, + "reward_std": 0.33587461709976196, + "rewards/accuracy_reward/mean": -0.609375, + "rewards/accuracy_reward/std": 0.3145764470100403, + "rewards/ngram_similarity_reward/mean": 0.6523054242134094, + "rewards/ngram_similarity_reward/std": 0.19416570663452148, + "step": 938 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 896.0, + "completions/max_terminated_length": 896.0, + "completions/mean_length": 443.796875, + "completions/mean_terminated_length": 443.796875, + "completions/min_length": 207.0, + "completions/min_terminated_length": 207.0, + "epoch": 0.4202282389796375, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0778619572520256, + "learning_rate": 4.6932640347328125e-06, + "loss": -0.0313, + "num_tokens": 149402525.0, + "reward": 4.294356346130371, + "reward_std": 1.9737029075622559, + "rewards/accuracy_reward/mean": 3.53125, + "rewards/accuracy_reward/std": 2.839454174041748, + "rewards/ngram_similarity_reward/mean": 0.763106107711792, + "rewards/ngram_similarity_reward/std": 0.29600730538368225, + "step": 939 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 798.0, + "completions/max_terminated_length": 798.0, + "completions/mean_length": 487.125, + "completions/mean_terminated_length": 487.125, + "completions/min_length": 213.0, + "completions/min_terminated_length": 213.0, + "epoch": 0.42067576639069143, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07317737489938736, + "learning_rate": 4.692424374653774e-06, + "loss": 0.0081, + "num_tokens": 149633157.0, + "reward": 1.4493913650512695, + "reward_std": 0.44835755228996277, + "rewards/accuracy_reward/mean": 0.90625, + "rewards/accuracy_reward/std": 2.5617377758026123, + "rewards/ngram_similarity_reward/mean": 0.5431413650512695, + "rewards/ngram_similarity_reward/std": 0.3096190392971039, + "step": 940 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 590.0, + "completions/max_terminated_length": 590.0, + "completions/mean_length": 354.609375, + "completions/mean_terminated_length": 354.609375, + "completions/min_length": 196.0, + "completions/min_terminated_length": 196.0, + "epoch": 0.42112329380174535, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09393345564603806, + "learning_rate": 4.691583651206055e-06, + "loss": -0.0068, + "num_tokens": 149765692.0, + "reward": 3.0908572673797607, + "reward_std": 1.6652133464813232, + "rewards/accuracy_reward/mean": 2.3125, + "rewards/accuracy_reward/std": 3.0178043842315674, + "rewards/ngram_similarity_reward/mean": 0.7783571481704712, + "rewards/ngram_similarity_reward/std": 0.3887231647968292, + "step": 941 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 473.0, + "completions/max_terminated_length": 473.0, + "completions/mean_length": 321.640625, + "completions/mean_terminated_length": 321.640625, + "completions/min_length": 150.0, + "completions/min_terminated_length": 150.0, + "epoch": 0.42157082121279926, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.1032625362277031, + "learning_rate": 4.6907418648499045e-06, + "loss": -0.0107, + "num_tokens": 149893029.0, + "reward": 4.834690093994141, + "reward_std": 0.12957683205604553, + "rewards/accuracy_reward/mean": 4.0, + "rewards/accuracy_reward/std": 2.618614673614502, + "rewards/ngram_similarity_reward/mean": 0.8346905708312988, + "rewards/ngram_similarity_reward/std": 0.2965867519378662, + "step": 942 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 685.0, + "completions/max_terminated_length": 685.0, + "completions/mean_length": 473.359375, + "completions/mean_terminated_length": 473.359375, + "completions/min_length": 278.0, + "completions/min_terminated_length": 278.0, + "epoch": 0.42201834862385323, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06951655447483063, + "learning_rate": 4.689899016046152e-06, + "loss": 0.0189, + "num_tokens": 150042204.0, + "reward": 4.776795864105225, + "reward_std": 1.9362866878509521, + "rewards/accuracy_reward/mean": 4.1875, + "rewards/accuracy_reward/std": 2.5, + "rewards/ngram_similarity_reward/mean": 0.5892958641052246, + "rewards/ngram_similarity_reward/std": 0.34720170497894287, + "step": 943 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 882.0, + "completions/max_terminated_length": 882.0, + "completions/mean_length": 525.59375, + "completions/mean_terminated_length": 525.59375, + "completions/min_length": 315.0, + "completions/min_terminated_length": 315.0, + "epoch": 0.42246587603490715, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07009086012840271, + "learning_rate": 4.689055105256212e-06, + "loss": -0.002, + "num_tokens": 150219410.0, + "reward": 1.7767894268035889, + "reward_std": 0.6507170796394348, + "rewards/accuracy_reward/mean": 1.1875, + "rewards/accuracy_reward/std": 2.7189810276031494, + "rewards/ngram_similarity_reward/mean": 0.5892894864082336, + "rewards/ngram_similarity_reward/std": 0.26303830742836, + "step": 944 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 536.0, + "completions/max_terminated_length": 536.0, + "completions/mean_length": 364.5625, + "completions/mean_terminated_length": 364.5625, + "completions/min_length": 232.0, + "completions/min_terminated_length": 232.0, + "epoch": 0.42291340344596107, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.07398659735918045, + "learning_rate": 4.688210132942076e-06, + "loss": -0.0034, + "num_tokens": 150360694.0, + "reward": 6.416528701782227, + "reward_std": 0.16398948431015015, + "rewards/accuracy_reward/mean": 5.5, + "rewards/accuracy_reward/std": 0.0, + "rewards/ngram_similarity_reward/mean": 0.9165284037590027, + "rewards/ngram_similarity_reward/std": 0.29204583168029785, + "step": 945 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 813.0, + "completions/max_terminated_length": 813.0, + "completions/mean_length": 512.015625, + "completions/mean_terminated_length": 512.015625, + "completions/min_length": 302.0, + "completions/min_terminated_length": 302.0, + "epoch": 0.423360930857015, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07332911342382431, + "learning_rate": 4.687364099566321e-06, + "loss": -0.0114, + "num_tokens": 150493399.0, + "reward": 4.729446887969971, + "reward_std": 0.18728002905845642, + "rewards/accuracy_reward/mean": 4.0, + "rewards/accuracy_reward/std": 2.618614673614502, + "rewards/ngram_similarity_reward/mean": 0.7294468283653259, + "rewards/ngram_similarity_reward/std": 0.3016957938671112, + "step": 946 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 670.0, + "completions/max_terminated_length": 670.0, + "completions/mean_length": 440.40625, + "completions/mean_terminated_length": 440.40625, + "completions/min_length": 191.0, + "completions/min_terminated_length": 191.0, + "epoch": 0.4238084582680689, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07343576103448868, + "learning_rate": 4.686517005592102e-06, + "loss": -0.0129, + "num_tokens": 150703185.0, + "reward": 3.696603298187256, + "reward_std": 1.21620512008667, + "rewards/accuracy_reward/mean": 2.96875, + "rewards/accuracy_reward/std": 2.986577272415161, + "rewards/ngram_similarity_reward/mean": 0.7278531789779663, + "rewards/ngram_similarity_reward/std": 0.3106238543987274, + "step": 947 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1224.0, + "completions/max_terminated_length": 1224.0, + "completions/mean_length": 539.71875, + "completions/mean_terminated_length": 539.71875, + "completions/min_length": 183.0, + "completions/min_terminated_length": 183.0, + "epoch": 0.42425598567912287, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07591954618692398, + "learning_rate": 4.6856688514831566e-06, + "loss": 0.0079, + "num_tokens": 150857951.0, + "reward": 1.913421869277954, + "reward_std": 1.941478967666626, + "rewards/accuracy_reward/mean": 1.421875, + "rewards/accuracy_reward/std": 2.880171298980713, + "rewards/ngram_similarity_reward/mean": 0.49154698848724365, + "rewards/ngram_similarity_reward/std": 0.2773827016353607, + "step": 948 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 592.0, + "completions/max_terminated_length": 592.0, + "completions/mean_length": 399.890625, + "completions/mean_terminated_length": 399.890625, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "epoch": 0.4247035130901768, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08750013262033463, + "learning_rate": 4.684819637703801e-06, + "loss": 0.0123, + "num_tokens": 150955320.0, + "reward": 4.495274543762207, + "reward_std": 0.1956636607646942, + "rewards/accuracy_reward/mean": 4.0, + "rewards/accuracy_reward/std": 2.618614673614502, + "rewards/ngram_similarity_reward/mean": 0.4952741265296936, + "rewards/ngram_similarity_reward/std": 0.3157603144645691, + "step": 949 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 688.0, + "completions/max_terminated_length": 688.0, + "completions/mean_length": 453.078125, + "completions/mean_terminated_length": 453.078125, + "completions/min_length": 257.0, + "completions/min_terminated_length": 257.0, + "epoch": 0.4251510405012307, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06658633798360825, + "learning_rate": 4.683969364718932e-06, + "loss": 0.0164, + "num_tokens": 151121853.0, + "reward": 4.68372917175293, + "reward_std": 0.49302682280540466, + "rewards/accuracy_reward/mean": 3.90625, + "rewards/accuracy_reward/std": 2.6709415912628174, + "rewards/ngram_similarity_reward/mean": 0.7774791717529297, + "rewards/ngram_similarity_reward/std": 0.29103362560272217, + "step": 950 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 650.0, + "completions/max_terminated_length": 650.0, + "completions/mean_length": 422.515625, + "completions/mean_terminated_length": 422.515625, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "epoch": 0.4255985679122846, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08575849235057831, + "learning_rate": 4.6831180329940265e-06, + "loss": 0.0187, + "num_tokens": 151268398.0, + "reward": 3.5084714889526367, + "reward_std": 1.4491386413574219, + "rewards/accuracy_reward/mean": 2.59375, + "rewards/accuracy_reward/std": 3.0222392082214355, + "rewards/ngram_similarity_reward/mean": 0.9147217869758606, + "rewards/ngram_similarity_reward/std": 0.289139986038208, + "step": 951 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 781.0, + "completions/max_terminated_length": 781.0, + "completions/mean_length": 475.828125, + "completions/mean_terminated_length": 475.828125, + "completions/min_length": 254.0, + "completions/min_terminated_length": 254.0, + "epoch": 0.42604609532333854, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07986129075288773, + "learning_rate": 4.6822656429951415e-06, + "loss": -0.0008, + "num_tokens": 151419187.0, + "reward": 3.8873395919799805, + "reward_std": 1.240060806274414, + "rewards/accuracy_reward/mean": 3.4375, + "rewards/accuracy_reward/std": 2.872281312942505, + "rewards/ngram_similarity_reward/mean": 0.44983938336372375, + "rewards/ngram_similarity_reward/std": 0.34512755274772644, + "step": 952 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 771.0, + "completions/max_terminated_length": 771.0, + "completions/mean_length": 470.96875, + "completions/mean_terminated_length": 470.96875, + "completions/min_length": 251.0, + "completions/min_terminated_length": 251.0, + "epoch": 0.4264936227343925, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07536032795906067, + "learning_rate": 4.681412195188913e-06, + "loss": -0.0214, + "num_tokens": 151561681.0, + "reward": 4.1212921142578125, + "reward_std": 1.463741660118103, + "rewards/accuracy_reward/mean": 3.25, + "rewards/accuracy_reward/std": 2.9277002811431885, + "rewards/ngram_similarity_reward/mean": 0.8712917566299438, + "rewards/ngram_similarity_reward/std": 0.2256009578704834, + "step": 953 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 558.0, + "completions/max_terminated_length": 558.0, + "completions/mean_length": 406.40625, + "completions/mean_terminated_length": 406.40625, + "completions/min_length": 279.0, + "completions/min_terminated_length": 279.0, + "epoch": 0.4269411501454464, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08135781437158585, + "learning_rate": 4.680557690042555e-06, + "loss": -0.0006, + "num_tokens": 151730619.0, + "reward": 5.661190032958984, + "reward_std": 1.377014398574829, + "rewards/accuracy_reward/mean": 4.75, + "rewards/accuracy_reward/std": 2.0, + "rewards/ngram_similarity_reward/mean": 0.9111900329589844, + "rewards/ngram_similarity_reward/std": 0.2595444917678833, + "step": 954 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 622.0, + "completions/max_terminated_length": 622.0, + "completions/mean_length": 389.6875, + "completions/mean_terminated_length": 389.6875, + "completions/min_length": 247.0, + "completions/min_terminated_length": 247.0, + "epoch": 0.42738867755650034, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09303267300128937, + "learning_rate": 4.679702128023862e-06, + "loss": 0.007, + "num_tokens": 151887047.0, + "reward": 3.0402135848999023, + "reward_std": 1.1083359718322754, + "rewards/accuracy_reward/mean": 2.375, + "rewards/accuracy_reward/std": 3.057647228240967, + "rewards/ngram_similarity_reward/mean": 0.6652137041091919, + "rewards/ngram_similarity_reward/std": 0.3403913974761963, + "step": 955 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 673.0, + "completions/max_terminated_length": 673.0, + "completions/mean_length": 378.5625, + "completions/mean_terminated_length": 378.5625, + "completions/min_length": 271.0, + "completions/min_terminated_length": 271.0, + "epoch": 0.42783620496755426, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08461212366819382, + "learning_rate": 4.678845509601207e-06, + "loss": 0.01, + "num_tokens": 152050427.0, + "reward": 5.111605644226074, + "reward_std": 1.624394178390503, + "rewards/accuracy_reward/mean": 4.453125, + "rewards/accuracy_reward/std": 2.319206953048706, + "rewards/ngram_similarity_reward/mean": 0.6584810018539429, + "rewards/ngram_similarity_reward/std": 0.29018649458885193, + "step": 956 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 766.0, + "completions/max_terminated_length": 766.0, + "completions/mean_length": 462.71875, + "completions/mean_terminated_length": 462.71875, + "completions/min_length": 293.0, + "completions/min_terminated_length": 293.0, + "epoch": 0.4282837323786082, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08391781896352768, + "learning_rate": 4.677987835243539e-06, + "loss": -0.0126, + "num_tokens": 152210937.0, + "reward": 2.565617084503174, + "reward_std": 1.311779260635376, + "rewards/accuracy_reward/mean": 2.015625, + "rewards/accuracy_reward/std": 3.111638069152832, + "rewards/ngram_similarity_reward/mean": 0.5499922037124634, + "rewards/ngram_similarity_reward/std": 0.3418310284614563, + "step": 957 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 621.0, + "completions/max_terminated_length": 621.0, + "completions/mean_length": 444.328125, + "completions/mean_terminated_length": 444.328125, + "completions/min_length": 219.0, + "completions/min_terminated_length": 219.0, + "epoch": 0.4287312597896621, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07344631850719452, + "learning_rate": 4.677129105420387e-06, + "loss": -0.0588, + "num_tokens": 152396702.0, + "reward": 3.783191204071045, + "reward_std": 1.4082891941070557, + "rewards/accuracy_reward/mean": 3.21875, + "rewards/accuracy_reward/std": 2.9732606410980225, + "rewards/ngram_similarity_reward/mean": 0.5644412040710449, + "rewards/ngram_similarity_reward/std": 0.20864370465278625, + "step": 958 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 860.0, + "completions/max_terminated_length": 860.0, + "completions/mean_length": 497.796875, + "completions/mean_terminated_length": 497.796875, + "completions/min_length": 194.0, + "completions/min_terminated_length": 194.0, + "epoch": 0.42917878720071606, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06695375591516495, + "learning_rate": 4.6762693206018585e-06, + "loss": -0.0034, + "num_tokens": 152605905.0, + "reward": 0.6230151653289795, + "reward_std": 1.6565158367156982, + "rewards/accuracy_reward/mean": 0.0625, + "rewards/accuracy_reward/std": 1.7627090215682983, + "rewards/ngram_similarity_reward/mean": 0.5605151653289795, + "rewards/ngram_similarity_reward/std": 0.36507290601730347, + "step": 959 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 898.0, + "completions/mean_length": 569.859375, + "completions/mean_terminated_length": 497.1639099121094, + "completions/min_length": 338.0, + "completions/min_terminated_length": 338.0, + "epoch": 0.42962631461177, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0528508760035038, + "learning_rate": 4.675408481258637e-06, + "loss": -0.0834, + "num_tokens": 152764936.0, + "reward": 4.562169075012207, + "reward_std": 0.52070552110672, + "rewards/accuracy_reward/mean": 3.890625, + "rewards/accuracy_reward/std": 2.6998953819274902, + "rewards/ngram_similarity_reward/mean": 0.671544075012207, + "rewards/ngram_similarity_reward/std": 0.32839202880859375, + "step": 960 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 731.0, + "completions/max_terminated_length": 731.0, + "completions/mean_length": 508.265625, + "completions/mean_terminated_length": 508.265625, + "completions/min_length": 282.0, + "completions/min_terminated_length": 282.0, + "epoch": 0.4300738420228239, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06275948137044907, + "learning_rate": 4.674546587861985e-06, + "loss": 0.0134, + "num_tokens": 152953705.0, + "reward": 4.307257175445557, + "reward_std": 1.8037517070770264, + "rewards/accuracy_reward/mean": 3.515625, + "rewards/accuracy_reward/std": 2.8646292686462402, + "rewards/ngram_similarity_reward/mean": 0.7916322350502014, + "rewards/ngram_similarity_reward/std": 0.32915112376213074, + "step": 961 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 631.0, + "completions/max_terminated_length": 631.0, + "completions/mean_length": 363.96875, + "completions/mean_terminated_length": 363.96875, + "completions/min_length": 256.0, + "completions/min_terminated_length": 256.0, + "epoch": 0.4305213694338778, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08565182983875275, + "learning_rate": 4.67368364088374e-06, + "loss": -0.0126, + "num_tokens": 153143223.0, + "reward": 2.904425621032715, + "reward_std": 0.7966715097427368, + "rewards/accuracy_reward/mean": 2.21875, + "rewards/accuracy_reward/std": 3.0103988647460938, + "rewards/ngram_similarity_reward/mean": 0.6856756806373596, + "rewards/ngram_similarity_reward/std": 0.2889115810394287, + "step": 962 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 734.0, + "completions/max_terminated_length": 734.0, + "completions/mean_length": 469.171875, + "completions/mean_terminated_length": 469.171875, + "completions/min_length": 270.0, + "completions/min_terminated_length": 270.0, + "epoch": 0.4309688968449317, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07181283831596375, + "learning_rate": 4.6728196407963165e-06, + "loss": 0.0015, + "num_tokens": 153286130.0, + "reward": 4.65209436416626, + "reward_std": 1.5053563117980957, + "rewards/accuracy_reward/mean": 4.0, + "rewards/accuracy_reward/std": 2.618614673614502, + "rewards/ngram_similarity_reward/mean": 0.6520941853523254, + "rewards/ngram_similarity_reward/std": 0.3621197044849396, + "step": 963 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 686.0, + "completions/max_terminated_length": 686.0, + "completions/mean_length": 447.671875, + "completions/mean_terminated_length": 447.671875, + "completions/min_length": 291.0, + "completions/min_terminated_length": 291.0, + "epoch": 0.4314164242559857, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09691634774208069, + "learning_rate": 4.671954588072706e-06, + "loss": -0.0244, + "num_tokens": 153407661.0, + "reward": 3.785538673400879, + "reward_std": 1.360426902770996, + "rewards/accuracy_reward/mean": 3.25, + "rewards/accuracy_reward/std": 2.9277002811431885, + "rewards/ngram_similarity_reward/mean": 0.5355386734008789, + "rewards/ngram_similarity_reward/std": 0.19325245916843414, + "step": 964 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 741.0, + "completions/max_terminated_length": 741.0, + "completions/mean_length": 524.9375, + "completions/mean_terminated_length": 524.9375, + "completions/min_length": 277.0, + "completions/min_terminated_length": 277.0, + "epoch": 0.4318639516670396, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06560735404491425, + "learning_rate": 4.671088483186478e-06, + "loss": -0.0118, + "num_tokens": 153565225.0, + "reward": 4.425906658172607, + "reward_std": 0.6114445328712463, + "rewards/accuracy_reward/mean": 3.734375, + "rewards/accuracy_reward/std": 2.969379186630249, + "rewards/ngram_similarity_reward/mean": 0.6915316581726074, + "rewards/ngram_similarity_reward/std": 0.37951141595840454, + "step": 965 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 642.0, + "completions/max_terminated_length": 642.0, + "completions/mean_length": 424.03125, + "completions/mean_terminated_length": 424.03125, + "completions/min_length": 296.0, + "completions/min_terminated_length": 296.0, + "epoch": 0.43231147907809353, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.080103300511837, + "learning_rate": 4.670221326611777e-06, + "loss": 0.0107, + "num_tokens": 153686011.0, + "reward": 4.409912109375, + "reward_std": 0.8295788168907166, + "rewards/accuracy_reward/mean": 3.625, + "rewards/accuracy_reward/std": 2.8030595779418945, + "rewards/ngram_similarity_reward/mean": 0.7849124073982239, + "rewards/ngram_similarity_reward/std": 0.2863464951515198, + "step": 966 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 650.0, + "completions/max_terminated_length": 650.0, + "completions/mean_length": 448.25, + "completions/mean_terminated_length": 448.25, + "completions/min_length": 270.0, + "completions/min_terminated_length": 270.0, + "epoch": 0.43275900648914745, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08570236712694168, + "learning_rate": 4.6693531188233195e-06, + "loss": 0.0356, + "num_tokens": 153882811.0, + "reward": 5.434075355529785, + "reward_std": 1.3801506757736206, + "rewards/accuracy_reward/mean": 4.734375, + "rewards/accuracy_reward/std": 2.04506516456604, + "rewards/ngram_similarity_reward/mean": 0.6996999382972717, + "rewards/ngram_similarity_reward/std": 0.3544958233833313, + "step": 967 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 634.0, + "completions/max_terminated_length": 634.0, + "completions/mean_length": 449.515625, + "completions/mean_terminated_length": 449.515625, + "completions/min_length": 297.0, + "completions/min_terminated_length": 297.0, + "epoch": 0.43320653390020136, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07121328264474869, + "learning_rate": 4.668483860296405e-06, + "loss": 0.0139, + "num_tokens": 154036332.0, + "reward": 4.247105598449707, + "reward_std": 1.1307997703552246, + "rewards/accuracy_reward/mean": 3.53125, + "rewards/accuracy_reward/std": 2.839454174041748, + "rewards/ngram_similarity_reward/mean": 0.7158555388450623, + "rewards/ngram_similarity_reward/std": 0.3953235149383545, + "step": 968 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 793.0, + "completions/max_terminated_length": 793.0, + "completions/mean_length": 474.28125, + "completions/mean_terminated_length": 474.28125, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "epoch": 0.43365406131125533, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07322057336568832, + "learning_rate": 4.667613551506901e-06, + "loss": -0.0341, + "num_tokens": 154164830.0, + "reward": 3.941260576248169, + "reward_std": 0.8153356909751892, + "rewards/accuracy_reward/mean": 3.53125, + "rewards/accuracy_reward/std": 2.839454174041748, + "rewards/ngram_similarity_reward/mean": 0.4100106358528137, + "rewards/ngram_similarity_reward/std": 0.2338978499174118, + "step": 969 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 869.0, + "completions/max_terminated_length": 869.0, + "completions/mean_length": 558.0625, + "completions/mean_terminated_length": 558.0625, + "completions/min_length": 259.0, + "completions/min_terminated_length": 259.0, + "epoch": 0.43410158872230925, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06403569877147675, + "learning_rate": 4.666742192931252e-06, + "loss": 0.0144, + "num_tokens": 154337666.0, + "reward": 6.299419403076172, + "reward_std": 0.09389565885066986, + "rewards/accuracy_reward/mean": 5.5, + "rewards/accuracy_reward/std": 0.0, + "rewards/ngram_similarity_reward/mean": 0.7994194626808167, + "rewards/ngram_similarity_reward/std": 0.17963354289531708, + "step": 970 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 620.0, + "completions/max_terminated_length": 620.0, + "completions/mean_length": 394.890625, + "completions/mean_terminated_length": 394.890625, + "completions/min_length": 236.0, + "completions/min_terminated_length": 236.0, + "epoch": 0.43454911613336317, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08646032214164734, + "learning_rate": 4.665869785046481e-06, + "loss": 0.0464, + "num_tokens": 154472635.0, + "reward": 2.6723878383636475, + "reward_std": 1.3516616821289062, + "rewards/accuracy_reward/mean": 2.21875, + "rewards/accuracy_reward/std": 3.0103988647460938, + "rewards/ngram_similarity_reward/mean": 0.4536377191543579, + "rewards/ngram_similarity_reward/std": 0.26994383335113525, + "step": 971 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 811.0, + "completions/max_terminated_length": 811.0, + "completions/mean_length": 435.046875, + "completions/mean_terminated_length": 435.046875, + "completions/min_length": 206.0, + "completions/min_terminated_length": 206.0, + "epoch": 0.4349966435444171, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08001197129487991, + "learning_rate": 4.664996328330181e-06, + "loss": 0.0315, + "num_tokens": 154654654.0, + "reward": 2.80649471282959, + "reward_std": 0.6137222647666931, + "rewards/accuracy_reward/mean": 2.3125, + "rewards/accuracy_reward/std": 3.126309394836426, + "rewards/ngram_similarity_reward/mean": 0.49399474263191223, + "rewards/ngram_similarity_reward/std": 0.3272567093372345, + "step": 972 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1090.0, + "completions/max_terminated_length": 1090.0, + "completions/mean_length": 455.8125, + "completions/mean_terminated_length": 455.8125, + "completions/min_length": 300.0, + "completions/min_terminated_length": 300.0, + "epoch": 0.435444170955471, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0852704867720604, + "learning_rate": 4.66412182326052e-06, + "loss": -0.0294, + "num_tokens": 154786994.0, + "reward": 3.906424045562744, + "reward_std": 1.2724534273147583, + "rewards/accuracy_reward/mean": 3.40625, + "rewards/accuracy_reward/std": 2.920745372772217, + "rewards/ngram_similarity_reward/mean": 0.5001741051673889, + "rewards/ngram_similarity_reward/std": 0.36396166682243347, + "step": 973 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 657.0, + "completions/max_terminated_length": 657.0, + "completions/mean_length": 423.453125, + "completions/mean_terminated_length": 423.453125, + "completions/min_length": 274.0, + "completions/min_terminated_length": 274.0, + "epoch": 0.43589169836652497, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08183480054140091, + "learning_rate": 4.663246270316243e-06, + "loss": 0.026, + "num_tokens": 154937359.0, + "reward": 3.4259989261627197, + "reward_std": 0.7067916989326477, + "rewards/accuracy_reward/mean": 2.78125, + "rewards/accuracy_reward/std": 3.0103988647460938, + "rewards/ngram_similarity_reward/mean": 0.6447486877441406, + "rewards/ngram_similarity_reward/std": 0.29538872838020325, + "step": 974 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 955.0, + "completions/max_terminated_length": 955.0, + "completions/mean_length": 441.4375, + "completions/mean_terminated_length": 441.4375, + "completions/min_length": 279.0, + "completions/min_terminated_length": 279.0, + "epoch": 0.4363392257775789, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07977086305618286, + "learning_rate": 4.662369669976663e-06, + "loss": -0.0062, + "num_tokens": 155121643.0, + "reward": 2.9232845306396484, + "reward_std": 0.5824289321899414, + "rewards/accuracy_reward/mean": 2.3125, + "rewards/accuracy_reward/std": 3.0178043842315674, + "rewards/ngram_similarity_reward/mean": 0.6107844710350037, + "rewards/ngram_similarity_reward/std": 0.18239139020442963, + "step": 975 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 656.0, + "completions/max_terminated_length": 656.0, + "completions/mean_length": 419.3125, + "completions/mean_terminated_length": 419.3125, + "completions/min_length": 270.0, + "completions/min_terminated_length": 270.0, + "epoch": 0.4367867531886328, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08666278421878815, + "learning_rate": 4.661492022721672e-06, + "loss": 0.0297, + "num_tokens": 155272319.0, + "reward": 1.7772903442382812, + "reward_std": 1.3866727352142334, + "rewards/accuracy_reward/mean": 1.28125, + "rewards/accuracy_reward/std": 2.7629566192626953, + "rewards/ngram_similarity_reward/mean": 0.496040403842926, + "rewards/ngram_similarity_reward/std": 0.1825767606496811, + "step": 976 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1034.0, + "completions/max_terminated_length": 1034.0, + "completions/mean_length": 550.171875, + "completions/mean_terminated_length": 550.171875, + "completions/min_length": 330.0, + "completions/min_terminated_length": 330.0, + "epoch": 0.4372342805996867, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06390845775604248, + "learning_rate": 4.660613329031733e-06, + "loss": -0.0196, + "num_tokens": 155429210.0, + "reward": 5.810588836669922, + "reward_std": 0.7339605093002319, + "rewards/accuracy_reward/mean": 5.28125, + "rewards/accuracy_reward/std": 1.227576732635498, + "rewards/ngram_similarity_reward/mean": 0.5293385982513428, + "rewards/ngram_similarity_reward/std": 0.27086836099624634, + "step": 977 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 689.0, + "completions/max_terminated_length": 689.0, + "completions/mean_length": 440.84375, + "completions/mean_terminated_length": 440.84375, + "completions/min_length": 229.0, + "completions/min_terminated_length": 229.0, + "epoch": 0.43768180801074064, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07718484103679657, + "learning_rate": 4.6597335893878795e-06, + "loss": 0.0143, + "num_tokens": 155596400.0, + "reward": 4.276413917541504, + "reward_std": 0.46639174222946167, + "rewards/accuracy_reward/mean": 3.90625, + "rewards/accuracy_reward/std": 2.6709415912628174, + "rewards/ngram_similarity_reward/mean": 0.37016376852989197, + "rewards/ngram_similarity_reward/std": 0.30691027641296387, + "step": 978 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 795.0, + "completions/max_terminated_length": 795.0, + "completions/mean_length": 503.40625, + "completions/mean_terminated_length": 503.40625, + "completions/min_length": 287.0, + "completions/min_terminated_length": 287.0, + "epoch": 0.4381293354217946, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06703373044729233, + "learning_rate": 4.6588528042717204e-06, + "loss": -0.0112, + "num_tokens": 155720618.0, + "reward": 4.702371120452881, + "reward_std": 1.5032615661621094, + "rewards/accuracy_reward/mean": 4.1875, + "rewards/accuracy_reward/std": 2.5, + "rewards/ngram_similarity_reward/mean": 0.5148714184761047, + "rewards/ngram_similarity_reward/std": 0.2904004454612732, + "step": 979 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 835.0, + "completions/max_terminated_length": 835.0, + "completions/mean_length": 576.984375, + "completions/mean_terminated_length": 576.984375, + "completions/min_length": 445.0, + "completions/min_terminated_length": 445.0, + "epoch": 0.4385768628328485, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05834938958287239, + "learning_rate": 4.657970974165438e-06, + "loss": 0.0058, + "num_tokens": 155861065.0, + "reward": 3.6889820098876953, + "reward_std": 0.888426661491394, + "rewards/accuracy_reward/mean": 3.0625, + "rewards/accuracy_reward/std": 2.9700891971588135, + "rewards/ngram_similarity_reward/mean": 0.6264820098876953, + "rewards/ngram_similarity_reward/std": 0.18658064305782318, + "step": 980 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 667.0, + "completions/max_terminated_length": 667.0, + "completions/mean_length": 473.0625, + "completions/mean_terminated_length": 473.0625, + "completions/min_length": 283.0, + "completions/min_terminated_length": 283.0, + "epoch": 0.43902439024390244, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06484103947877884, + "learning_rate": 4.6570880995517835e-06, + "loss": 0.0014, + "num_tokens": 156040989.0, + "reward": 4.436846733093262, + "reward_std": 0.5330394506454468, + "rewards/accuracy_reward/mean": 3.90625, + "rewards/accuracy_reward/std": 2.6709415912628174, + "rewards/ngram_similarity_reward/mean": 0.5305963754653931, + "rewards/ngram_similarity_reward/std": 0.3162103593349457, + "step": 981 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 778.0, + "completions/max_terminated_length": 778.0, + "completions/mean_length": 502.78125, + "completions/mean_terminated_length": 502.78125, + "completions/min_length": 246.0, + "completions/min_terminated_length": 246.0, + "epoch": 0.43947191765495636, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06592470407485962, + "learning_rate": 4.656204180914082e-06, + "loss": 0.0361, + "num_tokens": 156183343.0, + "reward": 4.8697404861450195, + "reward_std": 1.4862189292907715, + "rewards/accuracy_reward/mean": 4.34375, + "rewards/accuracy_reward/std": 2.4314002990722656, + "rewards/ngram_similarity_reward/mean": 0.5259901285171509, + "rewards/ngram_similarity_reward/std": 0.2956460118293762, + "step": 982 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 973.0, + "completions/max_terminated_length": 973.0, + "completions/mean_length": 583.34375, + "completions/mean_terminated_length": 583.34375, + "completions/min_length": 256.0, + "completions/min_terminated_length": 256.0, + "epoch": 0.4399194450660103, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.059540845453739166, + "learning_rate": 4.655319218736229e-06, + "loss": 0.0079, + "num_tokens": 156341733.0, + "reward": 3.7660622596740723, + "reward_std": 1.3947157859802246, + "rewards/accuracy_reward/mean": 2.96875, + "rewards/accuracy_reward/std": 2.986577272415161, + "rewards/ngram_similarity_reward/mean": 0.7973122000694275, + "rewards/ngram_similarity_reward/std": 0.28380465507507324, + "step": 983 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 618.0, + "completions/max_terminated_length": 618.0, + "completions/mean_length": 376.90625, + "completions/mean_terminated_length": 376.90625, + "completions/min_length": 223.0, + "completions/min_terminated_length": 223.0, + "epoch": 0.44036697247706424, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09604540467262268, + "learning_rate": 4.654433213502691e-06, + "loss": -0.0188, + "num_tokens": 156451903.0, + "reward": 5.35115909576416, + "reward_std": 0.9664373993873596, + "rewards/accuracy_reward/mean": 4.65625, + "rewards/accuracy_reward/std": 2.102294683456421, + "rewards/ngram_similarity_reward/mean": 0.6949087977409363, + "rewards/ngram_similarity_reward/std": 0.3469395935535431, + "step": 984 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 618.0, + "completions/max_terminated_length": 618.0, + "completions/mean_length": 436.890625, + "completions/mean_terminated_length": 436.890625, + "completions/min_length": 308.0, + "completions/min_terminated_length": 308.0, + "epoch": 0.44081449988811816, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08597588539123535, + "learning_rate": 4.653546165698508e-06, + "loss": 0.0151, + "num_tokens": 156678072.0, + "reward": 5.017669200897217, + "reward_std": 0.8949160575866699, + "rewards/accuracy_reward/mean": 4.5625, + "rewards/accuracy_reward/std": 2.195775270462036, + "rewards/ngram_similarity_reward/mean": 0.4551692605018616, + "rewards/ngram_similarity_reward/std": 0.2327612191438675, + "step": 985 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 766.0, + "completions/max_terminated_length": 766.0, + "completions/mean_length": 419.03125, + "completions/mean_terminated_length": 419.03125, + "completions/min_length": 205.0, + "completions/min_terminated_length": 205.0, + "epoch": 0.4412620272991721, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08380001038312912, + "learning_rate": 4.652658075809289e-06, + "loss": -0.001, + "num_tokens": 156833658.0, + "reward": 1.934064507484436, + "reward_std": 1.6516462564468384, + "rewards/accuracy_reward/mean": 1.46875, + "rewards/accuracy_reward/std": 2.839454174041748, + "rewards/ngram_similarity_reward/mean": 0.46531447768211365, + "rewards/ngram_similarity_reward/std": 0.20871390402317047, + "step": 986 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 492.0, + "completions/max_terminated_length": 492.0, + "completions/mean_length": 319.515625, + "completions/mean_terminated_length": 319.515625, + "completions/min_length": 172.0, + "completions/min_terminated_length": 172.0, + "epoch": 0.441709554710226, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.07714425027370453, + "learning_rate": 4.651768944321212e-06, + "loss": -0.0181, + "num_tokens": 156956747.0, + "reward": 5.239560127258301, + "reward_std": 1.2817769050598145, + "rewards/accuracy_reward/mean": 4.375, + "rewards/accuracy_reward/std": 2.3603873252868652, + "rewards/ngram_similarity_reward/mean": 0.8645603656768799, + "rewards/ngram_similarity_reward/std": 0.3636215925216675, + "step": 987 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 730.0, + "completions/max_terminated_length": 730.0, + "completions/mean_length": 497.078125, + "completions/mean_terminated_length": 497.078125, + "completions/min_length": 302.0, + "completions/min_terminated_length": 302.0, + "epoch": 0.4421570821212799, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05398174375295639, + "learning_rate": 4.650878771721028e-06, + "loss": 0.0073, + "num_tokens": 157094496.0, + "reward": 4.853190898895264, + "reward_std": 0.1744401752948761, + "rewards/accuracy_reward/mean": 4.0, + "rewards/accuracy_reward/std": 2.618614673614502, + "rewards/ngram_similarity_reward/mean": 0.8531908988952637, + "rewards/ngram_similarity_reward/std": 0.20298880338668823, + "step": 988 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 669.0, + "completions/max_terminated_length": 669.0, + "completions/mean_length": 389.234375, + "completions/mean_terminated_length": 389.234375, + "completions/min_length": 232.0, + "completions/min_terminated_length": 232.0, + "epoch": 0.4426046095323339, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09710188210010529, + "learning_rate": 4.649987558496056e-06, + "loss": -0.0053, + "num_tokens": 157235439.0, + "reward": 5.915307998657227, + "reward_std": 1.104044795036316, + "rewards/accuracy_reward/mean": 5.125, + "rewards/accuracy_reward/std": 1.4638501405715942, + "rewards/ngram_similarity_reward/mean": 0.7903082370758057, + "rewards/ngram_similarity_reward/std": 0.3099243938922882, + "step": 989 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 619.0, + "completions/max_terminated_length": 619.0, + "completions/mean_length": 407.921875, + "completions/mean_terminated_length": 407.921875, + "completions/min_length": 269.0, + "completions/min_terminated_length": 269.0, + "epoch": 0.4430521369433878, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10357832908630371, + "learning_rate": 4.649095305134186e-06, + "loss": 0.0067, + "num_tokens": 157474378.0, + "reward": 1.24918532371521, + "reward_std": 1.3662900924682617, + "rewards/accuracy_reward/mean": 0.53125, + "rewards/accuracy_reward/std": 2.2815253734588623, + "rewards/ngram_similarity_reward/mean": 0.71793532371521, + "rewards/ngram_similarity_reward/std": 0.2971068024635315, + "step": 990 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1197.0, + "completions/max_terminated_length": 1197.0, + "completions/mean_length": 626.25, + "completions/mean_terminated_length": 626.25, + "completions/min_length": 306.0, + "completions/min_terminated_length": 306.0, + "epoch": 0.4434996643544417, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.048259276896715164, + "learning_rate": 4.648202012123875e-06, + "loss": 0.0095, + "num_tokens": 157588394.0, + "reward": 4.237071990966797, + "reward_std": 0.8555949926376343, + "rewards/accuracy_reward/mean": 3.796875, + "rewards/accuracy_reward/std": 2.746886730194092, + "rewards/ngram_similarity_reward/mean": 0.4401967227458954, + "rewards/ngram_similarity_reward/std": 0.20750321447849274, + "step": 991 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 810.0, + "completions/max_terminated_length": 810.0, + "completions/mean_length": 501.796875, + "completions/mean_terminated_length": 501.796875, + "completions/min_length": 252.0, + "completions/min_terminated_length": 252.0, + "epoch": 0.44394719176549563, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07116010785102844, + "learning_rate": 4.647307679954155e-06, + "loss": 0.0502, + "num_tokens": 157721405.0, + "reward": 4.5370049476623535, + "reward_std": 0.7521318793296814, + "rewards/accuracy_reward/mean": 3.625, + "rewards/accuracy_reward/std": 2.8030595779418945, + "rewards/ngram_similarity_reward/mean": 0.9120049476623535, + "rewards/ngram_similarity_reward/std": 0.13152669370174408, + "step": 992 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 634.0, + "completions/max_terminated_length": 634.0, + "completions/mean_length": 391.984375, + "completions/mean_terminated_length": 391.984375, + "completions/min_length": 192.0, + "completions/min_terminated_length": 192.0, + "epoch": 0.44439471917654955, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09845888614654541, + "learning_rate": 4.646412309114618e-06, + "loss": -0.0157, + "num_tokens": 157885708.0, + "reward": 4.514744758605957, + "reward_std": 0.8385021686553955, + "rewards/accuracy_reward/mean": 4.0, + "rewards/accuracy_reward/std": 2.618614673614502, + "rewards/ngram_similarity_reward/mean": 0.5147448182106018, + "rewards/ngram_similarity_reward/std": 0.26765215396881104, + "step": 993 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 631.0, + "completions/max_terminated_length": 631.0, + "completions/mean_length": 446.546875, + "completions/mean_terminated_length": 446.546875, + "completions/min_length": 286.0, + "completions/min_terminated_length": 286.0, + "epoch": 0.4448422465876035, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07439401745796204, + "learning_rate": 4.645515900095432e-06, + "loss": -0.0023, + "num_tokens": 158054975.0, + "reward": 3.3904080390930176, + "reward_std": 1.1648640632629395, + "rewards/accuracy_reward/mean": 2.875, + "rewards/accuracy_reward/std": 3.000000238418579, + "rewards/ngram_similarity_reward/mean": 0.5154081583023071, + "rewards/ngram_similarity_reward/std": 0.28233492374420166, + "step": 994 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 734.0, + "completions/max_terminated_length": 734.0, + "completions/mean_length": 457.5625, + "completions/mean_terminated_length": 457.5625, + "completions/min_length": 283.0, + "completions/min_terminated_length": 283.0, + "epoch": 0.44528977399865743, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07856795191764832, + "learning_rate": 4.6446184533873315e-06, + "loss": -0.0044, + "num_tokens": 158187299.0, + "reward": 4.359292984008789, + "reward_std": 1.1848640441894531, + "rewards/accuracy_reward/mean": 3.625, + "rewards/accuracy_reward/std": 2.8030595779418945, + "rewards/ngram_similarity_reward/mean": 0.7342928647994995, + "rewards/ngram_similarity_reward/std": 0.3862909972667694, + "step": 995 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1095.0, + "completions/max_terminated_length": 1095.0, + "completions/mean_length": 538.0625, + "completions/mean_terminated_length": 538.0625, + "completions/min_length": 251.0, + "completions/min_terminated_length": 251.0, + "epoch": 0.44573730140971135, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0661824569106102, + "learning_rate": 4.643719969481616e-06, + "loss": -0.0411, + "num_tokens": 158339079.0, + "reward": 2.278085470199585, + "reward_std": 1.0091297626495361, + "rewards/accuracy_reward/mean": 1.75, + "rewards/accuracy_reward/std": 3.039423704147339, + "rewards/ngram_similarity_reward/mean": 0.5280854105949402, + "rewards/ngram_similarity_reward/std": 0.2838883399963379, + "step": 996 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 677.0, + "completions/max_terminated_length": 677.0, + "completions/mean_length": 368.09375, + "completions/mean_terminated_length": 368.09375, + "completions/min_length": 212.0, + "completions/min_terminated_length": 212.0, + "epoch": 0.44618482882076527, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09019217640161514, + "learning_rate": 4.642820448870158e-06, + "loss": -0.0171, + "num_tokens": 158464269.0, + "reward": 2.937518835067749, + "reward_std": 0.8208221793174744, + "rewards/accuracy_reward/mean": 2.484375, + "rewards/accuracy_reward/std": 3.0419929027557373, + "rewards/ngram_similarity_reward/mean": 0.4531439542770386, + "rewards/ngram_similarity_reward/std": 0.2831995189189911, + "step": 997 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 880.0, + "completions/max_terminated_length": 880.0, + "completions/mean_length": 474.5625, + "completions/mean_terminated_length": 474.5625, + "completions/min_length": 275.0, + "completions/min_terminated_length": 275.0, + "epoch": 0.4466323562318192, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08046316355466843, + "learning_rate": 4.641919892045393e-06, + "loss": -0.0133, + "num_tokens": 158589153.0, + "reward": 2.7983765602111816, + "reward_std": 0.7150464653968811, + "rewards/accuracy_reward/mean": 2.25, + "rewards/accuracy_reward/std": 3.0860671997070312, + "rewards/ngram_similarity_reward/mean": 0.5483765006065369, + "rewards/ngram_similarity_reward/std": 0.31970739364624023, + "step": 998 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 978.0, + "completions/max_terminated_length": 978.0, + "completions/mean_length": 541.28125, + "completions/mean_terminated_length": 541.28125, + "completions/min_length": 317.0, + "completions/min_terminated_length": 317.0, + "epoch": 0.4470798836428731, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0517701581120491, + "learning_rate": 4.641018299500324e-06, + "loss": 0.0304, + "num_tokens": 158717219.0, + "reward": 4.679778099060059, + "reward_std": 0.1837746500968933, + "rewards/accuracy_reward/mean": 4.0, + "rewards/accuracy_reward/std": 2.618614673614502, + "rewards/ngram_similarity_reward/mean": 0.6797779202461243, + "rewards/ngram_similarity_reward/std": 0.29137223958969116, + "step": 999 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 587.0, + "completions/max_terminated_length": 587.0, + "completions/mean_length": 435.109375, + "completions/mean_terminated_length": 435.109375, + "completions/min_length": 294.0, + "completions/min_terminated_length": 294.0, + "epoch": 0.44752741105392707, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08018787950277328, + "learning_rate": 4.640115671728527e-06, + "loss": -0.0115, + "num_tokens": 158882922.0, + "reward": 4.042120456695557, + "reward_std": 1.1525325775146484, + "rewards/accuracy_reward/mean": 3.71875, + "rewards/accuracy_reward/std": 2.7629566192626953, + "rewards/ngram_similarity_reward/mean": 0.3233703374862671, + "rewards/ngram_similarity_reward/std": 0.1967499703168869, + "step": 1000 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 838.0, + "completions/max_terminated_length": 838.0, + "completions/mean_length": 518.046875, + "completions/mean_terminated_length": 518.046875, + "completions/min_length": 287.0, + "completions/min_terminated_length": 287.0, + "epoch": 0.447974938464981, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06527865678071976, + "learning_rate": 4.639212009224135e-06, + "loss": 0.0079, + "num_tokens": 159055997.0, + "reward": 3.4171600341796875, + "reward_std": 0.47780343890190125, + "rewards/accuracy_reward/mean": 2.59375, + "rewards/accuracy_reward/std": 3.0222392082214355, + "rewards/ngram_similarity_reward/mean": 0.8234100341796875, + "rewards/ngram_similarity_reward/std": 0.2530917525291443, + "step": 1001 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 591.0, + "completions/max_terminated_length": 591.0, + "completions/mean_length": 401.171875, + "completions/mean_terminated_length": 401.171875, + "completions/min_length": 256.0, + "completions/min_terminated_length": 256.0, + "epoch": 0.4484224658760349, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08057080209255219, + "learning_rate": 4.638307312481856e-06, + "loss": -0.0164, + "num_tokens": 159218520.0, + "reward": 6.217398643493652, + "reward_std": 0.13007661700248718, + "rewards/accuracy_reward/mean": 5.5, + "rewards/accuracy_reward/std": 0.0, + "rewards/ngram_similarity_reward/mean": 0.7173987627029419, + "rewards/ngram_similarity_reward/std": 0.21581734716892242, + "step": 1002 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 645.0, + "completions/max_terminated_length": 645.0, + "completions/mean_length": 424.640625, + "completions/mean_terminated_length": 424.640625, + "completions/min_length": 237.0, + "completions/min_terminated_length": 237.0, + "epoch": 0.4488699932870888, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08946868032217026, + "learning_rate": 4.637401581996961e-06, + "loss": 0.0141, + "num_tokens": 159366369.0, + "reward": 3.845090866088867, + "reward_std": 1.2513779401779175, + "rewards/accuracy_reward/mean": 3.15625, + "rewards/accuracy_reward/std": 2.950484275817871, + "rewards/ngram_similarity_reward/mean": 0.6888412237167358, + "rewards/ngram_similarity_reward/std": 0.29713091254234314, + "step": 1003 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 492.0, + "completions/max_terminated_length": 492.0, + "completions/mean_length": 347.984375, + "completions/mean_terminated_length": 347.984375, + "completions/min_length": 269.0, + "completions/min_terminated_length": 269.0, + "epoch": 0.44931752069814274, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09654480218887329, + "learning_rate": 4.636494818265284e-06, + "loss": -0.0059, + "num_tokens": 159502880.0, + "reward": 2.747676134109497, + "reward_std": 1.8483915328979492, + "rewards/accuracy_reward/mean": 2.03125, + "rewards/accuracy_reward/std": 2.986577272415161, + "rewards/ngram_similarity_reward/mean": 0.7164261341094971, + "rewards/ngram_similarity_reward/std": 0.2764227092266083, + "step": 1004 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 704.0, + "completions/max_terminated_length": 704.0, + "completions/mean_length": 390.171875, + "completions/mean_terminated_length": 390.171875, + "completions/min_length": 256.0, + "completions/min_terminated_length": 256.0, + "epoch": 0.4497650481091967, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07983547449111938, + "learning_rate": 4.63558702178323e-06, + "loss": -0.0131, + "num_tokens": 159688955.0, + "reward": 3.516596555709839, + "reward_std": 0.8062986731529236, + "rewards/accuracy_reward/mean": 2.96875, + "rewards/accuracy_reward/std": 2.986577272415161, + "rewards/ngram_similarity_reward/mean": 0.547846794128418, + "rewards/ngram_similarity_reward/std": 0.21161513030529022, + "step": 1005 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 471.0, + "completions/max_terminated_length": 471.0, + "completions/mean_length": 359.359375, + "completions/mean_terminated_length": 359.359375, + "completions/min_length": 255.0, + "completions/min_terminated_length": 255.0, + "epoch": 0.4502125755202506, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09040001779794693, + "learning_rate": 4.634678193047765e-06, + "loss": 0.0201, + "num_tokens": 159850962.0, + "reward": 4.5764336585998535, + "reward_std": 1.0174980163574219, + "rewards/accuracy_reward/mean": 3.8125, + "rewards/accuracy_reward/std": 2.7189810276031494, + "rewards/ngram_similarity_reward/mean": 0.7639338970184326, + "rewards/ngram_similarity_reward/std": 0.2882556617259979, + "step": 1006 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 568.0, + "completions/max_terminated_length": 568.0, + "completions/mean_length": 407.59375, + "completions/mean_terminated_length": 407.59375, + "completions/min_length": 291.0, + "completions/min_terminated_length": 291.0, + "epoch": 0.45066010293130454, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08074507862329483, + "learning_rate": 4.633768332556424e-06, + "loss": -0.0025, + "num_tokens": 159981528.0, + "reward": 3.310504674911499, + "reward_std": 1.0501964092254639, + "rewards/accuracy_reward/mean": 2.6875, + "rewards/accuracy_reward/std": 3.0178043842315674, + "rewards/ngram_similarity_reward/mean": 0.6230047941207886, + "rewards/ngram_similarity_reward/std": 0.3178018629550934, + "step": 1007 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 637.0, + "completions/mean_length": 416.125, + "completions/mean_terminated_length": 390.2222595214844, + "completions/min_length": 225.0, + "completions/min_terminated_length": 225.0, + "epoch": 0.45110763034235846, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10865113884210587, + "learning_rate": 4.632857440807303e-06, + "loss": -0.0083, + "num_tokens": 160158640.0, + "reward": 0.19361451268196106, + "reward_std": 0.8938025832176208, + "rewards/accuracy_reward/mean": -0.328125, + "rewards/accuracy_reward/std": 1.0624125003814697, + "rewards/ngram_similarity_reward/mean": 0.5217394828796387, + "rewards/ngram_similarity_reward/std": 0.2538130581378937, + "step": 1008 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 602.0, + "completions/max_terminated_length": 602.0, + "completions/mean_length": 372.3125, + "completions/mean_terminated_length": 372.3125, + "completions/min_length": 202.0, + "completions/min_terminated_length": 202.0, + "epoch": 0.4515551577534124, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.08638826757669449, + "learning_rate": 4.631945518299064e-06, + "loss": 0.0259, + "num_tokens": 160397092.0, + "reward": 4.829348087310791, + "reward_std": 0.5159898400306702, + "rewards/accuracy_reward/mean": 3.90625, + "rewards/accuracy_reward/std": 2.6709415912628174, + "rewards/ngram_similarity_reward/mean": 0.9230982065200806, + "rewards/ngram_similarity_reward/std": 0.20585164427757263, + "step": 1009 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 660.0, + "completions/max_terminated_length": 660.0, + "completions/mean_length": 399.828125, + "completions/mean_terminated_length": 399.828125, + "completions/min_length": 255.0, + "completions/min_terminated_length": 255.0, + "epoch": 0.45200268516446634, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08075409382581711, + "learning_rate": 4.631032565530935e-06, + "loss": 0.013, + "num_tokens": 160570809.0, + "reward": 1.565451979637146, + "reward_std": 0.09725593030452728, + "rewards/accuracy_reward/mean": 1.0, + "rewards/accuracy_reward/std": 2.618614673614502, + "rewards/ngram_similarity_reward/mean": 0.5654520988464355, + "rewards/ngram_similarity_reward/std": 0.22560539841651917, + "step": 1010 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 776.0, + "completions/max_terminated_length": 776.0, + "completions/mean_length": 398.09375, + "completions/mean_terminated_length": 398.09375, + "completions/min_length": 210.0, + "completions/min_terminated_length": 210.0, + "epoch": 0.45245021257552026, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13738077878952026, + "learning_rate": 4.630118583002706e-06, + "loss": -0.0013, + "num_tokens": 160764959.0, + "reward": 3.276353359222412, + "reward_std": 1.9264037609100342, + "rewards/accuracy_reward/mean": 2.734375, + "rewards/accuracy_reward/std": 3.0692648887634277, + "rewards/ngram_similarity_reward/mean": 0.5419784188270569, + "rewards/ngram_similarity_reward/std": 0.3680952787399292, + "step": 1011 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 807.0, + "completions/max_terminated_length": 807.0, + "completions/mean_length": 657.671875, + "completions/mean_terminated_length": 657.671875, + "completions/min_length": 515.0, + "completions/min_terminated_length": 515.0, + "epoch": 0.4528977399865742, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05747548118233681, + "learning_rate": 4.629203571214732e-06, + "loss": 0.0019, + "num_tokens": 160941930.0, + "reward": 4.90988302230835, + "reward_std": 0.7813419699668884, + "rewards/accuracy_reward/mean": 4.265625, + "rewards/accuracy_reward/std": 2.467195510864258, + "rewards/ngram_similarity_reward/mean": 0.6442579030990601, + "rewards/ngram_similarity_reward/std": 0.20877647399902344, + "step": 1012 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 693.0, + "completions/max_terminated_length": 693.0, + "completions/mean_length": 470.1875, + "completions/mean_terminated_length": 470.1875, + "completions/min_length": 281.0, + "completions/min_terminated_length": 281.0, + "epoch": 0.4533452673976281, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09367240965366364, + "learning_rate": 4.628287530667929e-06, + "loss": 0.0205, + "num_tokens": 161089654.0, + "reward": 0.8358583450317383, + "reward_std": 0.8505191206932068, + "rewards/accuracy_reward/mean": 0.34375, + "rewards/accuracy_reward/std": 2.102294683456421, + "rewards/ngram_similarity_reward/mean": 0.49210840463638306, + "rewards/ngram_similarity_reward/std": 0.35597845911979675, + "step": 1013 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 789.0, + "completions/max_terminated_length": 789.0, + "completions/mean_length": 513.03125, + "completions/mean_terminated_length": 513.03125, + "completions/min_length": 347.0, + "completions/min_terminated_length": 347.0, + "epoch": 0.453792794808682, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07150578498840332, + "learning_rate": 4.627370461863779e-06, + "loss": 0.0288, + "num_tokens": 161266152.0, + "reward": 4.312531471252441, + "reward_std": 0.986059844493866, + "rewards/accuracy_reward/mean": 3.515625, + "rewards/accuracy_reward/std": 2.8646292686462402, + "rewards/ngram_similarity_reward/mean": 0.7969064712524414, + "rewards/ngram_similarity_reward/std": 0.3095705509185791, + "step": 1014 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 581.0, + "completions/max_terminated_length": 581.0, + "completions/mean_length": 401.265625, + "completions/mean_terminated_length": 401.265625, + "completions/min_length": 283.0, + "completions/min_terminated_length": 283.0, + "epoch": 0.454240322219736, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08253169804811478, + "learning_rate": 4.626452365304327e-06, + "loss": 0.0023, + "num_tokens": 161433225.0, + "reward": 6.062775611877441, + "reward_std": 0.7393642663955688, + "rewards/accuracy_reward/mean": 5.21875, + "rewards/accuracy_reward/std": 1.2782522439956665, + "rewards/ngram_similarity_reward/mean": 0.844025731086731, + "rewards/ngram_similarity_reward/std": 0.2485669106245041, + "step": 1015 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 714.0, + "completions/max_terminated_length": 714.0, + "completions/mean_length": 434.984375, + "completions/mean_terminated_length": 434.984375, + "completions/min_length": 275.0, + "completions/min_terminated_length": 275.0, + "epoch": 0.4546878496307899, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10803146660327911, + "learning_rate": 4.625533241492177e-06, + "loss": 0.0029, + "num_tokens": 161669704.0, + "reward": 3.029890537261963, + "reward_std": 2.1167008876800537, + "rewards/accuracy_reward/mean": 2.6875, + "rewards/accuracy_reward/std": 3.0178043842315674, + "rewards/ngram_similarity_reward/mean": 0.34239059686660767, + "rewards/ngram_similarity_reward/std": 0.14699290692806244, + "step": 1016 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 630.0, + "completions/max_terminated_length": 630.0, + "completions/mean_length": 407.703125, + "completions/mean_terminated_length": 407.703125, + "completions/min_length": 181.0, + "completions/min_terminated_length": 181.0, + "epoch": 0.4551353770418438, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08884494006633759, + "learning_rate": 4.6246130909305e-06, + "loss": -0.0355, + "num_tokens": 161873925.0, + "reward": 4.0044450759887695, + "reward_std": 1.6529027223587036, + "rewards/accuracy_reward/mean": 3.21875, + "rewards/accuracy_reward/std": 2.9732606410980225, + "rewards/ngram_similarity_reward/mean": 0.785695493221283, + "rewards/ngram_similarity_reward/std": 0.3590853810310364, + "step": 1017 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 672.0, + "completions/max_terminated_length": 672.0, + "completions/mean_length": 477.09375, + "completions/mean_terminated_length": 477.09375, + "completions/min_length": 281.0, + "completions/min_terminated_length": 281.0, + "epoch": 0.45558290445289773, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06873611360788345, + "learning_rate": 4.623691914123025e-06, + "loss": -0.0072, + "num_tokens": 162011323.0, + "reward": 4.906051158905029, + "reward_std": 0.0931473970413208, + "rewards/accuracy_reward/mean": 4.0, + "rewards/accuracy_reward/std": 2.618614673614502, + "rewards/ngram_similarity_reward/mean": 0.9060510396957397, + "rewards/ngram_similarity_reward/std": 0.19354379177093506, + "step": 1018 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 572.0, + "completions/max_terminated_length": 572.0, + "completions/mean_length": 330.484375, + "completions/mean_terminated_length": 330.484375, + "completions/min_length": 186.0, + "completions/min_terminated_length": 186.0, + "epoch": 0.45603043186395165, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.08067482709884644, + "learning_rate": 4.622769711574047e-06, + "loss": -0.0173, + "num_tokens": 162141162.0, + "reward": 3.4138293266296387, + "reward_std": 0.5554684996604919, + "rewards/accuracy_reward/mean": 2.390625, + "rewards/accuracy_reward/std": 3.0400354862213135, + "rewards/ngram_similarity_reward/mean": 1.0232044458389282, + "rewards/ngram_similarity_reward/std": 0.22352083027362823, + "step": 1019 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 509.0, + "completions/max_terminated_length": 509.0, + "completions/mean_length": 368.375, + "completions/mean_terminated_length": 368.375, + "completions/min_length": 242.0, + "completions/min_terminated_length": 242.0, + "epoch": 0.4564779592750056, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08370555937290192, + "learning_rate": 4.621846483788418e-06, + "loss": 0.0242, + "num_tokens": 162288434.0, + "reward": 4.566494464874268, + "reward_std": 0.437887579202652, + "rewards/accuracy_reward/mean": 3.90625, + "rewards/accuracy_reward/std": 2.6709415912628174, + "rewards/ngram_similarity_reward/mean": 0.6602445840835571, + "rewards/ngram_similarity_reward/std": 0.2169688194990158, + "step": 1020 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 903.0, + "completions/max_terminated_length": 903.0, + "completions/mean_length": 442.390625, + "completions/mean_terminated_length": 442.390625, + "completions/min_length": 272.0, + "completions/min_terminated_length": 272.0, + "epoch": 0.45692548668605953, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08730597794055939, + "learning_rate": 4.620922231271555e-06, + "loss": 0.0725, + "num_tokens": 162469691.0, + "reward": 2.3871564865112305, + "reward_std": 0.8901073336601257, + "rewards/accuracy_reward/mean": 1.9375, + "rewards/accuracy_reward/std": 2.9700891971588135, + "rewards/ngram_similarity_reward/mean": 0.44965660572052, + "rewards/ngram_similarity_reward/std": 0.21127961575984955, + "step": 1021 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1079.0, + "completions/max_terminated_length": 1079.0, + "completions/mean_length": 503.8125, + "completions/mean_terminated_length": 503.8125, + "completions/min_length": 286.0, + "completions/min_terminated_length": 286.0, + "epoch": 0.45737301409711345, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07264433056116104, + "learning_rate": 4.619996954529436e-06, + "loss": -0.0433, + "num_tokens": 162668127.0, + "reward": 3.299483299255371, + "reward_std": 0.41848915815353394, + "rewards/accuracy_reward/mean": 2.59375, + "rewards/accuracy_reward/std": 3.0222392082214355, + "rewards/ngram_similarity_reward/mean": 0.7057333588600159, + "rewards/ngram_similarity_reward/std": 0.21536371111869812, + "step": 1022 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 728.0, + "completions/max_terminated_length": 728.0, + "completions/mean_length": 467.796875, + "completions/mean_terminated_length": 467.796875, + "completions/min_length": 299.0, + "completions/min_terminated_length": 299.0, + "epoch": 0.45782054150816737, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0833204984664917, + "learning_rate": 4.619070654068595e-06, + "loss": -0.0056, + "num_tokens": 162825442.0, + "reward": 4.313222885131836, + "reward_std": 0.6884230375289917, + "rewards/accuracy_reward/mean": 3.8125, + "rewards/accuracy_reward/std": 2.7189810276031494, + "rewards/ngram_similarity_reward/mean": 0.5007230043411255, + "rewards/ngram_similarity_reward/std": 0.2955898642539978, + "step": 1023 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 783.0, + "completions/max_terminated_length": 783.0, + "completions/mean_length": 463.15625, + "completions/mean_terminated_length": 463.15625, + "completions/min_length": 295.0, + "completions/min_terminated_length": 295.0, + "epoch": 0.4582680689192213, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.07459675520658493, + "learning_rate": 4.618143330396132e-06, + "loss": 0.0033, + "num_tokens": 162976252.0, + "reward": 2.2040796279907227, + "reward_std": 0.8834795355796814, + "rewards/accuracy_reward/mean": 1.640625, + "rewards/accuracy_reward/std": 2.91611385345459, + "rewards/ngram_similarity_reward/mean": 0.5634545683860779, + "rewards/ngram_similarity_reward/std": 0.24133865535259247, + "step": 1024 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 799.0, + "completions/max_terminated_length": 799.0, + "completions/mean_length": 470.9375, + "completions/mean_terminated_length": 470.9375, + "completions/min_length": 321.0, + "completions/min_terminated_length": 321.0, + "epoch": 0.45871559633027525, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07935074716806412, + "learning_rate": 4.617214984019704e-06, + "loss": 0.0048, + "num_tokens": 163170760.0, + "reward": 3.609093189239502, + "reward_std": 0.8524174690246582, + "rewards/accuracy_reward/mean": 3.25, + "rewards/accuracy_reward/std": 2.9277002811431885, + "rewards/ngram_similarity_reward/mean": 0.35909339785575867, + "rewards/ngram_similarity_reward/std": 0.259185254573822, + "step": 1025 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 787.0, + "completions/mean_length": 512.203125, + "completions/mean_terminated_length": 487.825439453125, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, + "epoch": 0.45916312374132917, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07026367634534836, + "learning_rate": 4.616285615447528e-06, + "loss": 0.029, + "num_tokens": 163342117.0, + "reward": 4.603669166564941, + "reward_std": 1.6760941743850708, + "rewards/accuracy_reward/mean": 3.984375, + "rewards/accuracy_reward/std": 2.648702621459961, + "rewards/ngram_similarity_reward/mean": 0.6192945241928101, + "rewards/ngram_similarity_reward/std": 0.3728182911872864, + "step": 1026 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 689.0, + "completions/max_terminated_length": 689.0, + "completions/mean_length": 412.0625, + "completions/mean_terminated_length": 412.0625, + "completions/min_length": 233.0, + "completions/min_terminated_length": 233.0, + "epoch": 0.4596106511523831, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08737986534833908, + "learning_rate": 4.615355225188383e-06, + "loss": -0.0044, + "num_tokens": 163482857.0, + "reward": 3.2743639945983887, + "reward_std": 0.7320421934127808, + "rewards/accuracy_reward/mean": 2.875, + "rewards/accuracy_reward/std": 3.000000238418579, + "rewards/ngram_similarity_reward/mean": 0.3993642330169678, + "rewards/ngram_similarity_reward/std": 0.23473213613033295, + "step": 1027 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 772.0, + "completions/max_terminated_length": 772.0, + "completions/mean_length": 527.640625, + "completions/mean_terminated_length": 527.640625, + "completions/min_length": 346.0, + "completions/min_terminated_length": 346.0, + "epoch": 0.460058178563437, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05836676061153412, + "learning_rate": 4.6144238137516055e-06, + "loss": 0.0058, + "num_tokens": 163624178.0, + "reward": 4.261211395263672, + "reward_std": 1.2127914428710938, + "rewards/accuracy_reward/mean": 3.625, + "rewards/accuracy_reward/std": 2.8030595779418945, + "rewards/ngram_similarity_reward/mean": 0.6362113952636719, + "rewards/ngram_similarity_reward/std": 0.21961015462875366, + "step": 1028 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 818.0, + "completions/max_terminated_length": 818.0, + "completions/mean_length": 481.46875, + "completions/mean_terminated_length": 481.46875, + "completions/min_length": 251.0, + "completions/min_terminated_length": 251.0, + "epoch": 0.4605057059744909, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08901166915893555, + "learning_rate": 4.613491381647089e-06, + "loss": -0.0366, + "num_tokens": 163788480.0, + "reward": 2.892777919769287, + "reward_std": 2.786778450012207, + "rewards/accuracy_reward/mean": 2.390625, + "rewards/accuracy_reward/std": 3.0400354862213135, + "rewards/ngram_similarity_reward/mean": 0.5021529197692871, + "rewards/ngram_similarity_reward/std": 0.19329185783863068, + "step": 1029 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 691.0, + "completions/max_terminated_length": 691.0, + "completions/mean_length": 411.125, + "completions/mean_terminated_length": 411.125, + "completions/min_length": 242.0, + "completions/min_terminated_length": 242.0, + "epoch": 0.4609532333855449, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08067741245031357, + "learning_rate": 4.61255792938529e-06, + "loss": 0.0262, + "num_tokens": 163947400.0, + "reward": 3.001347064971924, + "reward_std": 2.00323486328125, + "rewards/accuracy_reward/mean": 2.21875, + "rewards/accuracy_reward/std": 3.0103988647460938, + "rewards/ngram_similarity_reward/mean": 0.7825968861579895, + "rewards/ngram_similarity_reward/std": 0.2769053876399994, + "step": 1030 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 695.0, + "completions/max_terminated_length": 695.0, + "completions/mean_length": 455.21875, + "completions/mean_terminated_length": 455.21875, + "completions/min_length": 312.0, + "completions/min_terminated_length": 312.0, + "epoch": 0.4614007607965988, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06692025810480118, + "learning_rate": 4.611623457477221e-06, + "loss": 0.0132, + "num_tokens": 164085014.0, + "reward": 3.226724863052368, + "reward_std": 0.6595999598503113, + "rewards/accuracy_reward/mean": 2.78125, + "rewards/accuracy_reward/std": 3.0103988647460938, + "rewards/ngram_similarity_reward/mean": 0.4454747140407562, + "rewards/ngram_similarity_reward/std": 0.2540774643421173, + "step": 1031 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 503.0, + "completions/max_terminated_length": 503.0, + "completions/mean_length": 358.171875, + "completions/mean_terminated_length": 358.171875, + "completions/min_length": 202.0, + "completions/min_terminated_length": 202.0, + "epoch": 0.4618482882076527, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1052207350730896, + "learning_rate": 4.610687966434451e-06, + "loss": -0.0117, + "num_tokens": 164198529.0, + "reward": 4.967963218688965, + "reward_std": 1.3966630697250366, + "rewards/accuracy_reward/mean": 4.234375, + "rewards/accuracy_reward/std": 2.662152051925659, + "rewards/ngram_similarity_reward/mean": 0.7335888147354126, + "rewards/ngram_similarity_reward/std": 0.434805691242218, + "step": 1032 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 550.0, + "completions/max_terminated_length": 550.0, + "completions/mean_length": 384.359375, + "completions/mean_terminated_length": 384.359375, + "completions/min_length": 232.0, + "completions/min_terminated_length": 232.0, + "epoch": 0.46229581561870664, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10399851948022842, + "learning_rate": 4.609751456769112e-06, + "loss": 0.0094, + "num_tokens": 164369320.0, + "reward": 3.617398262023926, + "reward_std": 1.3301640748977661, + "rewards/accuracy_reward/mean": 3.15625, + "rewards/accuracy_reward/std": 2.950484275817871, + "rewards/ngram_similarity_reward/mean": 0.4611482620239258, + "rewards/ngram_similarity_reward/std": 0.3033144772052765, + "step": 1033 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 502.0, + "completions/max_terminated_length": 502.0, + "completions/mean_length": 361.09375, + "completions/mean_terminated_length": 361.09375, + "completions/min_length": 239.0, + "completions/min_terminated_length": 239.0, + "epoch": 0.46274334302976056, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09007919579744339, + "learning_rate": 4.60881392899389e-06, + "loss": 0.0161, + "num_tokens": 164494574.0, + "reward": 4.622521877288818, + "reward_std": 0.22130389511585236, + "rewards/accuracy_reward/mean": 4.0, + "rewards/accuracy_reward/std": 2.618614673614502, + "rewards/ngram_similarity_reward/mean": 0.622521698474884, + "rewards/ngram_similarity_reward/std": 0.32046768069267273, + "step": 1034 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 747.0, + "completions/max_terminated_length": 747.0, + "completions/mean_length": 468.03125, + "completions/mean_terminated_length": 468.03125, + "completions/min_length": 280.0, + "completions/min_terminated_length": 280.0, + "epoch": 0.4631908704408145, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09070774167776108, + "learning_rate": 4.607875383622028e-06, + "loss": 0.0035, + "num_tokens": 164724912.0, + "reward": 2.7548575401306152, + "reward_std": 1.8196587562561035, + "rewards/accuracy_reward/mean": 2.03125, + "rewards/accuracy_reward/std": 2.986577272415161, + "rewards/ngram_similarity_reward/mean": 0.7236075401306152, + "rewards/ngram_similarity_reward/std": 0.2541177272796631, + "step": 1035 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 673.0, + "completions/max_terminated_length": 673.0, + "completions/mean_length": 438.015625, + "completions/mean_terminated_length": 438.015625, + "completions/min_length": 308.0, + "completions/min_terminated_length": 308.0, + "epoch": 0.46363839785186844, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07595244795084, + "learning_rate": 4.606935821167327e-06, + "loss": 0.0049, + "num_tokens": 164883409.0, + "reward": 6.129752159118652, + "reward_std": 0.4484296143054962, + "rewards/accuracy_reward/mean": 5.40625, + "rewards/accuracy_reward/std": 0.7500000596046448, + "rewards/ngram_similarity_reward/mean": 0.7235023975372314, + "rewards/ngram_similarity_reward/std": 0.14641357958316803, + "step": 1036 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 718.0, + "completions/max_terminated_length": 718.0, + "completions/mean_length": 472.3125, + "completions/mean_terminated_length": 472.3125, + "completions/min_length": 254.0, + "completions/min_terminated_length": 254.0, + "epoch": 0.46408592526292236, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06311800330877304, + "learning_rate": 4.605995242144146e-06, + "loss": -0.0237, + "num_tokens": 165045765.0, + "reward": 4.421064853668213, + "reward_std": 2.2462971210479736, + "rewards/accuracy_reward/mean": 3.625, + "rewards/accuracy_reward/std": 2.8030595779418945, + "rewards/ngram_similarity_reward/mean": 0.7960647940635681, + "rewards/ngram_similarity_reward/std": 0.3977976143360138, + "step": 1037 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 677.0, + "completions/max_terminated_length": 677.0, + "completions/mean_length": 466.0, + "completions/mean_terminated_length": 466.0, + "completions/min_length": 291.0, + "completions/min_terminated_length": 291.0, + "epoch": 0.4645334526739763, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07536507397890091, + "learning_rate": 4.605053647067399e-06, + "loss": 0.0244, + "num_tokens": 165210757.0, + "reward": 2.9640872478485107, + "reward_std": 0.7854170799255371, + "rewards/accuracy_reward/mean": 2.21875, + "rewards/accuracy_reward/std": 3.0103988647460938, + "rewards/ngram_similarity_reward/mean": 0.7453370690345764, + "rewards/ngram_similarity_reward/std": 0.2581217586994171, + "step": 1038 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 787.0, + "completions/max_terminated_length": 787.0, + "completions/mean_length": 409.75, + "completions/mean_terminated_length": 409.75, + "completions/min_length": 180.0, + "completions/min_terminated_length": 180.0, + "epoch": 0.4649809800850302, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06313635408878326, + "learning_rate": 4.6041110364525565e-06, + "loss": 0.0, + "num_tokens": 165397909.0, + "reward": 6.239960670471191, + "reward_std": 0.9716970920562744, + "rewards/accuracy_reward/mean": 5.3125, + "rewards/accuracy_reward/std": 1.0522085428237915, + "rewards/ngram_similarity_reward/mean": 0.9274606704711914, + "rewards/ngram_similarity_reward/std": 0.2883267104625702, + "step": 1039 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 818.0, + "completions/max_terminated_length": 818.0, + "completions/mean_length": 492.453125, + "completions/mean_terminated_length": 492.453125, + "completions/min_length": 336.0, + "completions/min_terminated_length": 336.0, + "epoch": 0.4654285074960841, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07633720338344574, + "learning_rate": 4.603167410815645e-06, + "loss": -0.0282, + "num_tokens": 165553186.0, + "reward": 6.175039291381836, + "reward_std": 0.20186586678028107, + "rewards/accuracy_reward/mean": 5.5, + "rewards/accuracy_reward/std": 0.0, + "rewards/ngram_similarity_reward/mean": 0.6750392913818359, + "rewards/ngram_similarity_reward/std": 0.30154407024383545, + "step": 1040 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 780.0, + "completions/max_terminated_length": 780.0, + "completions/mean_length": 391.875, + "completions/mean_terminated_length": 391.875, + "completions/min_length": 216.0, + "completions/min_terminated_length": 216.0, + "epoch": 0.4658760349071381, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.07251864671707153, + "learning_rate": 4.602222770673246e-06, + "loss": 0.0544, + "num_tokens": 165677898.0, + "reward": 4.065640926361084, + "reward_std": 0.8149410486221313, + "rewards/accuracy_reward/mean": 3.53125, + "rewards/accuracy_reward/std": 2.839454174041748, + "rewards/ngram_similarity_reward/mean": 0.5343913435935974, + "rewards/ngram_similarity_reward/std": 0.28121185302734375, + "step": 1041 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 642.0, + "completions/mean_length": 435.578125, + "completions/mean_terminated_length": 409.9841613769531, + "completions/min_length": 199.0, + "completions/min_terminated_length": 199.0, + "epoch": 0.466323562318192, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.06468340009450912, + "learning_rate": 4.601277116542498e-06, + "loss": 0.0504, + "num_tokens": 165820575.0, + "reward": 3.5892698764801025, + "reward_std": 0.6493726372718811, + "rewards/accuracy_reward/mean": 2.78125, + "rewards/accuracy_reward/std": 3.0103988647460938, + "rewards/ngram_similarity_reward/mean": 0.8080199956893921, + "rewards/ngram_similarity_reward/std": 0.3213406801223755, + "step": 1042 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 944.0, + "completions/mean_length": 489.15625, + "completions/mean_terminated_length": 464.4127197265625, + "completions/min_length": 220.0, + "completions/min_terminated_length": 220.0, + "epoch": 0.4667710897292459, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07755690813064575, + "learning_rate": 4.600330448941094e-06, + "loss": -0.0507, + "num_tokens": 166048185.0, + "reward": 3.6401398181915283, + "reward_std": 1.5206663608551025, + "rewards/accuracy_reward/mean": 2.875, + "rewards/accuracy_reward/std": 3.000000238418579, + "rewards/ngram_similarity_reward/mean": 0.7651398777961731, + "rewards/ngram_similarity_reward/std": 0.3352813422679901, + "step": 1043 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 849.0, + "completions/max_terminated_length": 849.0, + "completions/mean_length": 448.046875, + "completions/mean_terminated_length": 448.046875, + "completions/min_length": 196.0, + "completions/min_terminated_length": 196.0, + "epoch": 0.46721861714029983, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08614441007375717, + "learning_rate": 4.599382768387282e-06, + "loss": 0.0319, + "num_tokens": 166185436.0, + "reward": 6.143418788909912, + "reward_std": 0.17175956070423126, + "rewards/accuracy_reward/mean": 5.5, + "rewards/accuracy_reward/std": 0.0, + "rewards/ngram_similarity_reward/mean": 0.6434186697006226, + "rewards/ngram_similarity_reward/std": 0.3139803409576416, + "step": 1044 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 776.0, + "completions/max_terminated_length": 776.0, + "completions/mean_length": 534.34375, + "completions/mean_terminated_length": 534.34375, + "completions/min_length": 403.0, + "completions/min_terminated_length": 403.0, + "epoch": 0.46766614455135375, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0726943388581276, + "learning_rate": 4.598434075399862e-06, + "loss": 0.0147, + "num_tokens": 166341106.0, + "reward": 1.8407868146896362, + "reward_std": 0.6296216249465942, + "rewards/accuracy_reward/mean": 1.1875, + "rewards/accuracy_reward/std": 2.7189810276031494, + "rewards/ngram_similarity_reward/mean": 0.6532865762710571, + "rewards/ngram_similarity_reward/std": 0.19341208040714264, + "step": 1045 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 718.0, + "completions/max_terminated_length": 718.0, + "completions/mean_length": 397.578125, + "completions/mean_terminated_length": 397.578125, + "completions/min_length": 228.0, + "completions/min_terminated_length": 228.0, + "epoch": 0.4681136719624077, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07925833016633987, + "learning_rate": 4.597484370498193e-06, + "loss": 0.032, + "num_tokens": 166481751.0, + "reward": 6.3092122077941895, + "reward_std": 0.528394341468811, + "rewards/accuracy_reward/mean": 5.40625, + "rewards/accuracy_reward/std": 0.7500000596046448, + "rewards/ngram_similarity_reward/mean": 0.9029619693756104, + "rewards/ngram_similarity_reward/std": 0.269043505191803, + "step": 1046 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 920.0, + "completions/max_terminated_length": 920.0, + "completions/mean_length": 593.984375, + "completions/mean_terminated_length": 593.984375, + "completions/min_length": 351.0, + "completions/min_terminated_length": 351.0, + "epoch": 0.46856119937346163, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06135343015193939, + "learning_rate": 4.596533654202183e-06, + "loss": -0.016, + "num_tokens": 166653334.0, + "reward": 4.515833377838135, + "reward_std": 0.46059608459472656, + "rewards/accuracy_reward/mean": 3.90625, + "rewards/accuracy_reward/std": 2.6709415912628174, + "rewards/ngram_similarity_reward/mean": 0.6095834374427795, + "rewards/ngram_similarity_reward/std": 0.35220563411712646, + "step": 1047 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 550.0, + "completions/max_terminated_length": 550.0, + "completions/mean_length": 435.40625, + "completions/mean_terminated_length": 435.40625, + "completions/min_length": 312.0, + "completions/min_terminated_length": 312.0, + "epoch": 0.46900872678451555, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08945198357105255, + "learning_rate": 4.595581927032296e-06, + "loss": -0.0074, + "num_tokens": 166823584.0, + "reward": 4.23256254196167, + "reward_std": 0.5420059561729431, + "rewards/accuracy_reward/mean": 3.8125, + "rewards/accuracy_reward/std": 2.7189810276031494, + "rewards/ngram_similarity_reward/mean": 0.42006272077560425, + "rewards/ngram_similarity_reward/std": 0.21719275414943695, + "step": 1048 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1005.0, + "completions/max_terminated_length": 1005.0, + "completions/mean_length": 568.578125, + "completions/mean_terminated_length": 568.578125, + "completions/min_length": 355.0, + "completions/min_terminated_length": 355.0, + "epoch": 0.46945625419556947, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07008595019578934, + "learning_rate": 4.594629189509552e-06, + "loss": 0.022, + "num_tokens": 167063989.0, + "reward": 1.4306988716125488, + "reward_std": 0.5476556420326233, + "rewards/accuracy_reward/mean": 1.046875, + "rewards/accuracy_reward/std": 2.7076005935668945, + "rewards/ngram_similarity_reward/mean": 0.3838239908218384, + "rewards/ngram_similarity_reward/std": 0.24899296462535858, + "step": 1049 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 633.0, + "completions/max_terminated_length": 633.0, + "completions/mean_length": 427.15625, + "completions/mean_terminated_length": 427.15625, + "completions/min_length": 242.0, + "completions/min_terminated_length": 242.0, + "epoch": 0.4699037816066234, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07404383271932602, + "learning_rate": 4.59367544215552e-06, + "loss": 0.0081, + "num_tokens": 167206015.0, + "reward": 2.6730198860168457, + "reward_std": 0.8190189599990845, + "rewards/accuracy_reward/mean": 2.125, + "rewards/accuracy_reward/std": 3.000000238418579, + "rewards/ngram_similarity_reward/mean": 0.5480199456214905, + "rewards/ngram_similarity_reward/std": 0.30111372470855713, + "step": 1050 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 746.0, + "completions/max_terminated_length": 746.0, + "completions/mean_length": 516.421875, + "completions/mean_terminated_length": 516.421875, + "completions/min_length": 344.0, + "completions/min_terminated_length": 344.0, + "epoch": 0.47035130901767735, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06461696326732635, + "learning_rate": 4.5927206854923214e-06, + "loss": -0.0275, + "num_tokens": 167358298.0, + "reward": 4.086188793182373, + "reward_std": 1.0312371253967285, + "rewards/accuracy_reward/mean": 3.390625, + "rewards/accuracy_reward/std": 2.944552183151245, + "rewards/ngram_similarity_reward/mean": 0.6955640316009521, + "rewards/ngram_similarity_reward/std": 0.29851576685905457, + "step": 1051 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 678.0, + "completions/max_terminated_length": 678.0, + "completions/mean_length": 423.765625, + "completions/mean_terminated_length": 423.765625, + "completions/min_length": 242.0, + "completions/min_terminated_length": 242.0, + "epoch": 0.47079883642873127, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0901014432311058, + "learning_rate": 4.591764920042635e-06, + "loss": 0.0026, + "num_tokens": 167517691.0, + "reward": 4.31773567199707, + "reward_std": 0.7875552177429199, + "rewards/accuracy_reward/mean": 3.625, + "rewards/accuracy_reward/std": 2.8030595779418945, + "rewards/ngram_similarity_reward/mean": 0.6927354335784912, + "rewards/ngram_similarity_reward/std": 0.29080554842948914, + "step": 1052 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 724.0, + "completions/max_terminated_length": 724.0, + "completions/mean_length": 440.171875, + "completions/mean_terminated_length": 440.171875, + "completions/min_length": 289.0, + "completions/min_terminated_length": 289.0, + "epoch": 0.4712463638397852, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06958474963903427, + "learning_rate": 4.590808146329687e-06, + "loss": -0.0019, + "num_tokens": 167686438.0, + "reward": 4.321617603302002, + "reward_std": 0.8348760008811951, + "rewards/accuracy_reward/mean": 3.625, + "rewards/accuracy_reward/std": 2.8030595779418945, + "rewards/ngram_similarity_reward/mean": 0.6966177225112915, + "rewards/ngram_similarity_reward/std": 0.2306501567363739, + "step": 1053 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 714.0, + "completions/max_terminated_length": 714.0, + "completions/mean_length": 398.9375, + "completions/mean_terminated_length": 398.9375, + "completions/min_length": 220.0, + "completions/min_terminated_length": 220.0, + "epoch": 0.4716938912508391, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08572255074977875, + "learning_rate": 4.589850364877258e-06, + "loss": 0.0102, + "num_tokens": 167821554.0, + "reward": 5.629925727844238, + "reward_std": 1.23585844039917, + "rewards/accuracy_reward/mean": 4.84375, + "rewards/accuracy_reward/std": 1.8874586820602417, + "rewards/ngram_similarity_reward/mean": 0.7861757874488831, + "rewards/ngram_similarity_reward/std": 0.3273070156574249, + "step": 1054 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 592.0, + "completions/max_terminated_length": 592.0, + "completions/mean_length": 373.59375, + "completions/mean_terminated_length": 373.59375, + "completions/min_length": 226.0, + "completions/min_terminated_length": 226.0, + "epoch": 0.472141418661893, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.07060826569795609, + "learning_rate": 4.588891576209682e-06, + "loss": -0.0275, + "num_tokens": 167970728.0, + "reward": 3.7303578853607178, + "reward_std": 1.4070768356323242, + "rewards/accuracy_reward/mean": 3.0625, + "rewards/accuracy_reward/std": 2.9700891971588135, + "rewards/ngram_similarity_reward/mean": 0.6678580641746521, + "rewards/ngram_similarity_reward/std": 0.2841942608356476, + "step": 1055 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 652.0, + "completions/max_terminated_length": 652.0, + "completions/mean_length": 380.171875, + "completions/mean_terminated_length": 380.171875, + "completions/min_length": 196.0, + "completions/min_terminated_length": 196.0, + "epoch": 0.472588946072947, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.08772917836904526, + "learning_rate": 4.587931780851838e-06, + "loss": -0.0164, + "num_tokens": 168108131.0, + "reward": 4.362936019897461, + "reward_std": 0.8165090084075928, + "rewards/accuracy_reward/mean": 3.53125, + "rewards/accuracy_reward/std": 2.839454174041748, + "rewards/ngram_similarity_reward/mean": 0.8316863775253296, + "rewards/ngram_similarity_reward/std": 0.3584800958633423, + "step": 1056 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 739.0, + "completions/max_terminated_length": 739.0, + "completions/mean_length": 395.703125, + "completions/mean_terminated_length": 395.703125, + "completions/min_length": 182.0, + "completions/min_terminated_length": 182.0, + "epoch": 0.4730364734840009, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.07608242332935333, + "learning_rate": 4.586970979329163e-06, + "loss": 0.003, + "num_tokens": 168278864.0, + "reward": 5.808627128601074, + "reward_std": 0.8873310685157776, + "rewards/accuracy_reward/mean": 5.03125, + "rewards/accuracy_reward/std": 1.6229382753372192, + "rewards/ngram_similarity_reward/mean": 0.7773770689964294, + "rewards/ngram_similarity_reward/std": 0.38519734144210815, + "step": 1057 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 682.0, + "completions/max_terminated_length": 682.0, + "completions/mean_length": 423.8125, + "completions/mean_terminated_length": 423.8125, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "epoch": 0.4734840008950548, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0858198031783104, + "learning_rate": 4.586009172167642e-06, + "loss": 0.025, + "num_tokens": 168403908.0, + "reward": 4.685708522796631, + "reward_std": 0.6626394987106323, + "rewards/accuracy_reward/mean": 4.171875, + "rewards/accuracy_reward/std": 2.5326733589172363, + "rewards/ngram_similarity_reward/mean": 0.5138335227966309, + "rewards/ngram_similarity_reward/std": 0.2579622268676758, + "step": 1058 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 980.0, + "completions/max_terminated_length": 980.0, + "completions/mean_length": 491.0625, + "completions/mean_terminated_length": 491.0625, + "completions/min_length": 206.0, + "completions/min_terminated_length": 206.0, + "epoch": 0.47393152830610874, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.05656769126653671, + "learning_rate": 4.58504635989381e-06, + "loss": -0.0338, + "num_tokens": 168586472.0, + "reward": 3.1912267208099365, + "reward_std": 2.096855401992798, + "rewards/accuracy_reward/mean": 2.5, + "rewards/accuracy_reward/std": 3.0237157344818115, + "rewards/ngram_similarity_reward/mean": 0.6912267804145813, + "rewards/ngram_similarity_reward/std": 0.4442880153656006, + "step": 1059 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1000.0, + "completions/max_terminated_length": 1000.0, + "completions/mean_length": 562.1875, + "completions/mean_terminated_length": 562.1875, + "completions/min_length": 317.0, + "completions/min_terminated_length": 317.0, + "epoch": 0.47437905571716266, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06605823338031769, + "learning_rate": 4.584082543034751e-06, + "loss": 0.0002, + "num_tokens": 168722388.0, + "reward": 6.094749450683594, + "reward_std": 0.7265222072601318, + "rewards/accuracy_reward/mean": 5.28125, + "rewards/accuracy_reward/std": 1.227576732635498, + "rewards/ngram_similarity_reward/mean": 0.8134993314743042, + "rewards/ngram_similarity_reward/std": 0.23808826506137848, + "step": 1060 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 515.0, + "completions/max_terminated_length": 515.0, + "completions/mean_length": 376.84375, + "completions/mean_terminated_length": 376.84375, + "completions/min_length": 278.0, + "completions/min_terminated_length": 278.0, + "epoch": 0.47482658312821663, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08780424296855927, + "learning_rate": 4.583117722118104e-06, + "loss": 0.0191, + "num_tokens": 168874474.0, + "reward": 5.824024677276611, + "reward_std": 1.1161339282989502, + "rewards/accuracy_reward/mean": 5.203125, + "rewards/accuracy_reward/std": 1.3531819581985474, + "rewards/ngram_similarity_reward/mean": 0.6208996772766113, + "rewards/ngram_similarity_reward/std": 0.3165114223957062, + "step": 1061 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 631.0, + "completions/max_terminated_length": 631.0, + "completions/mean_length": 340.25, + "completions/mean_terminated_length": 340.25, + "completions/min_length": 180.0, + "completions/min_terminated_length": 180.0, + "epoch": 0.47527411053927054, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.09583883732557297, + "learning_rate": 4.582151897672054e-06, + "loss": 0.018, + "num_tokens": 169064650.0, + "reward": 4.0373735427856445, + "reward_std": 1.3963215351104736, + "rewards/accuracy_reward/mean": 3.296875, + "rewards/accuracy_reward/std": 2.97171688079834, + "rewards/ngram_similarity_reward/mean": 0.7404987812042236, + "rewards/ngram_similarity_reward/std": 0.3559059500694275, + "step": 1062 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 705.0, + "completions/max_terminated_length": 705.0, + "completions/mean_length": 442.640625, + "completions/mean_terminated_length": 442.640625, + "completions/min_length": 276.0, + "completions/min_terminated_length": 276.0, + "epoch": 0.47572163795032446, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0804290696978569, + "learning_rate": 4.581185070225335e-06, + "loss": -0.0033, + "num_tokens": 169238163.0, + "reward": 1.472453236579895, + "reward_std": 0.11882346868515015, + "rewards/accuracy_reward/mean": 1.0, + "rewards/accuracy_reward/std": 2.618614673614502, + "rewards/ngram_similarity_reward/mean": 0.47245335578918457, + "rewards/ngram_similarity_reward/std": 0.15722110867500305, + "step": 1063 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 609.0, + "completions/max_terminated_length": 609.0, + "completions/mean_length": 403.5625, + "completions/mean_terminated_length": 403.5625, + "completions/min_length": 253.0, + "completions/min_terminated_length": 253.0, + "epoch": 0.4761691653613784, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1044817566871643, + "learning_rate": 4.5802172403072295e-06, + "loss": 0.0141, + "num_tokens": 169367575.0, + "reward": 2.712646722793579, + "reward_std": 1.088694453239441, + "rewards/accuracy_reward/mean": 2.21875, + "rewards/accuracy_reward/std": 3.0103988647460938, + "rewards/ngram_similarity_reward/mean": 0.4938966631889343, + "rewards/ngram_similarity_reward/std": 0.24072708189487457, + "step": 1064 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 868.0, + "completions/max_terminated_length": 868.0, + "completions/mean_length": 403.59375, + "completions/mean_terminated_length": 403.59375, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "epoch": 0.4766166927724323, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1042836606502533, + "learning_rate": 4.579248408447573e-06, + "loss": 0.0357, + "num_tokens": 169584109.0, + "reward": 4.310004234313965, + "reward_std": 1.4007518291473389, + "rewards/accuracy_reward/mean": 3.890625, + "rewards/accuracy_reward/std": 2.6998953819274902, + "rewards/ngram_similarity_reward/mean": 0.4193789064884186, + "rewards/ngram_similarity_reward/std": 0.2808838486671448, + "step": 1065 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 822.0, + "completions/max_terminated_length": 822.0, + "completions/mean_length": 462.578125, + "completions/mean_terminated_length": 462.578125, + "completions/min_length": 287.0, + "completions/min_terminated_length": 287.0, + "epoch": 0.47706422018348627, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06781932711601257, + "learning_rate": 4.578278575176745e-06, + "loss": -0.0018, + "num_tokens": 169706882.0, + "reward": 3.504133701324463, + "reward_std": 0.7482330799102783, + "rewards/accuracy_reward/mean": 2.78125, + "rewards/accuracy_reward/std": 3.0103988647460938, + "rewards/ngram_similarity_reward/mean": 0.722883939743042, + "rewards/ngram_similarity_reward/std": 0.3692457675933838, + "step": 1066 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 627.0, + "completions/max_terminated_length": 627.0, + "completions/mean_length": 452.9375, + "completions/mean_terminated_length": 452.9375, + "completions/min_length": 314.0, + "completions/min_terminated_length": 314.0, + "epoch": 0.4775117475945402, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1026446670293808, + "learning_rate": 4.577307741025676e-06, + "loss": -0.04, + "num_tokens": 169957998.0, + "reward": -0.06518572568893433, + "reward_std": 0.12394097447395325, + "rewards/accuracy_reward/mean": -0.5, + "rewards/accuracy_reward/std": 0.0, + "rewards/ngram_similarity_reward/mean": 0.4348142743110657, + "rewards/ngram_similarity_reward/std": 0.1759442389011383, + "step": 1067 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 668.0, + "completions/max_terminated_length": 668.0, + "completions/mean_length": 387.375, + "completions/mean_terminated_length": 387.375, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "epoch": 0.4779592750055941, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.08201418817043304, + "learning_rate": 4.5763359065258424e-06, + "loss": -0.0178, + "num_tokens": 170062390.0, + "reward": 4.6426544189453125, + "reward_std": 0.1418628841638565, + "rewards/accuracy_reward/mean": 4.0, + "rewards/accuracy_reward/std": 2.618614673614502, + "rewards/ngram_similarity_reward/mean": 0.6426544189453125, + "rewards/ngram_similarity_reward/std": 0.25741419196128845, + "step": 1068 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 927.0, + "completions/max_terminated_length": 927.0, + "completions/mean_length": 518.0, + "completions/mean_terminated_length": 518.0, + "completions/min_length": 221.0, + "completions/min_terminated_length": 221.0, + "epoch": 0.478406802416648, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07468228787183762, + "learning_rate": 4.57536307220927e-06, + "loss": 0.0201, + "num_tokens": 170227590.0, + "reward": 1.8692747354507446, + "reward_std": 1.0762724876403809, + "rewards/accuracy_reward/mean": 1.375, + "rewards/accuracy_reward/std": 2.8030595779418945, + "rewards/ngram_similarity_reward/mean": 0.494274765253067, + "rewards/ngram_similarity_reward/std": 0.24930661916732788, + "step": 1069 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 741.0, + "completions/max_terminated_length": 741.0, + "completions/mean_length": 437.890625, + "completions/mean_terminated_length": 437.890625, + "completions/min_length": 221.0, + "completions/min_terminated_length": 221.0, + "epoch": 0.47885432982770193, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10245736688375473, + "learning_rate": 4.574389238608531e-06, + "loss": 0.0663, + "num_tokens": 170398463.0, + "reward": 2.890944480895996, + "reward_std": 0.4334186911582947, + "rewards/accuracy_reward/mean": 2.40625, + "rewards/accuracy_reward/std": 3.0222392082214355, + "rewards/ngram_similarity_reward/mean": 0.4846946597099304, + "rewards/ngram_similarity_reward/std": 0.1906774491071701, + "step": 1070 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 710.0, + "completions/max_terminated_length": 710.0, + "completions/mean_length": 403.921875, + "completions/mean_terminated_length": 403.921875, + "completions/min_length": 233.0, + "completions/min_terminated_length": 233.0, + "epoch": 0.47930185723875585, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07545072585344315, + "learning_rate": 4.573414406256743e-06, + "loss": 0.0008, + "num_tokens": 170536826.0, + "reward": 5.844933986663818, + "reward_std": 0.5657603740692139, + "rewards/accuracy_reward/mean": 5.3125, + "rewards/accuracy_reward/std": 1.0522085428237915, + "rewards/ngram_similarity_reward/mean": 0.5324341654777527, + "rewards/ngram_similarity_reward/std": 0.27465152740478516, + "step": 1071 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 689.0, + "completions/mean_length": 476.65625, + "completions/mean_terminated_length": 451.7143249511719, + "completions/min_length": 213.0, + "completions/min_terminated_length": 213.0, + "epoch": 0.4797493846498098, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08169378340244293, + "learning_rate": 4.572438575687576e-06, + "loss": -0.0067, + "num_tokens": 170660932.0, + "reward": 3.226865768432617, + "reward_std": 0.9055585861206055, + "rewards/accuracy_reward/mean": 2.6875, + "rewards/accuracy_reward/std": 3.0178043842315674, + "rewards/ngram_similarity_reward/mean": 0.5393656492233276, + "rewards/ngram_similarity_reward/std": 0.2904561460018158, + "step": 1072 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 778.0, + "completions/max_terminated_length": 778.0, + "completions/mean_length": 468.84375, + "completions/mean_terminated_length": 468.84375, + "completions/min_length": 286.0, + "completions/min_terminated_length": 286.0, + "epoch": 0.48019691206086373, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.06678974628448486, + "learning_rate": 4.571461747435239e-06, + "loss": 0.0046, + "num_tokens": 170784650.0, + "reward": 4.711894989013672, + "reward_std": 0.4199894070625305, + "rewards/accuracy_reward/mean": 3.90625, + "rewards/accuracy_reward/std": 2.6709415912628174, + "rewards/ngram_similarity_reward/mean": 0.8056451678276062, + "rewards/ngram_similarity_reward/std": 0.27729812264442444, + "step": 1073 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 702.0, + "completions/max_terminated_length": 702.0, + "completions/mean_length": 455.140625, + "completions/mean_terminated_length": 455.140625, + "completions/min_length": 246.0, + "completions/min_terminated_length": 246.0, + "epoch": 0.48064443947191765, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0764194130897522, + "learning_rate": 4.570483922034493e-06, + "loss": -0.0194, + "num_tokens": 170965043.0, + "reward": 3.971428155899048, + "reward_std": 0.8636301755905151, + "rewards/accuracy_reward/mean": 3.4375, + "rewards/accuracy_reward/std": 2.872281312942505, + "rewards/ngram_similarity_reward/mean": 0.5339281558990479, + "rewards/ngram_similarity_reward/std": 0.31490033864974976, + "step": 1074 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1042.0, + "completions/max_terminated_length": 1042.0, + "completions/mean_length": 592.734375, + "completions/mean_terminated_length": 592.734375, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "epoch": 0.48109196688297157, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07808054238557816, + "learning_rate": 4.569505100020642e-06, + "loss": 0.0146, + "num_tokens": 171111330.0, + "reward": 1.527955174446106, + "reward_std": 0.13413190841674805, + "rewards/accuracy_reward/mean": 0.984375, + "rewards/accuracy_reward/std": 2.6306629180908203, + "rewards/ngram_similarity_reward/mean": 0.543580174446106, + "rewards/ngram_similarity_reward/std": 0.12489572167396545, + "step": 1075 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 914.0, + "completions/max_terminated_length": 914.0, + "completions/mean_length": 454.515625, + "completions/mean_terminated_length": 454.515625, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "epoch": 0.4815394942940255, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09512680768966675, + "learning_rate": 4.568525281929536e-06, + "loss": 0.0293, + "num_tokens": 171286947.0, + "reward": 5.5437164306640625, + "reward_std": 1.2956647872924805, + "rewards/accuracy_reward/mean": 4.921875, + "rewards/accuracy_reward/std": 1.8153201341629028, + "rewards/ngram_similarity_reward/mean": 0.6218414306640625, + "rewards/ngram_similarity_reward/std": 0.3024308979511261, + "step": 1076 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 681.0, + "completions/max_terminated_length": 681.0, + "completions/mean_length": 468.671875, + "completions/mean_terminated_length": 468.671875, + "completions/min_length": 296.0, + "completions/min_terminated_length": 296.0, + "epoch": 0.48198702170507945, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06600500643253326, + "learning_rate": 4.567544468297571e-06, + "loss": -0.0094, + "num_tokens": 171419294.0, + "reward": 3.1191608905792236, + "reward_std": 0.180575892329216, + "rewards/accuracy_reward/mean": 2.453125, + "rewards/accuracy_reward/std": 3.077979803085327, + "rewards/ngram_similarity_reward/mean": 0.6660360097885132, + "rewards/ngram_similarity_reward/std": 0.20528137683868408, + "step": 1077 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 713.0, + "completions/max_terminated_length": 713.0, + "completions/mean_length": 485.046875, + "completions/mean_terminated_length": 485.046875, + "completions/min_length": 308.0, + "completions/min_terminated_length": 308.0, + "epoch": 0.48243454911613337, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08958762884140015, + "learning_rate": 4.5665626596616875e-06, + "loss": 0.0076, + "num_tokens": 171635025.0, + "reward": 2.6711831092834473, + "reward_std": 0.5186856985092163, + "rewards/accuracy_reward/mean": 2.390625, + "rewards/accuracy_reward/std": 3.0400354862213135, + "rewards/ngram_similarity_reward/mean": 0.28055813908576965, + "rewards/ngram_similarity_reward/std": 0.17530040442943573, + "step": 1078 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 724.0, + "completions/max_terminated_length": 724.0, + "completions/mean_length": 505.46875, + "completions/mean_terminated_length": 505.46875, + "completions/min_length": 343.0, + "completions/min_terminated_length": 343.0, + "epoch": 0.4828820765271873, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10286291688680649, + "learning_rate": 4.565579856559371e-06, + "loss": -0.0058, + "num_tokens": 171867247.0, + "reward": 2.890352725982666, + "reward_std": 1.205926775932312, + "rewards/accuracy_reward/mean": 2.390625, + "rewards/accuracy_reward/std": 3.1477742195129395, + "rewards/ngram_similarity_reward/mean": 0.4997277557849884, + "rewards/ngram_similarity_reward/std": 0.30446815490722656, + "step": 1079 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 708.0, + "completions/max_terminated_length": 708.0, + "completions/mean_length": 431.453125, + "completions/mean_terminated_length": 431.453125, + "completions/min_length": 221.0, + "completions/min_terminated_length": 221.0, + "epoch": 0.4833296039382412, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08253061771392822, + "learning_rate": 4.564596059528651e-06, + "loss": -0.0061, + "num_tokens": 172012204.0, + "reward": 3.1154181957244873, + "reward_std": 0.16891822218894958, + "rewards/accuracy_reward/mean": 2.5, + "rewards/accuracy_reward/std": 3.0237157344818115, + "rewards/ngram_similarity_reward/mean": 0.6154181361198425, + "rewards/ngram_similarity_reward/std": 0.3033756911754608, + "step": 1080 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 602.0, + "completions/max_terminated_length": 602.0, + "completions/mean_length": 438.65625, + "completions/mean_terminated_length": 438.65625, + "completions/min_length": 305.0, + "completions/min_terminated_length": 305.0, + "epoch": 0.4837771313492951, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07555454969406128, + "learning_rate": 4.563611269108101e-06, + "loss": -0.0071, + "num_tokens": 172151270.0, + "reward": 6.350188255310059, + "reward_std": 0.07179703563451767, + "rewards/accuracy_reward/mean": 5.5, + "rewards/accuracy_reward/std": 0.0, + "rewards/ngram_similarity_reward/mean": 0.8501883745193481, + "rewards/ngram_similarity_reward/std": 0.4043208360671997, + "step": 1081 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 861.0, + "completions/max_terminated_length": 861.0, + "completions/mean_length": 434.296875, + "completions/mean_terminated_length": 434.296875, + "completions/min_length": 240.0, + "completions/min_terminated_length": 240.0, + "epoch": 0.4842246587603491, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0908069908618927, + "learning_rate": 4.562625485836839e-06, + "loss": 0.0665, + "num_tokens": 172310953.0, + "reward": 4.73192024230957, + "reward_std": 1.1997625827789307, + "rewards/accuracy_reward/mean": 4.0, + "rewards/accuracy_reward/std": 2.618614673614502, + "rewards/ngram_similarity_reward/mean": 0.7319202423095703, + "rewards/ngram_similarity_reward/std": 0.36777517199516296, + "step": 1082 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 768.0, + "completions/max_terminated_length": 768.0, + "completions/mean_length": 536.03125, + "completions/mean_terminated_length": 536.03125, + "completions/min_length": 317.0, + "completions/min_terminated_length": 317.0, + "epoch": 0.484672186171403, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06916855275630951, + "learning_rate": 4.561638710254526e-06, + "loss": 0.0375, + "num_tokens": 172481963.0, + "reward": 3.339456558227539, + "reward_std": 0.7031046748161316, + "rewards/accuracy_reward/mean": 2.78125, + "rewards/accuracy_reward/std": 3.0103988647460938, + "rewards/ngram_similarity_reward/mean": 0.5582062602043152, + "rewards/ngram_similarity_reward/std": 0.2942054569721222, + "step": 1083 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 685.0, + "completions/max_terminated_length": 685.0, + "completions/mean_length": 415.1875, + "completions/mean_terminated_length": 415.1875, + "completions/min_length": 257.0, + "completions/min_terminated_length": 257.0, + "epoch": 0.4851197135824569, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09040789306163788, + "learning_rate": 4.560650942901367e-06, + "loss": 0.0317, + "num_tokens": 172630167.0, + "reward": 3.6213231086730957, + "reward_std": 1.5545053482055664, + "rewards/accuracy_reward/mean": 2.875, + "rewards/accuracy_reward/std": 3.000000238418579, + "rewards/ngram_similarity_reward/mean": 0.7463233470916748, + "rewards/ngram_similarity_reward/std": 0.2703729271888733, + "step": 1084 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 801.0, + "completions/max_terminated_length": 801.0, + "completions/mean_length": 454.953125, + "completions/mean_terminated_length": 454.953125, + "completions/min_length": 282.0, + "completions/min_terminated_length": 282.0, + "epoch": 0.48556724099351084, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.07023082673549652, + "learning_rate": 4.55966218431811e-06, + "loss": 0.0116, + "num_tokens": 172749300.0, + "reward": 3.944469928741455, + "reward_std": 0.8143153190612793, + "rewards/accuracy_reward/mean": 3.34375, + "rewards/accuracy_reward/std": 2.9016621112823486, + "rewards/ngram_similarity_reward/mean": 0.6007199287414551, + "rewards/ngram_similarity_reward/std": 0.2684047520160675, + "step": 1085 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 733.0, + "completions/max_terminated_length": 733.0, + "completions/mean_length": 465.9375, + "completions/mean_terminated_length": 465.9375, + "completions/min_length": 233.0, + "completions/min_terminated_length": 233.0, + "epoch": 0.48601476840456476, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08287008106708527, + "learning_rate": 4.558672435046042e-06, + "loss": -0.001, + "num_tokens": 172885648.0, + "reward": 2.8535852432250977, + "reward_std": 0.753909170627594, + "rewards/accuracy_reward/mean": 2.21875, + "rewards/accuracy_reward/std": 3.0103988647460938, + "rewards/ngram_similarity_reward/mean": 0.6348351240158081, + "rewards/ngram_similarity_reward/std": 0.33657437562942505, + "step": 1086 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 639.0, + "completions/max_terminated_length": 639.0, + "completions/mean_length": 443.75, + "completions/mean_terminated_length": 443.75, + "completions/min_length": 271.0, + "completions/min_terminated_length": 271.0, + "epoch": 0.48646229581561873, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09232994168996811, + "learning_rate": 4.557681695626998e-06, + "loss": 0.0107, + "num_tokens": 173050448.0, + "reward": 2.7262744903564453, + "reward_std": 1.1046934127807617, + "rewards/accuracy_reward/mean": 2.265625, + "rewards/accuracy_reward/std": 3.0692648887634277, + "rewards/ngram_similarity_reward/mean": 0.46064963936805725, + "rewards/ngram_similarity_reward/std": 0.1888895034790039, + "step": 1087 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 753.0, + "completions/max_terminated_length": 753.0, + "completions/mean_length": 419.734375, + "completions/mean_terminated_length": 419.734375, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "epoch": 0.48690982322667264, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09107498824596405, + "learning_rate": 4.556689966603353e-06, + "loss": 0.0156, + "num_tokens": 173227503.0, + "reward": 4.152169704437256, + "reward_std": 0.8628009557723999, + "rewards/accuracy_reward/mean": 3.53125, + "rewards/accuracy_reward/std": 2.839454174041748, + "rewards/ngram_similarity_reward/mean": 0.6209197044372559, + "rewards/ngram_similarity_reward/std": 0.23462265729904175, + "step": 1088 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 653.0, + "completions/max_terminated_length": 653.0, + "completions/mean_length": 429.109375, + "completions/mean_terminated_length": 429.109375, + "completions/min_length": 274.0, + "completions/min_terminated_length": 274.0, + "epoch": 0.48735735063772656, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08090570569038391, + "learning_rate": 4.55569724851802e-06, + "loss": 0.0017, + "num_tokens": 173365062.0, + "reward": 3.8782854080200195, + "reward_std": 1.4964256286621094, + "rewards/accuracy_reward/mean": 3.15625, + "rewards/accuracy_reward/std": 2.950484275817871, + "rewards/ngram_similarity_reward/mean": 0.7220356464385986, + "rewards/ngram_similarity_reward/std": 0.2286946028470993, + "step": 1089 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 676.0, + "completions/max_terminated_length": 676.0, + "completions/mean_length": 414.78125, + "completions/mean_terminated_length": 414.78125, + "completions/min_length": 214.0, + "completions/min_terminated_length": 214.0, + "epoch": 0.4878048780487805, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09512459486722946, + "learning_rate": 4.55470354191446e-06, + "loss": -0.0028, + "num_tokens": 173487208.0, + "reward": 4.085986614227295, + "reward_std": 1.3344218730926514, + "rewards/accuracy_reward/mean": 3.34375, + "rewards/accuracy_reward/std": 2.9016621112823486, + "rewards/ngram_similarity_reward/mean": 0.7422366142272949, + "rewards/ngram_similarity_reward/std": 0.3242091238498688, + "step": 1090 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 645.0, + "completions/max_terminated_length": 645.0, + "completions/mean_length": 450.234375, + "completions/mean_terminated_length": 450.234375, + "completions/min_length": 312.0, + "completions/min_terminated_length": 312.0, + "epoch": 0.4882524054598344, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.08115733414888382, + "learning_rate": 4.55370884733667e-06, + "loss": 0.0069, + "num_tokens": 173732951.0, + "reward": 3.8544161319732666, + "reward_std": 1.337068796157837, + "rewards/accuracy_reward/mean": 3.15625, + "rewards/accuracy_reward/std": 2.950484275817871, + "rewards/ngram_similarity_reward/mean": 0.6981660723686218, + "rewards/ngram_similarity_reward/std": 0.30572065711021423, + "step": 1091 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 605.0, + "completions/max_terminated_length": 605.0, + "completions/mean_length": 389.890625, + "completions/mean_terminated_length": 389.890625, + "completions/min_length": 245.0, + "completions/min_terminated_length": 245.0, + "epoch": 0.48869993287088836, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09762948006391525, + "learning_rate": 4.55271316532919e-06, + "loss": 0.0124, + "num_tokens": 173914096.0, + "reward": 2.8449859619140625, + "reward_std": 2.1369941234588623, + "rewards/accuracy_reward/mean": 2.125, + "rewards/accuracy_reward/std": 3.000000238418579, + "rewards/ngram_similarity_reward/mean": 0.7199859619140625, + "rewards/ngram_similarity_reward/std": 0.38775435090065, + "step": 1092 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 658.0, + "completions/max_terminated_length": 658.0, + "completions/mean_length": 416.4375, + "completions/mean_terminated_length": 416.4375, + "completions/min_length": 260.0, + "completions/min_terminated_length": 260.0, + "epoch": 0.4891474602819423, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08745718747377396, + "learning_rate": 4.5517164964371e-06, + "loss": -0.0236, + "num_tokens": 174162316.0, + "reward": 3.639711380004883, + "reward_std": 1.6092393398284912, + "rewards/accuracy_reward/mean": 2.96875, + "rewards/accuracy_reward/std": 2.986577272415161, + "rewards/ngram_similarity_reward/mean": 0.6709613800048828, + "rewards/ngram_similarity_reward/std": 0.2689666748046875, + "step": 1093 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 776.0, + "completions/max_terminated_length": 776.0, + "completions/mean_length": 409.0, + "completions/mean_terminated_length": 409.0, + "completions/min_length": 260.0, + "completions/min_terminated_length": 260.0, + "epoch": 0.4895949876929962, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09560834616422653, + "learning_rate": 4.55071884120602e-06, + "loss": -0.0177, + "num_tokens": 174374860.0, + "reward": 3.9378693103790283, + "reward_std": 1.4928762912750244, + "rewards/accuracy_reward/mean": 3.3125, + "rewards/accuracy_reward/std": 2.948634386062622, + "rewards/ngram_similarity_reward/mean": 0.6253694295883179, + "rewards/ngram_similarity_reward/std": 0.2162037044763565, + "step": 1094 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 688.0, + "completions/max_terminated_length": 688.0, + "completions/mean_length": 386.078125, + "completions/mean_terminated_length": 386.078125, + "completions/min_length": 243.0, + "completions/min_terminated_length": 243.0, + "epoch": 0.4900425151040501, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09615163505077362, + "learning_rate": 4.549720200182112e-06, + "loss": -0.0073, + "num_tokens": 174522305.0, + "reward": 3.2679896354675293, + "reward_std": 0.21264588832855225, + "rewards/accuracy_reward/mean": 2.5, + "rewards/accuracy_reward/std": 3.0237157344818115, + "rewards/ngram_similarity_reward/mean": 0.7679896354675293, + "rewards/ngram_similarity_reward/std": 0.29604002833366394, + "step": 1095 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 696.0, + "completions/max_terminated_length": 696.0, + "completions/mean_length": 533.375, + "completions/mean_terminated_length": 533.375, + "completions/min_length": 338.0, + "completions/min_terminated_length": 338.0, + "epoch": 0.49049004251510403, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06156305968761444, + "learning_rate": 4.548720573912074e-06, + "loss": 0.0026, + "num_tokens": 174673961.0, + "reward": 3.1978039741516113, + "reward_std": 0.1528598964214325, + "rewards/accuracy_reward/mean": 2.5, + "rewards/accuracy_reward/std": 3.0237157344818115, + "rewards/ngram_similarity_reward/mean": 0.6978040337562561, + "rewards/ngram_similarity_reward/std": 0.2940730154514313, + "step": 1096 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 642.0, + "completions/max_terminated_length": 642.0, + "completions/mean_length": 505.359375, + "completions/mean_terminated_length": 505.359375, + "completions/min_length": 362.0, + "completions/min_terminated_length": 362.0, + "epoch": 0.490937569926158, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08782241493463516, + "learning_rate": 4.547719962943148e-06, + "loss": -0.0012, + "num_tokens": 174853888.0, + "reward": 2.947543144226074, + "reward_std": 0.18983094394207, + "rewards/accuracy_reward/mean": 2.390625, + "rewards/accuracy_reward/std": 3.1477742195129395, + "rewards/ngram_similarity_reward/mean": 0.5569181442260742, + "rewards/ngram_similarity_reward/std": 0.24300679564476013, + "step": 1097 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 637.0, + "completions/max_terminated_length": 637.0, + "completions/mean_length": 415.203125, + "completions/mean_terminated_length": 415.203125, + "completions/min_length": 323.0, + "completions/min_terminated_length": 323.0, + "epoch": 0.4913850973372119, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09604466706514359, + "learning_rate": 4.5467183678231105e-06, + "loss": 0.0163, + "num_tokens": 174995565.0, + "reward": 6.056362152099609, + "reward_std": 0.5539984703063965, + "rewards/accuracy_reward/mean": 5.40625, + "rewards/accuracy_reward/std": 0.7500000596046448, + "rewards/ngram_similarity_reward/mean": 0.6501119136810303, + "rewards/ngram_similarity_reward/std": 0.2691671550273895, + "step": 1098 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 674.0, + "completions/max_terminated_length": 674.0, + "completions/mean_length": 434.75, + "completions/mean_terminated_length": 434.75, + "completions/min_length": 230.0, + "completions/min_terminated_length": 230.0, + "epoch": 0.49183262474826583, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0859757587313652, + "learning_rate": 4.545715789100279e-06, + "loss": -0.0052, + "num_tokens": 175108797.0, + "reward": 2.9753494262695312, + "reward_std": 0.16992181539535522, + "rewards/accuracy_reward/mean": 2.484375, + "rewards/accuracy_reward/std": 3.0419929027557373, + "rewards/ngram_similarity_reward/mean": 0.49097421765327454, + "rewards/ngram_similarity_reward/std": 0.13519920408725739, + "step": 1099 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 731.0, + "completions/mean_length": 559.046875, + "completions/mean_terminated_length": 535.4127197265625, + "completions/min_length": 381.0, + "completions/min_terminated_length": 381.0, + "epoch": 0.49228015215931975, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07323068380355835, + "learning_rate": 4.544712227323511e-06, + "loss": 0.004, + "num_tokens": 175291264.0, + "reward": 4.822459697723389, + "reward_std": 0.23598459362983704, + "rewards/accuracy_reward/mean": 4.0, + "rewards/accuracy_reward/std": 2.618614673614502, + "rewards/ngram_similarity_reward/mean": 0.8224596977233887, + "rewards/ngram_similarity_reward/std": 0.28161683678627014, + "step": 1100 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 782.0, + "completions/mean_length": 548.15625, + "completions/mean_terminated_length": 499.774169921875, + "completions/min_length": 299.0, + "completions/min_terminated_length": 299.0, + "epoch": 0.49272767957037367, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.061277225613594055, + "learning_rate": 4.543707683042199e-06, + "loss": 0.0882, + "num_tokens": 175450298.0, + "reward": 4.542122840881348, + "reward_std": 0.42735737562179565, + "rewards/accuracy_reward/mean": 3.90625, + "rewards/accuracy_reward/std": 2.6709415912628174, + "rewards/ngram_similarity_reward/mean": 0.6358727216720581, + "rewards/ngram_similarity_reward/std": 0.20485639572143555, + "step": 1101 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 965.0, + "completions/max_terminated_length": 965.0, + "completions/mean_length": 430.890625, + "completions/mean_terminated_length": 430.890625, + "completions/min_length": 242.0, + "completions/min_terminated_length": 242.0, + "epoch": 0.49317520698142764, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09584963321685791, + "learning_rate": 4.542702156806273e-06, + "loss": -0.0052, + "num_tokens": 175616515.0, + "reward": 1.153327226638794, + "reward_std": 1.185215950012207, + "rewards/accuracy_reward/mean": 0.59375, + "rewards/accuracy_reward/std": 2.381934404373169, + "rewards/ngram_similarity_reward/mean": 0.5595772862434387, + "rewards/ngram_similarity_reward/std": 0.367759108543396, + "step": 1102 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 690.0, + "completions/max_terminated_length": 690.0, + "completions/mean_length": 420.671875, + "completions/mean_terminated_length": 420.671875, + "completions/min_length": 270.0, + "completions/min_terminated_length": 270.0, + "epoch": 0.49362273439248155, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08076344430446625, + "learning_rate": 4.5416956491662055e-06, + "loss": 0.0055, + "num_tokens": 175744814.0, + "reward": 3.275195360183716, + "reward_std": 0.12107278406620026, + "rewards/accuracy_reward/mean": 2.5, + "rewards/accuracy_reward/std": 3.0237157344818115, + "rewards/ngram_similarity_reward/mean": 0.7751953601837158, + "rewards/ngram_similarity_reward/std": 0.236673966050148, + "step": 1103 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 631.0, + "completions/max_terminated_length": 631.0, + "completions/mean_length": 438.390625, + "completions/mean_terminated_length": 438.390625, + "completions/min_length": 270.0, + "completions/min_terminated_length": 270.0, + "epoch": 0.49407026180353547, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0785026103258133, + "learning_rate": 4.540688160673002e-06, + "loss": 0.005, + "num_tokens": 175897671.0, + "reward": 3.3170793056488037, + "reward_std": 0.9156270623207092, + "rewards/accuracy_reward/mean": 2.78125, + "rewards/accuracy_reward/std": 3.0103988647460938, + "rewards/ngram_similarity_reward/mean": 0.5358291864395142, + "rewards/ngram_similarity_reward/std": 0.2915536165237427, + "step": 1104 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 962.0, + "completions/mean_length": 573.75, + "completions/mean_terminated_length": 550.3492431640625, + "completions/min_length": 242.0, + "completions/min_terminated_length": 242.0, + "epoch": 0.4945177892145894, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0661405697464943, + "learning_rate": 4.5396796918782055e-06, + "loss": 0.0107, + "num_tokens": 176075527.0, + "reward": 4.647940635681152, + "reward_std": 0.15474610030651093, + "rewards/accuracy_reward/mean": 4.0, + "rewards/accuracy_reward/std": 2.618614673614502, + "rewards/ngram_similarity_reward/mean": 0.6479406356811523, + "rewards/ngram_similarity_reward/std": 0.2756750285625458, + "step": 1105 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1422.0, + "completions/max_terminated_length": 1422.0, + "completions/mean_length": 551.375, + "completions/mean_terminated_length": 551.375, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "epoch": 0.4949653166256433, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.06709405779838562, + "learning_rate": 4.538670243333897e-06, + "loss": 0.0234, + "num_tokens": 176289183.0, + "reward": 4.5249834060668945, + "reward_std": 0.40416163206100464, + "rewards/accuracy_reward/mean": 4.09375, + "rewards/accuracy_reward/std": 2.5617377758026123, + "rewards/ngram_similarity_reward/mean": 0.4312337338924408, + "rewards/ngram_similarity_reward/std": 0.2670634090900421, + "step": 1106 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 696.0, + "completions/max_terminated_length": 696.0, + "completions/mean_length": 423.359375, + "completions/mean_terminated_length": 423.359375, + "completions/min_length": 235.0, + "completions/min_terminated_length": 235.0, + "epoch": 0.4954128440366973, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0584019236266613, + "learning_rate": 4.537659815592693e-06, + "loss": -0.0232, + "num_tokens": 176453782.0, + "reward": 4.692909240722656, + "reward_std": 0.0988813266158104, + "rewards/accuracy_reward/mean": 4.0, + "rewards/accuracy_reward/std": 2.618614673614502, + "rewards/ngram_similarity_reward/mean": 0.6929092407226562, + "rewards/ngram_similarity_reward/std": 0.2770628333091736, + "step": 1107 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 636.0, + "completions/max_terminated_length": 636.0, + "completions/mean_length": 410.421875, + "completions/mean_terminated_length": 410.421875, + "completions/min_length": 227.0, + "completions/min_terminated_length": 227.0, + "epoch": 0.4958603714477512, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07954328507184982, + "learning_rate": 4.536648409207746e-06, + "loss": 0.025, + "num_tokens": 176600049.0, + "reward": 6.233366012573242, + "reward_std": 0.09150787442922592, + "rewards/accuracy_reward/mean": 5.5, + "rewards/accuracy_reward/std": 0.0, + "rewards/ngram_similarity_reward/mean": 0.7333660125732422, + "rewards/ngram_similarity_reward/std": 0.11544132232666016, + "step": 1108 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 745.0, + "completions/max_terminated_length": 745.0, + "completions/mean_length": 488.65625, + "completions/mean_terminated_length": 488.65625, + "completions/min_length": 255.0, + "completions/min_terminated_length": 255.0, + "epoch": 0.4963078988588051, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08080743253231049, + "learning_rate": 4.535636024732745e-06, + "loss": 0.0072, + "num_tokens": 176777147.0, + "reward": 3.5629160404205322, + "reward_std": 1.4889570474624634, + "rewards/accuracy_reward/mean": 3.046875, + "rewards/accuracy_reward/std": 2.991680145263672, + "rewards/ngram_similarity_reward/mean": 0.5160409212112427, + "rewards/ngram_similarity_reward/std": 0.2812080681324005, + "step": 1109 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1171.0, + "completions/max_terminated_length": 1171.0, + "completions/mean_length": 560.234375, + "completions/mean_terminated_length": 560.234375, + "completions/min_length": 247.0, + "completions/min_terminated_length": 247.0, + "epoch": 0.496755426269859, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06918640434741974, + "learning_rate": 4.534622662721912e-06, + "loss": -0.0248, + "num_tokens": 176935706.0, + "reward": 1.5414713621139526, + "reward_std": 0.12708443403244019, + "rewards/accuracy_reward/mean": 1.0, + "rewards/accuracy_reward/std": 2.618614673614502, + "rewards/ngram_similarity_reward/mean": 0.5414713025093079, + "rewards/ngram_similarity_reward/std": 0.30945712327957153, + "step": 1110 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 713.0, + "completions/max_terminated_length": 713.0, + "completions/mean_length": 360.796875, + "completions/mean_terminated_length": 360.796875, + "completions/min_length": 254.0, + "completions/min_terminated_length": 254.0, + "epoch": 0.49720295368091294, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0921863541007042, + "learning_rate": 4.533608323730008e-06, + "loss": 0.0021, + "num_tokens": 177075405.0, + "reward": 4.40880823135376, + "reward_std": 0.9826715588569641, + "rewards/accuracy_reward/mean": 3.71875, + "rewards/accuracy_reward/std": 2.7629566192626953, + "rewards/ngram_similarity_reward/mean": 0.6900581121444702, + "rewards/ngram_similarity_reward/std": 0.2976328730583191, + "step": 1111 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 632.0, + "completions/max_terminated_length": 632.0, + "completions/mean_length": 407.046875, + "completions/mean_terminated_length": 407.046875, + "completions/min_length": 285.0, + "completions/min_terminated_length": 285.0, + "epoch": 0.49765048109196686, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.07950065284967422, + "learning_rate": 4.532593008312326e-06, + "loss": 0.0296, + "num_tokens": 177204672.0, + "reward": 5.49090051651001, + "reward_std": 1.4403270483016968, + "rewards/accuracy_reward/mean": 4.75, + "rewards/accuracy_reward/std": 2.0, + "rewards/ngram_similarity_reward/mean": 0.740900456905365, + "rewards/ngram_similarity_reward/std": 0.2728201150894165, + "step": 1112 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 492.0, + "completions/max_terminated_length": 492.0, + "completions/mean_length": 347.59375, + "completions/mean_terminated_length": 347.59375, + "completions/min_length": 207.0, + "completions/min_terminated_length": 207.0, + "epoch": 0.49809800850302083, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08854083716869354, + "learning_rate": 4.5315767170246945e-06, + "loss": 0.0007, + "num_tokens": 177334950.0, + "reward": 4.268105506896973, + "reward_std": 1.4788068532943726, + "rewards/accuracy_reward/mean": 3.4375, + "rewards/accuracy_reward/std": 2.872281312942505, + "rewards/ngram_similarity_reward/mean": 0.8306055068969727, + "rewards/ngram_similarity_reward/std": 0.2775154411792755, + "step": 1113 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 593.0, + "completions/max_terminated_length": 593.0, + "completions/mean_length": 423.921875, + "completions/mean_terminated_length": 423.921875, + "completions/min_length": 285.0, + "completions/min_terminated_length": 285.0, + "epoch": 0.49854553591407474, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10035360604524612, + "learning_rate": 4.530559450423477e-06, + "loss": 0.0388, + "num_tokens": 177478081.0, + "reward": 1.4108541011810303, + "reward_std": 0.08538447320461273, + "rewards/accuracy_reward/mean": 1.0, + "rewards/accuracy_reward/std": 2.618614673614502, + "rewards/ngram_similarity_reward/mean": 0.4108540415763855, + "rewards/ngram_similarity_reward/std": 0.25501975417137146, + "step": 1114 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 725.0, + "completions/max_terminated_length": 725.0, + "completions/mean_length": 539.90625, + "completions/mean_terminated_length": 539.90625, + "completions/min_length": 369.0, + "completions/min_terminated_length": 369.0, + "epoch": 0.49899306332512866, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05983530730009079, + "learning_rate": 4.529541209065568e-06, + "loss": 0.0214, + "num_tokens": 177640107.0, + "reward": 4.336306571960449, + "reward_std": 0.9542237520217896, + "rewards/accuracy_reward/mean": 3.71875, + "rewards/accuracy_reward/std": 2.7629566192626953, + "rewards/ngram_similarity_reward/mean": 0.6175564527511597, + "rewards/ngram_similarity_reward/std": 0.19533227384090424, + "step": 1115 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 598.0, + "completions/max_terminated_length": 598.0, + "completions/mean_length": 420.296875, + "completions/mean_terminated_length": 420.296875, + "completions/min_length": 306.0, + "completions/min_terminated_length": 306.0, + "epoch": 0.4994405907361826, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07837483286857605, + "learning_rate": 4.5285219935084e-06, + "loss": 0.0108, + "num_tokens": 177827518.0, + "reward": 5.884796142578125, + "reward_std": 1.0983819961547852, + "rewards/accuracy_reward/mean": 5.125, + "rewards/accuracy_reward/std": 1.4638501405715942, + "rewards/ngram_similarity_reward/mean": 0.7597965002059937, + "rewards/ngram_similarity_reward/std": 0.3037159740924835, + "step": 1116 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 770.0, + "completions/max_terminated_length": 770.0, + "completions/mean_length": 502.203125, + "completions/mean_terminated_length": 502.203125, + "completions/min_length": 325.0, + "completions/min_terminated_length": 325.0, + "epoch": 0.4998881181472365, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09346076101064682, + "learning_rate": 4.527501804309935e-06, + "loss": 0.0285, + "num_tokens": 178005147.0, + "reward": 5.347973823547363, + "reward_std": 1.6126937866210938, + "rewards/accuracy_reward/mean": 4.53125, + "rewards/accuracy_reward/std": 2.27455735206604, + "rewards/ngram_similarity_reward/mean": 0.8167234659194946, + "rewards/ngram_similarity_reward/std": 0.27641963958740234, + "step": 1117 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 812.0, + "completions/max_terminated_length": 812.0, + "completions/mean_length": 444.9375, + "completions/mean_terminated_length": 444.9375, + "completions/min_length": 254.0, + "completions/min_terminated_length": 254.0, + "epoch": 0.5003356455582905, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.073221355676651, + "learning_rate": 4.52648064202867e-06, + "loss": 0.0217, + "num_tokens": 178157671.0, + "reward": 4.819570541381836, + "reward_std": 0.08227217942476273, + "rewards/accuracy_reward/mean": 4.0, + "rewards/accuracy_reward/std": 2.618614673614502, + "rewards/ngram_similarity_reward/mean": 0.8195701837539673, + "rewards/ngram_similarity_reward/std": 0.18116185069084167, + "step": 1118 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 632.0, + "completions/max_terminated_length": 632.0, + "completions/mean_length": 399.359375, + "completions/mean_terminated_length": 399.359375, + "completions/min_length": 275.0, + "completions/min_terminated_length": 275.0, + "epoch": 0.5007831729693444, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09368530660867691, + "learning_rate": 4.525458507223633e-06, + "loss": 0.0076, + "num_tokens": 178299662.0, + "reward": 2.5764033794403076, + "reward_std": 1.840775728225708, + "rewards/accuracy_reward/mean": 1.9375, + "rewards/accuracy_reward/std": 2.9700891971588135, + "rewards/ngram_similarity_reward/mean": 0.6389033198356628, + "rewards/ngram_similarity_reward/std": 0.21850425004959106, + "step": 1119 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 592.0, + "completions/max_terminated_length": 592.0, + "completions/mean_length": 400.46875, + "completions/mean_terminated_length": 400.46875, + "completions/min_length": 272.0, + "completions/min_terminated_length": 272.0, + "epoch": 0.5012307003803983, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0986308604478836, + "learning_rate": 4.524435400454388e-06, + "loss": -0.0369, + "num_tokens": 178455596.0, + "reward": 6.323049545288086, + "reward_std": 0.1640501320362091, + "rewards/accuracy_reward/mean": 5.5, + "rewards/accuracy_reward/std": 0.0, + "rewards/ngram_similarity_reward/mean": 0.8230493068695068, + "rewards/ngram_similarity_reward/std": 0.21891455352306366, + "step": 1120 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 967.0, + "completions/max_terminated_length": 967.0, + "completions/mean_length": 528.78125, + "completions/mean_terminated_length": 528.78125, + "completions/min_length": 298.0, + "completions/min_terminated_length": 298.0, + "epoch": 0.5016782277914522, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05695182830095291, + "learning_rate": 4.523411322281027e-06, + "loss": -0.0004, + "num_tokens": 178566734.0, + "reward": 6.216606616973877, + "reward_std": 0.48135465383529663, + "rewards/accuracy_reward/mean": 5.40625, + "rewards/accuracy_reward/std": 0.7500000596046448, + "rewards/ngram_similarity_reward/mean": 0.8103565573692322, + "rewards/ngram_similarity_reward/std": 0.1939639449119568, + "step": 1121 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 797.0, + "completions/max_terminated_length": 797.0, + "completions/mean_length": 468.90625, + "completions/mean_terminated_length": 468.90625, + "completions/min_length": 315.0, + "completions/min_terminated_length": 315.0, + "epoch": 0.5021257552025061, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.085639089345932, + "learning_rate": 4.5223862732641775e-06, + "loss": -0.0116, + "num_tokens": 178714232.0, + "reward": 3.6673331260681152, + "reward_std": 1.353724718093872, + "rewards/accuracy_reward/mean": 3.0625, + "rewards/accuracy_reward/std": 2.9700891971588135, + "rewards/ngram_similarity_reward/mean": 0.6048333048820496, + "rewards/ngram_similarity_reward/std": 0.2771512567996979, + "step": 1122 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 634.0, + "completions/max_terminated_length": 634.0, + "completions/mean_length": 375.109375, + "completions/mean_terminated_length": 375.109375, + "completions/min_length": 236.0, + "completions/min_terminated_length": 236.0, + "epoch": 0.50257328261356, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0942516103386879, + "learning_rate": 4.5213602539649935e-06, + "loss": -0.013, + "num_tokens": 178839855.0, + "reward": 4.5549468994140625, + "reward_std": 0.5895836353302002, + "rewards/accuracy_reward/mean": 3.890625, + "rewards/accuracy_reward/std": 2.6998953819274902, + "rewards/ngram_similarity_reward/mean": 0.664322018623352, + "rewards/ngram_similarity_reward/std": 0.35421836376190186, + "step": 1123 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 754.0, + "completions/max_terminated_length": 754.0, + "completions/mean_length": 587.4375, + "completions/mean_terminated_length": 587.4375, + "completions/min_length": 426.0, + "completions/min_terminated_length": 426.0, + "epoch": 0.503020810024614, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06759040802717209, + "learning_rate": 4.520333264945164e-06, + "loss": 0.0033, + "num_tokens": 178983515.0, + "reward": 4.4897613525390625, + "reward_std": 0.7100332975387573, + "rewards/accuracy_reward/mean": 3.78125, + "rewards/accuracy_reward/std": 2.7744226455688477, + "rewards/ngram_similarity_reward/mean": 0.7085116505622864, + "rewards/ngram_similarity_reward/std": 0.2417541742324829, + "step": 1124 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 586.0, + "completions/max_terminated_length": 586.0, + "completions/mean_length": 365.578125, + "completions/mean_terminated_length": 365.578125, + "completions/min_length": 170.0, + "completions/min_terminated_length": 170.0, + "epoch": 0.503468337435668, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12146230787038803, + "learning_rate": 4.519305306766911e-06, + "loss": -0.0168, + "num_tokens": 179108272.0, + "reward": 4.922126770019531, + "reward_std": 1.7761967182159424, + "rewards/accuracy_reward/mean": 4.28125, + "rewards/accuracy_reward/std": 2.4330317974090576, + "rewards/ngram_similarity_reward/mean": 0.6408767104148865, + "rewards/ngram_similarity_reward/std": 0.30786970257759094, + "step": 1125 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 783.0, + "completions/max_terminated_length": 783.0, + "completions/mean_length": 467.796875, + "completions/mean_terminated_length": 467.796875, + "completions/min_length": 277.0, + "completions/min_terminated_length": 277.0, + "epoch": 0.5039158648467219, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08122233301401138, + "learning_rate": 4.51827637999298e-06, + "loss": -0.0069, + "num_tokens": 179257155.0, + "reward": 6.2950897216796875, + "reward_std": 0.21259811520576477, + "rewards/accuracy_reward/mean": 5.5, + "rewards/accuracy_reward/std": 0.0, + "rewards/ngram_similarity_reward/mean": 0.7950894236564636, + "rewards/ngram_similarity_reward/std": 0.2753046751022339, + "step": 1126 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 526.0, + "completions/max_terminated_length": 526.0, + "completions/mean_length": 371.546875, + "completions/mean_terminated_length": 371.546875, + "completions/min_length": 283.0, + "completions/min_terminated_length": 283.0, + "epoch": 0.5043633922577758, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0807737484574318, + "learning_rate": 4.517246485186653e-06, + "loss": 0.0073, + "num_tokens": 179423318.0, + "reward": 4.422578811645508, + "reward_std": 0.6542232036590576, + "rewards/accuracy_reward/mean": 3.71875, + "rewards/accuracy_reward/std": 2.7629566192626953, + "rewards/ngram_similarity_reward/mean": 0.7038288116455078, + "rewards/ngram_similarity_reward/std": 0.3005114197731018, + "step": 1127 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 668.0, + "completions/max_terminated_length": 668.0, + "completions/mean_length": 433.390625, + "completions/mean_terminated_length": 433.390625, + "completions/min_length": 289.0, + "completions/min_terminated_length": 289.0, + "epoch": 0.5048109196688297, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07679495215415955, + "learning_rate": 4.51621562291174e-06, + "loss": -0.0012, + "num_tokens": 179544383.0, + "reward": 4.917806148529053, + "reward_std": 0.4657190442085266, + "rewards/accuracy_reward/mean": 4.09375, + "rewards/accuracy_reward/std": 2.5617377758026123, + "rewards/ngram_similarity_reward/mean": 0.8240560293197632, + "rewards/ngram_similarity_reward/std": 0.24326196312904358, + "step": 1128 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 655.0, + "completions/max_terminated_length": 655.0, + "completions/mean_length": 388.765625, + "completions/mean_terminated_length": 388.765625, + "completions/min_length": 199.0, + "completions/min_terminated_length": 199.0, + "epoch": 0.5052584470798837, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.10028428584337234, + "learning_rate": 4.515183793732579e-06, + "loss": -0.0079, + "num_tokens": 179676352.0, + "reward": 3.615325450897217, + "reward_std": 1.9288501739501953, + "rewards/accuracy_reward/mean": 2.875, + "rewards/accuracy_reward/std": 3.000000238418579, + "rewards/ngram_similarity_reward/mean": 0.7403252124786377, + "rewards/ngram_similarity_reward/std": 0.18498782813549042, + "step": 1129 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 798.0, + "completions/max_terminated_length": 798.0, + "completions/mean_length": 455.203125, + "completions/mean_terminated_length": 455.203125, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, + "epoch": 0.5057059744909376, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08782827854156494, + "learning_rate": 4.5141509982140395e-06, + "loss": -0.0153, + "num_tokens": 179826701.0, + "reward": 5.653952598571777, + "reward_std": 1.243302822113037, + "rewards/accuracy_reward/mean": 4.9375, + "rewards/accuracy_reward/std": 1.7627090215682983, + "rewards/ngram_similarity_reward/mean": 0.7164527773857117, + "rewards/ngram_similarity_reward/std": 0.3032922148704529, + "step": 1130 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 700.0, + "completions/max_terminated_length": 700.0, + "completions/mean_length": 421.46875, + "completions/mean_terminated_length": 421.46875, + "completions/min_length": 220.0, + "completions/min_terminated_length": 220.0, + "epoch": 0.5061535019019915, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08586135506629944, + "learning_rate": 4.5131172369215205e-06, + "loss": -0.0009, + "num_tokens": 179978059.0, + "reward": 4.517760276794434, + "reward_std": 0.7564865350723267, + "rewards/accuracy_reward/mean": 3.71875, + "rewards/accuracy_reward/std": 2.7629566192626953, + "rewards/ngram_similarity_reward/mean": 0.7990100979804993, + "rewards/ngram_similarity_reward/std": 0.22309666872024536, + "step": 1131 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 755.0, + "completions/max_terminated_length": 755.0, + "completions/mean_length": 472.421875, + "completions/mean_terminated_length": 472.421875, + "completions/min_length": 229.0, + "completions/min_terminated_length": 229.0, + "epoch": 0.5066010293130454, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07734288275241852, + "learning_rate": 4.512082510420946e-06, + "loss": 0.0058, + "num_tokens": 180163606.0, + "reward": 4.513777732849121, + "reward_std": 0.5919719934463501, + "rewards/accuracy_reward/mean": 3.8125, + "rewards/accuracy_reward/std": 2.7189810276031494, + "rewards/ngram_similarity_reward/mean": 0.7012777328491211, + "rewards/ngram_similarity_reward/std": 0.39876317977905273, + "step": 1132 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1193.0, + "completions/max_terminated_length": 1193.0, + "completions/mean_length": 615.953125, + "completions/mean_terminated_length": 615.953125, + "completions/min_length": 288.0, + "completions/min_terminated_length": 288.0, + "epoch": 0.5070485567240993, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07883348315954208, + "learning_rate": 4.511046819278773e-06, + "loss": 0.0073, + "num_tokens": 180363987.0, + "reward": 2.52891206741333, + "reward_std": 0.8345606923103333, + "rewards/accuracy_reward/mean": 2.03125, + "rewards/accuracy_reward/std": 2.986577272415161, + "rewards/ngram_similarity_reward/mean": 0.4976623058319092, + "rewards/ngram_similarity_reward/std": 0.2290460467338562, + "step": 1133 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 693.0, + "completions/max_terminated_length": 693.0, + "completions/mean_length": 522.390625, + "completions/mean_terminated_length": 522.390625, + "completions/min_length": 385.0, + "completions/min_terminated_length": 385.0, + "epoch": 0.5074960841351532, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06173047423362732, + "learning_rate": 4.510010164061984e-06, + "loss": 0.0179, + "num_tokens": 180516812.0, + "reward": 6.1605224609375, + "reward_std": 0.0839911550283432, + "rewards/accuracy_reward/mean": 5.5, + "rewards/accuracy_reward/std": 0.0, + "rewards/ngram_similarity_reward/mean": 0.6605223417282104, + "rewards/ngram_similarity_reward/std": 0.3330098092556, + "step": 1134 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 757.0, + "completions/max_terminated_length": 757.0, + "completions/mean_length": 480.90625, + "completions/mean_terminated_length": 480.90625, + "completions/min_length": 289.0, + "completions/min_terminated_length": 289.0, + "epoch": 0.5079436115462072, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09174945950508118, + "learning_rate": 4.508972545338089e-06, + "loss": 0.0102, + "num_tokens": 180700966.0, + "reward": 4.181850433349609, + "reward_std": 1.16425359249115, + "rewards/accuracy_reward/mean": 3.53125, + "rewards/accuracy_reward/std": 2.839454174041748, + "rewards/ngram_similarity_reward/mean": 0.6506001949310303, + "rewards/ngram_similarity_reward/std": 0.2055261731147766, + "step": 1135 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1064.0, + "completions/max_terminated_length": 1064.0, + "completions/mean_length": 566.65625, + "completions/mean_terminated_length": 566.65625, + "completions/min_length": 274.0, + "completions/min_terminated_length": 274.0, + "epoch": 0.5083911389572612, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05755320191383362, + "learning_rate": 4.507933963675128e-06, + "loss": 0.0011, + "num_tokens": 180878528.0, + "reward": 6.443551063537598, + "reward_std": 0.08974266052246094, + "rewards/accuracy_reward/mean": 5.5, + "rewards/accuracy_reward/std": 0.0, + "rewards/ngram_similarity_reward/mean": 0.9435515403747559, + "rewards/ngram_similarity_reward/std": 0.1593705117702484, + "step": 1136 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 621.0, + "completions/max_terminated_length": 621.0, + "completions/mean_length": 393.953125, + "completions/mean_terminated_length": 393.953125, + "completions/min_length": 270.0, + "completions/min_terminated_length": 270.0, + "epoch": 0.5088386663683151, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.06603922694921494, + "learning_rate": 4.506894419641663e-06, + "loss": 0.0243, + "num_tokens": 181015997.0, + "reward": 2.74050235748291, + "reward_std": 1.3518166542053223, + "rewards/accuracy_reward/mean": 2.125, + "rewards/accuracy_reward/std": 3.000000238418579, + "rewards/ngram_similarity_reward/mean": 0.6155022978782654, + "rewards/ngram_similarity_reward/std": 0.3120533525943756, + "step": 1137 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 946.0, + "completions/max_terminated_length": 946.0, + "completions/mean_length": 538.90625, + "completions/mean_terminated_length": 538.90625, + "completions/min_length": 313.0, + "completions/min_terminated_length": 313.0, + "epoch": 0.509286193779369, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06870218366384506, + "learning_rate": 4.505853913806789e-06, + "loss": 0.0007, + "num_tokens": 181179383.0, + "reward": 4.546904563903809, + "reward_std": 0.1394781917333603, + "rewards/accuracy_reward/mean": 4.0, + "rewards/accuracy_reward/std": 2.618614673614502, + "rewards/ngram_similarity_reward/mean": 0.5469047427177429, + "rewards/ngram_similarity_reward/std": 0.24873505532741547, + "step": 1138 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 687.0, + "completions/max_terminated_length": 687.0, + "completions/mean_length": 421.125, + "completions/mean_terminated_length": 421.125, + "completions/min_length": 188.0, + "completions/min_terminated_length": 188.0, + "epoch": 0.5097337211904229, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09319427609443665, + "learning_rate": 4.504812446740124e-06, + "loss": 0.052, + "num_tokens": 181322175.0, + "reward": 3.070396900177002, + "reward_std": 0.22405299544334412, + "rewards/accuracy_reward/mean": 2.5, + "rewards/accuracy_reward/std": 3.0237157344818115, + "rewards/ngram_similarity_reward/mean": 0.570397138595581, + "rewards/ngram_similarity_reward/std": 0.3630915582180023, + "step": 1139 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 740.0, + "completions/max_terminated_length": 740.0, + "completions/mean_length": 500.453125, + "completions/mean_terminated_length": 500.453125, + "completions/min_length": 292.0, + "completions/min_terminated_length": 292.0, + "epoch": 0.5101812486014768, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08894863724708557, + "learning_rate": 4.5037700190118125e-06, + "loss": 0.0099, + "num_tokens": 181466556.0, + "reward": 3.2207350730895996, + "reward_std": 0.19931933283805847, + "rewards/accuracy_reward/mean": 2.5, + "rewards/accuracy_reward/std": 3.0237157344818115, + "rewards/ngram_similarity_reward/mean": 0.7207349538803101, + "rewards/ngram_similarity_reward/std": 0.22353129088878632, + "step": 1140 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 766.0, + "completions/max_terminated_length": 766.0, + "completions/mean_length": 452.328125, + "completions/mean_terminated_length": 452.328125, + "completions/min_length": 210.0, + "completions/min_terminated_length": 210.0, + "epoch": 0.5106287760125308, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0830058678984642, + "learning_rate": 4.502726631192526e-06, + "loss": -0.0097, + "num_tokens": 181617409.0, + "reward": 3.030059814453125, + "reward_std": 0.12081623822450638, + "rewards/accuracy_reward/mean": 2.5, + "rewards/accuracy_reward/std": 3.0237157344818115, + "rewards/ngram_similarity_reward/mean": 0.5300598740577698, + "rewards/ngram_similarity_reward/std": 0.19034965336322784, + "step": 1141 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 845.0, + "completions/max_terminated_length": 845.0, + "completions/mean_length": 506.5625, + "completions/mean_terminated_length": 506.5625, + "completions/min_length": 306.0, + "completions/min_terminated_length": 306.0, + "epoch": 0.5110763034235847, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07904507964849472, + "learning_rate": 4.501682283853461e-06, + "loss": -0.0101, + "num_tokens": 181778421.0, + "reward": 4.437538146972656, + "reward_std": 0.523172914981842, + "rewards/accuracy_reward/mean": 3.890625, + "rewards/accuracy_reward/std": 2.6998953819274902, + "rewards/ngram_similarity_reward/mean": 0.5469129085540771, + "rewards/ngram_similarity_reward/std": 0.2477530837059021, + "step": 1142 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 743.0, + "completions/max_terminated_length": 743.0, + "completions/mean_length": 490.296875, + "completions/mean_terminated_length": 490.296875, + "completions/min_length": 336.0, + "completions/min_terminated_length": 336.0, + "epoch": 0.5115238308346386, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0692552924156189, + "learning_rate": 4.500636977566339e-06, + "loss": 0.0159, + "num_tokens": 181923240.0, + "reward": 4.113036632537842, + "reward_std": 1.2630510330200195, + "rewards/accuracy_reward/mean": 3.25, + "rewards/accuracy_reward/std": 2.9277002811431885, + "rewards/ngram_similarity_reward/mean": 0.8630364537239075, + "rewards/ngram_similarity_reward/std": 0.20219597220420837, + "step": 1143 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 705.0, + "completions/max_terminated_length": 705.0, + "completions/mean_length": 415.453125, + "completions/mean_terminated_length": 415.453125, + "completions/min_length": 297.0, + "completions/min_terminated_length": 297.0, + "epoch": 0.5119713582456925, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.08201241493225098, + "learning_rate": 4.499590712903406e-06, + "loss": -0.0128, + "num_tokens": 182052917.0, + "reward": 4.288032531738281, + "reward_std": 0.6296678781509399, + "rewards/accuracy_reward/mean": 3.71875, + "rewards/accuracy_reward/std": 2.7629566192626953, + "rewards/ngram_similarity_reward/mean": 0.5692823529243469, + "rewards/ngram_similarity_reward/std": 0.21475210785865784, + "step": 1144 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 852.0, + "completions/max_terminated_length": 852.0, + "completions/mean_length": 482.15625, + "completions/mean_terminated_length": 482.15625, + "completions/min_length": 274.0, + "completions/min_terminated_length": 274.0, + "epoch": 0.5124188856567464, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08425358682870865, + "learning_rate": 4.498543490437435e-06, + "loss": -0.0208, + "num_tokens": 182215071.0, + "reward": 6.372078895568848, + "reward_std": 0.20720459520816803, + "rewards/accuracy_reward/mean": 5.5, + "rewards/accuracy_reward/std": 0.0, + "rewards/ngram_similarity_reward/mean": 0.8720792531967163, + "rewards/ngram_similarity_reward/std": 0.24663090705871582, + "step": 1145 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 504.0, + "completions/max_terminated_length": 504.0, + "completions/mean_length": 337.328125, + "completions/mean_terminated_length": 337.328125, + "completions/min_length": 238.0, + "completions/min_terminated_length": 238.0, + "epoch": 0.5128664130678005, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12950652837753296, + "learning_rate": 4.49749531074172e-06, + "loss": 0.0082, + "num_tokens": 182326564.0, + "reward": 5.764260292053223, + "reward_std": 1.266313076019287, + "rewards/accuracy_reward/mean": 5.03125, + "rewards/accuracy_reward/std": 1.6229382753372192, + "rewards/ngram_similarity_reward/mean": 0.7330105304718018, + "rewards/ngram_similarity_reward/std": 0.3188089430332184, + "step": 1146 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 563.0, + "completions/max_terminated_length": 563.0, + "completions/mean_length": 418.328125, + "completions/mean_terminated_length": 418.328125, + "completions/min_length": 288.0, + "completions/min_terminated_length": 288.0, + "epoch": 0.5133139404788544, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09356052428483963, + "learning_rate": 4.496446174390082e-06, + "loss": -0.005, + "num_tokens": 182469689.0, + "reward": 3.608661413192749, + "reward_std": 1.2372500896453857, + "rewards/accuracy_reward/mean": 3.0625, + "rewards/accuracy_reward/std": 2.9700891971588135, + "rewards/ngram_similarity_reward/mean": 0.5461612939834595, + "rewards/ngram_similarity_reward/std": 0.14150479435920715, + "step": 1147 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 751.0, + "completions/max_terminated_length": 751.0, + "completions/mean_length": 465.40625, + "completions/mean_terminated_length": 465.40625, + "completions/min_length": 231.0, + "completions/min_terminated_length": 231.0, + "epoch": 0.5137614678899083, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.089523546397686, + "learning_rate": 4.495396081956864e-06, + "loss": -0.0015, + "num_tokens": 182633267.0, + "reward": 2.098345994949341, + "reward_std": 1.1912450790405273, + "rewards/accuracy_reward/mean": 1.296875, + "rewards/accuracy_reward/std": 2.9823806285858154, + "rewards/ngram_similarity_reward/mean": 0.8014709949493408, + "rewards/ngram_similarity_reward/std": 0.26014891266822815, + "step": 1148 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 480.0, + "completions/max_terminated_length": 480.0, + "completions/mean_length": 337.75, + "completions/mean_terminated_length": 337.75, + "completions/min_length": 198.0, + "completions/min_terminated_length": 198.0, + "epoch": 0.5142089953009622, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11801254749298096, + "learning_rate": 4.494345034016932e-06, + "loss": -0.0032, + "num_tokens": 182752707.0, + "reward": 3.136491060256958, + "reward_std": 0.16078060865402222, + "rewards/accuracy_reward/mean": 2.484375, + "rewards/accuracy_reward/std": 3.0419929027557373, + "rewards/ngram_similarity_reward/mean": 0.6521161794662476, + "rewards/ngram_similarity_reward/std": 0.3107413053512573, + "step": 1149 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 699.0, + "completions/max_terminated_length": 699.0, + "completions/mean_length": 438.640625, + "completions/mean_terminated_length": 438.640625, + "completions/min_length": 225.0, + "completions/min_terminated_length": 225.0, + "epoch": 0.5146565227120161, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.07768989354372025, + "learning_rate": 4.4932930311456774e-06, + "loss": 0.0026, + "num_tokens": 182898876.0, + "reward": 3.6199331283569336, + "reward_std": 0.8400613069534302, + "rewards/accuracy_reward/mean": 2.96875, + "rewards/accuracy_reward/std": 2.986577272415161, + "rewards/ngram_similarity_reward/mean": 0.6511832475662231, + "rewards/ngram_similarity_reward/std": 0.43442097306251526, + "step": 1150 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 623.0, + "completions/max_terminated_length": 623.0, + "completions/mean_length": 405.0, + "completions/mean_terminated_length": 405.0, + "completions/min_length": 283.0, + "completions/min_terminated_length": 283.0, + "epoch": 0.51510405012307, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08477354794740677, + "learning_rate": 4.492240073919013e-06, + "loss": 0.0245, + "num_tokens": 183055804.0, + "reward": 6.137720584869385, + "reward_std": 0.46785545349121094, + "rewards/accuracy_reward/mean": 5.40625, + "rewards/accuracy_reward/std": 0.7500000596046448, + "rewards/ngram_similarity_reward/mean": 0.7314706444740295, + "rewards/ngram_similarity_reward/std": 0.3978707492351532, + "step": 1151 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 499.0, + "completions/max_terminated_length": 499.0, + "completions/mean_length": 351.796875, + "completions/mean_terminated_length": 351.796875, + "completions/min_length": 187.0, + "completions/min_terminated_length": 187.0, + "epoch": 0.515551577534124, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11964279413223267, + "learning_rate": 4.4911861629133724e-06, + "loss": -0.0126, + "num_tokens": 183288719.0, + "reward": 3.1458687782287598, + "reward_std": 0.5388427972793579, + "rewards/accuracy_reward/mean": 2.40625, + "rewards/accuracy_reward/std": 3.0222392082214355, + "rewards/ngram_similarity_reward/mean": 0.7396188974380493, + "rewards/ngram_similarity_reward/std": 0.3106095492839813, + "step": 1152 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 485.0, + "completions/max_terminated_length": 485.0, + "completions/mean_length": 354.421875, + "completions/mean_terminated_length": 354.421875, + "completions/min_length": 252.0, + "completions/min_terminated_length": 252.0, + "epoch": 0.5159991049451779, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09884260594844818, + "learning_rate": 4.490131298705714e-06, + "loss": 0.0228, + "num_tokens": 183470058.0, + "reward": 4.703275203704834, + "reward_std": 0.062054891139268875, + "rewards/accuracy_reward/mean": 4.0, + "rewards/accuracy_reward/std": 2.618614673614502, + "rewards/ngram_similarity_reward/mean": 0.7032753229141235, + "rewards/ngram_similarity_reward/std": 0.3125001788139343, + "step": 1153 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 684.0, + "completions/max_terminated_length": 684.0, + "completions/mean_length": 385.53125, + "completions/mean_terminated_length": 385.53125, + "completions/min_length": 255.0, + "completions/min_terminated_length": 255.0, + "epoch": 0.5164466323562318, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.09258336573839188, + "learning_rate": 4.489075481873517e-06, + "loss": -0.0091, + "num_tokens": 183725836.0, + "reward": 4.709859848022461, + "reward_std": 1.5861510038375854, + "rewards/accuracy_reward/mean": 4.0, + "rewards/accuracy_reward/std": 2.618614673614502, + "rewards/ngram_similarity_reward/mean": 0.70986008644104, + "rewards/ngram_similarity_reward/std": 0.3363460302352905, + "step": 1154 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 835.0, + "completions/max_terminated_length": 835.0, + "completions/mean_length": 486.6875, + "completions/mean_terminated_length": 486.6875, + "completions/min_length": 278.0, + "completions/min_terminated_length": 278.0, + "epoch": 0.5168941597672857, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0801980122923851, + "learning_rate": 4.488018712994782e-06, + "loss": -0.0039, + "num_tokens": 183890904.0, + "reward": 3.1528306007385254, + "reward_std": 0.1401413083076477, + "rewards/accuracy_reward/mean": 2.484375, + "rewards/accuracy_reward/std": 3.0419929027557373, + "rewards/ngram_similarity_reward/mean": 0.6684555411338806, + "rewards/ngram_similarity_reward/std": 0.372976154088974, + "step": 1155 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 865.0, + "completions/max_terminated_length": 865.0, + "completions/mean_length": 512.796875, + "completions/mean_terminated_length": 512.796875, + "completions/min_length": 297.0, + "completions/min_terminated_length": 297.0, + "epoch": 0.5173416871783397, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.06421882659196854, + "learning_rate": 4.48696099264803e-06, + "loss": 0.0141, + "num_tokens": 184026987.0, + "reward": 4.849231719970703, + "reward_std": 0.05383665859699249, + "rewards/accuracy_reward/mean": 4.0, + "rewards/accuracy_reward/std": 2.618614673614502, + "rewards/ngram_similarity_reward/mean": 0.8492318391799927, + "rewards/ngram_similarity_reward/std": 0.4074471592903137, + "step": 1156 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 722.0, + "completions/max_terminated_length": 722.0, + "completions/mean_length": 507.953125, + "completions/mean_terminated_length": 507.953125, + "completions/min_length": 296.0, + "completions/min_terminated_length": 296.0, + "epoch": 0.5177892145893936, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06870316714048386, + "learning_rate": 4.485902321412304e-06, + "loss": -0.006, + "num_tokens": 184177672.0, + "reward": 5.312700271606445, + "reward_std": 0.9201359152793884, + "rewards/accuracy_reward/mean": 4.75, + "rewards/accuracy_reward/std": 2.0, + "rewards/ngram_similarity_reward/mean": 0.5627005100250244, + "rewards/ngram_similarity_reward/std": 0.2898489534854889, + "step": 1157 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 789.0, + "completions/max_terminated_length": 789.0, + "completions/mean_length": 467.71875, + "completions/mean_terminated_length": 467.71875, + "completions/min_length": 277.0, + "completions/min_terminated_length": 277.0, + "epoch": 0.5182367420004476, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.06772363185882568, + "learning_rate": 4.484842699867168e-06, + "loss": -0.0277, + "num_tokens": 184329974.0, + "reward": 4.685681343078613, + "reward_std": 0.20069976150989532, + "rewards/accuracy_reward/mean": 4.0, + "rewards/accuracy_reward/std": 2.618614673614502, + "rewards/ngram_similarity_reward/mean": 0.6856814622879028, + "rewards/ngram_similarity_reward/std": 0.26604440808296204, + "step": 1158 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 771.0, + "completions/max_terminated_length": 771.0, + "completions/mean_length": 421.234375, + "completions/mean_terminated_length": 421.234375, + "completions/min_length": 268.0, + "completions/min_terminated_length": 268.0, + "epoch": 0.5186842694115015, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08867233991622925, + "learning_rate": 4.483782128592706e-06, + "loss": -0.0217, + "num_tokens": 184509573.0, + "reward": 6.380373001098633, + "reward_std": 0.4640117585659027, + "rewards/accuracy_reward/mean": 5.40625, + "rewards/accuracy_reward/std": 0.7500000596046448, + "rewards/ngram_similarity_reward/mean": 0.9741232395172119, + "rewards/ngram_similarity_reward/std": 0.22101424634456635, + "step": 1159 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 689.0, + "completions/max_terminated_length": 689.0, + "completions/mean_length": 437.1875, + "completions/mean_terminated_length": 437.1875, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "epoch": 0.5191317968225554, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08081317692995071, + "learning_rate": 4.48272060816952e-06, + "loss": 0.0263, + "num_tokens": 184663793.0, + "reward": 3.2799012660980225, + "reward_std": 0.060639895498752594, + "rewards/accuracy_reward/mean": 2.5, + "rewards/accuracy_reward/std": 3.0237157344818115, + "rewards/ngram_similarity_reward/mean": 0.7799011468887329, + "rewards/ngram_similarity_reward/std": 0.3067109286785126, + "step": 1160 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 672.0, + "completions/max_terminated_length": 672.0, + "completions/mean_length": 444.9375, + "completions/mean_terminated_length": 444.9375, + "completions/min_length": 298.0, + "completions/min_terminated_length": 298.0, + "epoch": 0.5195793242336093, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.06213630363345146, + "learning_rate": 4.481658139178734e-06, + "loss": 0.0039, + "num_tokens": 184800813.0, + "reward": 6.081175804138184, + "reward_std": 0.09067673981189728, + "rewards/accuracy_reward/mean": 5.5, + "rewards/accuracy_reward/std": 0.0, + "rewards/ngram_similarity_reward/mean": 0.5811758637428284, + "rewards/ngram_similarity_reward/std": 0.3027797341346741, + "step": 1161 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 864.0, + "completions/max_terminated_length": 864.0, + "completions/mean_length": 562.46875, + "completions/mean_terminated_length": 562.46875, + "completions/min_length": 313.0, + "completions/min_terminated_length": 313.0, + "epoch": 0.5200268516446632, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.058607082813978195, + "learning_rate": 4.4805947222019895e-06, + "loss": -0.0046, + "num_tokens": 184989755.0, + "reward": 5.9834794998168945, + "reward_std": 0.7814930081367493, + "rewards/accuracy_reward/mean": 5.125, + "rewards/accuracy_reward/std": 1.4638501405715942, + "rewards/ngram_similarity_reward/mean": 0.8584795594215393, + "rewards/ngram_similarity_reward/std": 0.22565433382987976, + "step": 1162 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 779.0, + "completions/max_terminated_length": 779.0, + "completions/mean_length": 440.78125, + "completions/mean_terminated_length": 440.78125, + "completions/min_length": 294.0, + "completions/min_terminated_length": 294.0, + "epoch": 0.5204743790557171, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07914085686206818, + "learning_rate": 4.479530357821448e-06, + "loss": 0.0066, + "num_tokens": 185155181.0, + "reward": 5.609979629516602, + "reward_std": 1.271763801574707, + "rewards/accuracy_reward/mean": 4.828125, + "rewards/accuracy_reward/std": 1.9359153509140015, + "rewards/ngram_similarity_reward/mean": 0.7818544507026672, + "rewards/ngram_similarity_reward/std": 0.1954008787870407, + "step": 1163 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 805.0, + "completions/max_terminated_length": 805.0, + "completions/mean_length": 522.46875, + "completions/mean_terminated_length": 522.46875, + "completions/min_length": 259.0, + "completions/min_terminated_length": 259.0, + "epoch": 0.5209219064667711, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.06199885904788971, + "learning_rate": 4.47846504661979e-06, + "loss": 0.0148, + "num_tokens": 185303163.0, + "reward": 4.539876937866211, + "reward_std": 0.41415005922317505, + "rewards/accuracy_reward/mean": 3.90625, + "rewards/accuracy_reward/std": 2.6709415912628174, + "rewards/ngram_similarity_reward/mean": 0.6336269378662109, + "rewards/ngram_similarity_reward/std": 0.2983126640319824, + "step": 1164 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 666.0, + "completions/max_terminated_length": 666.0, + "completions/mean_length": 482.046875, + "completions/mean_terminated_length": 482.046875, + "completions/min_length": 249.0, + "completions/min_terminated_length": 249.0, + "epoch": 0.521369433877825, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06533315032720566, + "learning_rate": 4.477398789180214e-06, + "loss": -0.0054, + "num_tokens": 185441102.0, + "reward": 3.6075239181518555, + "reward_std": 0.7264617681503296, + "rewards/accuracy_reward/mean": 2.875, + "rewards/accuracy_reward/std": 3.000000238418579, + "rewards/ngram_similarity_reward/mean": 0.7325237989425659, + "rewards/ngram_similarity_reward/std": 0.2967837154865265, + "step": 1165 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 779.0, + "completions/max_terminated_length": 779.0, + "completions/mean_length": 518.390625, + "completions/mean_terminated_length": 518.390625, + "completions/min_length": 312.0, + "completions/min_terminated_length": 312.0, + "epoch": 0.5218169612888789, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08033981174230576, + "learning_rate": 4.476331586086435e-06, + "loss": -0.0108, + "num_tokens": 185581527.0, + "reward": 2.005021810531616, + "reward_std": 0.960408091545105, + "rewards/accuracy_reward/mean": 1.28125, + "rewards/accuracy_reward/std": 2.7629566192626953, + "rewards/ngram_similarity_reward/mean": 0.723771870136261, + "rewards/ngram_similarity_reward/std": 0.2440297156572342, + "step": 1166 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 703.0, + "completions/max_terminated_length": 703.0, + "completions/mean_length": 471.796875, + "completions/mean_terminated_length": 471.796875, + "completions/min_length": 267.0, + "completions/min_terminated_length": 267.0, + "epoch": 0.5222644886999329, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.06338745355606079, + "learning_rate": 4.475263437922689e-06, + "loss": 0.0028, + "num_tokens": 185712426.0, + "reward": 4.866727828979492, + "reward_std": 0.10598120093345642, + "rewards/accuracy_reward/mean": 4.0, + "rewards/accuracy_reward/std": 2.618614673614502, + "rewards/ngram_similarity_reward/mean": 0.8667280673980713, + "rewards/ngram_similarity_reward/std": 0.3441309630870819, + "step": 1167 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 893.0, + "completions/max_terminated_length": 893.0, + "completions/mean_length": 645.953125, + "completions/mean_terminated_length": 645.953125, + "completions/min_length": 435.0, + "completions/min_terminated_length": 435.0, + "epoch": 0.5227120161109868, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05194732919335365, + "learning_rate": 4.474194345273726e-06, + "loss": 0.0358, + "num_tokens": 185883015.0, + "reward": 6.271397590637207, + "reward_std": 0.13656187057495117, + "rewards/accuracy_reward/mean": 5.5, + "rewards/accuracy_reward/std": 0.0, + "rewards/ngram_similarity_reward/mean": 0.7713972330093384, + "rewards/ngram_similarity_reward/std": 0.20546936988830566, + "step": 1168 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 639.0, + "completions/max_terminated_length": 639.0, + "completions/mean_length": 388.6875, + "completions/mean_terminated_length": 388.6875, + "completions/min_length": 193.0, + "completions/min_terminated_length": 193.0, + "epoch": 0.5231595435220407, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08884132653474808, + "learning_rate": 4.473124308724814e-06, + "loss": 0.0119, + "num_tokens": 185998451.0, + "reward": 3.39813232421875, + "reward_std": 0.6454095244407654, + "rewards/accuracy_reward/mean": 2.6875, + "rewards/accuracy_reward/std": 3.0178043842315674, + "rewards/ngram_similarity_reward/mean": 0.7106325626373291, + "rewards/ngram_similarity_reward/std": 0.32445278763771057, + "step": 1169 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 504.0, + "completions/max_terminated_length": 504.0, + "completions/mean_length": 305.453125, + "completions/mean_terminated_length": 305.453125, + "completions/min_length": 197.0, + "completions/min_terminated_length": 197.0, + "epoch": 0.5236070709330947, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11478908360004425, + "learning_rate": 4.472053328861738e-06, + "loss": 0.0363, + "num_tokens": 186162144.0, + "reward": 5.454303741455078, + "reward_std": 1.8677128553390503, + "rewards/accuracy_reward/mean": 4.46875, + "rewards/accuracy_reward/std": 2.2815253734588623, + "rewards/ngram_similarity_reward/mean": 0.9855537414550781, + "rewards/ngram_similarity_reward/std": 0.2933935523033142, + "step": 1170 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 574.0, + "completions/max_terminated_length": 574.0, + "completions/mean_length": 374.828125, + "completions/mean_terminated_length": 374.828125, + "completions/min_length": 241.0, + "completions/min_terminated_length": 241.0, + "epoch": 0.5240545983441486, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.09934691339731216, + "learning_rate": 4.470981406270802e-06, + "loss": 0.0094, + "num_tokens": 186365653.0, + "reward": 3.188791513442993, + "reward_std": 0.1223050057888031, + "rewards/accuracy_reward/mean": 2.484375, + "rewards/accuracy_reward/std": 3.0419929027557373, + "rewards/ngram_similarity_reward/mean": 0.7044165134429932, + "rewards/ngram_similarity_reward/std": 0.1886785477399826, + "step": 1171 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 620.0, + "completions/max_terminated_length": 620.0, + "completions/mean_length": 456.9375, + "completions/mean_terminated_length": 456.9375, + "completions/min_length": 286.0, + "completions/min_terminated_length": 286.0, + "epoch": 0.5245021257552025, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09197183698415756, + "learning_rate": 4.469908541538821e-06, + "loss": 0.0019, + "num_tokens": 186516833.0, + "reward": 3.5278964042663574, + "reward_std": 0.8360381722450256, + "rewards/accuracy_reward/mean": 3.0625, + "rewards/accuracy_reward/std": 2.9700891971588135, + "rewards/ngram_similarity_reward/mean": 0.4653964638710022, + "rewards/ngram_similarity_reward/std": 0.18946166336536407, + "step": 1172 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 754.0, + "completions/max_terminated_length": 754.0, + "completions/mean_length": 477.09375, + "completions/mean_terminated_length": 477.09375, + "completions/min_length": 274.0, + "completions/min_terminated_length": 274.0, + "epoch": 0.5249496531662564, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08766616135835648, + "learning_rate": 4.468834735253129e-06, + "loss": -0.0272, + "num_tokens": 186673159.0, + "reward": 3.9475269317626953, + "reward_std": 0.9023805856704712, + "rewards/accuracy_reward/mean": 3.34375, + "rewards/accuracy_reward/std": 2.9016621112823486, + "rewards/ngram_similarity_reward/mean": 0.6037769317626953, + "rewards/ngram_similarity_reward/std": 0.3598421514034271, + "step": 1173 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 538.0, + "completions/max_terminated_length": 538.0, + "completions/mean_length": 360.390625, + "completions/mean_terminated_length": 360.390625, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "epoch": 0.5253971805773103, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11231087893247604, + "learning_rate": 4.467759988001576e-06, + "loss": 0.0126, + "num_tokens": 186914208.0, + "reward": 1.7884948253631592, + "reward_std": 0.534112274646759, + "rewards/accuracy_reward/mean": 1.03125, + "rewards/accuracy_reward/std": 2.7195281982421875, + "rewards/ngram_similarity_reward/mean": 0.7572449445724487, + "rewards/ngram_similarity_reward/std": 0.28806519508361816, + "step": 1174 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 744.0, + "completions/max_terminated_length": 744.0, + "completions/mean_length": 473.125, + "completions/mean_terminated_length": 473.125, + "completions/min_length": 325.0, + "completions/min_terminated_length": 325.0, + "epoch": 0.5258447079883642, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.08042744547128677, + "learning_rate": 4.466684300372524e-06, + "loss": 0.0121, + "num_tokens": 187085416.0, + "reward": 3.117652177810669, + "reward_std": 0.814605712890625, + "rewards/accuracy_reward/mean": 2.5, + "rewards/accuracy_reward/std": 3.0237157344818115, + "rewards/ngram_similarity_reward/mean": 0.6176521182060242, + "rewards/ngram_similarity_reward/std": 0.17957745492458344, + "step": 1175 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 590.0, + "completions/max_terminated_length": 590.0, + "completions/mean_length": 357.75, + "completions/mean_terminated_length": 357.75, + "completions/min_length": 252.0, + "completions/min_terminated_length": 252.0, + "epoch": 0.5262922353994182, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.138007253408432, + "learning_rate": 4.465607672954855e-06, + "loss": -0.0453, + "num_tokens": 187231064.0, + "reward": 5.352939605712891, + "reward_std": 0.8770514726638794, + "rewards/accuracy_reward/mean": 4.46875, + "rewards/accuracy_reward/std": 2.2815253734588623, + "rewards/ngram_similarity_reward/mean": 0.8841900825500488, + "rewards/ngram_similarity_reward/std": 0.2505876123905182, + "step": 1176 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 589.0, + "completions/max_terminated_length": 589.0, + "completions/mean_length": 338.390625, + "completions/mean_terminated_length": 338.390625, + "completions/min_length": 185.0, + "completions/min_terminated_length": 185.0, + "epoch": 0.5267397628104722, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11972807347774506, + "learning_rate": 4.464530106337959e-06, + "loss": 0.0243, + "num_tokens": 187372497.0, + "reward": 3.768209218978882, + "reward_std": 0.9440657496452332, + "rewards/accuracy_reward/mean": 3.15625, + "rewards/accuracy_reward/std": 2.950484275817871, + "rewards/ngram_similarity_reward/mean": 0.6119591593742371, + "rewards/ngram_similarity_reward/std": 0.2602013051509857, + "step": 1177 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 619.0, + "completions/max_terminated_length": 619.0, + "completions/mean_length": 432.234375, + "completions/mean_terminated_length": 432.234375, + "completions/min_length": 327.0, + "completions/min_terminated_length": 327.0, + "epoch": 0.5271872902215261, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11133215576410294, + "learning_rate": 4.4634516011117455e-06, + "loss": -0.0083, + "num_tokens": 187599568.0, + "reward": 2.9685022830963135, + "reward_std": 1.5151430368423462, + "rewards/accuracy_reward/mean": 2.203125, + "rewards/accuracy_reward/std": 3.0272817611694336, + "rewards/ngram_similarity_reward/mean": 0.7653775215148926, + "rewards/ngram_similarity_reward/std": 0.40268921852111816, + "step": 1178 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 653.0, + "completions/max_terminated_length": 653.0, + "completions/mean_length": 434.78125, + "completions/mean_terminated_length": 434.78125, + "completions/min_length": 245.0, + "completions/min_terminated_length": 245.0, + "epoch": 0.52763481763258, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.06760997325181961, + "learning_rate": 4.4623721578666345e-06, + "loss": -0.0061, + "num_tokens": 187845202.0, + "reward": 2.0149641036987305, + "reward_std": 0.9120714664459229, + "rewards/accuracy_reward/mean": 1.28125, + "rewards/accuracy_reward/std": 2.7629566192626953, + "rewards/ngram_similarity_reward/mean": 0.7337141633033752, + "rewards/ngram_similarity_reward/std": 0.2726683020591736, + "step": 1179 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 687.0, + "completions/max_terminated_length": 687.0, + "completions/mean_length": 449.578125, + "completions/mean_terminated_length": 449.578125, + "completions/min_length": 289.0, + "completions/min_terminated_length": 289.0, + "epoch": 0.5280823450436339, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0716506615281105, + "learning_rate": 4.461291777193562e-06, + "loss": 0.0071, + "num_tokens": 188014775.0, + "reward": 5.391098976135254, + "reward_std": 1.4299428462982178, + "rewards/accuracy_reward/mean": 4.640625, + "rewards/accuracy_reward/std": 2.1445181369781494, + "rewards/ngram_similarity_reward/mean": 0.7504733800888062, + "rewards/ngram_similarity_reward/std": 0.2910078167915344, + "step": 1180 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 759.0, + "completions/max_terminated_length": 759.0, + "completions/mean_length": 500.984375, + "completions/mean_terminated_length": 500.984375, + "completions/min_length": 330.0, + "completions/min_terminated_length": 330.0, + "epoch": 0.5285298724546879, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07876599580049515, + "learning_rate": 4.460210459683975e-06, + "loss": 0.0322, + "num_tokens": 188172854.0, + "reward": 4.6322832107543945, + "reward_std": 0.08452863991260529, + "rewards/accuracy_reward/mean": 4.0, + "rewards/accuracy_reward/std": 2.618614673614502, + "rewards/ngram_similarity_reward/mean": 0.6322831511497498, + "rewards/ngram_similarity_reward/std": 0.3113572597503662, + "step": 1181 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 639.0, + "completions/max_terminated_length": 639.0, + "completions/mean_length": 431.34375, + "completions/mean_terminated_length": 431.34375, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "epoch": 0.5289773998657418, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08243060857057571, + "learning_rate": 4.459128205929835e-06, + "loss": 0.0084, + "num_tokens": 188320764.0, + "reward": 4.336956977844238, + "reward_std": 0.7198574542999268, + "rewards/accuracy_reward/mean": 3.71875, + "rewards/accuracy_reward/std": 2.7629566192626953, + "rewards/ngram_similarity_reward/mean": 0.6182069778442383, + "rewards/ngram_similarity_reward/std": 0.19275131821632385, + "step": 1182 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 628.0, + "completions/max_terminated_length": 628.0, + "completions/mean_length": 435.265625, + "completions/mean_terminated_length": 435.265625, + "completions/min_length": 277.0, + "completions/min_terminated_length": 277.0, + "epoch": 0.5294249272767957, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08188221603631973, + "learning_rate": 4.458045016523615e-06, + "loss": -0.0147, + "num_tokens": 188481293.0, + "reward": 4.523244857788086, + "reward_std": 1.700342059135437, + "rewards/accuracy_reward/mean": 3.90625, + "rewards/accuracy_reward/std": 2.6709415912628174, + "rewards/ngram_similarity_reward/mean": 0.6169949173927307, + "rewards/ngram_similarity_reward/std": 0.325880229473114, + "step": 1183 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 647.0, + "completions/max_terminated_length": 647.0, + "completions/mean_length": 408.3125, + "completions/mean_terminated_length": 408.3125, + "completions/min_length": 290.0, + "completions/min_terminated_length": 290.0, + "epoch": 0.5298724546878496, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12688559293746948, + "learning_rate": 4.4569608920582994e-06, + "loss": -0.0039, + "num_tokens": 188771185.0, + "reward": 2.5959887504577637, + "reward_std": 0.7658644914627075, + "rewards/accuracy_reward/mean": 2.109375, + "rewards/accuracy_reward/std": 3.0164480209350586, + "rewards/ngram_similarity_reward/mean": 0.4866139888763428, + "rewards/ngram_similarity_reward/std": 0.2558864653110504, + "step": 1184 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 836.0, + "completions/max_terminated_length": 836.0, + "completions/mean_length": 577.03125, + "completions/mean_terminated_length": 577.03125, + "completions/min_length": 333.0, + "completions/min_terminated_length": 333.0, + "epoch": 0.5303199820989035, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06306634098291397, + "learning_rate": 4.455875833127388e-06, + "loss": 0.0149, + "num_tokens": 188898899.0, + "reward": 3.541544198989868, + "reward_std": 0.8483132123947144, + "rewards/accuracy_reward/mean": 2.859375, + "rewards/accuracy_reward/std": 3.0203921794891357, + "rewards/ngram_similarity_reward/mean": 0.6821693181991577, + "rewards/ngram_similarity_reward/std": 0.26785826683044434, + "step": 1185 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 635.0, + "completions/max_terminated_length": 635.0, + "completions/mean_length": 402.6875, + "completions/mean_terminated_length": 402.6875, + "completions/min_length": 246.0, + "completions/min_terminated_length": 246.0, + "epoch": 0.5307675095099574, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10755161941051483, + "learning_rate": 4.4547898403248885e-06, + "loss": 0.0222, + "num_tokens": 189054527.0, + "reward": 4.52208948135376, + "reward_std": 0.13962656259536743, + "rewards/accuracy_reward/mean": 4.0, + "rewards/accuracy_reward/std": 2.618614673614502, + "rewards/ngram_similarity_reward/mean": 0.522089421749115, + "rewards/ngram_similarity_reward/std": 0.29837271571159363, + "step": 1186 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 703.0, + "completions/max_terminated_length": 703.0, + "completions/mean_length": 504.546875, + "completions/mean_terminated_length": 504.546875, + "completions/min_length": 294.0, + "completions/min_terminated_length": 294.0, + "epoch": 0.5312150369210115, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06728096306324005, + "learning_rate": 4.4537029142453215e-06, + "loss": -0.0118, + "num_tokens": 189207106.0, + "reward": 4.069512367248535, + "reward_std": 0.899407148361206, + "rewards/accuracy_reward/mean": 3.453125, + "rewards/accuracy_reward/std": 2.962354898452759, + "rewards/ngram_similarity_reward/mean": 0.6163874864578247, + "rewards/ngram_similarity_reward/std": 0.2319907248020172, + "step": 1187 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 544.0, + "completions/max_terminated_length": 544.0, + "completions/mean_length": 355.28125, + "completions/mean_terminated_length": 355.28125, + "completions/min_length": 212.0, + "completions/min_terminated_length": 212.0, + "epoch": 0.5316625643320654, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09989286214113235, + "learning_rate": 4.452615055483719e-06, + "loss": 0.0002, + "num_tokens": 189361076.0, + "reward": 6.271515369415283, + "reward_std": 0.5231516361236572, + "rewards/accuracy_reward/mean": 5.40625, + "rewards/accuracy_reward/std": 0.7500000596046448, + "rewards/ngram_similarity_reward/mean": 0.8652653694152832, + "rewards/ngram_similarity_reward/std": 0.3173908591270447, + "step": 1188 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 577.0, + "completions/max_terminated_length": 577.0, + "completions/mean_length": 402.515625, + "completions/mean_terminated_length": 402.515625, + "completions/min_length": 220.0, + "completions/min_terminated_length": 220.0, + "epoch": 0.5321100917431193, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0916324183344841, + "learning_rate": 4.451526264635622e-06, + "loss": -0.0148, + "num_tokens": 189512661.0, + "reward": 4.111636638641357, + "reward_std": 0.8803005814552307, + "rewards/accuracy_reward/mean": 3.34375, + "rewards/accuracy_reward/std": 2.9016621112823486, + "rewards/ngram_similarity_reward/mean": 0.7678864598274231, + "rewards/ngram_similarity_reward/std": 0.2462681084871292, + "step": 1189 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 663.0, + "completions/max_terminated_length": 663.0, + "completions/mean_length": 432.78125, + "completions/mean_terminated_length": 432.78125, + "completions/min_length": 281.0, + "completions/min_terminated_length": 281.0, + "epoch": 0.5325576191541732, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10155756026506424, + "learning_rate": 4.450436542297082e-06, + "loss": 0.0177, + "num_tokens": 189640967.0, + "reward": 1.6175291538238525, + "reward_std": 0.1283179223537445, + "rewards/accuracy_reward/mean": 1.0, + "rewards/accuracy_reward/std": 2.618614673614502, + "rewards/ngram_similarity_reward/mean": 0.6175291538238525, + "rewards/ngram_similarity_reward/std": 0.1763124018907547, + "step": 1190 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 777.0, + "completions/mean_length": 691.109375, + "completions/mean_terminated_length": 497.26788330078125, + "completions/min_length": 365.0, + "completions/min_terminated_length": 365.0, + "epoch": 0.5330051465652271, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07862675935029984, + "learning_rate": 4.4493458890646615e-06, + "loss": -0.1386, + "num_tokens": 189758030.0, + "reward": 3.4345579147338867, + "reward_std": 1.5686781406402588, + "rewards/accuracy_reward/mean": 2.953125, + "rewards/accuracy_reward/std": 3.0075550079345703, + "rewards/ngram_similarity_reward/mean": 0.4814329743385315, + "rewards/ngram_similarity_reward/std": 0.26114898920059204, + "step": 1191 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 743.0, + "completions/max_terminated_length": 743.0, + "completions/mean_length": 439.953125, + "completions/mean_terminated_length": 439.953125, + "completions/min_length": 278.0, + "completions/min_terminated_length": 278.0, + "epoch": 0.533452673976281, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10595672577619553, + "learning_rate": 4.448254305535432e-06, + "loss": -0.0085, + "num_tokens": 189893019.0, + "reward": 3.101775884628296, + "reward_std": 0.18303251266479492, + "rewards/accuracy_reward/mean": 2.484375, + "rewards/accuracy_reward/std": 3.0419929027557373, + "rewards/ngram_similarity_reward/mean": 0.6174007654190063, + "rewards/ngram_similarity_reward/std": 0.2428828775882721, + "step": 1192 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 525.0, + "completions/max_terminated_length": 525.0, + "completions/mean_length": 377.53125, + "completions/mean_terminated_length": 377.53125, + "completions/min_length": 259.0, + "completions/min_terminated_length": 259.0, + "epoch": 0.533900201387335, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09084273874759674, + "learning_rate": 4.447161792306976e-06, + "loss": -0.0022, + "num_tokens": 190032973.0, + "reward": 2.9964699745178223, + "reward_std": 0.48650163412094116, + "rewards/accuracy_reward/mean": 2.578125, + "rewards/accuracy_reward/std": 3.0410144329071045, + "rewards/ngram_similarity_reward/mean": 0.41834527254104614, + "rewards/ngram_similarity_reward/std": 0.40859779715538025, + "step": 1193 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 809.0, + "completions/max_terminated_length": 809.0, + "completions/mean_length": 445.21875, + "completions/mean_terminated_length": 445.21875, + "completions/min_length": 282.0, + "completions/min_terminated_length": 282.0, + "epoch": 0.5343477287983889, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07660823315382004, + "learning_rate": 4.446068349977381e-06, + "loss": -0.0193, + "num_tokens": 190167883.0, + "reward": 4.553478240966797, + "reward_std": 0.8184431195259094, + "rewards/accuracy_reward/mean": 3.703125, + "rewards/accuracy_reward/std": 2.789889335632324, + "rewards/ngram_similarity_reward/mean": 0.850353479385376, + "rewards/ngram_similarity_reward/std": 0.3576010465621948, + "step": 1194 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 562.0, + "completions/max_terminated_length": 562.0, + "completions/mean_length": 419.4375, + "completions/mean_terminated_length": 419.4375, + "completions/min_length": 286.0, + "completions/min_terminated_length": 286.0, + "epoch": 0.5347952562094428, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12094959616661072, + "learning_rate": 4.444973979145247e-06, + "loss": -0.0241, + "num_tokens": 190289575.0, + "reward": 3.085411548614502, + "reward_std": 1.3554508686065674, + "rewards/accuracy_reward/mean": 2.4375, + "rewards/accuracy_reward/std": 3.095695972442627, + "rewards/ngram_similarity_reward/mean": 0.6479116678237915, + "rewards/ngram_similarity_reward/std": 0.3733079135417938, + "step": 1195 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 662.0, + "completions/max_terminated_length": 662.0, + "completions/mean_length": 428.1875, + "completions/mean_terminated_length": 428.1875, + "completions/min_length": 279.0, + "completions/min_terminated_length": 279.0, + "epoch": 0.5352427836204967, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07159439474344254, + "learning_rate": 4.443878680409681e-06, + "loss": -0.0079, + "num_tokens": 190427731.0, + "reward": 6.272378921508789, + "reward_std": 0.09958252310752869, + "rewards/accuracy_reward/mean": 5.5, + "rewards/accuracy_reward/std": 0.0, + "rewards/ngram_similarity_reward/mean": 0.7723792195320129, + "rewards/ngram_similarity_reward/std": 0.3313034176826477, + "step": 1196 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 595.0, + "completions/max_terminated_length": 595.0, + "completions/mean_length": 387.4375, + "completions/mean_terminated_length": 387.4375, + "completions/min_length": 230.0, + "completions/min_terminated_length": 230.0, + "epoch": 0.5356903110315507, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08593352138996124, + "learning_rate": 4.442782454370296e-06, + "loss": 0.0077, + "num_tokens": 190591567.0, + "reward": 4.706838130950928, + "reward_std": 0.5320311188697815, + "rewards/accuracy_reward/mean": 3.90625, + "rewards/accuracy_reward/std": 2.6709415912628174, + "rewards/ngram_similarity_reward/mean": 0.8005880117416382, + "rewards/ngram_similarity_reward/std": 0.26705488562583923, + "step": 1197 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 675.0, + "completions/max_terminated_length": 675.0, + "completions/mean_length": 459.296875, + "completions/mean_terminated_length": 459.296875, + "completions/min_length": 321.0, + "completions/min_terminated_length": 321.0, + "epoch": 0.5361378384426047, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09371272474527359, + "learning_rate": 4.441685301627216e-06, + "loss": 0.0152, + "num_tokens": 190817410.0, + "reward": 3.785367488861084, + "reward_std": 0.9774179458618164, + "rewards/accuracy_reward/mean": 3.453125, + "rewards/accuracy_reward/std": 2.962354898452759, + "rewards/ngram_similarity_reward/mean": 0.33224231004714966, + "rewards/ngram_similarity_reward/std": 0.2916002869606018, + "step": 1198 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 595.0, + "completions/max_terminated_length": 595.0, + "completions/mean_length": 383.078125, + "completions/mean_terminated_length": 383.078125, + "completions/min_length": 259.0, + "completions/min_terminated_length": 259.0, + "epoch": 0.5365853658536586, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10788188129663467, + "learning_rate": 4.440587222781071e-06, + "loss": 0.0254, + "num_tokens": 190947559.0, + "reward": 1.6308579444885254, + "reward_std": 1.6703917980194092, + "rewards/accuracy_reward/mean": 1.140625, + "rewards/accuracy_reward/std": 2.7566208839416504, + "rewards/ngram_similarity_reward/mean": 0.4902329742908478, + "rewards/ngram_similarity_reward/std": 0.21183845400810242, + "step": 1199 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 723.0, + "completions/max_terminated_length": 723.0, + "completions/mean_length": 408.375, + "completions/mean_terminated_length": 408.375, + "completions/min_length": 247.0, + "completions/min_terminated_length": 247.0, + "epoch": 0.5370328932647125, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09294694662094116, + "learning_rate": 4.439488218432996e-06, + "loss": 0.0038, + "num_tokens": 191123359.0, + "reward": 3.347154140472412, + "reward_std": 1.6056389808654785, + "rewards/accuracy_reward/mean": 2.78125, + "rewards/accuracy_reward/std": 3.0103988647460938, + "rewards/ngram_similarity_reward/mean": 0.5659040212631226, + "rewards/ngram_similarity_reward/std": 0.32600826025009155, + "step": 1200 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 718.0, + "completions/max_terminated_length": 718.0, + "completions/mean_length": 438.234375, + "completions/mean_terminated_length": 438.234375, + "completions/min_length": 228.0, + "completions/min_terminated_length": 228.0, + "epoch": 0.5374804206757664, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09265422821044922, + "learning_rate": 4.438388289184637e-06, + "loss": -0.0324, + "num_tokens": 191257998.0, + "reward": 3.1129822731018066, + "reward_std": 1.1772977113723755, + "rewards/accuracy_reward/mean": 2.59375, + "rewards/accuracy_reward/std": 3.0222392082214355, + "rewards/ngram_similarity_reward/mean": 0.519232451915741, + "rewards/ngram_similarity_reward/std": 0.20867078006267548, + "step": 1201 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 765.0, + "completions/max_terminated_length": 765.0, + "completions/mean_length": 504.6875, + "completions/mean_terminated_length": 504.6875, + "completions/min_length": 273.0, + "completions/min_terminated_length": 273.0, + "epoch": 0.5379279480868203, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07943686842918396, + "learning_rate": 4.437287435638141e-06, + "loss": -0.02, + "num_tokens": 191419034.0, + "reward": 1.963949203491211, + "reward_std": 1.6095166206359863, + "rewards/accuracy_reward/mean": 1.359375, + "rewards/accuracy_reward/std": 2.816432476043701, + "rewards/ngram_similarity_reward/mean": 0.6045742034912109, + "rewards/ngram_similarity_reward/std": 0.3855489194393158, + "step": 1202 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 519.0, + "completions/max_terminated_length": 519.0, + "completions/mean_length": 341.03125, + "completions/mean_terminated_length": 341.03125, + "completions/min_length": 189.0, + "completions/min_terminated_length": 189.0, + "epoch": 0.5383754754978742, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.1066887304186821, + "learning_rate": 4.436185658396165e-06, + "loss": -0.0023, + "num_tokens": 191526300.0, + "reward": 4.599543571472168, + "reward_std": 1.4790633916854858, + "rewards/accuracy_reward/mean": 3.875, + "rewards/accuracy_reward/std": 2.7284510135650635, + "rewards/ngram_similarity_reward/mean": 0.7245436906814575, + "rewards/ngram_similarity_reward/std": 0.1518775224685669, + "step": 1203 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 597.0, + "completions/max_terminated_length": 597.0, + "completions/mean_length": 427.9375, + "completions/mean_terminated_length": 427.9375, + "completions/min_length": 291.0, + "completions/min_terminated_length": 291.0, + "epoch": 0.5388230029089282, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1014091968536377, + "learning_rate": 4.435082958061871e-06, + "loss": -0.0068, + "num_tokens": 191684808.0, + "reward": 0.0338769257068634, + "reward_std": 0.5324877500534058, + "rewards/accuracy_reward/mean": -0.453125, + "rewards/accuracy_reward/std": 0.7853760123252869, + "rewards/ngram_similarity_reward/mean": 0.4870019257068634, + "rewards/ngram_similarity_reward/std": 0.18385639786720276, + "step": 1204 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 745.0, + "completions/max_terminated_length": 745.0, + "completions/mean_length": 513.0, + "completions/mean_terminated_length": 513.0, + "completions/min_length": 331.0, + "completions/min_terminated_length": 331.0, + "epoch": 0.5392705303199821, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07513110339641571, + "learning_rate": 4.433979335238925e-06, + "loss": -0.0052, + "num_tokens": 191822568.0, + "reward": 4.481563568115234, + "reward_std": 0.7649232149124146, + "rewards/accuracy_reward/mean": 3.625, + "rewards/accuracy_reward/std": 2.8030595779418945, + "rewards/ngram_similarity_reward/mean": 0.8565635681152344, + "rewards/ngram_similarity_reward/std": 0.19029684364795685, + "step": 1205 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 846.0, + "completions/max_terminated_length": 846.0, + "completions/mean_length": 517.5, + "completions/mean_terminated_length": 517.5, + "completions/min_length": 329.0, + "completions/min_terminated_length": 329.0, + "epoch": 0.539718057731036, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08555588126182556, + "learning_rate": 4.4328747905314985e-06, + "loss": -0.0155, + "num_tokens": 192039816.0, + "reward": 1.8265221118927002, + "reward_std": 2.752342700958252, + "rewards/accuracy_reward/mean": 1.28125, + "rewards/accuracy_reward/std": 2.7629566192626953, + "rewards/ngram_similarity_reward/mean": 0.5452721118927002, + "rewards/ngram_similarity_reward/std": 0.3277539610862732, + "step": 1206 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 724.0, + "completions/max_terminated_length": 724.0, + "completions/mean_length": 418.625, + "completions/mean_terminated_length": 418.625, + "completions/min_length": 206.0, + "completions/min_terminated_length": 206.0, + "epoch": 0.5401655851420899, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.09938278049230576, + "learning_rate": 4.431769324544268e-06, + "loss": 0.0287, + "num_tokens": 192202608.0, + "reward": 4.789680004119873, + "reward_std": 0.07458257675170898, + "rewards/accuracy_reward/mean": 4.0, + "rewards/accuracy_reward/std": 2.618614673614502, + "rewards/ngram_similarity_reward/mean": 0.7896800637245178, + "rewards/ngram_similarity_reward/std": 0.32243624329566956, + "step": 1207 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 531.0, + "completions/max_terminated_length": 531.0, + "completions/mean_length": 358.78125, + "completions/mean_terminated_length": 358.78125, + "completions/min_length": 202.0, + "completions/min_terminated_length": 202.0, + "epoch": 0.5406131125531439, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10705152899026871, + "learning_rate": 4.430662937882415e-06, + "loss": -0.0476, + "num_tokens": 192384818.0, + "reward": 4.78361177444458, + "reward_std": 1.1920162439346313, + "rewards/accuracy_reward/mean": 4.359375, + "rewards/accuracy_reward/std": 2.3962087631225586, + "rewards/ngram_similarity_reward/mean": 0.4242364764213562, + "rewards/ngram_similarity_reward/std": 0.24500861763954163, + "step": 1208 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 671.0, + "completions/max_terminated_length": 671.0, + "completions/mean_length": 416.859375, + "completions/mean_terminated_length": 416.859375, + "completions/min_length": 238.0, + "completions/min_terminated_length": 238.0, + "epoch": 0.5410606399641978, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10445639491081238, + "learning_rate": 4.429555631151624e-06, + "loss": 0.0699, + "num_tokens": 192601529.0, + "reward": 3.170267105102539, + "reward_std": 0.23001646995544434, + "rewards/accuracy_reward/mean": 2.359375, + "rewards/accuracy_reward/std": 3.1816298961639404, + "rewards/ngram_similarity_reward/mean": 0.8108919858932495, + "rewards/ngram_similarity_reward/std": 0.39939984679222107, + "step": 1209 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 517.0, + "completions/max_terminated_length": 517.0, + "completions/mean_length": 366.109375, + "completions/mean_terminated_length": 366.109375, + "completions/min_length": 260.0, + "completions/min_terminated_length": 260.0, + "epoch": 0.5415081673752518, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09566810727119446, + "learning_rate": 4.428447404958084e-06, + "loss": 0.0063, + "num_tokens": 192781024.0, + "reward": 2.16746187210083, + "reward_std": 2.0318212509155273, + "rewards/accuracy_reward/mean": 1.46875, + "rewards/accuracy_reward/std": 2.839454174041748, + "rewards/ngram_similarity_reward/mean": 0.6987118124961853, + "rewards/ngram_similarity_reward/std": 0.3588160574436188, + "step": 1210 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 465.0, + "completions/max_terminated_length": 465.0, + "completions/mean_length": 349.296875, + "completions/mean_terminated_length": 349.296875, + "completions/min_length": 248.0, + "completions/min_terminated_length": 248.0, + "epoch": 0.5419556947863057, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.09934712946414948, + "learning_rate": 4.427338259908485e-06, + "loss": 0.0101, + "num_tokens": 192888531.0, + "reward": 5.944144248962402, + "reward_std": 0.8069709539413452, + "rewards/accuracy_reward/mean": 5.3125, + "rewards/accuracy_reward/std": 1.0522085428237915, + "rewards/ngram_similarity_reward/mean": 0.6316438913345337, + "rewards/ngram_similarity_reward/std": 0.28903961181640625, + "step": 1211 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 641.0, + "completions/max_terminated_length": 641.0, + "completions/mean_length": 379.828125, + "completions/mean_terminated_length": 379.828125, + "completions/min_length": 233.0, + "completions/min_terminated_length": 233.0, + "epoch": 0.5424032221973596, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10533181577920914, + "learning_rate": 4.426228196610024e-06, + "loss": -0.0253, + "num_tokens": 192991032.0, + "reward": 4.857659339904785, + "reward_std": 0.11671176552772522, + "rewards/accuracy_reward/mean": 4.0, + "rewards/accuracy_reward/std": 2.618614673614502, + "rewards/ngram_similarity_reward/mean": 0.8576596975326538, + "rewards/ngram_similarity_reward/std": 0.32358139753341675, + "step": 1212 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 712.0, + "completions/max_terminated_length": 712.0, + "completions/mean_length": 446.375, + "completions/mean_terminated_length": 446.375, + "completions/min_length": 260.0, + "completions/min_terminated_length": 260.0, + "epoch": 0.5428507496084135, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0953093022108078, + "learning_rate": 4.4251172156703974e-06, + "loss": -0.0166, + "num_tokens": 193172656.0, + "reward": 2.717423915863037, + "reward_std": 2.219616174697876, + "rewards/accuracy_reward/mean": 2.125, + "rewards/accuracy_reward/std": 3.000000238418579, + "rewards/ngram_similarity_reward/mean": 0.5924241542816162, + "rewards/ngram_similarity_reward/std": 0.3202495872974396, + "step": 1213 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 816.0, + "completions/max_terminated_length": 816.0, + "completions/mean_length": 398.171875, + "completions/mean_terminated_length": 398.171875, + "completions/min_length": 290.0, + "completions/min_terminated_length": 290.0, + "epoch": 0.5432982770194674, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12990568578243256, + "learning_rate": 4.424005317697805e-06, + "loss": -0.0166, + "num_tokens": 193324651.0, + "reward": 4.243120193481445, + "reward_std": 0.8481252789497375, + "rewards/accuracy_reward/mean": 3.625, + "rewards/accuracy_reward/std": 2.8030595779418945, + "rewards/ngram_similarity_reward/mean": 0.6181201934814453, + "rewards/ngram_similarity_reward/std": 0.2649337351322174, + "step": 1214 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 617.0, + "completions/max_terminated_length": 617.0, + "completions/mean_length": 402.25, + "completions/mean_terminated_length": 402.25, + "completions/min_length": 287.0, + "completions/min_terminated_length": 287.0, + "epoch": 0.5437458044305213, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10022781044244766, + "learning_rate": 4.422892503300949e-06, + "loss": -0.0005, + "num_tokens": 193469899.0, + "reward": 4.512398719787598, + "reward_std": 0.8765596747398376, + "rewards/accuracy_reward/mean": 3.625, + "rewards/accuracy_reward/std": 2.8030595779418945, + "rewards/ngram_similarity_reward/mean": 0.8873988389968872, + "rewards/ngram_similarity_reward/std": 0.29553747177124023, + "step": 1215 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 924.0, + "completions/max_terminated_length": 924.0, + "completions/mean_length": 493.53125, + "completions/mean_terminated_length": 493.53125, + "completions/min_length": 297.0, + "completions/min_terminated_length": 297.0, + "epoch": 0.5441933318415753, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09071893244981766, + "learning_rate": 4.421778773089035e-06, + "loss": 0.0228, + "num_tokens": 193605133.0, + "reward": 4.1777238845825195, + "reward_std": 2.1853396892547607, + "rewards/accuracy_reward/mean": 3.59375, + "rewards/accuracy_reward/std": 2.854785919189453, + "rewards/ngram_similarity_reward/mean": 0.5839738249778748, + "rewards/ngram_similarity_reward/std": 0.24924445152282715, + "step": 1216 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 485.0, + "completions/max_terminated_length": 485.0, + "completions/mean_length": 353.8125, + "completions/mean_terminated_length": 353.8125, + "completions/min_length": 253.0, + "completions/min_terminated_length": 253.0, + "epoch": 0.5446408592526292, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10845791548490524, + "learning_rate": 4.420664127671764e-06, + "loss": 0.0341, + "num_tokens": 193749793.0, + "reward": 5.829488754272461, + "reward_std": 0.9526141881942749, + "rewards/accuracy_reward/mean": 4.84375, + "rewards/accuracy_reward/std": 1.8874586820602417, + "rewards/ngram_similarity_reward/mean": 0.9857387542724609, + "rewards/ngram_similarity_reward/std": 0.2537689805030823, + "step": 1217 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 856.0, + "completions/max_terminated_length": 856.0, + "completions/mean_length": 466.78125, + "completions/mean_terminated_length": 466.78125, + "completions/min_length": 268.0, + "completions/min_terminated_length": 268.0, + "epoch": 0.5450883866636832, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.07989437133073807, + "learning_rate": 4.419548567659344e-06, + "loss": 0.0066, + "num_tokens": 193930371.0, + "reward": 2.572287082672119, + "reward_std": 0.8746790885925293, + "rewards/accuracy_reward/mean": 1.84375, + "rewards/accuracy_reward/std": 2.950484275817871, + "rewards/ngram_similarity_reward/mean": 0.7285372018814087, + "rewards/ngram_similarity_reward/std": 0.2515488862991333, + "step": 1218 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 683.0, + "completions/max_terminated_length": 683.0, + "completions/mean_length": 487.046875, + "completions/mean_terminated_length": 487.046875, + "completions/min_length": 278.0, + "completions/min_terminated_length": 278.0, + "epoch": 0.5455359140747371, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08332957327365875, + "learning_rate": 4.418432093662483e-06, + "loss": 0.0294, + "num_tokens": 194075942.0, + "reward": 4.446735382080078, + "reward_std": 0.46390363574028015, + "rewards/accuracy_reward/mean": 3.90625, + "rewards/accuracy_reward/std": 2.6709415912628174, + "rewards/ngram_similarity_reward/mean": 0.5404852032661438, + "rewards/ngram_similarity_reward/std": 0.28095483779907227, + "step": 1219 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 612.0, + "completions/max_terminated_length": 612.0, + "completions/mean_length": 412.859375, + "completions/mean_terminated_length": 412.859375, + "completions/min_length": 250.0, + "completions/min_terminated_length": 250.0, + "epoch": 0.545983441485791, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0877496600151062, + "learning_rate": 4.417314706292386e-06, + "loss": 0.019, + "num_tokens": 194225917.0, + "reward": 5.413695812225342, + "reward_std": 1.3082340955734253, + "rewards/accuracy_reward/mean": 4.5625, + "rewards/accuracy_reward/std": 2.195775270462036, + "rewards/ngram_similarity_reward/mean": 0.8511958122253418, + "rewards/ngram_similarity_reward/std": 0.26645204424858093, + "step": 1220 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 578.0, + "completions/max_terminated_length": 578.0, + "completions/mean_length": 380.0625, + "completions/mean_terminated_length": 380.0625, + "completions/min_length": 215.0, + "completions/min_terminated_length": 215.0, + "epoch": 0.546430968896845, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12315753102302551, + "learning_rate": 4.416196406160762e-06, + "loss": 0.061, + "num_tokens": 194423985.0, + "reward": 2.7438230514526367, + "reward_std": 1.4954211711883545, + "rewards/accuracy_reward/mean": 2.125, + "rewards/accuracy_reward/std": 3.000000238418579, + "rewards/ngram_similarity_reward/mean": 0.618823230266571, + "rewards/ngram_similarity_reward/std": 0.3517220914363861, + "step": 1221 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 663.0, + "completions/max_terminated_length": 663.0, + "completions/mean_length": 432.234375, + "completions/mean_terminated_length": 432.234375, + "completions/min_length": 296.0, + "completions/min_terminated_length": 296.0, + "epoch": 0.5468784963078989, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.16123153269290924, + "learning_rate": 4.415077193879816e-06, + "loss": 0.0049, + "num_tokens": 194566000.0, + "reward": 3.785012722015381, + "reward_std": 1.0094263553619385, + "rewards/accuracy_reward/mean": 3.109375, + "rewards/accuracy_reward/std": 3.125000238418579, + "rewards/ngram_similarity_reward/mean": 0.6756376624107361, + "rewards/ngram_similarity_reward/std": 0.3560906946659088, + "step": 1222 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 477.0, + "completions/max_terminated_length": 477.0, + "completions/mean_length": 331.125, + "completions/mean_terminated_length": 331.125, + "completions/min_length": 240.0, + "completions/min_terminated_length": 240.0, + "epoch": 0.5473260237189528, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.09261993318796158, + "learning_rate": 4.413957070062256e-06, + "loss": -0.0121, + "num_tokens": 194725432.0, + "reward": 6.400691986083984, + "reward_std": 0.8437039256095886, + "rewards/accuracy_reward/mean": 5.3125, + "rewards/accuracy_reward/std": 1.0522085428237915, + "rewards/ngram_similarity_reward/mean": 1.0881922245025635, + "rewards/ngram_similarity_reward/std": 0.16068026423454285, + "step": 1223 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 621.0, + "completions/max_terminated_length": 621.0, + "completions/mean_length": 407.578125, + "completions/mean_terminated_length": 407.578125, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "epoch": 0.5477735511300067, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11597134172916412, + "learning_rate": 4.4128360353212846e-06, + "loss": -0.0016, + "num_tokens": 194847341.0, + "reward": 4.762581825256348, + "reward_std": 0.218172088265419, + "rewards/accuracy_reward/mean": 3.984375, + "rewards/accuracy_reward/std": 2.648702621459961, + "rewards/ngram_similarity_reward/mean": 0.7782068252563477, + "rewards/ngram_similarity_reward/std": 0.40134501457214355, + "step": 1224 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 796.0, + "completions/mean_length": 491.21875, + "completions/mean_terminated_length": 466.5079650878906, + "completions/min_length": 279.0, + "completions/min_terminated_length": 279.0, + "epoch": 0.5482210785410606, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09415656328201294, + "learning_rate": 4.411714090270606e-06, + "loss": 0.0172, + "num_tokens": 195059947.0, + "reward": 3.096059799194336, + "reward_std": 0.42152872681617737, + "rewards/accuracy_reward/mean": 2.40625, + "rewards/accuracy_reward/std": 3.0222392082214355, + "rewards/ngram_similarity_reward/mean": 0.6898097991943359, + "rewards/ngram_similarity_reward/std": 0.22004714608192444, + "step": 1225 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 715.0, + "completions/max_terminated_length": 715.0, + "completions/mean_length": 459.6875, + "completions/mean_terminated_length": 459.6875, + "completions/min_length": 267.0, + "completions/min_terminated_length": 267.0, + "epoch": 0.5486686059521145, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08568942546844482, + "learning_rate": 4.4105912355244255e-06, + "loss": 0.01, + "num_tokens": 195212551.0, + "reward": 6.1095170974731445, + "reward_std": 0.8938249349594116, + "rewards/accuracy_reward/mean": 5.3125, + "rewards/accuracy_reward/std": 1.0522085428237915, + "rewards/ngram_similarity_reward/mean": 0.7970174551010132, + "rewards/ngram_similarity_reward/std": 0.36820217967033386, + "step": 1226 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 756.0, + "completions/max_terminated_length": 756.0, + "completions/mean_length": 428.046875, + "completions/mean_terminated_length": 428.046875, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "epoch": 0.5491161333631684, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09603255242109299, + "learning_rate": 4.40946747169744e-06, + "loss": 0.0154, + "num_tokens": 195347834.0, + "reward": 2.0748066902160645, + "reward_std": 1.199176549911499, + "rewards/accuracy_reward/mean": 1.640625, + "rewards/accuracy_reward/std": 2.91611385345459, + "rewards/ngram_similarity_reward/mean": 0.4341817796230316, + "rewards/ngram_similarity_reward/std": 0.22849342226982117, + "step": 1227 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1066.0, + "completions/max_terminated_length": 1066.0, + "completions/mean_length": 398.203125, + "completions/mean_terminated_length": 398.203125, + "completions/min_length": 220.0, + "completions/min_terminated_length": 220.0, + "epoch": 0.5495636607742225, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09439033269882202, + "learning_rate": 4.4083427994048484e-06, + "loss": 0.0072, + "num_tokens": 195524167.0, + "reward": 6.176176071166992, + "reward_std": 0.10318027436733246, + "rewards/accuracy_reward/mean": 5.5, + "rewards/accuracy_reward/std": 0.0, + "rewards/ngram_similarity_reward/mean": 0.6761763095855713, + "rewards/ngram_similarity_reward/std": 0.2377578467130661, + "step": 1228 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 785.0, + "completions/max_terminated_length": 785.0, + "completions/mean_length": 504.359375, + "completions/mean_terminated_length": 504.359375, + "completions/min_length": 269.0, + "completions/min_terminated_length": 269.0, + "epoch": 0.5500111881852764, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07317329943180084, + "learning_rate": 4.407217219262347e-06, + "loss": -0.0342, + "num_tokens": 195671966.0, + "reward": 3.126587390899658, + "reward_std": 0.4775499105453491, + "rewards/accuracy_reward/mean": 2.40625, + "rewards/accuracy_reward/std": 3.0222392082214355, + "rewards/ngram_similarity_reward/mean": 0.7203375101089478, + "rewards/ngram_similarity_reward/std": 0.19649796187877655, + "step": 1229 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 738.0, + "completions/max_terminated_length": 738.0, + "completions/mean_length": 428.515625, + "completions/mean_terminated_length": 428.515625, + "completions/min_length": 197.0, + "completions/min_terminated_length": 197.0, + "epoch": 0.5504587155963303, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.08198384940624237, + "learning_rate": 4.406090731886125e-06, + "loss": 0.0426, + "num_tokens": 195794463.0, + "reward": 4.950111389160156, + "reward_std": 0.13821549713611603, + "rewards/accuracy_reward/mean": 4.0, + "rewards/accuracy_reward/std": 2.618614673614502, + "rewards/ngram_similarity_reward/mean": 0.9501111507415771, + "rewards/ngram_similarity_reward/std": 0.2974853515625, + "step": 1230 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 816.0, + "completions/max_terminated_length": 816.0, + "completions/mean_length": 420.203125, + "completions/mean_terminated_length": 420.203125, + "completions/min_length": 183.0, + "completions/min_terminated_length": 183.0, + "epoch": 0.5509062430073842, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.09043321013450623, + "learning_rate": 4.404963337892874e-06, + "loss": 0.005, + "num_tokens": 195923756.0, + "reward": 2.980551242828369, + "reward_std": 0.0410948246717453, + "rewards/accuracy_reward/mean": 2.5, + "rewards/accuracy_reward/std": 3.0237157344818115, + "rewards/ngram_similarity_reward/mean": 0.4805510640144348, + "rewards/ngram_similarity_reward/std": 0.2743230164051056, + "step": 1231 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 925.0, + "completions/max_terminated_length": 925.0, + "completions/mean_length": 350.15625, + "completions/mean_terminated_length": 350.15625, + "completions/min_length": 236.0, + "completions/min_terminated_length": 236.0, + "epoch": 0.5513537704184381, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1214212104678154, + "learning_rate": 4.403835037899778e-06, + "loss": -0.0058, + "num_tokens": 196088710.0, + "reward": 3.1802761554718018, + "reward_std": 0.18114234507083893, + "rewards/accuracy_reward/mean": 2.5, + "rewards/accuracy_reward/std": 3.0237157344818115, + "rewards/ngram_similarity_reward/mean": 0.6802761554718018, + "rewards/ngram_similarity_reward/std": 0.26738229393959045, + "step": 1232 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 645.0, + "completions/max_terminated_length": 645.0, + "completions/mean_length": 383.6875, + "completions/mean_terminated_length": 383.6875, + "completions/min_length": 207.0, + "completions/min_terminated_length": 207.0, + "epoch": 0.551801297829492, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11884015798568726, + "learning_rate": 4.4027058325245186e-06, + "loss": 0.0295, + "num_tokens": 196264658.0, + "reward": 3.2437744140625, + "reward_std": 1.6648763418197632, + "rewards/accuracy_reward/mean": 2.6875, + "rewards/accuracy_reward/std": 3.0178043842315674, + "rewards/ngram_similarity_reward/mean": 0.5562744140625, + "rewards/ngram_similarity_reward/std": 0.2828984260559082, + "step": 1233 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 789.0, + "completions/max_terminated_length": 789.0, + "completions/mean_length": 450.328125, + "completions/mean_terminated_length": 450.328125, + "completions/min_length": 245.0, + "completions/min_terminated_length": 245.0, + "epoch": 0.552248825240546, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08260753005743027, + "learning_rate": 4.401575722385272e-06, + "loss": -0.0038, + "num_tokens": 196394135.0, + "reward": 5.003075122833252, + "reward_std": 1.365034580230713, + "rewards/accuracy_reward/mean": 4.5625, + "rewards/accuracy_reward/std": 2.195775270462036, + "rewards/ngram_similarity_reward/mean": 0.4405752122402191, + "rewards/ngram_similarity_reward/std": 0.19160227477550507, + "step": 1234 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 757.0, + "completions/max_terminated_length": 757.0, + "completions/mean_length": 462.46875, + "completions/mean_terminated_length": 462.46875, + "completions/min_length": 258.0, + "completions/min_terminated_length": 258.0, + "epoch": 0.5526963526515999, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08738621324300766, + "learning_rate": 4.400444708100712e-06, + "loss": -0.0076, + "num_tokens": 196559973.0, + "reward": 2.6876659393310547, + "reward_std": 1.1295281648635864, + "rewards/accuracy_reward/mean": 2.125, + "rewards/accuracy_reward/std": 3.000000238418579, + "rewards/ngram_similarity_reward/mean": 0.5626658201217651, + "rewards/ngram_similarity_reward/std": 0.11666838079690933, + "step": 1235 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 510.0, + "completions/max_terminated_length": 510.0, + "completions/mean_length": 354.875, + "completions/mean_terminated_length": 354.875, + "completions/min_length": 161.0, + "completions/min_terminated_length": 161.0, + "epoch": 0.5531438800626538, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11991672962903976, + "learning_rate": 4.399312790290002e-06, + "loss": 0.0102, + "num_tokens": 196731933.0, + "reward": 5.695017337799072, + "reward_std": 1.147326946258545, + "rewards/accuracy_reward/mean": 5.03125, + "rewards/accuracy_reward/std": 1.6229382753372192, + "rewards/ngram_similarity_reward/mean": 0.6637676954269409, + "rewards/ngram_similarity_reward/std": 0.3793351352214813, + "step": 1236 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 654.0, + "completions/max_terminated_length": 654.0, + "completions/mean_length": 386.046875, + "completions/mean_terminated_length": 386.046875, + "completions/min_length": 247.0, + "completions/min_terminated_length": 247.0, + "epoch": 0.5535914074737077, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.12412123382091522, + "learning_rate": 4.398179969572807e-06, + "loss": 0.0123, + "num_tokens": 196894464.0, + "reward": 1.0227073431015015, + "reward_std": 0.8533003330230713, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 2.2060859203338623, + "rewards/ngram_similarity_reward/mean": 0.6008323431015015, + "rewards/ngram_similarity_reward/std": 0.287748783826828, + "step": 1237 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 675.0, + "completions/max_terminated_length": 675.0, + "completions/mean_length": 469.078125, + "completions/mean_terminated_length": 469.078125, + "completions/min_length": 270.0, + "completions/min_terminated_length": 270.0, + "epoch": 0.5540389348847617, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07732565701007843, + "learning_rate": 4.397046246569281e-06, + "loss": 0.0105, + "num_tokens": 197067893.0, + "reward": 1.4631134271621704, + "reward_std": 0.08608405292034149, + "rewards/accuracy_reward/mean": 1.0, + "rewards/accuracy_reward/std": 2.618614673614502, + "rewards/ngram_similarity_reward/mean": 0.46311336755752563, + "rewards/ngram_similarity_reward/std": 0.13780225813388824, + "step": 1238 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 733.0, + "completions/max_terminated_length": 733.0, + "completions/mean_length": 479.53125, + "completions/mean_terminated_length": 479.53125, + "completions/min_length": 301.0, + "completions/min_terminated_length": 301.0, + "epoch": 0.5544864622958157, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07165555655956268, + "learning_rate": 4.395911621900076e-06, + "loss": 0.0031, + "num_tokens": 197229799.0, + "reward": 4.127664566040039, + "reward_std": 2.0247697830200195, + "rewards/accuracy_reward/mean": 3.53125, + "rewards/accuracy_reward/std": 2.839454174041748, + "rewards/ngram_similarity_reward/mean": 0.5964146852493286, + "rewards/ngram_similarity_reward/std": 0.3093799948692322, + "step": 1239 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 693.0, + "completions/max_terminated_length": 693.0, + "completions/mean_length": 439.234375, + "completions/mean_terminated_length": 439.234375, + "completions/min_length": 233.0, + "completions/min_terminated_length": 233.0, + "epoch": 0.5549339897068696, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10605686157941818, + "learning_rate": 4.394776096186334e-06, + "loss": 0.0, + "num_tokens": 197377766.0, + "reward": 4.677614688873291, + "reward_std": 0.205747589468956, + "rewards/accuracy_reward/mean": 3.984375, + "rewards/accuracy_reward/std": 2.648702621459961, + "rewards/ngram_similarity_reward/mean": 0.693239688873291, + "rewards/ngram_similarity_reward/std": 0.40632468461990356, + "step": 1240 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 727.0, + "completions/max_terminated_length": 727.0, + "completions/mean_length": 427.984375, + "completions/mean_terminated_length": 427.984375, + "completions/min_length": 245.0, + "completions/min_terminated_length": 245.0, + "epoch": 0.5553815171179235, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09386388212442398, + "learning_rate": 4.393639670049692e-06, + "loss": 0.0178, + "num_tokens": 197547941.0, + "reward": 3.404207229614258, + "reward_std": 1.5193042755126953, + "rewards/accuracy_reward/mean": 2.96875, + "rewards/accuracy_reward/std": 2.986577272415161, + "rewards/ngram_similarity_reward/mean": 0.435457319021225, + "rewards/ngram_similarity_reward/std": 0.2644595503807068, + "step": 1241 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 751.0, + "completions/max_terminated_length": 751.0, + "completions/mean_length": 465.765625, + "completions/mean_terminated_length": 465.765625, + "completions/min_length": 281.0, + "completions/min_terminated_length": 281.0, + "epoch": 0.5558290445289774, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07347415387630463, + "learning_rate": 4.392502344112279e-06, + "loss": -0.0266, + "num_tokens": 197685126.0, + "reward": 3.5573441982269287, + "reward_std": 1.2206635475158691, + "rewards/accuracy_reward/mean": 3.0625, + "rewards/accuracy_reward/std": 2.9700891971588135, + "rewards/ngram_similarity_reward/mean": 0.49484407901763916, + "rewards/ngram_similarity_reward/std": 0.29545167088508606, + "step": 1242 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 654.0, + "completions/max_terminated_length": 654.0, + "completions/mean_length": 442.4375, + "completions/mean_terminated_length": 442.4375, + "completions/min_length": 247.0, + "completions/min_terminated_length": 247.0, + "epoch": 0.5562765719400313, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09946972131729126, + "learning_rate": 4.391364118996719e-06, + "loss": 0.0076, + "num_tokens": 197872834.0, + "reward": 1.5781934261322021, + "reward_std": 0.5496147871017456, + "rewards/accuracy_reward/mean": 1.1875, + "rewards/accuracy_reward/std": 2.7189810276031494, + "rewards/ngram_similarity_reward/mean": 0.3906933069229126, + "rewards/ngram_similarity_reward/std": 0.4072954058647156, + "step": 1243 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 552.0, + "completions/max_terminated_length": 552.0, + "completions/mean_length": 325.6875, + "completions/mean_terminated_length": 325.6875, + "completions/min_length": 267.0, + "completions/min_terminated_length": 267.0, + "epoch": 0.5567240993510852, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.07552611082792282, + "learning_rate": 4.390224995326126e-06, + "loss": -0.0015, + "num_tokens": 198019518.0, + "reward": 3.113879919052124, + "reward_std": 0.4030926525592804, + "rewards/accuracy_reward/mean": 2.59375, + "rewards/accuracy_reward/std": 3.0222392082214355, + "rewards/ngram_similarity_reward/mean": 0.5201297998428345, + "rewards/ngram_similarity_reward/std": 0.3127535283565521, + "step": 1244 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 959.0, + "completions/max_terminated_length": 959.0, + "completions/mean_length": 618.53125, + "completions/mean_terminated_length": 618.53125, + "completions/min_length": 394.0, + "completions/min_terminated_length": 394.0, + "epoch": 0.5571716267621392, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05382458493113518, + "learning_rate": 4.389084973724106e-06, + "loss": -0.0017, + "num_tokens": 198208240.0, + "reward": 4.324518203735352, + "reward_std": 0.7830394506454468, + "rewards/accuracy_reward/mean": 3.609375, + "rewards/accuracy_reward/std": 2.829084634780884, + "rewards/ngram_similarity_reward/mean": 0.7151432633399963, + "rewards/ngram_similarity_reward/std": 0.22197884321212769, + "step": 1245 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1178.0, + "completions/mean_length": 516.671875, + "completions/mean_terminated_length": 492.3651123046875, + "completions/min_length": 289.0, + "completions/min_terminated_length": 289.0, + "epoch": 0.5576191541731931, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07948078960180283, + "learning_rate": 4.3879440548147575e-06, + "loss": -0.0031, + "num_tokens": 198337131.0, + "reward": 4.668152809143066, + "reward_std": 0.5126365423202515, + "rewards/accuracy_reward/mean": 3.90625, + "rewards/accuracy_reward/std": 2.6709415912628174, + "rewards/ngram_similarity_reward/mean": 0.7619028687477112, + "rewards/ngram_similarity_reward/std": 0.23275285959243774, + "step": 1246 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 731.0, + "completions/max_terminated_length": 731.0, + "completions/mean_length": 463.5625, + "completions/mean_terminated_length": 463.5625, + "completions/min_length": 272.0, + "completions/min_terminated_length": 272.0, + "epoch": 0.558066681584247, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12463536858558655, + "learning_rate": 4.386802239222669e-06, + "loss": 0.0054, + "num_tokens": 198543423.0, + "reward": 4.298855304718018, + "reward_std": 0.7150664925575256, + "rewards/accuracy_reward/mean": 3.703125, + "rewards/accuracy_reward/std": 2.789889335632324, + "rewards/ngram_similarity_reward/mean": 0.5957306623458862, + "rewards/ngram_similarity_reward/std": 0.4176064431667328, + "step": 1247 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 740.0, + "completions/max_terminated_length": 740.0, + "completions/mean_length": 478.5625, + "completions/mean_terminated_length": 478.5625, + "completions/min_length": 340.0, + "completions/min_terminated_length": 340.0, + "epoch": 0.5585142089953009, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07589740306138992, + "learning_rate": 4.385659527572922e-06, + "loss": -0.0055, + "num_tokens": 198674355.0, + "reward": 5.731651782989502, + "reward_std": 0.9091787934303284, + "rewards/accuracy_reward/mean": 4.84375, + "rewards/accuracy_reward/std": 1.8874586820602417, + "rewards/ngram_similarity_reward/mean": 0.8879019021987915, + "rewards/ngram_similarity_reward/std": 0.17611658573150635, + "step": 1248 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1075.0, + "completions/max_terminated_length": 1075.0, + "completions/mean_length": 530.96875, + "completions/mean_terminated_length": 530.96875, + "completions/min_length": 349.0, + "completions/min_terminated_length": 349.0, + "epoch": 0.5589617364063549, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07664234191179276, + "learning_rate": 4.384515920491086e-06, + "loss": 0.0182, + "num_tokens": 198791345.0, + "reward": 2.9477784633636475, + "reward_std": 0.7595309019088745, + "rewards/accuracy_reward/mean": 2.125, + "rewards/accuracy_reward/std": 3.000000238418579, + "rewards/ngram_similarity_reward/mean": 0.8227784633636475, + "rewards/ngram_similarity_reward/std": 0.264209121465683, + "step": 1249 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 814.0, + "completions/max_terminated_length": 814.0, + "completions/mean_length": 502.703125, + "completions/mean_terminated_length": 502.703125, + "completions/min_length": 268.0, + "completions/min_terminated_length": 268.0, + "epoch": 0.5594092638174089, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08277595043182373, + "learning_rate": 4.383371418603222e-06, + "loss": -0.0294, + "num_tokens": 198948110.0, + "reward": 3.1016769409179688, + "reward_std": 0.1552654504776001, + "rewards/accuracy_reward/mean": 2.5, + "rewards/accuracy_reward/std": 3.0237157344818115, + "rewards/ngram_similarity_reward/mean": 0.6016767024993896, + "rewards/ngram_similarity_reward/std": 0.300509512424469, + "step": 1250 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 561.0, + "completions/max_terminated_length": 561.0, + "completions/mean_length": 395.53125, + "completions/mean_terminated_length": 395.53125, + "completions/min_length": 270.0, + "completions/min_terminated_length": 270.0, + "epoch": 0.5598567912284628, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.07156224548816681, + "learning_rate": 4.382226022535882e-06, + "loss": 0.0104, + "num_tokens": 199110688.0, + "reward": 4.769443035125732, + "reward_std": 0.07713472843170166, + "rewards/accuracy_reward/mean": 4.0, + "rewards/accuracy_reward/std": 2.618614673614502, + "rewards/ngram_similarity_reward/mean": 0.7694433927536011, + "rewards/ngram_similarity_reward/std": 0.4159128963947296, + "step": 1251 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 599.0, + "completions/max_terminated_length": 599.0, + "completions/mean_length": 409.390625, + "completions/mean_terminated_length": 409.390625, + "completions/min_length": 268.0, + "completions/min_terminated_length": 268.0, + "epoch": 0.5603043186395167, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08550208061933517, + "learning_rate": 4.381079732916104e-06, + "loss": -0.0078, + "num_tokens": 199290969.0, + "reward": 3.241121768951416, + "reward_std": 0.6393508911132812, + "rewards/accuracy_reward/mean": 2.6875, + "rewards/accuracy_reward/std": 3.0178043842315674, + "rewards/ngram_similarity_reward/mean": 0.5536216497421265, + "rewards/ngram_similarity_reward/std": 0.28667500615119934, + "step": 1252 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 686.0, + "completions/max_terminated_length": 686.0, + "completions/mean_length": 450.578125, + "completions/mean_terminated_length": 450.578125, + "completions/min_length": 212.0, + "completions/min_terminated_length": 212.0, + "epoch": 0.5607518460505706, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09212508797645569, + "learning_rate": 4.3799325503714205e-06, + "loss": -0.0084, + "num_tokens": 199500894.0, + "reward": 2.061561346054077, + "reward_std": 0.8142240643501282, + "rewards/accuracy_reward/mean": 1.46875, + "rewards/accuracy_reward/std": 2.839454174041748, + "rewards/ngram_similarity_reward/mean": 0.5928114652633667, + "rewards/ngram_similarity_reward/std": 0.2968214154243469, + "step": 1253 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 596.0, + "completions/max_terminated_length": 596.0, + "completions/mean_length": 355.953125, + "completions/mean_terminated_length": 355.953125, + "completions/min_length": 225.0, + "completions/min_terminated_length": 225.0, + "epoch": 0.5611993734616245, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.08225823193788528, + "learning_rate": 4.378784475529847e-06, + "loss": -0.0251, + "num_tokens": 199671707.0, + "reward": 6.1100172996521, + "reward_std": 0.10467161983251572, + "rewards/accuracy_reward/mean": 5.5, + "rewards/accuracy_reward/std": 0.0, + "rewards/ngram_similarity_reward/mean": 0.6100171804428101, + "rewards/ngram_similarity_reward/std": 0.2915937900543213, + "step": 1254 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 865.0, + "completions/max_terminated_length": 865.0, + "completions/mean_length": 479.15625, + "completions/mean_terminated_length": 479.15625, + "completions/min_length": 225.0, + "completions/min_terminated_length": 225.0, + "epoch": 0.5616469008726784, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0778251439332962, + "learning_rate": 4.377635509019891e-06, + "loss": -0.0155, + "num_tokens": 199827045.0, + "reward": 3.1580843925476074, + "reward_std": 0.5416929125785828, + "rewards/accuracy_reward/mean": 2.59375, + "rewards/accuracy_reward/std": 3.0222392082214355, + "rewards/ngram_similarity_reward/mean": 0.564334511756897, + "rewards/ngram_similarity_reward/std": 0.20748886466026306, + "step": 1255 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1102.0, + "completions/max_terminated_length": 1102.0, + "completions/mean_length": 419.9375, + "completions/mean_terminated_length": 419.9375, + "completions/min_length": 251.0, + "completions/min_terminated_length": 251.0, + "epoch": 0.5620944282837324, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.08607825636863708, + "learning_rate": 4.376485651470549e-06, + "loss": -0.0038, + "num_tokens": 199985249.0, + "reward": 1.464486837387085, + "reward_std": 0.8320564031600952, + "rewards/accuracy_reward/mean": 0.984375, + "rewards/accuracy_reward/std": 2.6306629180908203, + "rewards/ngram_similarity_reward/mean": 0.48011189699172974, + "rewards/ngram_similarity_reward/std": 0.2833652198314667, + "step": 1256 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 698.0, + "completions/max_terminated_length": 698.0, + "completions/mean_length": 428.390625, + "completions/mean_terminated_length": 428.390625, + "completions/min_length": 237.0, + "completions/min_terminated_length": 237.0, + "epoch": 0.5625419556947863, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09921771287918091, + "learning_rate": 4.375334903511302e-06, + "loss": -0.0273, + "num_tokens": 200126858.0, + "reward": 3.4128971099853516, + "reward_std": 0.6406792402267456, + "rewards/accuracy_reward/mean": 2.6875, + "rewards/accuracy_reward/std": 3.0178043842315674, + "rewards/ngram_similarity_reward/mean": 0.7253968715667725, + "rewards/ngram_similarity_reward/std": 0.21139651536941528, + "step": 1257 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 760.0, + "completions/max_terminated_length": 760.0, + "completions/mean_length": 529.0625, + "completions/mean_terminated_length": 529.0625, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "epoch": 0.5629894831058402, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0691845715045929, + "learning_rate": 4.37418326577212e-06, + "loss": 0.0244, + "num_tokens": 200282302.0, + "reward": 3.6800453662872314, + "reward_std": 0.8345037698745728, + "rewards/accuracy_reward/mean": 2.96875, + "rewards/accuracy_reward/std": 2.986577272415161, + "rewards/ngram_similarity_reward/mean": 0.7112956047058105, + "rewards/ngram_similarity_reward/std": 0.25884538888931274, + "step": 1258 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 701.0, + "completions/max_terminated_length": 701.0, + "completions/mean_length": 460.84375, + "completions/mean_terminated_length": 460.84375, + "completions/min_length": 290.0, + "completions/min_terminated_length": 290.0, + "epoch": 0.5634370105168942, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09184509515762329, + "learning_rate": 4.37303073888346e-06, + "loss": 0.0102, + "num_tokens": 200438996.0, + "reward": 4.765527725219727, + "reward_std": 0.5454859733581543, + "rewards/accuracy_reward/mean": 4.09375, + "rewards/accuracy_reward/std": 2.5617377758026123, + "rewards/ngram_similarity_reward/mean": 0.6717779636383057, + "rewards/ngram_similarity_reward/std": 0.3333250880241394, + "step": 1259 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 604.0, + "completions/max_terminated_length": 604.0, + "completions/mean_length": 413.65625, + "completions/mean_terminated_length": 413.65625, + "completions/min_length": 278.0, + "completions/min_terminated_length": 278.0, + "epoch": 0.5638845379279481, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1030343621969223, + "learning_rate": 4.3718773234762684e-06, + "loss": -0.0029, + "num_tokens": 200573614.0, + "reward": 3.8357348442077637, + "reward_std": 1.9523077011108398, + "rewards/accuracy_reward/mean": 3.15625, + "rewards/accuracy_reward/std": 2.950484275817871, + "rewards/ngram_similarity_reward/mean": 0.6794849634170532, + "rewards/ngram_similarity_reward/std": 0.3307085335254669, + "step": 1260 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 586.0, + "completions/max_terminated_length": 586.0, + "completions/mean_length": 396.015625, + "completions/mean_terminated_length": 396.015625, + "completions/min_length": 304.0, + "completions/min_terminated_length": 304.0, + "epoch": 0.564332065339002, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11136715114116669, + "learning_rate": 4.370723020181973e-06, + "loss": -0.0062, + "num_tokens": 200701919.0, + "reward": 4.722345352172852, + "reward_std": 1.2175238132476807, + "rewards/accuracy_reward/mean": 4.09375, + "rewards/accuracy_reward/std": 2.5617377758026123, + "rewards/ngram_similarity_reward/mean": 0.6285953521728516, + "rewards/ngram_similarity_reward/std": 0.27477124333381653, + "step": 1261 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 991.0, + "completions/max_terminated_length": 991.0, + "completions/mean_length": 601.625, + "completions/mean_terminated_length": 601.625, + "completions/min_length": 363.0, + "completions/min_terminated_length": 363.0, + "epoch": 0.564779592750056, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06975237280130386, + "learning_rate": 4.369567829632491e-06, + "loss": 0.0124, + "num_tokens": 200867223.0, + "reward": 3.798196792602539, + "reward_std": 1.3255693912506104, + "rewards/accuracy_reward/mean": 3.25, + "rewards/accuracy_reward/std": 2.9277002811431885, + "rewards/ngram_similarity_reward/mean": 0.54819655418396, + "rewards/ngram_similarity_reward/std": 0.23060840368270874, + "step": 1262 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 598.0, + "completions/max_terminated_length": 598.0, + "completions/mean_length": 409.546875, + "completions/mean_terminated_length": 409.546875, + "completions/min_length": 248.0, + "completions/min_terminated_length": 248.0, + "epoch": 0.5652271201611099, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11291968077421188, + "learning_rate": 4.368411752460226e-06, + "loss": 0.0079, + "num_tokens": 200974810.0, + "reward": 1.9527816772460938, + "reward_std": 2.1675050258636475, + "rewards/accuracy_reward/mean": 1.28125, + "rewards/accuracy_reward/std": 2.7629566192626953, + "rewards/ngram_similarity_reward/mean": 0.6715317964553833, + "rewards/ngram_similarity_reward/std": 0.31274405121803284, + "step": 1263 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 607.0, + "completions/max_terminated_length": 607.0, + "completions/mean_length": 370.84375, + "completions/mean_terminated_length": 370.84375, + "completions/min_length": 229.0, + "completions/min_terminated_length": 229.0, + "epoch": 0.5656746475721638, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.117449551820755, + "learning_rate": 4.367254789298064e-06, + "loss": -0.0587, + "num_tokens": 201134016.0, + "reward": 4.690761089324951, + "reward_std": 0.18102560937404633, + "rewards/accuracy_reward/mean": 4.0, + "rewards/accuracy_reward/std": 2.618614673614502, + "rewards/ngram_similarity_reward/mean": 0.6907616257667542, + "rewards/ngram_similarity_reward/std": 0.31563398241996765, + "step": 1264 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 455.0, + "completions/max_terminated_length": 455.0, + "completions/mean_length": 358.828125, + "completions/mean_terminated_length": 358.828125, + "completions/min_length": 230.0, + "completions/min_terminated_length": 230.0, + "epoch": 0.5661221749832177, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.09254030138254166, + "learning_rate": 4.366096940779378e-06, + "loss": 0.0127, + "num_tokens": 201380165.0, + "reward": 4.939835548400879, + "reward_std": 0.05255156755447388, + "rewards/accuracy_reward/mean": 4.0, + "rewards/accuracy_reward/std": 2.618614673614502, + "rewards/ngram_similarity_reward/mean": 0.9398356080055237, + "rewards/ngram_similarity_reward/std": 0.26915013790130615, + "step": 1265 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 681.0, + "completions/max_terminated_length": 681.0, + "completions/mean_length": 371.234375, + "completions/mean_terminated_length": 371.234375, + "completions/min_length": 179.0, + "completions/min_terminated_length": 179.0, + "epoch": 0.5665697023942716, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12018410861492157, + "learning_rate": 4.364938207538025e-06, + "loss": 0.0074, + "num_tokens": 201576292.0, + "reward": 3.246954917907715, + "reward_std": 1.396032452583313, + "rewards/accuracy_reward/mean": 2.4375, + "rewards/accuracy_reward/std": 3.095695972442627, + "rewards/ngram_similarity_reward/mean": 0.8094548583030701, + "rewards/ngram_similarity_reward/std": 0.3154228627681732, + "step": 1266 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 715.0, + "completions/max_terminated_length": 715.0, + "completions/mean_length": 455.75, + "completions/mean_terminated_length": 455.75, + "completions/min_length": 275.0, + "completions/min_terminated_length": 275.0, + "epoch": 0.5670172298053255, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08729907870292664, + "learning_rate": 4.3637785902083465e-06, + "loss": 0.0213, + "num_tokens": 201734020.0, + "reward": 1.7822763919830322, + "reward_std": 1.6111629009246826, + "rewards/accuracy_reward/mean": 1.265625, + "rewards/accuracy_reward/std": 2.775986671447754, + "rewards/ngram_similarity_reward/mean": 0.5166513919830322, + "rewards/ngram_similarity_reward/std": 0.30705127120018005, + "step": 1267 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 661.0, + "completions/max_terminated_length": 661.0, + "completions/mean_length": 390.90625, + "completions/mean_terminated_length": 390.90625, + "completions/min_length": 281.0, + "completions/min_terminated_length": 281.0, + "epoch": 0.5674647572163795, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10130637139081955, + "learning_rate": 4.362618089425169e-06, + "loss": 0.0195, + "num_tokens": 201891838.0, + "reward": 3.8338756561279297, + "reward_std": 2.096841335296631, + "rewards/accuracy_reward/mean": 3.25, + "rewards/accuracy_reward/std": 2.9277002811431885, + "rewards/ngram_similarity_reward/mean": 0.5838757753372192, + "rewards/ngram_similarity_reward/std": 0.34362757205963135, + "step": 1268 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 601.0, + "completions/mean_length": 464.875, + "completions/mean_terminated_length": 439.7460632324219, + "completions/min_length": 294.0, + "completions/min_terminated_length": 294.0, + "epoch": 0.5679122846274335, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08763626962900162, + "learning_rate": 4.361456705823802e-06, + "loss": -0.0274, + "num_tokens": 202039766.0, + "reward": 5.238713264465332, + "reward_std": 1.1649223566055298, + "rewards/accuracy_reward/mean": 4.375, + "rewards/accuracy_reward/std": 2.3603873252868652, + "rewards/ngram_similarity_reward/mean": 0.8637134432792664, + "rewards/ngram_similarity_reward/std": 0.2816106379032135, + "step": 1269 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 599.0, + "completions/max_terminated_length": 599.0, + "completions/mean_length": 418.171875, + "completions/mean_terminated_length": 418.171875, + "completions/min_length": 282.0, + "completions/min_terminated_length": 282.0, + "epoch": 0.5683598120384874, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11126846075057983, + "learning_rate": 4.3602944400400364e-06, + "loss": -0.0053, + "num_tokens": 202186593.0, + "reward": 2.743793487548828, + "reward_std": 2.2492334842681885, + "rewards/accuracy_reward/mean": 2.03125, + "rewards/accuracy_reward/std": 2.986577272415161, + "rewards/ngram_similarity_reward/mean": 0.7125436067581177, + "rewards/ngram_similarity_reward/std": 0.23575741052627563, + "step": 1270 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1637.0, + "completions/mean_length": 489.609375, + "completions/mean_terminated_length": 439.33868408203125, + "completions/min_length": 248.0, + "completions/min_terminated_length": 248.0, + "epoch": 0.5688073394495413, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06754752993583679, + "learning_rate": 4.359131292710149e-06, + "loss": -0.0835, + "num_tokens": 202316040.0, + "reward": 5.6284284591674805, + "reward_std": 1.3737437725067139, + "rewards/accuracy_reward/mean": 4.84375, + "rewards/accuracy_reward/std": 1.8874586820602417, + "rewards/ngram_similarity_reward/mean": 0.7846779227256775, + "rewards/ngram_similarity_reward/std": 0.31788432598114014, + "step": 1271 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 777.0, + "completions/max_terminated_length": 777.0, + "completions/mean_length": 481.921875, + "completions/mean_terminated_length": 481.921875, + "completions/min_length": 295.0, + "completions/min_terminated_length": 295.0, + "epoch": 0.5692548668605952, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08389648795127869, + "learning_rate": 4.357967264470898e-06, + "loss": -0.0086, + "num_tokens": 202488723.0, + "reward": 3.3738982677459717, + "reward_std": 0.7010356187820435, + "rewards/accuracy_reward/mean": 2.875, + "rewards/accuracy_reward/std": 3.000000238418579, + "rewards/ngram_similarity_reward/mean": 0.4988984763622284, + "rewards/ngram_similarity_reward/std": 0.2545229494571686, + "step": 1272 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1064.0, + "completions/mean_length": 620.859375, + "completions/mean_terminated_length": 574.8225708007812, + "completions/min_length": 294.0, + "completions/min_terminated_length": 294.0, + "epoch": 0.5697023942716491, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09216287732124329, + "learning_rate": 4.356802355959524e-06, + "loss": 0.0526, + "num_tokens": 202655034.0, + "reward": 3.811068534851074, + "reward_std": 1.2754812240600586, + "rewards/accuracy_reward/mean": 2.96875, + "rewards/accuracy_reward/std": 2.986577272415161, + "rewards/ngram_similarity_reward/mean": 0.8423184752464294, + "rewards/ngram_similarity_reward/std": 0.2445010542869568, + "step": 1273 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 894.0, + "completions/max_terminated_length": 894.0, + "completions/mean_length": 513.625, + "completions/mean_terminated_length": 513.625, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "epoch": 0.5701499216827031, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08699966967105865, + "learning_rate": 4.355636567813747e-06, + "loss": 0.006, + "num_tokens": 202846946.0, + "reward": 4.7357497215271, + "reward_std": 0.1327037811279297, + "rewards/accuracy_reward/mean": 4.0, + "rewards/accuracy_reward/std": 2.618614673614502, + "rewards/ngram_similarity_reward/mean": 0.7357498407363892, + "rewards/ngram_similarity_reward/std": 0.3276141285896301, + "step": 1274 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 567.0, + "completions/max_terminated_length": 567.0, + "completions/mean_length": 401.890625, + "completions/mean_terminated_length": 401.890625, + "completions/min_length": 257.0, + "completions/min_terminated_length": 257.0, + "epoch": 0.570597449093757, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09865225851535797, + "learning_rate": 4.354469900671773e-06, + "loss": -0.0169, + "num_tokens": 202977371.0, + "reward": 3.6046531200408936, + "reward_std": 0.8405933976173401, + "rewards/accuracy_reward/mean": 2.96875, + "rewards/accuracy_reward/std": 2.986577272415161, + "rewards/ngram_similarity_reward/mean": 0.6359032392501831, + "rewards/ngram_similarity_reward/std": 0.2587908208370209, + "step": 1275 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 598.0, + "completions/max_terminated_length": 598.0, + "completions/mean_length": 418.8125, + "completions/mean_terminated_length": 418.8125, + "completions/min_length": 277.0, + "completions/min_terminated_length": 277.0, + "epoch": 0.5710449765048109, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.10138774663209915, + "learning_rate": 4.353302355172286e-06, + "loss": -0.0067, + "num_tokens": 203110127.0, + "reward": 3.164700984954834, + "reward_std": 0.06457286328077316, + "rewards/accuracy_reward/mean": 2.5, + "rewards/accuracy_reward/std": 3.0237157344818115, + "rewards/ngram_similarity_reward/mean": 0.6647011041641235, + "rewards/ngram_similarity_reward/std": 0.3551769256591797, + "step": 1276 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 678.0, + "completions/max_terminated_length": 678.0, + "completions/mean_length": 444.203125, + "completions/mean_terminated_length": 444.203125, + "completions/min_length": 246.0, + "completions/min_terminated_length": 246.0, + "epoch": 0.5714925039158648, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0864931121468544, + "learning_rate": 4.3521339319544526e-06, + "loss": 0.0059, + "num_tokens": 203257500.0, + "reward": 5.6036834716796875, + "reward_std": 1.960383415222168, + "rewards/accuracy_reward/mean": 4.84375, + "rewards/accuracy_reward/std": 1.8874586820602417, + "rewards/ngram_similarity_reward/mean": 0.759933352470398, + "rewards/ngram_similarity_reward/std": 0.28998276591300964, + "step": 1277 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 576.0, + "completions/max_terminated_length": 576.0, + "completions/mean_length": 408.625, + "completions/mean_terminated_length": 408.625, + "completions/min_length": 267.0, + "completions/min_terminated_length": 267.0, + "epoch": 0.5719400313269187, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.08241990953683853, + "learning_rate": 4.350964631657918e-06, + "loss": 0.0027, + "num_tokens": 203417636.0, + "reward": 3.7263097763061523, + "reward_std": 0.8650047183036804, + "rewards/accuracy_reward/mean": 3.15625, + "rewards/accuracy_reward/std": 2.950484275817871, + "rewards/ngram_similarity_reward/mean": 0.5700598955154419, + "rewards/ngram_similarity_reward/std": 0.46347489953041077, + "step": 1278 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 561.0, + "completions/max_terminated_length": 561.0, + "completions/mean_length": 390.75, + "completions/mean_terminated_length": 390.75, + "completions/min_length": 212.0, + "completions/min_terminated_length": 212.0, + "epoch": 0.5723875587379726, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09796450287103653, + "learning_rate": 4.349794454922811e-06, + "loss": 0.0355, + "num_tokens": 203556820.0, + "reward": 4.256396293640137, + "reward_std": 1.2533926963806152, + "rewards/accuracy_reward/mean": 3.59375, + "rewards/accuracy_reward/std": 2.854785919189453, + "rewards/ngram_similarity_reward/mean": 0.6626464128494263, + "rewards/ngram_similarity_reward/std": 0.2611728310585022, + "step": 1279 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 1564.0, + "completions/max_terminated_length": 797.0, + "completions/mean_length": 435.828125, + "completions/mean_terminated_length": 417.920654296875, + "completions/min_length": 249.0, + "completions/min_terminated_length": 249.0, + "epoch": 0.5728350861490267, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.20437754690647125, + "learning_rate": 4.348623402389735e-06, + "loss": 0.0418, + "num_tokens": 203834909.0, + "reward": 2.4070346355438232, + "reward_std": 0.9201688766479492, + "rewards/accuracy_reward/mean": 1.984375, + "rewards/accuracy_reward/std": 3.03415584564209, + "rewards/ngram_similarity_reward/mean": 0.42265960574150085, + "rewards/ngram_similarity_reward/std": 0.29591092467308044, + "step": 1280 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1001.0, + "completions/max_terminated_length": 1001.0, + "completions/mean_length": 480.421875, + "completions/mean_terminated_length": 480.421875, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "epoch": 0.5732826135600806, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0753888338804245, + "learning_rate": 4.347451474699777e-06, + "loss": -0.0334, + "num_tokens": 203976904.0, + "reward": 2.8891162872314453, + "reward_std": 0.1184445470571518, + "rewards/accuracy_reward/mean": 2.484375, + "rewards/accuracy_reward/std": 3.0419929027557373, + "rewards/ngram_similarity_reward/mean": 0.40474116802215576, + "rewards/ngram_similarity_reward/std": 0.23125404119491577, + "step": 1281 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 797.0, + "completions/max_terminated_length": 797.0, + "completions/mean_length": 498.015625, + "completions/mean_terminated_length": 498.015625, + "completions/min_length": 354.0, + "completions/min_terminated_length": 354.0, + "epoch": 0.5737301409711345, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07478698343038559, + "learning_rate": 4.346278672494504e-06, + "loss": 0.0139, + "num_tokens": 204172025.0, + "reward": 2.332071542739868, + "reward_std": 1.2610900402069092, + "rewards/accuracy_reward/mean": 1.546875, + "rewards/accuracy_reward/std": 2.886364698410034, + "rewards/ngram_similarity_reward/mean": 0.7851964235305786, + "rewards/ngram_similarity_reward/std": 0.16327303647994995, + "step": 1282 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 639.0, + "completions/max_terminated_length": 639.0, + "completions/mean_length": 459.890625, + "completions/mean_terminated_length": 459.890625, + "completions/min_length": 302.0, + "completions/min_terminated_length": 302.0, + "epoch": 0.5741776683821884, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08617988973855972, + "learning_rate": 4.345104996415955e-06, + "loss": 0.0302, + "num_tokens": 204340898.0, + "reward": 4.557013511657715, + "reward_std": 0.5276709794998169, + "rewards/accuracy_reward/mean": 3.875, + "rewards/accuracy_reward/std": 2.7284510135650635, + "rewards/ngram_similarity_reward/mean": 0.682013750076294, + "rewards/ngram_similarity_reward/std": 0.2925887703895569, + "step": 1283 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 703.0, + "completions/max_terminated_length": 703.0, + "completions/mean_length": 396.453125, + "completions/mean_terminated_length": 396.453125, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, + "epoch": 0.5746251957932423, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08058377355337143, + "learning_rate": 4.343930447106656e-06, + "loss": 0.0124, + "num_tokens": 204519631.0, + "reward": 1.8502486944198608, + "reward_std": 0.8469141721725464, + "rewards/accuracy_reward/mean": 1.1875, + "rewards/accuracy_reward/std": 2.7189810276031494, + "rewards/ngram_similarity_reward/mean": 0.6627487540245056, + "rewards/ngram_similarity_reward/std": 0.1869189441204071, + "step": 1284 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 654.0, + "completions/max_terminated_length": 654.0, + "completions/mean_length": 407.5, + "completions/mean_terminated_length": 407.5, + "completions/min_length": 283.0, + "completions/min_terminated_length": 283.0, + "epoch": 0.5750727232042963, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10437867790460587, + "learning_rate": 4.342755025209604e-06, + "loss": 0.0234, + "num_tokens": 204655599.0, + "reward": 5.386307239532471, + "reward_std": 2.221829652786255, + "rewards/accuracy_reward/mean": 4.5625, + "rewards/accuracy_reward/std": 2.195775270462036, + "rewards/ngram_similarity_reward/mean": 0.8238072991371155, + "rewards/ngram_similarity_reward/std": 0.26526692509651184, + "step": 1285 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1018.0, + "completions/max_terminated_length": 1018.0, + "completions/mean_length": 564.3125, + "completions/mean_terminated_length": 564.3125, + "completions/min_length": 299.0, + "completions/min_terminated_length": 299.0, + "epoch": 0.5755202506153502, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.066917784512043, + "learning_rate": 4.34157873136828e-06, + "loss": 0.0032, + "num_tokens": 204823587.0, + "reward": 6.086606979370117, + "reward_std": 0.7912037372589111, + "rewards/accuracy_reward/mean": 5.125, + "rewards/accuracy_reward/std": 1.4638501405715942, + "rewards/ngram_similarity_reward/mean": 0.9616069793701172, + "rewards/ngram_similarity_reward/std": 0.18773657083511353, + "step": 1286 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 746.0, + "completions/max_terminated_length": 746.0, + "completions/mean_length": 481.75, + "completions/mean_terminated_length": 481.75, + "completions/min_length": 297.0, + "completions/min_terminated_length": 297.0, + "epoch": 0.5759677780264041, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08909545838832855, + "learning_rate": 4.340401566226636e-06, + "loss": -0.0426, + "num_tokens": 204978611.0, + "reward": 4.891693115234375, + "reward_std": 0.5363629460334778, + "rewards/accuracy_reward/mean": 4.09375, + "rewards/accuracy_reward/std": 2.5617377758026123, + "rewards/ngram_similarity_reward/mean": 0.797943115234375, + "rewards/ngram_similarity_reward/std": 0.23069414496421814, + "step": 1287 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 859.0, + "completions/max_terminated_length": 859.0, + "completions/mean_length": 521.484375, + "completions/mean_terminated_length": 521.484375, + "completions/min_length": 293.0, + "completions/min_terminated_length": 293.0, + "epoch": 0.576415305437458, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08944880217313766, + "learning_rate": 4.339223530429107e-06, + "loss": 0.0096, + "num_tokens": 205223346.0, + "reward": 4.646847248077393, + "reward_std": 0.12239819020032883, + "rewards/accuracy_reward/mean": 4.0, + "rewards/accuracy_reward/std": 2.618614673614502, + "rewards/ngram_similarity_reward/mean": 0.646847128868103, + "rewards/ngram_similarity_reward/std": 0.3760032057762146, + "step": 1288 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 749.0, + "completions/mean_length": 693.90625, + "completions/mean_terminated_length": 500.46429443359375, + "completions/min_length": 330.0, + "completions/min_terminated_length": 330.0, + "epoch": 0.5768628328485119, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.12657584249973297, + "learning_rate": 4.338044624620599e-06, + "loss": -0.0352, + "num_tokens": 205432556.0, + "reward": 1.4632389545440674, + "reward_std": 1.4085140228271484, + "rewards/accuracy_reward/mean": 0.6875, + "rewards/accuracy_reward/std": 2.455153465270996, + "rewards/ngram_similarity_reward/mean": 0.7757389545440674, + "rewards/ngram_similarity_reward/std": 0.2819943130016327, + "step": 1289 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 694.0, + "completions/mean_length": 449.234375, + "completions/mean_terminated_length": 397.6612854003906, + "completions/min_length": 238.0, + "completions/min_terminated_length": 238.0, + "epoch": 0.577310360259566, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11718279868364334, + "learning_rate": 4.336864849446499e-06, + "loss": 0.0201, + "num_tokens": 205583355.0, + "reward": 3.979405641555786, + "reward_std": 1.3052562475204468, + "rewards/accuracy_reward/mean": 3.34375, + "rewards/accuracy_reward/std": 2.9016621112823486, + "rewards/ngram_similarity_reward/mean": 0.6356555223464966, + "rewards/ngram_similarity_reward/std": 0.35236215591430664, + "step": 1290 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 950.0, + "completions/mean_length": 575.4375, + "completions/mean_terminated_length": 527.9354858398438, + "completions/min_length": 273.0, + "completions/min_terminated_length": 273.0, + "epoch": 0.5777578876706199, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0954417958855629, + "learning_rate": 4.335684205552666e-06, + "loss": 0.0487, + "num_tokens": 205836455.0, + "reward": 0.9510928988456726, + "reward_std": 1.331400752067566, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 2.3426685333251953, + "rewards/ngram_similarity_reward/mean": 0.5135928988456726, + "rewards/ngram_similarity_reward/std": 0.3271593749523163, + "step": 1291 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 503.0, + "completions/max_terminated_length": 503.0, + "completions/mean_length": 344.09375, + "completions/mean_terminated_length": 344.09375, + "completions/min_length": 194.0, + "completions/min_terminated_length": 194.0, + "epoch": 0.5782054150816738, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11226638406515121, + "learning_rate": 4.334502693585438e-06, + "loss": -0.011, + "num_tokens": 206006973.0, + "reward": 3.1843786239624023, + "reward_std": 0.1548232138156891, + "rewards/accuracy_reward/mean": 2.5, + "rewards/accuracy_reward/std": 3.0237157344818115, + "rewards/ngram_similarity_reward/mean": 0.6843785643577576, + "rewards/ngram_similarity_reward/std": 0.288402259349823, + "step": 1292 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 579.0, + "completions/max_terminated_length": 579.0, + "completions/mean_length": 375.921875, + "completions/mean_terminated_length": 375.921875, + "completions/min_length": 222.0, + "completions/min_terminated_length": 222.0, + "epoch": 0.5786529424927277, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.07860208302736282, + "learning_rate": 4.333320314191625e-06, + "loss": 0.0095, + "num_tokens": 206148088.0, + "reward": 4.348507881164551, + "reward_std": 0.7970369458198547, + "rewards/accuracy_reward/mean": 3.53125, + "rewards/accuracy_reward/std": 2.839454174041748, + "rewards/ngram_similarity_reward/mean": 0.8172580599784851, + "rewards/ngram_similarity_reward/std": 0.1973295956850052, + "step": 1293 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 828.0, + "completions/max_terminated_length": 828.0, + "completions/mean_length": 436.125, + "completions/mean_terminated_length": 436.125, + "completions/min_length": 228.0, + "completions/min_terminated_length": 228.0, + "epoch": 0.5791004699037816, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09916546195745468, + "learning_rate": 4.332137068018517e-06, + "loss": -0.0098, + "num_tokens": 206294736.0, + "reward": 5.092933177947998, + "reward_std": 1.3503336906433105, + "rewards/accuracy_reward/mean": 4.28125, + "rewards/accuracy_reward/std": 2.4330317974090576, + "rewards/ngram_similarity_reward/mean": 0.8116834163665771, + "rewards/ngram_similarity_reward/std": 0.3395226299762726, + "step": 1294 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 874.0, + "completions/max_terminated_length": 874.0, + "completions/mean_length": 548.0, + "completions/mean_terminated_length": 548.0, + "completions/min_length": 286.0, + "completions/min_terminated_length": 286.0, + "epoch": 0.5795479973148355, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07296004146337509, + "learning_rate": 4.330952955713871e-06, + "loss": -0.0303, + "num_tokens": 206444656.0, + "reward": 6.197272300720215, + "reward_std": 0.13062140345573425, + "rewards/accuracy_reward/mean": 5.5, + "rewards/accuracy_reward/std": 0.0, + "rewards/ngram_similarity_reward/mean": 0.6972724795341492, + "rewards/ngram_similarity_reward/std": 0.31922462582588196, + "step": 1295 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 562.0, + "completions/max_terminated_length": 562.0, + "completions/mean_length": 397.390625, + "completions/mean_terminated_length": 397.390625, + "completions/min_length": 288.0, + "completions/min_terminated_length": 288.0, + "epoch": 0.5799955247258894, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09220290929079056, + "learning_rate": 4.329767977925926e-06, + "loss": 0.0017, + "num_tokens": 206595369.0, + "reward": 5.39586067199707, + "reward_std": 1.393269658088684, + "rewards/accuracy_reward/mean": 4.75, + "rewards/accuracy_reward/std": 2.0, + "rewards/ngram_similarity_reward/mean": 0.645860493183136, + "rewards/ngram_similarity_reward/std": 0.1155499666929245, + "step": 1296 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 703.0, + "completions/max_terminated_length": 703.0, + "completions/mean_length": 432.25, + "completions/mean_terminated_length": 432.25, + "completions/min_length": 293.0, + "completions/min_terminated_length": 293.0, + "epoch": 0.5804430521369434, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08327258378267288, + "learning_rate": 4.328582135303387e-06, + "loss": -0.0105, + "num_tokens": 206748617.0, + "reward": 3.1711244583129883, + "reward_std": 0.4597923755645752, + "rewards/accuracy_reward/mean": 2.59375, + "rewards/accuracy_reward/std": 3.0222392082214355, + "rewards/ngram_similarity_reward/mean": 0.5773743391036987, + "rewards/ngram_similarity_reward/std": 0.30644235014915466, + "step": 1297 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 902.0, + "completions/max_terminated_length": 902.0, + "completions/mean_length": 500.75, + "completions/mean_terminated_length": 500.75, + "completions/min_length": 318.0, + "completions/min_terminated_length": 318.0, + "epoch": 0.5808905795479973, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07713520526885986, + "learning_rate": 4.327395428495441e-06, + "loss": -0.0364, + "num_tokens": 206921913.0, + "reward": 1.6427359580993652, + "reward_std": 0.09815908223390579, + "rewards/accuracy_reward/mean": 1.0, + "rewards/accuracy_reward/std": 2.618614673614502, + "rewards/ngram_similarity_reward/mean": 0.6427358984947205, + "rewards/ngram_similarity_reward/std": 0.23064683377742767, + "step": 1298 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 713.0, + "completions/max_terminated_length": 713.0, + "completions/mean_length": 510.515625, + "completions/mean_terminated_length": 510.515625, + "completions/min_length": 276.0, + "completions/min_terminated_length": 276.0, + "epoch": 0.5813381069590512, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06505846977233887, + "learning_rate": 4.326207858151739e-06, + "loss": 0.0012, + "num_tokens": 207084122.0, + "reward": 4.891460418701172, + "reward_std": 0.6402475237846375, + "rewards/accuracy_reward/mean": 4.1875, + "rewards/accuracy_reward/std": 2.5, + "rewards/ngram_similarity_reward/mean": 0.703960657119751, + "rewards/ngram_similarity_reward/std": 0.29561981558799744, + "step": 1299 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 675.0, + "completions/mean_length": 479.375, + "completions/mean_terminated_length": 454.4762268066406, + "completions/min_length": 277.0, + "completions/min_terminated_length": 277.0, + "epoch": 0.5817856343701052, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.09155668318271637, + "learning_rate": 4.325019424922412e-06, + "loss": -0.0135, + "num_tokens": 207233314.0, + "reward": 3.167853355407715, + "reward_std": 0.10784564912319183, + "rewards/accuracy_reward/mean": 2.5, + "rewards/accuracy_reward/std": 3.0237157344818115, + "rewards/ngram_similarity_reward/mean": 0.6678533554077148, + "rewards/ngram_similarity_reward/std": 0.25225144624710083, + "step": 1300 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 831.0, + "completions/max_terminated_length": 831.0, + "completions/mean_length": 443.890625, + "completions/mean_terminated_length": 443.890625, + "completions/min_length": 222.0, + "completions/min_terminated_length": 222.0, + "epoch": 0.5822331617811591, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0908183827996254, + "learning_rate": 4.323830129458061e-06, + "loss": 0.024, + "num_tokens": 207518395.0, + "reward": 4.62649393081665, + "reward_std": 2.101778030395508, + "rewards/accuracy_reward/mean": 3.71875, + "rewards/accuracy_reward/std": 2.7629566192626953, + "rewards/ngram_similarity_reward/mean": 0.9077439308166504, + "rewards/ngram_similarity_reward/std": 0.28142043948173523, + "step": 1301 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 661.0, + "completions/max_terminated_length": 661.0, + "completions/mean_length": 446.359375, + "completions/mean_terminated_length": 446.359375, + "completions/min_length": 307.0, + "completions/min_terminated_length": 307.0, + "epoch": 0.582680689192213, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09624571353197098, + "learning_rate": 4.322639972409759e-06, + "loss": 0.0042, + "num_tokens": 207667986.0, + "reward": 4.585222244262695, + "reward_std": 0.47039100527763367, + "rewards/accuracy_reward/mean": 3.90625, + "rewards/accuracy_reward/std": 2.6709415912628174, + "rewards/ngram_similarity_reward/mean": 0.6789719462394714, + "rewards/ngram_similarity_reward/std": 0.23788464069366455, + "step": 1302 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 490.0, + "completions/max_terminated_length": 490.0, + "completions/mean_length": 324.828125, + "completions/mean_terminated_length": 324.828125, + "completions/min_length": 193.0, + "completions/min_terminated_length": 193.0, + "epoch": 0.583128216603267, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.10973282158374786, + "learning_rate": 4.321448954429048e-06, + "loss": -0.0066, + "num_tokens": 207798759.0, + "reward": 4.582745552062988, + "reward_std": 0.039319053292274475, + "rewards/accuracy_reward/mean": 4.0, + "rewards/accuracy_reward/std": 2.618614673614502, + "rewards/ngram_similarity_reward/mean": 0.5827455520629883, + "rewards/ngram_similarity_reward/std": 0.1293172389268875, + "step": 1303 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 900.0, + "completions/max_terminated_length": 900.0, + "completions/mean_length": 522.328125, + "completions/mean_terminated_length": 522.328125, + "completions/min_length": 237.0, + "completions/min_terminated_length": 237.0, + "epoch": 0.5835757440143209, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09233658760786057, + "learning_rate": 4.320257076167945e-06, + "loss": -0.0145, + "num_tokens": 207954252.0, + "reward": 2.652021646499634, + "reward_std": 2.0199222564697266, + "rewards/accuracy_reward/mean": 2.1875, + "rewards/accuracy_reward/std": 3.043989896774292, + "rewards/ngram_similarity_reward/mean": 0.4645217955112457, + "rewards/ngram_similarity_reward/std": 0.389131098985672, + "step": 1304 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 924.0, + "completions/max_terminated_length": 924.0, + "completions/mean_length": 506.390625, + "completions/mean_terminated_length": 506.390625, + "completions/min_length": 323.0, + "completions/min_terminated_length": 323.0, + "epoch": 0.5840232714253748, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0873550996184349, + "learning_rate": 4.319064338278937e-06, + "loss": 0.0418, + "num_tokens": 208083957.0, + "reward": 1.8698256015777588, + "reward_std": 2.0062408447265625, + "rewards/accuracy_reward/mean": 1.1875, + "rewards/accuracy_reward/std": 2.7189810276031494, + "rewards/ngram_similarity_reward/mean": 0.6823254823684692, + "rewards/ngram_similarity_reward/std": 0.21194741129875183, + "step": 1305 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 687.0, + "completions/max_terminated_length": 687.0, + "completions/mean_length": 403.28125, + "completions/mean_terminated_length": 403.28125, + "completions/min_length": 214.0, + "completions/min_terminated_length": 214.0, + "epoch": 0.5844707988364287, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09757047891616821, + "learning_rate": 4.317870741414981e-06, + "loss": 0.0312, + "num_tokens": 208239943.0, + "reward": 2.9031214714050293, + "reward_std": 0.4686318039894104, + "rewards/accuracy_reward/mean": 2.40625, + "rewards/accuracy_reward/std": 3.0222392082214355, + "rewards/ngram_similarity_reward/mean": 0.4968714416027069, + "rewards/ngram_similarity_reward/std": 0.21332430839538574, + "step": 1306 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1324.0, + "completions/max_terminated_length": 1324.0, + "completions/mean_length": 412.046875, + "completions/mean_terminated_length": 412.046875, + "completions/min_length": 296.0, + "completions/min_terminated_length": 296.0, + "epoch": 0.5849183262474826, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12136556953191757, + "learning_rate": 4.3166762862295055e-06, + "loss": 0.0032, + "num_tokens": 208373370.0, + "reward": 3.7895092964172363, + "reward_std": 0.8431544303894043, + "rewards/accuracy_reward/mean": 3.25, + "rewards/accuracy_reward/std": 2.9277002811431885, + "rewards/ngram_similarity_reward/mean": 0.5395092368125916, + "rewards/ngram_similarity_reward/std": 0.2623805105686188, + "step": 1307 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 516.0, + "completions/max_terminated_length": 516.0, + "completions/mean_length": 369.65625, + "completions/mean_terminated_length": 369.65625, + "completions/min_length": 258.0, + "completions/min_terminated_length": 258.0, + "epoch": 0.5853658536585366, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11417142301797867, + "learning_rate": 4.315480973376406e-06, + "loss": 0.0012, + "num_tokens": 208533684.0, + "reward": 3.0894782543182373, + "reward_std": 1.7220485210418701, + "rewards/accuracy_reward/mean": 2.5, + "rewards/accuracy_reward/std": 3.0237157344818115, + "rewards/ngram_similarity_reward/mean": 0.5894783735275269, + "rewards/ngram_similarity_reward/std": 0.32322585582733154, + "step": 1308 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 841.0, + "completions/max_terminated_length": 841.0, + "completions/mean_length": 423.84375, + "completions/mean_terminated_length": 423.84375, + "completions/min_length": 271.0, + "completions/min_terminated_length": 271.0, + "epoch": 0.5858133810695905, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0967361107468605, + "learning_rate": 4.314284803510051e-06, + "loss": -0.0074, + "num_tokens": 208704058.0, + "reward": 6.2280683517456055, + "reward_std": 0.5649706125259399, + "rewards/accuracy_reward/mean": 5.390625, + "rewards/accuracy_reward/std": 0.8750000596046448, + "rewards/ngram_similarity_reward/mean": 0.837443470954895, + "rewards/ngram_similarity_reward/std": 0.19172249734401703, + "step": 1309 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 645.0, + "completions/max_terminated_length": 645.0, + "completions/mean_length": 474.171875, + "completions/mean_terminated_length": 474.171875, + "completions/min_length": 335.0, + "completions/min_terminated_length": 335.0, + "epoch": 0.5862609084806445, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.14397962391376495, + "learning_rate": 4.313087777285275e-06, + "loss": 0.0034, + "num_tokens": 208902981.0, + "reward": 2.9218571186065674, + "reward_std": 0.44614848494529724, + "rewards/accuracy_reward/mean": 2.40625, + "rewards/accuracy_reward/std": 3.0222392082214355, + "rewards/ngram_similarity_reward/mean": 0.5156069993972778, + "rewards/ngram_similarity_reward/std": 0.34570419788360596, + "step": 1310 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 619.0, + "completions/max_terminated_length": 619.0, + "completions/mean_length": 382.453125, + "completions/mean_terminated_length": 382.453125, + "completions/min_length": 191.0, + "completions/min_terminated_length": 191.0, + "epoch": 0.5867084358916984, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1117420345544815, + "learning_rate": 4.311889895357385e-06, + "loss": -0.0281, + "num_tokens": 209042322.0, + "reward": 3.054821491241455, + "reward_std": 0.15742333233356476, + "rewards/accuracy_reward/mean": 2.5, + "rewards/accuracy_reward/std": 3.0237157344818115, + "rewards/ngram_similarity_reward/mean": 0.5548213720321655, + "rewards/ngram_similarity_reward/std": 0.35419711470603943, + "step": 1311 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 933.0, + "completions/max_terminated_length": 933.0, + "completions/mean_length": 614.03125, + "completions/mean_terminated_length": 614.03125, + "completions/min_length": 293.0, + "completions/min_terminated_length": 293.0, + "epoch": 0.5871559633027523, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13993960618972778, + "learning_rate": 4.310691158382153e-06, + "loss": -0.0063, + "num_tokens": 209275700.0, + "reward": 3.5995821952819824, + "reward_std": 0.7862210273742676, + "rewards/accuracy_reward/mean": 2.84375, + "rewards/accuracy_reward/std": 3.0405657291412354, + "rewards/ngram_similarity_reward/mean": 0.7558322548866272, + "rewards/ngram_similarity_reward/std": 0.3166537582874298, + "step": 1312 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 683.0, + "completions/max_terminated_length": 683.0, + "completions/mean_length": 452.140625, + "completions/mean_terminated_length": 452.140625, + "completions/min_length": 302.0, + "completions/min_terminated_length": 302.0, + "epoch": 0.5876034907138062, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.08446729928255081, + "learning_rate": 4.3094915670158205e-06, + "loss": 0.003, + "num_tokens": 209448637.0, + "reward": 3.330838203430176, + "reward_std": 0.4240117073059082, + "rewards/accuracy_reward/mean": 2.59375, + "rewards/accuracy_reward/std": 3.0222392082214355, + "rewards/ngram_similarity_reward/mean": 0.7370883226394653, + "rewards/ngram_similarity_reward/std": 0.304819792509079, + "step": 1313 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 714.0, + "completions/max_terminated_length": 714.0, + "completions/mean_length": 432.84375, + "completions/mean_terminated_length": 432.84375, + "completions/min_length": 252.0, + "completions/min_terminated_length": 252.0, + "epoch": 0.5880510181248602, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.04957375302910805, + "learning_rate": 4.308291121915097e-06, + "loss": -0.0012, + "num_tokens": 209582851.0, + "reward": 4.887948989868164, + "reward_std": 0.016795890405774117, + "rewards/accuracy_reward/mean": 4.0, + "rewards/accuracy_reward/std": 2.618614673614502, + "rewards/ngram_similarity_reward/mean": 0.8879486918449402, + "rewards/ngram_similarity_reward/std": 0.27080172300338745, + "step": 1314 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1070.0, + "completions/max_terminated_length": 1070.0, + "completions/mean_length": 617.921875, + "completions/mean_terminated_length": 617.921875, + "completions/min_length": 326.0, + "completions/min_terminated_length": 326.0, + "epoch": 0.5884985455359141, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07341840118169785, + "learning_rate": 4.307089823737158e-06, + "loss": -0.0015, + "num_tokens": 209758686.0, + "reward": 1.6291905641555786, + "reward_std": 0.14190228283405304, + "rewards/accuracy_reward/mean": 0.984375, + "rewards/accuracy_reward/std": 2.6306629180908203, + "rewards/ngram_similarity_reward/mean": 0.6448154449462891, + "rewards/ngram_similarity_reward/std": 0.18305432796478271, + "step": 1315 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 702.0, + "completions/max_terminated_length": 702.0, + "completions/mean_length": 477.125, + "completions/mean_terminated_length": 477.125, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "epoch": 0.588946072946968, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10296616703271866, + "learning_rate": 4.30588767313965e-06, + "loss": -0.0146, + "num_tokens": 209898118.0, + "reward": 4.714685440063477, + "reward_std": 0.08106479048728943, + "rewards/accuracy_reward/mean": 4.0, + "rewards/accuracy_reward/std": 2.618614673614502, + "rewards/ngram_similarity_reward/mean": 0.714685320854187, + "rewards/ngram_similarity_reward/std": 0.306520938873291, + "step": 1316 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 711.0, + "completions/mean_length": 487.75, + "completions/mean_terminated_length": 437.4193420410156, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "epoch": 0.5893936003580219, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08141181617975235, + "learning_rate": 4.304684670780679e-06, + "loss": -0.0747, + "num_tokens": 210028694.0, + "reward": 4.732090950012207, + "reward_std": 1.1350460052490234, + "rewards/accuracy_reward/mean": 4.28125, + "rewards/accuracy_reward/std": 2.4330317974090576, + "rewards/ngram_similarity_reward/mean": 0.4508410394191742, + "rewards/ngram_similarity_reward/std": 0.41943320631980896, + "step": 1317 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1030.0, + "completions/max_terminated_length": 1030.0, + "completions/mean_length": 468.109375, + "completions/mean_terminated_length": 468.109375, + "completions/min_length": 299.0, + "completions/min_terminated_length": 299.0, + "epoch": 0.5898411277690758, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08218703418970108, + "learning_rate": 4.303480817318824e-06, + "loss": -0.0047, + "num_tokens": 210170541.0, + "reward": 3.772031784057617, + "reward_std": 1.4102468490600586, + "rewards/accuracy_reward/mean": 3.15625, + "rewards/accuracy_reward/std": 2.950484275817871, + "rewards/ngram_similarity_reward/mean": 0.6157817244529724, + "rewards/ngram_similarity_reward/std": 0.36207425594329834, + "step": 1318 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 844.0, + "completions/max_terminated_length": 844.0, + "completions/mean_length": 448.953125, + "completions/mean_terminated_length": 448.953125, + "completions/min_length": 222.0, + "completions/min_terminated_length": 222.0, + "epoch": 0.5902886551801297, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0812632218003273, + "learning_rate": 4.302276113413127e-06, + "loss": 0.0048, + "num_tokens": 210345034.0, + "reward": 3.6715292930603027, + "reward_std": 0.7349720597267151, + "rewards/accuracy_reward/mean": 2.875, + "rewards/accuracy_reward/std": 3.000000238418579, + "rewards/ngram_similarity_reward/mean": 0.7965295910835266, + "rewards/ngram_similarity_reward/std": 0.3704771399497986, + "step": 1319 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 696.0, + "completions/max_terminated_length": 696.0, + "completions/mean_length": 413.8125, + "completions/mean_terminated_length": 413.8125, + "completions/min_length": 252.0, + "completions/min_terminated_length": 252.0, + "epoch": 0.5907361825911837, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08711085468530655, + "learning_rate": 4.301070559723097e-06, + "loss": 0.0093, + "num_tokens": 210501582.0, + "reward": 4.417667388916016, + "reward_std": 0.8771721720695496, + "rewards/accuracy_reward/mean": 3.53125, + "rewards/accuracy_reward/std": 2.839454174041748, + "rewards/ngram_similarity_reward/mean": 0.8864171504974365, + "rewards/ngram_similarity_reward/std": 0.3051711618900299, + "step": 1320 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 645.0, + "completions/max_terminated_length": 645.0, + "completions/mean_length": 428.40625, + "completions/mean_terminated_length": 428.40625, + "completions/min_length": 275.0, + "completions/min_terminated_length": 275.0, + "epoch": 0.5911837100022377, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.09898845851421356, + "learning_rate": 4.2998641569087055e-06, + "loss": -0.0295, + "num_tokens": 210654440.0, + "reward": 3.3565735816955566, + "reward_std": 0.10630813241004944, + "rewards/accuracy_reward/mean": 2.5, + "rewards/accuracy_reward/std": 3.0237157344818115, + "rewards/ngram_similarity_reward/mean": 0.8565737009048462, + "rewards/ngram_similarity_reward/std": 0.16159357130527496, + "step": 1321 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 764.0, + "completions/max_terminated_length": 764.0, + "completions/mean_length": 419.75, + "completions/mean_terminated_length": 419.75, + "completions/min_length": 271.0, + "completions/min_terminated_length": 271.0, + "epoch": 0.5916312374132916, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09474169462919235, + "learning_rate": 4.2986569056303914e-06, + "loss": 0.0152, + "num_tokens": 210808600.0, + "reward": 1.5890698432922363, + "reward_std": 0.4605858623981476, + "rewards/accuracy_reward/mean": 0.90625, + "rewards/accuracy_reward/std": 2.5617377758026123, + "rewards/ngram_similarity_reward/mean": 0.6828198432922363, + "rewards/ngram_similarity_reward/std": 0.1803705096244812, + "step": 1322 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 766.0, + "completions/max_terminated_length": 766.0, + "completions/mean_length": 448.8125, + "completions/mean_terminated_length": 448.8125, + "completions/min_length": 303.0, + "completions/min_terminated_length": 303.0, + "epoch": 0.5920787648243455, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10569164901971817, + "learning_rate": 4.297448806549057e-06, + "loss": 0.0042, + "num_tokens": 210951836.0, + "reward": 4.156012058258057, + "reward_std": 0.843189537525177, + "rewards/accuracy_reward/mean": 3.53125, + "rewards/accuracy_reward/std": 2.839454174041748, + "rewards/ngram_similarity_reward/mean": 0.6247619986534119, + "rewards/ngram_similarity_reward/std": 0.3260897397994995, + "step": 1323 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 729.0, + "completions/max_terminated_length": 729.0, + "completions/mean_length": 510.09375, + "completions/mean_terminated_length": 510.09375, + "completions/min_length": 253.0, + "completions/min_terminated_length": 253.0, + "epoch": 0.5925262922353994, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06643950194120407, + "learning_rate": 4.2962398603260685e-06, + "loss": 0.0263, + "num_tokens": 211098034.0, + "reward": 6.3764495849609375, + "reward_std": 0.10241048783063889, + "rewards/accuracy_reward/mean": 5.5, + "rewards/accuracy_reward/std": 0.0, + "rewards/ngram_similarity_reward/mean": 0.8764500021934509, + "rewards/ngram_similarity_reward/std": 0.1761569082736969, + "step": 1324 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 664.0, + "completions/max_terminated_length": 664.0, + "completions/mean_length": 478.046875, + "completions/mean_terminated_length": 478.046875, + "completions/min_length": 355.0, + "completions/min_terminated_length": 355.0, + "epoch": 0.5929738196464533, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08487922698259354, + "learning_rate": 4.295030067623258e-06, + "loss": -0.0116, + "num_tokens": 211261285.0, + "reward": 4.322232723236084, + "reward_std": 1.0263280868530273, + "rewards/accuracy_reward/mean": 3.625, + "rewards/accuracy_reward/std": 2.8030595779418945, + "rewards/ngram_similarity_reward/mean": 0.6972328424453735, + "rewards/ngram_similarity_reward/std": 0.24187816679477692, + "step": 1325 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 781.0, + "completions/max_terminated_length": 781.0, + "completions/mean_length": 502.90625, + "completions/mean_terminated_length": 502.90625, + "completions/min_length": 275.0, + "completions/min_terminated_length": 275.0, + "epoch": 0.5934213470575073, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.07794208824634552, + "learning_rate": 4.293819429102917e-06, + "loss": 0.0161, + "num_tokens": 211430095.0, + "reward": 4.888006210327148, + "reward_std": 0.5880323648452759, + "rewards/accuracy_reward/mean": 4.1875, + "rewards/accuracy_reward/std": 2.5, + "rewards/ngram_similarity_reward/mean": 0.7005060911178589, + "rewards/ngram_similarity_reward/std": 0.26562538743019104, + "step": 1326 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 782.0, + "completions/mean_length": 556.421875, + "completions/mean_terminated_length": 508.3064270019531, + "completions/min_length": 319.0, + "completions/min_terminated_length": 319.0, + "epoch": 0.5938688744685612, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09490156918764114, + "learning_rate": 4.2926079454278055e-06, + "loss": -0.0211, + "num_tokens": 211566714.0, + "reward": 2.9497318267822266, + "reward_std": 2.1695144176483154, + "rewards/accuracy_reward/mean": 2.3125, + "rewards/accuracy_reward/std": 3.126309394836426, + "rewards/ngram_similarity_reward/mean": 0.6372320652008057, + "rewards/ngram_similarity_reward/std": 0.3465478718280792, + "step": 1327 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 677.0, + "completions/max_terminated_length": 677.0, + "completions/mean_length": 439.9375, + "completions/mean_terminated_length": 439.9375, + "completions/min_length": 270.0, + "completions/min_terminated_length": 270.0, + "epoch": 0.5943164018796151, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.09296488761901855, + "learning_rate": 4.29139561726114e-06, + "loss": -0.0155, + "num_tokens": 211746230.0, + "reward": 0.4452609121799469, + "reward_std": 0.621183454990387, + "rewards/accuracy_reward/mean": -0.21875, + "rewards/accuracy_reward/std": 1.2782522439956665, + "rewards/ngram_similarity_reward/mean": 0.6640108823776245, + "rewards/ngram_similarity_reward/std": 0.26765748858451843, + "step": 1328 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1079.0, + "completions/max_terminated_length": 1079.0, + "completions/mean_length": 563.703125, + "completions/mean_terminated_length": 563.703125, + "completions/min_length": 295.0, + "completions/min_terminated_length": 295.0, + "epoch": 0.594763929290669, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10059570521116257, + "learning_rate": 4.2901824452666025e-06, + "loss": -0.0006, + "num_tokens": 211888707.0, + "reward": 3.1512491703033447, + "reward_std": 0.14171501994132996, + "rewards/accuracy_reward/mean": 2.5, + "rewards/accuracy_reward/std": 3.0237157344818115, + "rewards/ngram_similarity_reward/mean": 0.6512490510940552, + "rewards/ngram_similarity_reward/std": 0.23141607642173767, + "step": 1329 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 678.0, + "completions/max_terminated_length": 678.0, + "completions/mean_length": 463.609375, + "completions/mean_terminated_length": 463.609375, + "completions/min_length": 305.0, + "completions/min_terminated_length": 305.0, + "epoch": 0.5952114567017229, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09031175076961517, + "learning_rate": 4.288968430108339e-06, + "loss": 0.0001, + "num_tokens": 212059626.0, + "reward": 4.033574104309082, + "reward_std": 0.9780128002166748, + "rewards/accuracy_reward/mean": 3.46875, + "rewards/accuracy_reward/std": 3.157097101211548, + "rewards/ngram_similarity_reward/mean": 0.5648245811462402, + "rewards/ngram_similarity_reward/std": 0.3023799657821655, + "step": 1330 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 755.0, + "completions/max_terminated_length": 755.0, + "completions/mean_length": 517.96875, + "completions/mean_terminated_length": 517.96875, + "completions/min_length": 223.0, + "completions/min_terminated_length": 223.0, + "epoch": 0.595658984112777, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07907724380493164, + "learning_rate": 4.287753572450953e-06, + "loss": 0.0591, + "num_tokens": 212239240.0, + "reward": 4.20205020904541, + "reward_std": 0.9947676658630371, + "rewards/accuracy_reward/mean": 3.71875, + "rewards/accuracy_reward/std": 2.7629566192626953, + "rewards/ngram_similarity_reward/mean": 0.48330050706863403, + "rewards/ngram_similarity_reward/std": 0.2970724403858185, + "step": 1331 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1128.0, + "completions/mean_length": 565.203125, + "completions/mean_terminated_length": 517.3709716796875, + "completions/min_length": 284.0, + "completions/min_terminated_length": 284.0, + "epoch": 0.5961065115238309, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11148945987224579, + "learning_rate": 4.286537872959513e-06, + "loss": -0.0255, + "num_tokens": 212376005.0, + "reward": 5.483572959899902, + "reward_std": 0.9000318050384521, + "rewards/accuracy_reward/mean": 4.828125, + "rewards/accuracy_reward/std": 1.9359153509140015, + "rewards/ngram_similarity_reward/mean": 0.6554478406906128, + "rewards/ngram_similarity_reward/std": 0.3480670750141144, + "step": 1332 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 679.0, + "completions/max_terminated_length": 679.0, + "completions/mean_length": 446.21875, + "completions/mean_terminated_length": 446.21875, + "completions/min_length": 307.0, + "completions/min_terminated_length": 307.0, + "epoch": 0.5965540389348848, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09494276344776154, + "learning_rate": 4.285321332299544e-06, + "loss": -0.0225, + "num_tokens": 212503747.0, + "reward": 3.0399563312530518, + "reward_std": 1.0980381965637207, + "rewards/accuracy_reward/mean": 2.5, + "rewards/accuracy_reward/std": 3.0237157344818115, + "rewards/ngram_similarity_reward/mean": 0.5399561524391174, + "rewards/ngram_similarity_reward/std": 0.1877039521932602, + "step": 1333 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 676.0, + "completions/max_terminated_length": 676.0, + "completions/mean_length": 402.125, + "completions/mean_terminated_length": 402.125, + "completions/min_length": 230.0, + "completions/min_terminated_length": 230.0, + "epoch": 0.5970015663459387, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10471401363611221, + "learning_rate": 4.284103951137036e-06, + "loss": -0.0133, + "num_tokens": 212656683.0, + "reward": 6.392208099365234, + "reward_std": 0.1683715581893921, + "rewards/accuracy_reward/mean": 5.5, + "rewards/accuracy_reward/std": 0.0, + "rewards/ngram_similarity_reward/mean": 0.892208456993103, + "rewards/ngram_similarity_reward/std": 0.23434390127658844, + "step": 1334 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 820.0, + "completions/mean_length": 474.921875, + "completions/mean_terminated_length": 449.952392578125, + "completions/min_length": 214.0, + "completions/min_terminated_length": 214.0, + "epoch": 0.5974490937569926, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09312959760427475, + "learning_rate": 4.2828857301384355e-06, + "loss": 0.0336, + "num_tokens": 212838998.0, + "reward": 3.833740711212158, + "reward_std": 2.504103183746338, + "rewards/accuracy_reward/mean": 3.25, + "rewards/accuracy_reward/std": 2.9277002811431885, + "rewards/ngram_similarity_reward/mean": 0.5837405920028687, + "rewards/ngram_similarity_reward/std": 0.33722445368766785, + "step": 1335 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 607.0, + "completions/max_terminated_length": 607.0, + "completions/mean_length": 403.84375, + "completions/mean_terminated_length": 403.84375, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "epoch": 0.5978966211680465, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10815006494522095, + "learning_rate": 4.281666669970652e-06, + "loss": -0.0152, + "num_tokens": 213023052.0, + "reward": 4.467321395874023, + "reward_std": 0.10678227245807648, + "rewards/accuracy_reward/mean": 4.0, + "rewards/accuracy_reward/std": 2.618614673614502, + "rewards/ngram_similarity_reward/mean": 0.46732163429260254, + "rewards/ngram_similarity_reward/std": 0.24562959372997284, + "step": 1336 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 879.0, + "completions/mean_length": 484.578125, + "completions/mean_terminated_length": 434.1451416015625, + "completions/min_length": 244.0, + "completions/min_terminated_length": 244.0, + "epoch": 0.5983441485791005, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10885749757289886, + "learning_rate": 4.280446771301051e-06, + "loss": -0.0005, + "num_tokens": 213146865.0, + "reward": 4.570272445678711, + "reward_std": 0.7078070640563965, + "rewards/accuracy_reward/mean": 3.71875, + "rewards/accuracy_reward/std": 2.7629566192626953, + "rewards/ngram_similarity_reward/mean": 0.85152268409729, + "rewards/ngram_similarity_reward/std": 0.2377517968416214, + "step": 1337 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 909.0, + "completions/max_terminated_length": 909.0, + "completions/mean_length": 495.953125, + "completions/mean_terminated_length": 495.953125, + "completions/min_length": 209.0, + "completions/min_terminated_length": 209.0, + "epoch": 0.5987916759901544, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.06886874884366989, + "learning_rate": 4.279226034797459e-06, + "loss": -0.0594, + "num_tokens": 213301934.0, + "reward": 3.3245933055877686, + "reward_std": 0.5592687726020813, + "rewards/accuracy_reward/mean": 2.6875, + "rewards/accuracy_reward/std": 3.0178043842315674, + "rewards/ngram_similarity_reward/mean": 0.6370932459831238, + "rewards/ngram_similarity_reward/std": 0.2646704316139221, + "step": 1338 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 712.0, + "completions/max_terminated_length": 712.0, + "completions/mean_length": 430.671875, + "completions/mean_terminated_length": 430.671875, + "completions/min_length": 249.0, + "completions/min_terminated_length": 249.0, + "epoch": 0.5992392034012083, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.07862003147602081, + "learning_rate": 4.278004461128163e-06, + "loss": -0.0119, + "num_tokens": 213425081.0, + "reward": 4.431199550628662, + "reward_std": 0.7673516869544983, + "rewards/accuracy_reward/mean": 3.625, + "rewards/accuracy_reward/std": 2.8030595779418945, + "rewards/ngram_similarity_reward/mean": 0.8061997890472412, + "rewards/ngram_similarity_reward/std": 0.307853102684021, + "step": 1339 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 727.0, + "completions/max_terminated_length": 727.0, + "completions/mean_length": 459.03125, + "completions/mean_terminated_length": 459.03125, + "completions/min_length": 315.0, + "completions/min_terminated_length": 315.0, + "epoch": 0.5996867308122622, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07831700146198273, + "learning_rate": 4.276782050961905e-06, + "loss": -0.0355, + "num_tokens": 213607291.0, + "reward": 6.349282741546631, + "reward_std": 0.14734327793121338, + "rewards/accuracy_reward/mean": 5.5, + "rewards/accuracy_reward/std": 0.0, + "rewards/ngram_similarity_reward/mean": 0.8492826223373413, + "rewards/ngram_similarity_reward/std": 0.25453436374664307, + "step": 1340 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 682.0, + "completions/max_terminated_length": 682.0, + "completions/mean_length": 403.359375, + "completions/mean_terminated_length": 403.359375, + "completions/min_length": 196.0, + "completions/min_terminated_length": 196.0, + "epoch": 0.6001342582233162, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08558699488639832, + "learning_rate": 4.2755588049678845e-06, + "loss": -0.0252, + "num_tokens": 213735954.0, + "reward": 4.715620994567871, + "reward_std": 0.10598556697368622, + "rewards/accuracy_reward/mean": 4.0, + "rewards/accuracy_reward/std": 2.618614673614502, + "rewards/ngram_similarity_reward/mean": 0.7156206369400024, + "rewards/ngram_similarity_reward/std": 0.1637507677078247, + "step": 1341 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1031.0, + "completions/max_terminated_length": 1031.0, + "completions/mean_length": 562.46875, + "completions/mean_terminated_length": 562.46875, + "completions/min_length": 295.0, + "completions/min_terminated_length": 295.0, + "epoch": 0.6005817856343701, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06608100980520248, + "learning_rate": 4.274334723815763e-06, + "loss": 0.005, + "num_tokens": 213871072.0, + "reward": 5.264054775238037, + "reward_std": 1.5914874076843262, + "rewards/accuracy_reward/mean": 4.46875, + "rewards/accuracy_reward/std": 2.2815253734588623, + "rewards/ngram_similarity_reward/mean": 0.7953047156333923, + "rewards/ngram_similarity_reward/std": 0.29335033893585205, + "step": 1342 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 763.0, + "completions/max_terminated_length": 763.0, + "completions/mean_length": 523.109375, + "completions/mean_terminated_length": 523.109375, + "completions/min_length": 320.0, + "completions/min_terminated_length": 320.0, + "epoch": 0.6010293130454241, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07100894302129745, + "learning_rate": 4.273109808175655e-06, + "loss": 0.008, + "num_tokens": 214082583.0, + "reward": 1.8311067819595337, + "reward_std": 0.6609352231025696, + "rewards/accuracy_reward/mean": 1.1875, + "rewards/accuracy_reward/std": 2.7189810276031494, + "rewards/ngram_similarity_reward/mean": 0.6436067819595337, + "rewards/ngram_similarity_reward/std": 0.268633633852005, + "step": 1343 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 932.0, + "completions/max_terminated_length": 932.0, + "completions/mean_length": 576.75, + "completions/mean_terminated_length": 576.75, + "completions/min_length": 339.0, + "completions/min_terminated_length": 339.0, + "epoch": 0.601476840456478, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09211838990449905, + "learning_rate": 4.271884058718133e-06, + "loss": 0.0255, + "num_tokens": 214230647.0, + "reward": 2.031980276107788, + "reward_std": 1.8648428916931152, + "rewards/accuracy_reward/mean": 1.53125, + "rewards/accuracy_reward/std": 3.0130341053009033, + "rewards/ngram_similarity_reward/mean": 0.5007302761077881, + "rewards/ngram_similarity_reward/std": 0.1377912163734436, + "step": 1344 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 909.0, + "completions/mean_length": 573.265625, + "completions/mean_terminated_length": 549.857177734375, + "completions/min_length": 283.0, + "completions/min_terminated_length": 283.0, + "epoch": 0.6019243678675319, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1130412220954895, + "learning_rate": 4.270657476114227e-06, + "loss": 0.0084, + "num_tokens": 214439080.0, + "reward": 3.2013514041900635, + "reward_std": 0.5560939311981201, + "rewards/accuracy_reward/mean": 2.59375, + "rewards/accuracy_reward/std": 3.0222392082214355, + "rewards/ngram_similarity_reward/mean": 0.6076014041900635, + "rewards/ngram_similarity_reward/std": 0.3042972981929779, + "step": 1345 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 737.0, + "completions/max_terminated_length": 737.0, + "completions/mean_length": 434.265625, + "completions/mean_terminated_length": 434.265625, + "completions/min_length": 252.0, + "completions/min_terminated_length": 252.0, + "epoch": 0.6023718952785858, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07032214850187302, + "learning_rate": 4.269430061035423e-06, + "loss": -0.0116, + "num_tokens": 214577033.0, + "reward": 6.2450032234191895, + "reward_std": 0.14689236879348755, + "rewards/accuracy_reward/mean": 5.5, + "rewards/accuracy_reward/std": 0.0, + "rewards/ngram_similarity_reward/mean": 0.7450035810470581, + "rewards/ngram_similarity_reward/std": 0.4236237108707428, + "step": 1346 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 846.0, + "completions/max_terminated_length": 846.0, + "completions/mean_length": 494.703125, + "completions/mean_terminated_length": 494.703125, + "completions/min_length": 228.0, + "completions/min_terminated_length": 228.0, + "epoch": 0.6028194226896397, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07221122086048126, + "learning_rate": 4.26820181415366e-06, + "loss": 0.0114, + "num_tokens": 214700678.0, + "reward": 3.9844794273376465, + "reward_std": 1.3448883295059204, + "rewards/accuracy_reward/mean": 3.34375, + "rewards/accuracy_reward/std": 2.9016621112823486, + "rewards/ngram_similarity_reward/mean": 0.640729546546936, + "rewards/ngram_similarity_reward/std": 0.4196074306964874, + "step": 1347 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 662.0, + "completions/max_terminated_length": 662.0, + "completions/mean_length": 438.375, + "completions/mean_terminated_length": 438.375, + "completions/min_length": 237.0, + "completions/min_terminated_length": 237.0, + "epoch": 0.6032669501006936, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.07972569018602371, + "learning_rate": 4.266972736141337e-06, + "loss": -0.0108, + "num_tokens": 214825182.0, + "reward": 3.4212043285369873, + "reward_std": 0.5146507620811462, + "rewards/accuracy_reward/mean": 2.59375, + "rewards/accuracy_reward/std": 3.0222392082214355, + "rewards/ngram_similarity_reward/mean": 0.8274544477462769, + "rewards/ngram_similarity_reward/std": 0.31825268268585205, + "step": 1348 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 661.0, + "completions/max_terminated_length": 661.0, + "completions/mean_length": 434.1875, + "completions/mean_terminated_length": 434.1875, + "completions/min_length": 242.0, + "completions/min_terminated_length": 242.0, + "epoch": 0.6037144775117476, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10139903426170349, + "learning_rate": 4.2657428276713025e-06, + "loss": -0.0588, + "num_tokens": 214964810.0, + "reward": 4.542989730834961, + "reward_std": 1.9339182376861572, + "rewards/accuracy_reward/mean": 3.625, + "rewards/accuracy_reward/std": 2.8030595779418945, + "rewards/ngram_similarity_reward/mean": 0.9179896116256714, + "rewards/ngram_similarity_reward/std": 0.23786461353302002, + "step": 1349 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 793.0, + "completions/max_terminated_length": 793.0, + "completions/mean_length": 466.921875, + "completions/mean_terminated_length": 466.921875, + "completions/min_length": 274.0, + "completions/min_terminated_length": 274.0, + "epoch": 0.6041620049228015, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.08900474011898041, + "learning_rate": 4.264512089416864e-06, + "loss": 0.0183, + "num_tokens": 215158261.0, + "reward": 3.9474077224731445, + "reward_std": 0.9407780170440674, + "rewards/accuracy_reward/mean": 3.140625, + "rewards/accuracy_reward/std": 2.9727182388305664, + "rewards/ngram_similarity_reward/mean": 0.8067828416824341, + "rewards/ngram_similarity_reward/std": 0.3917195498943329, + "step": 1350 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 796.0, + "completions/max_terminated_length": 796.0, + "completions/mean_length": 496.171875, + "completions/mean_terminated_length": 496.171875, + "completions/min_length": 341.0, + "completions/min_terminated_length": 341.0, + "epoch": 0.6046095323338555, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08319786190986633, + "learning_rate": 4.263280522051784e-06, + "loss": 0.0029, + "num_tokens": 215324656.0, + "reward": 4.686700344085693, + "reward_std": 0.5952214598655701, + "rewards/accuracy_reward/mean": 4.171875, + "rewards/accuracy_reward/std": 2.5326733589172363, + "rewards/ngram_similarity_reward/mean": 0.5148252248764038, + "rewards/ngram_similarity_reward/std": 0.3073686361312866, + "step": 1351 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 796.0, + "completions/max_terminated_length": 796.0, + "completions/mean_length": 459.1875, + "completions/mean_terminated_length": 459.1875, + "completions/min_length": 217.0, + "completions/min_terminated_length": 217.0, + "epoch": 0.6050570597449094, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0919172465801239, + "learning_rate": 4.262048126250274e-06, + "loss": -0.0199, + "num_tokens": 215483900.0, + "reward": 5.952759742736816, + "reward_std": 0.4562895894050598, + "rewards/accuracy_reward/mean": 5.40625, + "rewards/accuracy_reward/std": 0.7500000596046448, + "rewards/ngram_similarity_reward/mean": 0.5465095639228821, + "rewards/ngram_similarity_reward/std": 0.1557082086801529, + "step": 1352 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 592.0, + "completions/max_terminated_length": 592.0, + "completions/mean_length": 386.296875, + "completions/mean_terminated_length": 386.296875, + "completions/min_length": 255.0, + "completions/min_terminated_length": 255.0, + "epoch": 0.6055045871559633, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10624445974826813, + "learning_rate": 4.260814902687001e-06, + "loss": 0.0154, + "num_tokens": 215644111.0, + "reward": 4.7336578369140625, + "reward_std": 0.19047802686691284, + "rewards/accuracy_reward/mean": 4.0, + "rewards/accuracy_reward/std": 2.618614673614502, + "rewards/ngram_similarity_reward/mean": 0.7336578369140625, + "rewards/ngram_similarity_reward/std": 0.38012969493865967, + "step": 1353 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 648.0, + "completions/max_terminated_length": 648.0, + "completions/mean_length": 453.0625, + "completions/mean_terminated_length": 453.0625, + "completions/min_length": 302.0, + "completions/min_terminated_length": 302.0, + "epoch": 0.6059521145670173, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.09647325426340103, + "learning_rate": 4.259580852037089e-06, + "loss": 0.0303, + "num_tokens": 215809331.0, + "reward": 3.023860216140747, + "reward_std": 0.9079327583312988, + "rewards/accuracy_reward/mean": 2.40625, + "rewards/accuracy_reward/std": 3.0222392082214355, + "rewards/ngram_similarity_reward/mean": 0.6176100969314575, + "rewards/ngram_similarity_reward/std": 0.2777595520019531, + "step": 1354 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 613.0, + "completions/max_terminated_length": 613.0, + "completions/mean_length": 421.6875, + "completions/mean_terminated_length": 421.6875, + "completions/min_length": 297.0, + "completions/min_terminated_length": 297.0, + "epoch": 0.6063996419780712, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0940016433596611, + "learning_rate": 4.258345974976111e-06, + "loss": 0.0039, + "num_tokens": 215965631.0, + "reward": 3.2174174785614014, + "reward_std": 0.8066022992134094, + "rewards/accuracy_reward/mean": 2.5, + "rewards/accuracy_reward/std": 3.0237157344818115, + "rewards/ngram_similarity_reward/mean": 0.7174174189567566, + "rewards/ngram_similarity_reward/std": 0.2216615527868271, + "step": 1355 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 648.0, + "completions/max_terminated_length": 648.0, + "completions/mean_length": 398.921875, + "completions/mean_terminated_length": 398.921875, + "completions/min_length": 256.0, + "completions/min_terminated_length": 256.0, + "epoch": 0.6068471693891251, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09868673235177994, + "learning_rate": 4.257110272180091e-06, + "loss": -0.0161, + "num_tokens": 216138090.0, + "reward": 4.321117401123047, + "reward_std": 0.7568840980529785, + "rewards/accuracy_reward/mean": 3.625, + "rewards/accuracy_reward/std": 2.8030595779418945, + "rewards/ngram_similarity_reward/mean": 0.6961173415184021, + "rewards/ngram_similarity_reward/std": 0.2414599359035492, + "step": 1356 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 759.0, + "completions/max_terminated_length": 759.0, + "completions/mean_length": 480.140625, + "completions/mean_terminated_length": 480.140625, + "completions/min_length": 315.0, + "completions/min_terminated_length": 315.0, + "epoch": 0.607294696800179, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08821909129619598, + "learning_rate": 4.255873744325509e-06, + "loss": -0.0002, + "num_tokens": 216324867.0, + "reward": 6.164969444274902, + "reward_std": 0.12031973153352737, + "rewards/accuracy_reward/mean": 5.5, + "rewards/accuracy_reward/std": 0.0, + "rewards/ngram_similarity_reward/mean": 0.6649699211120605, + "rewards/ngram_similarity_reward/std": 0.20555303990840912, + "step": 1357 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1052.0, + "completions/mean_length": 628.125, + "completions/mean_terminated_length": 481.2413635253906, + "completions/min_length": 305.0, + "completions/min_terminated_length": 305.0, + "epoch": 0.6077422242112329, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.14948788285255432, + "learning_rate": 4.254636392089293e-06, + "loss": -0.1383, + "num_tokens": 216485579.0, + "reward": 4.539266586303711, + "reward_std": 1.2547316551208496, + "rewards/accuracy_reward/mean": 3.890625, + "rewards/accuracy_reward/std": 2.6998953819274902, + "rewards/ngram_similarity_reward/mean": 0.6486419439315796, + "rewards/ngram_similarity_reward/std": 0.44756120443344116, + "step": 1358 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 818.0, + "completions/max_terminated_length": 818.0, + "completions/mean_length": 498.9375, + "completions/mean_terminated_length": 498.9375, + "completions/min_length": 316.0, + "completions/min_terminated_length": 316.0, + "epoch": 0.6081897516222868, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07231821864843369, + "learning_rate": 4.253398216148826e-06, + "loss": -0.014, + "num_tokens": 216622135.0, + "reward": 4.751773834228516, + "reward_std": 0.11481022834777832, + "rewards/accuracy_reward/mean": 4.0, + "rewards/accuracy_reward/std": 2.618614673614502, + "rewards/ngram_similarity_reward/mean": 0.7517737150192261, + "rewards/ngram_similarity_reward/std": 0.30630967020988464, + "step": 1359 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 717.0, + "completions/max_terminated_length": 717.0, + "completions/mean_length": 471.796875, + "completions/mean_terminated_length": 471.796875, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "epoch": 0.6086372790333408, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09079179167747498, + "learning_rate": 4.25215921718194e-06, + "loss": -0.0029, + "num_tokens": 216787658.0, + "reward": 3.768434524536133, + "reward_std": 1.1472851037979126, + "rewards/accuracy_reward/mean": 3.15625, + "rewards/accuracy_reward/std": 2.950484275817871, + "rewards/ngram_similarity_reward/mean": 0.6121848225593567, + "rewards/ngram_similarity_reward/std": 0.3595779836177826, + "step": 1360 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 872.0, + "completions/max_terminated_length": 872.0, + "completions/mean_length": 477.421875, + "completions/mean_terminated_length": 477.421875, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "epoch": 0.6090848064443947, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09562075883150101, + "learning_rate": 4.250919395866917e-06, + "loss": -0.008, + "num_tokens": 216939541.0, + "reward": 2.0378975868225098, + "reward_std": 1.4186642169952393, + "rewards/accuracy_reward/mean": 1.4375, + "rewards/accuracy_reward/std": 2.8667497634887695, + "rewards/ngram_similarity_reward/mean": 0.6003977060317993, + "rewards/ngram_similarity_reward/std": 0.17121119797229767, + "step": 1361 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 698.0, + "completions/max_terminated_length": 698.0, + "completions/mean_length": 463.984375, + "completions/mean_terminated_length": 463.984375, + "completions/min_length": 352.0, + "completions/min_terminated_length": 352.0, + "epoch": 0.6095323338554487, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09416069835424423, + "learning_rate": 4.249678752882488e-06, + "loss": 0.0243, + "num_tokens": 217049812.0, + "reward": 4.938436031341553, + "reward_std": 0.18189571797847748, + "rewards/accuracy_reward/mean": 4.0, + "rewards/accuracy_reward/std": 2.618614673614502, + "rewards/ngram_similarity_reward/mean": 0.9384360313415527, + "rewards/ngram_similarity_reward/std": 0.2724663317203522, + "step": 1362 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 843.0, + "completions/max_terminated_length": 843.0, + "completions/mean_length": 516.859375, + "completions/mean_terminated_length": 516.859375, + "completions/min_length": 223.0, + "completions/min_terminated_length": 223.0, + "epoch": 0.6099798612665026, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0826227068901062, + "learning_rate": 4.24843728890784e-06, + "loss": -0.0194, + "num_tokens": 217225803.0, + "reward": 3.1972994804382324, + "reward_std": 0.837207555770874, + "rewards/accuracy_reward/mean": 2.5, + "rewards/accuracy_reward/std": 3.0237157344818115, + "rewards/ngram_similarity_reward/mean": 0.6972993612289429, + "rewards/ngram_similarity_reward/std": 0.298595130443573, + "step": 1363 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 565.0, + "completions/max_terminated_length": 565.0, + "completions/mean_length": 391.28125, + "completions/mean_terminated_length": 391.28125, + "completions/min_length": 260.0, + "completions/min_terminated_length": 260.0, + "epoch": 0.6104273886775565, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09286059439182281, + "learning_rate": 4.247195004622601e-06, + "loss": -0.0084, + "num_tokens": 217351901.0, + "reward": 5.607362747192383, + "reward_std": 0.8946105241775513, + "rewards/accuracy_reward/mean": 4.9375, + "rewards/accuracy_reward/std": 1.7627090215682983, + "rewards/ngram_similarity_reward/mean": 0.6698628067970276, + "rewards/ngram_similarity_reward/std": 0.2770031988620758, + "step": 1364 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 600.0, + "completions/max_terminated_length": 600.0, + "completions/mean_length": 418.671875, + "completions/mean_terminated_length": 418.671875, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "epoch": 0.6108749160886104, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09607899934053421, + "learning_rate": 4.245951900706854e-06, + "loss": -0.0446, + "num_tokens": 217496344.0, + "reward": 2.730788230895996, + "reward_std": 1.7564964294433594, + "rewards/accuracy_reward/mean": 2.03125, + "rewards/accuracy_reward/std": 2.986577272415161, + "rewards/ngram_similarity_reward/mean": 0.6995381712913513, + "rewards/ngram_similarity_reward/std": 0.39579570293426514, + "step": 1365 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 641.0, + "completions/max_terminated_length": 641.0, + "completions/mean_length": 457.09375, + "completions/mean_terminated_length": 457.09375, + "completions/min_length": 301.0, + "completions/min_terminated_length": 301.0, + "epoch": 0.6113224434996644, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1026829332113266, + "learning_rate": 4.244707977841129e-06, + "loss": -0.0009, + "num_tokens": 217687294.0, + "reward": 1.8190507888793945, + "reward_std": 0.49981939792633057, + "rewards/accuracy_reward/mean": 1.09375, + "rewards/accuracy_reward/std": 2.6709415912628174, + "rewards/ngram_similarity_reward/mean": 0.7253009080886841, + "rewards/ngram_similarity_reward/std": 0.1935221403837204, + "step": 1366 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 652.0, + "completions/max_terminated_length": 652.0, + "completions/mean_length": 442.453125, + "completions/mean_terminated_length": 442.453125, + "completions/min_length": 307.0, + "completions/min_terminated_length": 307.0, + "epoch": 0.6117699709107183, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.08293341845273972, + "learning_rate": 4.243463236706404e-06, + "loss": 0.0288, + "num_tokens": 217851771.0, + "reward": 3.426107883453369, + "reward_std": 0.4723675549030304, + "rewards/accuracy_reward/mean": 2.59375, + "rewards/accuracy_reward/std": 3.0222392082214355, + "rewards/ngram_similarity_reward/mean": 0.8323577642440796, + "rewards/ngram_similarity_reward/std": 0.3235006332397461, + "step": 1367 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 650.0, + "completions/max_terminated_length": 650.0, + "completions/mean_length": 446.609375, + "completions/mean_terminated_length": 446.609375, + "completions/min_length": 312.0, + "completions/min_terminated_length": 312.0, + "epoch": 0.6122174983217722, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13072438538074493, + "learning_rate": 4.242217677984104e-06, + "loss": 0.0042, + "num_tokens": 218055330.0, + "reward": 1.628633737564087, + "reward_std": 1.9026405811309814, + "rewards/accuracy_reward/mean": 1.078125, + "rewards/accuracy_reward/std": 2.683309316635132, + "rewards/ngram_similarity_reward/mean": 0.5505087375640869, + "rewards/ngram_similarity_reward/std": 0.19984152913093567, + "step": 1368 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 674.0, + "completions/max_terminated_length": 674.0, + "completions/mean_length": 423.765625, + "completions/mean_terminated_length": 423.765625, + "completions/min_length": 224.0, + "completions/min_terminated_length": 224.0, + "epoch": 0.6126650257328261, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.05521805211901665, + "learning_rate": 4.2409713023561045e-06, + "loss": -0.0124, + "num_tokens": 218204051.0, + "reward": 4.729894161224365, + "reward_std": 0.12251585721969604, + "rewards/accuracy_reward/mean": 4.0, + "rewards/accuracy_reward/std": 2.618614673614502, + "rewards/ngram_similarity_reward/mean": 0.7298941612243652, + "rewards/ngram_similarity_reward/std": 0.3434084355831146, + "step": 1369 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 997.0, + "completions/max_terminated_length": 997.0, + "completions/mean_length": 517.515625, + "completions/mean_terminated_length": 517.515625, + "completions/min_length": 233.0, + "completions/min_terminated_length": 233.0, + "epoch": 0.61311255314388, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06719180196523666, + "learning_rate": 4.239724110504725e-06, + "loss": -0.0246, + "num_tokens": 218367764.0, + "reward": 4.951564788818359, + "reward_std": 0.5328652858734131, + "rewards/accuracy_reward/mean": 3.90625, + "rewards/accuracy_reward/std": 2.6709415912628174, + "rewards/ngram_similarity_reward/mean": 1.0453150272369385, + "rewards/ngram_similarity_reward/std": 0.22503575682640076, + "step": 1370 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 659.0, + "completions/max_terminated_length": 659.0, + "completions/mean_length": 430.796875, + "completions/mean_terminated_length": 430.796875, + "completions/min_length": 279.0, + "completions/min_terminated_length": 279.0, + "epoch": 0.6135600805549339, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.07877121865749359, + "learning_rate": 4.238476103112734e-06, + "loss": 0.0225, + "num_tokens": 218514135.0, + "reward": 3.2725768089294434, + "reward_std": 0.972164511680603, + "rewards/accuracy_reward/mean": 2.40625, + "rewards/accuracy_reward/std": 3.0222392082214355, + "rewards/ngram_similarity_reward/mean": 0.8663267493247986, + "rewards/ngram_similarity_reward/std": 0.17302057147026062, + "step": 1371 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 804.0, + "completions/max_terminated_length": 804.0, + "completions/mean_length": 524.265625, + "completions/mean_terminated_length": 524.265625, + "completions/min_length": 278.0, + "completions/min_terminated_length": 278.0, + "epoch": 0.614007607965988, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09043522924184799, + "learning_rate": 4.237227280863345e-06, + "loss": -0.0094, + "num_tokens": 218713976.0, + "reward": 5.182313919067383, + "reward_std": 0.796884298324585, + "rewards/accuracy_reward/mean": 4.375, + "rewards/accuracy_reward/std": 2.3603873252868652, + "rewards/ngram_similarity_reward/mean": 0.807313859462738, + "rewards/ngram_similarity_reward/std": 0.25411897897720337, + "step": 1372 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 837.0, + "completions/mean_length": 584.484375, + "completions/mean_terminated_length": 537.274169921875, + "completions/min_length": 361.0, + "completions/min_terminated_length": 361.0, + "epoch": 0.6144551353770419, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08146406710147858, + "learning_rate": 4.235977644440219e-06, + "loss": -0.0147, + "num_tokens": 218857367.0, + "reward": 5.981154441833496, + "reward_std": 0.5761600732803345, + "rewards/accuracy_reward/mean": 5.3125, + "rewards/accuracy_reward/std": 1.0522085428237915, + "rewards/ngram_similarity_reward/mean": 0.6686544418334961, + "rewards/ngram_similarity_reward/std": 0.3249327838420868, + "step": 1373 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 851.0, + "completions/max_terminated_length": 851.0, + "completions/mean_length": 542.3125, + "completions/mean_terminated_length": 542.3125, + "completions/min_length": 389.0, + "completions/min_terminated_length": 389.0, + "epoch": 0.6149026627880958, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0895610898733139, + "learning_rate": 4.234727194527462e-06, + "loss": 0.0409, + "num_tokens": 218992123.0, + "reward": 4.557622909545898, + "reward_std": 0.7898719906806946, + "rewards/accuracy_reward/mean": 4.0, + "rewards/accuracy_reward/std": 2.618614673614502, + "rewards/ngram_similarity_reward/mean": 0.557623028755188, + "rewards/ngram_similarity_reward/std": 0.3491251766681671, + "step": 1374 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1182.0, + "completions/max_terminated_length": 1182.0, + "completions/mean_length": 494.109375, + "completions/mean_terminated_length": 494.109375, + "completions/min_length": 317.0, + "completions/min_terminated_length": 317.0, + "epoch": 0.6153501901991497, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08254073560237885, + "learning_rate": 4.233475931809626e-06, + "loss": 0.0243, + "num_tokens": 219199634.0, + "reward": 5.252430438995361, + "reward_std": 0.8898434638977051, + "rewards/accuracy_reward/mean": 4.5625, + "rewards/accuracy_reward/std": 2.195775270462036, + "rewards/ngram_similarity_reward/mean": 0.6899309158325195, + "rewards/ngram_similarity_reward/std": 0.3919968903064728, + "step": 1375 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 783.0, + "completions/max_terminated_length": 783.0, + "completions/mean_length": 486.015625, + "completions/mean_terminated_length": 486.015625, + "completions/min_length": 301.0, + "completions/min_terminated_length": 301.0, + "epoch": 0.6157977176102036, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0924331396818161, + "learning_rate": 4.232223856971705e-06, + "loss": 0.0518, + "num_tokens": 219412211.0, + "reward": 1.6161731481552124, + "reward_std": 0.0947863757610321, + "rewards/accuracy_reward/mean": 1.0, + "rewards/accuracy_reward/std": 2.618614673614502, + "rewards/ngram_similarity_reward/mean": 0.6161731481552124, + "rewards/ngram_similarity_reward/std": 0.30199921131134033, + "step": 1376 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1607.0, + "completions/max_terminated_length": 1607.0, + "completions/mean_length": 478.03125, + "completions/mean_terminated_length": 478.03125, + "completions/min_length": 240.0, + "completions/min_terminated_length": 240.0, + "epoch": 0.6162452450212575, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09547276049852371, + "learning_rate": 4.230970970699143e-06, + "loss": -0.0305, + "num_tokens": 219547125.0, + "reward": 3.140580177307129, + "reward_std": 0.0856446921825409, + "rewards/accuracy_reward/mean": 2.5, + "rewards/accuracy_reward/std": 3.0237157344818115, + "rewards/ngram_similarity_reward/mean": 0.6405801177024841, + "rewards/ngram_similarity_reward/std": 0.2989759147167206, + "step": 1377 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1193.0, + "completions/max_terminated_length": 1193.0, + "completions/mean_length": 584.90625, + "completions/mean_terminated_length": 584.90625, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "epoch": 0.6166927724323115, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07841598987579346, + "learning_rate": 4.229717273677823e-06, + "loss": -0.0298, + "num_tokens": 219699087.0, + "reward": 3.326608657836914, + "reward_std": 0.7123034596443176, + "rewards/accuracy_reward/mean": 2.875, + "rewards/accuracy_reward/std": 3.000000238418579, + "rewards/ngram_similarity_reward/mean": 0.4516085982322693, + "rewards/ngram_similarity_reward/std": 0.28527435660362244, + "step": 1378 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 807.0, + "completions/max_terminated_length": 807.0, + "completions/mean_length": 530.78125, + "completions/mean_terminated_length": 530.78125, + "completions/min_length": 335.0, + "completions/min_terminated_length": 335.0, + "epoch": 0.6171402998433654, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08135497570037842, + "learning_rate": 4.228462766594075e-06, + "loss": -0.0011, + "num_tokens": 219858593.0, + "reward": 3.2081475257873535, + "reward_std": 0.4310663640499115, + "rewards/accuracy_reward/mean": 2.59375, + "rewards/accuracy_reward/std": 3.0222392082214355, + "rewards/ngram_similarity_reward/mean": 0.6143976449966431, + "rewards/ngram_similarity_reward/std": 0.29245811700820923, + "step": 1379 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 763.0, + "completions/max_terminated_length": 763.0, + "completions/mean_length": 493.625, + "completions/mean_terminated_length": 493.625, + "completions/min_length": 305.0, + "completions/min_terminated_length": 305.0, + "epoch": 0.6175878272544193, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1111777052283287, + "learning_rate": 4.22720745013467e-06, + "loss": 0.02, + "num_tokens": 220012825.0, + "reward": 5.311241626739502, + "reward_std": 1.3066898584365845, + "rewards/accuracy_reward/mean": 4.75, + "rewards/accuracy_reward/std": 2.0, + "rewards/ngram_similarity_reward/mean": 0.561241626739502, + "rewards/ngram_similarity_reward/std": 0.144589364528656, + "step": 1380 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 574.0, + "completions/max_terminated_length": 574.0, + "completions/mean_length": 351.859375, + "completions/mean_terminated_length": 351.859375, + "completions/min_length": 171.0, + "completions/min_terminated_length": 171.0, + "epoch": 0.6180353546654732, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.07743772119283676, + "learning_rate": 4.225951324986826e-06, + "loss": -0.0064, + "num_tokens": 220124000.0, + "reward": 2.014042615890503, + "reward_std": 0.5598815083503723, + "rewards/accuracy_reward/mean": 1.1875, + "rewards/accuracy_reward/std": 2.7189810276031494, + "rewards/ngram_similarity_reward/mean": 0.8265426158905029, + "rewards/ngram_similarity_reward/std": 0.26197633147239685, + "step": 1381 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1032.0, + "completions/max_terminated_length": 1032.0, + "completions/mean_length": 591.046875, + "completions/mean_terminated_length": 591.046875, + "completions/min_length": 346.0, + "completions/min_terminated_length": 346.0, + "epoch": 0.6184828820765272, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09611193090677261, + "learning_rate": 4.2246943918382e-06, + "loss": -0.0242, + "num_tokens": 220326835.0, + "reward": 3.06565260887146, + "reward_std": 0.4496273994445801, + "rewards/accuracy_reward/mean": 2.40625, + "rewards/accuracy_reward/std": 3.0222392082214355, + "rewards/ngram_similarity_reward/mean": 0.6594024896621704, + "rewards/ngram_similarity_reward/std": 0.18902292847633362, + "step": 1382 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 925.0, + "completions/max_terminated_length": 925.0, + "completions/mean_length": 515.78125, + "completions/mean_terminated_length": 515.78125, + "completions/min_length": 297.0, + "completions/min_terminated_length": 297.0, + "epoch": 0.6189304094875812, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07921776920557022, + "learning_rate": 4.223436651376892e-06, + "loss": 0.0166, + "num_tokens": 220566773.0, + "reward": 4.603166103363037, + "reward_std": 0.44305115938186646, + "rewards/accuracy_reward/mean": 4.09375, + "rewards/accuracy_reward/std": 2.5617377758026123, + "rewards/ngram_similarity_reward/mean": 0.5094163417816162, + "rewards/ngram_similarity_reward/std": 0.1663961410522461, + "step": 1383 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 686.0, + "completions/max_terminated_length": 686.0, + "completions/mean_length": 462.15625, + "completions/mean_terminated_length": 462.15625, + "completions/min_length": 282.0, + "completions/min_terminated_length": 282.0, + "epoch": 0.6193779368986351, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0742671862244606, + "learning_rate": 4.222178104291445e-06, + "loss": -0.0078, + "num_tokens": 220772879.0, + "reward": 4.781156539916992, + "reward_std": 0.1699088215827942, + "rewards/accuracy_reward/mean": 4.0, + "rewards/accuracy_reward/std": 2.618614673614502, + "rewards/ngram_similarity_reward/mean": 0.7811561822891235, + "rewards/ngram_similarity_reward/std": 0.2659415304660797, + "step": 1384 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 706.0, + "completions/max_terminated_length": 706.0, + "completions/mean_length": 487.75, + "completions/mean_terminated_length": 487.75, + "completions/min_length": 331.0, + "completions/min_terminated_length": 331.0, + "epoch": 0.619825464309689, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08986928313970566, + "learning_rate": 4.220918751270843e-06, + "loss": -0.0167, + "num_tokens": 220899711.0, + "reward": 2.0840022563934326, + "reward_std": 1.6939761638641357, + "rewards/accuracy_reward/mean": 1.1875, + "rewards/accuracy_reward/std": 2.7189810276031494, + "rewards/ngram_similarity_reward/mean": 0.8965021371841431, + "rewards/ngram_similarity_reward/std": 0.20797498524188995, + "step": 1385 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 920.0, + "completions/max_terminated_length": 920.0, + "completions/mean_length": 530.765625, + "completions/mean_terminated_length": 530.765625, + "completions/min_length": 325.0, + "completions/min_terminated_length": 325.0, + "epoch": 0.6202729917207429, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.07207165658473969, + "learning_rate": 4.219658593004512e-06, + "loss": 0.0213, + "num_tokens": 221057792.0, + "reward": 1.5318541526794434, + "reward_std": 0.8543497920036316, + "rewards/accuracy_reward/mean": 0.984375, + "rewards/accuracy_reward/std": 2.6306629180908203, + "rewards/ngram_similarity_reward/mean": 0.5474792718887329, + "rewards/ngram_similarity_reward/std": 0.28610652685165405, + "step": 1386 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 852.0, + "completions/max_terminated_length": 852.0, + "completions/mean_length": 390.5, + "completions/mean_terminated_length": 390.5, + "completions/min_length": 200.0, + "completions/min_terminated_length": 200.0, + "epoch": 0.6207205191317968, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.09263911843299866, + "learning_rate": 4.2183976301823164e-06, + "loss": -0.0364, + "num_tokens": 221180448.0, + "reward": 2.463513135910034, + "reward_std": 0.8897101879119873, + "rewards/accuracy_reward/mean": 1.75, + "rewards/accuracy_reward/std": 2.9277002811431885, + "rewards/ngram_similarity_reward/mean": 0.713513195514679, + "rewards/ngram_similarity_reward/std": 0.3683803081512451, + "step": 1387 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 671.0, + "completions/max_terminated_length": 671.0, + "completions/mean_length": 441.359375, + "completions/mean_terminated_length": 441.359375, + "completions/min_length": 310.0, + "completions/min_terminated_length": 310.0, + "epoch": 0.6211680465428507, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08527567982673645, + "learning_rate": 4.217135863494564e-06, + "loss": 0.0011, + "num_tokens": 221346135.0, + "reward": 6.123813152313232, + "reward_std": 0.12822559475898743, + "rewards/accuracy_reward/mean": 5.5, + "rewards/accuracy_reward/std": 0.0, + "rewards/ngram_similarity_reward/mean": 0.6238132119178772, + "rewards/ngram_similarity_reward/std": 0.3540512025356293, + "step": 1388 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 826.0, + "completions/max_terminated_length": 826.0, + "completions/mean_length": 607.703125, + "completions/mean_terminated_length": 607.703125, + "completions/min_length": 393.0, + "completions/min_terminated_length": 393.0, + "epoch": 0.6216155739539047, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07611112296581268, + "learning_rate": 4.215873293632003e-06, + "loss": 0.0155, + "num_tokens": 221528196.0, + "reward": 3.1610960960388184, + "reward_std": 2.0345959663391113, + "rewards/accuracy_reward/mean": 2.3125, + "rewards/accuracy_reward/std": 3.0178043842315674, + "rewards/ngram_similarity_reward/mean": 0.8485962152481079, + "rewards/ngram_similarity_reward/std": 0.2572100758552551, + "step": 1389 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 807.0, + "completions/max_terminated_length": 807.0, + "completions/mean_length": 494.921875, + "completions/mean_terminated_length": 494.921875, + "completions/min_length": 372.0, + "completions/min_terminated_length": 372.0, + "epoch": 0.6220631013649586, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08504889160394669, + "learning_rate": 4.214609921285818e-06, + "loss": 0.01, + "num_tokens": 221653263.0, + "reward": 3.1183433532714844, + "reward_std": 0.13722378015518188, + "rewards/accuracy_reward/mean": 2.5, + "rewards/accuracy_reward/std": 3.0237157344818115, + "rewards/ngram_similarity_reward/mean": 0.6183434128761292, + "rewards/ngram_similarity_reward/std": 0.28606945276260376, + "step": 1390 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1000.0, + "completions/max_terminated_length": 1000.0, + "completions/mean_length": 545.40625, + "completions/mean_terminated_length": 545.40625, + "completions/min_length": 415.0, + "completions/min_terminated_length": 415.0, + "epoch": 0.6225106287760125, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08638472110033035, + "learning_rate": 4.2133457471476345e-06, + "loss": 0.0115, + "num_tokens": 221812377.0, + "reward": 2.7754156589508057, + "reward_std": 1.5384128093719482, + "rewards/accuracy_reward/mean": 2.015625, + "rewards/accuracy_reward/std": 3.00260329246521, + "rewards/ngram_similarity_reward/mean": 0.7597907185554504, + "rewards/ngram_similarity_reward/std": 0.31502005457878113, + "step": 1391 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 793.0, + "completions/max_terminated_length": 793.0, + "completions/mean_length": 433.03125, + "completions/mean_terminated_length": 433.03125, + "completions/min_length": 212.0, + "completions/min_terminated_length": 212.0, + "epoch": 0.6229581561870664, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.07527010142803192, + "learning_rate": 4.2120807719095166e-06, + "loss": 0.0026, + "num_tokens": 221934555.0, + "reward": 6.035556793212891, + "reward_std": 0.10023511946201324, + "rewards/accuracy_reward/mean": 5.5, + "rewards/accuracy_reward/std": 0.0, + "rewards/ngram_similarity_reward/mean": 0.5355567336082458, + "rewards/ngram_similarity_reward/std": 0.2512805461883545, + "step": 1392 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 794.0, + "completions/max_terminated_length": 794.0, + "completions/mean_length": 436.96875, + "completions/mean_terminated_length": 436.96875, + "completions/min_length": 295.0, + "completions/min_terminated_length": 295.0, + "epoch": 0.6234056835981204, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09910791367292404, + "learning_rate": 4.2108149962639695e-06, + "loss": -0.0045, + "num_tokens": 222097513.0, + "reward": 6.168093204498291, + "reward_std": 0.47341105341911316, + "rewards/accuracy_reward/mean": 5.40625, + "rewards/accuracy_reward/std": 0.7500000596046448, + "rewards/ngram_similarity_reward/mean": 0.761843204498291, + "rewards/ngram_similarity_reward/std": 0.2521355152130127, + "step": 1393 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 680.0, + "completions/max_terminated_length": 680.0, + "completions/mean_length": 494.171875, + "completions/mean_terminated_length": 494.171875, + "completions/min_length": 285.0, + "completions/min_terminated_length": 285.0, + "epoch": 0.6238532110091743, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09825079888105392, + "learning_rate": 4.209548420903934e-06, + "loss": -0.0026, + "num_tokens": 222248404.0, + "reward": 3.4757986068725586, + "reward_std": 1.0716943740844727, + "rewards/accuracy_reward/mean": 2.96875, + "rewards/accuracy_reward/std": 2.986577272415161, + "rewards/ngram_similarity_reward/mean": 0.5070487260818481, + "rewards/ngram_similarity_reward/std": 0.28137826919555664, + "step": 1394 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 799.0, + "completions/mean_length": 495.625, + "completions/mean_terminated_length": 470.9841613769531, + "completions/min_length": 276.0, + "completions/min_terminated_length": 276.0, + "epoch": 0.6243007384202283, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08490008115768433, + "learning_rate": 4.208281046522788e-06, + "loss": -0.0214, + "num_tokens": 222377292.0, + "reward": 4.1201958656311035, + "reward_std": 0.8970295190811157, + "rewards/accuracy_reward/mean": 3.34375, + "rewards/accuracy_reward/std": 2.9016621112823486, + "rewards/ngram_similarity_reward/mean": 0.7764459848403931, + "rewards/ngram_similarity_reward/std": 0.2914271056652069, + "step": 1395 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 646.0, + "completions/max_terminated_length": 646.0, + "completions/mean_length": 420.53125, + "completions/mean_terminated_length": 420.53125, + "completions/min_length": 245.0, + "completions/min_terminated_length": 245.0, + "epoch": 0.6247482658312822, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.09060279279947281, + "learning_rate": 4.207012873814349e-06, + "loss": 0.0085, + "num_tokens": 222572638.0, + "reward": 4.718295574188232, + "reward_std": 0.0775412917137146, + "rewards/accuracy_reward/mean": 4.0, + "rewards/accuracy_reward/std": 2.618614673614502, + "rewards/ngram_similarity_reward/mean": 0.7182955145835876, + "rewards/ngram_similarity_reward/std": 0.30478185415267944, + "step": 1396 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 630.0, + "completions/max_terminated_length": 630.0, + "completions/mean_length": 392.390625, + "completions/mean_terminated_length": 392.390625, + "completions/min_length": 212.0, + "completions/min_terminated_length": 212.0, + "epoch": 0.6251957932423361, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11478184163570404, + "learning_rate": 4.20574390347287e-06, + "loss": -0.0186, + "num_tokens": 222742759.0, + "reward": 4.449161529541016, + "reward_std": 0.7801280617713928, + "rewards/accuracy_reward/mean": 3.625, + "rewards/accuracy_reward/std": 2.8030595779418945, + "rewards/ngram_similarity_reward/mean": 0.8241614103317261, + "rewards/ngram_similarity_reward/std": 0.4052240550518036, + "step": 1397 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 649.0, + "completions/max_terminated_length": 649.0, + "completions/mean_length": 441.484375, + "completions/mean_terminated_length": 441.484375, + "completions/min_length": 303.0, + "completions/min_terminated_length": 303.0, + "epoch": 0.62564332065339, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0892636626958847, + "learning_rate": 4.2044741361930425e-06, + "loss": 0.0056, + "num_tokens": 222887830.0, + "reward": 3.9365570545196533, + "reward_std": 0.8537122011184692, + "rewards/accuracy_reward/mean": 2.96875, + "rewards/accuracy_reward/std": 2.986577272415161, + "rewards/ngram_similarity_reward/mean": 0.9678070545196533, + "rewards/ngram_similarity_reward/std": 0.15813502669334412, + "step": 1398 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 708.0, + "completions/max_terminated_length": 708.0, + "completions/mean_length": 476.828125, + "completions/mean_terminated_length": 476.828125, + "completions/min_length": 282.0, + "completions/min_terminated_length": 282.0, + "epoch": 0.6260908480644439, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07625213265419006, + "learning_rate": 4.203203572669992e-06, + "loss": 0.012, + "num_tokens": 223017387.0, + "reward": 4.069248676300049, + "reward_std": 0.9240694046020508, + "rewards/accuracy_reward/mean": 3.453125, + "rewards/accuracy_reward/std": 2.962354898452759, + "rewards/ngram_similarity_reward/mean": 0.6161236763000488, + "rewards/ngram_similarity_reward/std": 0.24392534792423248, + "step": 1399 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 645.0, + "completions/max_terminated_length": 645.0, + "completions/mean_length": 453.90625, + "completions/mean_terminated_length": 453.90625, + "completions/min_length": 342.0, + "completions/min_terminated_length": 342.0, + "epoch": 0.6265383754754978, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0871860533952713, + "learning_rate": 4.201932213599281e-06, + "loss": 0.0197, + "num_tokens": 223158677.0, + "reward": 4.026385307312012, + "reward_std": 0.9551817774772644, + "rewards/accuracy_reward/mean": 3.34375, + "rewards/accuracy_reward/std": 2.9016621112823486, + "rewards/ngram_similarity_reward/mean": 0.6826354265213013, + "rewards/ngram_similarity_reward/std": 0.33283618092536926, + "step": 1400 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 642.0, + "completions/max_terminated_length": 642.0, + "completions/mean_length": 457.34375, + "completions/mean_terminated_length": 457.34375, + "completions/min_length": 255.0, + "completions/min_terminated_length": 255.0, + "epoch": 0.6269859028865518, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08433037996292114, + "learning_rate": 4.200660059676908e-06, + "loss": -0.0166, + "num_tokens": 223337515.0, + "reward": 5.095659255981445, + "reward_std": 1.2406134605407715, + "rewards/accuracy_reward/mean": 4.375, + "rewards/accuracy_reward/std": 2.3603873252868652, + "rewards/ngram_similarity_reward/mean": 0.7206593751907349, + "rewards/ngram_similarity_reward/std": 0.3060435652732849, + "step": 1401 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 611.0, + "completions/max_terminated_length": 611.0, + "completions/mean_length": 409.21875, + "completions/mean_terminated_length": 409.21875, + "completions/min_length": 249.0, + "completions/min_terminated_length": 249.0, + "epoch": 0.6274334302976057, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10011816769838333, + "learning_rate": 4.199387111599305e-06, + "loss": -0.0469, + "num_tokens": 223519049.0, + "reward": 4.542868614196777, + "reward_std": 0.11508417129516602, + "rewards/accuracy_reward/mean": 4.0, + "rewards/accuracy_reward/std": 2.618614673614502, + "rewards/ngram_similarity_reward/mean": 0.5428687334060669, + "rewards/ngram_similarity_reward/std": 0.3072071075439453, + "step": 1402 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 728.0, + "completions/max_terminated_length": 728.0, + "completions/mean_length": 478.453125, + "completions/mean_terminated_length": 478.453125, + "completions/min_length": 299.0, + "completions/min_terminated_length": 299.0, + "epoch": 0.6278809577086597, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06337883323431015, + "learning_rate": 4.198113370063342e-06, + "loss": -0.0013, + "num_tokens": 223674614.0, + "reward": 5.18435001373291, + "reward_std": 1.8982305526733398, + "rewards/accuracy_reward/mean": 4.375, + "rewards/accuracy_reward/std": 2.3603873252868652, + "rewards/ngram_similarity_reward/mean": 0.8093503713607788, + "rewards/ngram_similarity_reward/std": 0.3255639672279358, + "step": 1403 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 874.0, + "completions/mean_length": 490.171875, + "completions/mean_terminated_length": 439.9193420410156, + "completions/min_length": 302.0, + "completions/min_terminated_length": 302.0, + "epoch": 0.6283284851197136, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.10022468119859695, + "learning_rate": 4.196838835766318e-06, + "loss": 0.0202, + "num_tokens": 223784241.0, + "reward": 2.7793374061584473, + "reward_std": 1.5091160535812378, + "rewards/accuracy_reward/mean": 2.125, + "rewards/accuracy_reward/std": 3.000000238418579, + "rewards/ngram_similarity_reward/mean": 0.6543375849723816, + "rewards/ngram_similarity_reward/std": 0.2150864601135254, + "step": 1404 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 696.0, + "completions/mean_length": 484.859375, + "completions/mean_terminated_length": 460.0476379394531, + "completions/min_length": 213.0, + "completions/min_terminated_length": 213.0, + "epoch": 0.6287760125307675, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08851854503154755, + "learning_rate": 4.1955635094059725e-06, + "loss": -0.0504, + "num_tokens": 223979768.0, + "reward": 3.8104681968688965, + "reward_std": 1.2830171585083008, + "rewards/accuracy_reward/mean": 3.34375, + "rewards/accuracy_reward/std": 2.9016621112823486, + "rewards/ngram_similarity_reward/mean": 0.4667181074619293, + "rewards/ngram_similarity_reward/std": 0.28389641642570496, + "step": 1405 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 832.0, + "completions/max_terminated_length": 832.0, + "completions/mean_length": 484.109375, + "completions/mean_terminated_length": 484.109375, + "completions/min_length": 298.0, + "completions/min_terminated_length": 298.0, + "epoch": 0.6292235399418215, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07858376204967499, + "learning_rate": 4.1942873916804736e-06, + "loss": -0.0391, + "num_tokens": 224116303.0, + "reward": 4.786311149597168, + "reward_std": 0.15001791715621948, + "rewards/accuracy_reward/mean": 4.0, + "rewards/accuracy_reward/std": 2.618614673614502, + "rewards/ngram_similarity_reward/mean": 0.7863106727600098, + "rewards/ngram_similarity_reward/std": 0.24081631004810333, + "step": 1406 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1050.0, + "completions/mean_length": 616.734375, + "completions/mean_terminated_length": 594.0159301757812, + "completions/min_length": 342.0, + "completions/min_terminated_length": 342.0, + "epoch": 0.6296710673528754, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09792505204677582, + "learning_rate": 4.193010483288424e-06, + "loss": -0.0839, + "num_tokens": 224303886.0, + "reward": 3.317302703857422, + "reward_std": 0.6904056668281555, + "rewards/accuracy_reward/mean": 2.640625, + "rewards/accuracy_reward/std": 3.075077533721924, + "rewards/ngram_similarity_reward/mean": 0.676677942276001, + "rewards/ngram_similarity_reward/std": 0.278852641582489, + "step": 1407 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 666.0, + "completions/max_terminated_length": 666.0, + "completions/mean_length": 500.9375, + "completions/mean_terminated_length": 500.9375, + "completions/min_length": 325.0, + "completions/min_terminated_length": 325.0, + "epoch": 0.6301185947639293, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09619695693254471, + "learning_rate": 4.191732784928862e-06, + "loss": -0.02, + "num_tokens": 224436426.0, + "reward": 3.7580008506774902, + "reward_std": 0.8680193424224854, + "rewards/accuracy_reward/mean": 3.0625, + "rewards/accuracy_reward/std": 2.9700891971588135, + "rewards/ngram_similarity_reward/mean": 0.6955007314682007, + "rewards/ngram_similarity_reward/std": 0.18768362700939178, + "step": 1408 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 665.0, + "completions/max_terminated_length": 665.0, + "completions/mean_length": 411.078125, + "completions/mean_terminated_length": 411.078125, + "completions/min_length": 304.0, + "completions/min_terminated_length": 304.0, + "epoch": 0.6305661221749832, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10975297540426254, + "learning_rate": 4.190454297301254e-06, + "loss": -0.0123, + "num_tokens": 224555967.0, + "reward": 3.212692975997925, + "reward_std": 0.2119302749633789, + "rewards/accuracy_reward/mean": 2.46875, + "rewards/accuracy_reward/std": 3.06007981300354, + "rewards/ngram_similarity_reward/mean": 0.7439426779747009, + "rewards/ngram_similarity_reward/std": 0.3938526213169098, + "step": 1409 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 700.0, + "completions/max_terminated_length": 700.0, + "completions/mean_length": 389.703125, + "completions/mean_terminated_length": 389.703125, + "completions/min_length": 254.0, + "completions/min_terminated_length": 254.0, + "epoch": 0.6310136495860371, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.14062047004699707, + "learning_rate": 4.189175021105499e-06, + "loss": -0.0113, + "num_tokens": 224741740.0, + "reward": 5.415818214416504, + "reward_std": 0.9347876906394958, + "rewards/accuracy_reward/mean": 4.84375, + "rewards/accuracy_reward/std": 1.8874586820602417, + "rewards/ngram_similarity_reward/mean": 0.5720680952072144, + "rewards/ngram_similarity_reward/std": 0.4173468351364136, + "step": 1410 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1158.0, + "completions/mean_length": 772.265625, + "completions/mean_terminated_length": 477.86541748046875, + "completions/min_length": 171.0, + "completions/min_terminated_length": 171.0, + "epoch": 0.631461176997091, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13017436861991882, + "learning_rate": 4.187894957041933e-06, + "loss": -0.0176, + "num_tokens": 224883661.0, + "reward": 2.0202715396881104, + "reward_std": 0.8831048011779785, + "rewards/accuracy_reward/mean": 1.375, + "rewards/accuracy_reward/std": 2.8030595779418945, + "rewards/ngram_similarity_reward/mean": 0.6452715396881104, + "rewards/ngram_similarity_reward/std": 0.41355928778648376, + "step": 1411 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1232.0, + "completions/mean_length": 643.875, + "completions/mean_terminated_length": 598.5806274414062, + "completions/min_length": 272.0, + "completions/min_terminated_length": 272.0, + "epoch": 0.631908704408145, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06196107342839241, + "learning_rate": 4.186614105811317e-06, + "loss": 0.0645, + "num_tokens": 225031029.0, + "reward": 3.5483577251434326, + "reward_std": 1.4880199432373047, + "rewards/accuracy_reward/mean": 2.78125, + "rewards/accuracy_reward/std": 3.0103988647460938, + "rewards/ngram_similarity_reward/mean": 0.7671077251434326, + "rewards/ngram_similarity_reward/std": 0.30070042610168457, + "step": 1412 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 665.0, + "completions/max_terminated_length": 665.0, + "completions/mean_length": 428.84375, + "completions/mean_terminated_length": 428.84375, + "completions/min_length": 219.0, + "completions/min_terminated_length": 219.0, + "epoch": 0.632356231819199, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.07660739868879318, + "learning_rate": 4.185332468114844e-06, + "loss": 0.0095, + "num_tokens": 225170971.0, + "reward": 3.1542890071868896, + "reward_std": 0.09491442143917084, + "rewards/accuracy_reward/mean": 2.5, + "rewards/accuracy_reward/std": 3.0237157344818115, + "rewards/ngram_similarity_reward/mean": 0.6542891263961792, + "rewards/ngram_similarity_reward/std": 0.20330768823623657, + "step": 1413 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 769.0, + "completions/max_terminated_length": 769.0, + "completions/mean_length": 493.921875, + "completions/mean_terminated_length": 493.921875, + "completions/min_length": 309.0, + "completions/min_terminated_length": 309.0, + "epoch": 0.6328037592302529, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09530206024646759, + "learning_rate": 4.184050044654142e-06, + "loss": 0.002, + "num_tokens": 225292902.0, + "reward": 4.7313737869262695, + "reward_std": 0.144403874874115, + "rewards/accuracy_reward/mean": 4.0, + "rewards/accuracy_reward/std": 2.618614673614502, + "rewards/ngram_similarity_reward/mean": 0.7313735485076904, + "rewards/ngram_similarity_reward/std": 0.2395082414150238, + "step": 1414 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 675.0, + "completions/max_terminated_length": 675.0, + "completions/mean_length": 420.109375, + "completions/mean_terminated_length": 420.109375, + "completions/min_length": 251.0, + "completions/min_terminated_length": 251.0, + "epoch": 0.6332512866413068, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09896707534790039, + "learning_rate": 4.182766836131265e-06, + "loss": 0.0097, + "num_tokens": 225479677.0, + "reward": 6.169071674346924, + "reward_std": 0.5309245586395264, + "rewards/accuracy_reward/mean": 5.390625, + "rewards/accuracy_reward/std": 0.8750000596046448, + "rewards/ngram_similarity_reward/mean": 0.7784466743469238, + "rewards/ngram_similarity_reward/std": 0.2396049052476883, + "step": 1415 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 661.0, + "completions/max_terminated_length": 661.0, + "completions/mean_length": 483.59375, + "completions/mean_terminated_length": 483.59375, + "completions/min_length": 300.0, + "completions/min_terminated_length": 300.0, + "epoch": 0.6336988140523607, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07389920204877853, + "learning_rate": 4.181482843248697e-06, + "loss": -0.0102, + "num_tokens": 225630275.0, + "reward": 5.170687198638916, + "reward_std": 0.7861341238021851, + "rewards/accuracy_reward/mean": 4.375, + "rewards/accuracy_reward/std": 2.3603873252868652, + "rewards/ngram_similarity_reward/mean": 0.7956870794296265, + "rewards/ngram_similarity_reward/std": 0.2827925384044647, + "step": 1416 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 873.0, + "completions/max_terminated_length": 873.0, + "completions/mean_length": 529.90625, + "completions/mean_terminated_length": 529.90625, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "epoch": 0.6341463414634146, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07031376659870148, + "learning_rate": 4.180198066709354e-06, + "loss": 0.0024, + "num_tokens": 225791293.0, + "reward": 6.244063854217529, + "reward_std": 0.11442062258720398, + "rewards/accuracy_reward/mean": 5.5, + "rewards/accuracy_reward/std": 0.0, + "rewards/ngram_similarity_reward/mean": 0.7440634369850159, + "rewards/ngram_similarity_reward/std": 0.34232163429260254, + "step": 1417 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 731.0, + "completions/max_terminated_length": 731.0, + "completions/mean_length": 438.3125, + "completions/mean_terminated_length": 438.3125, + "completions/min_length": 276.0, + "completions/min_terminated_length": 276.0, + "epoch": 0.6345938688744686, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12037988752126694, + "learning_rate": 4.178912507216577e-06, + "loss": 0.0319, + "num_tokens": 225940417.0, + "reward": 3.188974618911743, + "reward_std": 0.4819395840167999, + "rewards/accuracy_reward/mean": 2.40625, + "rewards/accuracy_reward/std": 3.0222392082214355, + "rewards/ngram_similarity_reward/mean": 0.7827244997024536, + "rewards/ngram_similarity_reward/std": 0.28460362553596497, + "step": 1418 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 960.0, + "completions/max_terminated_length": 960.0, + "completions/mean_length": 511.234375, + "completions/mean_terminated_length": 511.234375, + "completions/min_length": 337.0, + "completions/min_terminated_length": 337.0, + "epoch": 0.6350413962855225, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10589804500341415, + "learning_rate": 4.17762616547414e-06, + "loss": 0.0219, + "num_tokens": 226085168.0, + "reward": 2.5460028648376465, + "reward_std": 1.662891149520874, + "rewards/accuracy_reward/mean": 2.109375, + "rewards/accuracy_reward/std": 3.125000238418579, + "rewards/ngram_similarity_reward/mean": 0.43662798404693604, + "rewards/ngram_similarity_reward/std": 0.3201160132884979, + "step": 1419 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 767.0, + "completions/max_terminated_length": 767.0, + "completions/mean_length": 457.375, + "completions/mean_terminated_length": 457.375, + "completions/min_length": 296.0, + "completions/min_terminated_length": 296.0, + "epoch": 0.6354889236965764, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08328361809253693, + "learning_rate": 4.176339042186242e-06, + "loss": -0.0262, + "num_tokens": 226249320.0, + "reward": 4.569077491760254, + "reward_std": 0.9987720251083374, + "rewards/accuracy_reward/mean": 3.90625, + "rewards/accuracy_reward/std": 2.6709415912628174, + "rewards/ngram_similarity_reward/mean": 0.66282719373703, + "rewards/ngram_similarity_reward/std": 0.17006200551986694, + "step": 1420 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 847.0, + "completions/max_terminated_length": 847.0, + "completions/mean_length": 455.21875, + "completions/mean_terminated_length": 455.21875, + "completions/min_length": 278.0, + "completions/min_terminated_length": 278.0, + "epoch": 0.6359364511076303, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.08780533075332642, + "learning_rate": 4.175051138057512e-06, + "loss": 0.0133, + "num_tokens": 226405046.0, + "reward": 2.935864210128784, + "reward_std": 0.4295510947704315, + "rewards/accuracy_reward/mean": 2.59375, + "rewards/accuracy_reward/std": 3.0222392082214355, + "rewards/ngram_similarity_reward/mean": 0.34211423993110657, + "rewards/ngram_similarity_reward/std": 0.15545785427093506, + "step": 1421 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 648.0, + "completions/max_terminated_length": 648.0, + "completions/mean_length": 373.9375, + "completions/mean_terminated_length": 373.9375, + "completions/min_length": 240.0, + "completions/min_terminated_length": 240.0, + "epoch": 0.6363839785186842, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11097009479999542, + "learning_rate": 4.173762453793004e-06, + "loss": 0.0066, + "num_tokens": 226561602.0, + "reward": 4.57109260559082, + "reward_std": 1.2318971157073975, + "rewards/accuracy_reward/mean": 3.90625, + "rewards/accuracy_reward/std": 2.6709415912628174, + "rewards/ngram_similarity_reward/mean": 0.6648430824279785, + "rewards/ngram_similarity_reward/std": 0.3468696177005768, + "step": 1422 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 820.0, + "completions/max_terminated_length": 820.0, + "completions/mean_length": 467.015625, + "completions/mean_terminated_length": 467.015625, + "completions/min_length": 297.0, + "completions/min_terminated_length": 297.0, + "epoch": 0.6368315059297383, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09840057790279388, + "learning_rate": 4.172472990098201e-06, + "loss": -0.0136, + "num_tokens": 226692723.0, + "reward": 5.230722427368164, + "reward_std": 0.8436187505722046, + "rewards/accuracy_reward/mean": 4.53125, + "rewards/accuracy_reward/std": 2.27455735206604, + "rewards/ngram_similarity_reward/mean": 0.6994720697402954, + "rewards/ngram_similarity_reward/std": 0.3748303949832916, + "step": 1423 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 771.0, + "completions/max_terminated_length": 771.0, + "completions/mean_length": 476.546875, + "completions/mean_terminated_length": 476.546875, + "completions/min_length": 315.0, + "completions/min_terminated_length": 315.0, + "epoch": 0.6372790333407922, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09602098912000656, + "learning_rate": 4.171182747679013e-06, + "loss": 0.0086, + "num_tokens": 226869206.0, + "reward": 4.479099273681641, + "reward_std": 0.679693341255188, + "rewards/accuracy_reward/mean": 3.71875, + "rewards/accuracy_reward/std": 2.7629566192626953, + "rewards/ngram_similarity_reward/mean": 0.7603493332862854, + "rewards/ngram_similarity_reward/std": 0.23113702237606049, + "step": 1424 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 688.0, + "completions/mean_length": 480.1875, + "completions/mean_terminated_length": 455.3016052246094, + "completions/min_length": 285.0, + "completions/min_terminated_length": 285.0, + "epoch": 0.6377265607518461, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08428952097892761, + "learning_rate": 4.169891727241775e-06, + "loss": -0.047, + "num_tokens": 227013986.0, + "reward": 1.4020637273788452, + "reward_std": 0.5320973992347717, + "rewards/accuracy_reward/mean": 0.890625, + "rewards/accuracy_reward/std": 2.5734739303588867, + "rewards/ngram_similarity_reward/mean": 0.5114387273788452, + "rewards/ngram_similarity_reward/std": 0.2666884958744049, + "step": 1425 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 949.0, + "completions/max_terminated_length": 949.0, + "completions/mean_length": 510.0625, + "completions/mean_terminated_length": 510.0625, + "completions/min_length": 228.0, + "completions/min_terminated_length": 228.0, + "epoch": 0.6381740881629, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.08204381167888641, + "learning_rate": 4.168599929493249e-06, + "loss": 0.0165, + "num_tokens": 227161734.0, + "reward": 1.611677885055542, + "reward_std": 0.07850028574466705, + "rewards/accuracy_reward/mean": 1.0, + "rewards/accuracy_reward/std": 2.618614673614502, + "rewards/ngram_similarity_reward/mean": 0.6116780042648315, + "rewards/ngram_similarity_reward/std": 0.27441543340682983, + "step": 1426 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 731.0, + "completions/max_terminated_length": 731.0, + "completions/mean_length": 447.28125, + "completions/mean_terminated_length": 447.28125, + "completions/min_length": 320.0, + "completions/min_terminated_length": 320.0, + "epoch": 0.6386216155739539, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11197412759065628, + "learning_rate": 4.16730735514062e-06, + "loss": 0.0244, + "num_tokens": 227332968.0, + "reward": 4.392323017120361, + "reward_std": 2.07807993888855, + "rewards/accuracy_reward/mean": 3.71875, + "rewards/accuracy_reward/std": 2.7629566192626953, + "rewards/ngram_similarity_reward/mean": 0.6735729575157166, + "rewards/ngram_similarity_reward/std": 0.31565147638320923, + "step": 1427 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 690.0, + "completions/max_terminated_length": 690.0, + "completions/mean_length": 457.984375, + "completions/mean_terminated_length": 457.984375, + "completions/min_length": 276.0, + "completions/min_terminated_length": 276.0, + "epoch": 0.6390691429850078, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10292767733335495, + "learning_rate": 4.166014004891504e-06, + "loss": 0.0246, + "num_tokens": 227463543.0, + "reward": 2.124846935272217, + "reward_std": 1.1881186962127686, + "rewards/accuracy_reward/mean": 1.4375, + "rewards/accuracy_reward/std": 2.8667497634887695, + "rewards/ngram_similarity_reward/mean": 0.687346875667572, + "rewards/ngram_similarity_reward/std": 0.1769050657749176, + "step": 1428 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 749.0, + "completions/max_terminated_length": 749.0, + "completions/mean_length": 420.515625, + "completions/mean_terminated_length": 420.515625, + "completions/min_length": 279.0, + "completions/min_terminated_length": 279.0, + "epoch": 0.6395166703960617, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09785809367895126, + "learning_rate": 4.164719879453934e-06, + "loss": 0.0257, + "num_tokens": 227646712.0, + "reward": 4.452017307281494, + "reward_std": 0.5627508163452148, + "rewards/accuracy_reward/mean": 3.8125, + "rewards/accuracy_reward/std": 2.7189810276031494, + "rewards/ngram_similarity_reward/mean": 0.6395174264907837, + "rewards/ngram_similarity_reward/std": 0.26190394163131714, + "step": 1429 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 684.0, + "completions/max_terminated_length": 684.0, + "completions/mean_length": 429.484375, + "completions/mean_terminated_length": 429.484375, + "completions/min_length": 279.0, + "completions/min_terminated_length": 279.0, + "epoch": 0.6399641978071157, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12155099213123322, + "learning_rate": 4.163424979536373e-06, + "loss": -0.0046, + "num_tokens": 227828087.0, + "reward": 4.365470886230469, + "reward_std": 2.231382369995117, + "rewards/accuracy_reward/mean": 3.71875, + "rewards/accuracy_reward/std": 2.7629566192626953, + "rewards/ngram_similarity_reward/mean": 0.6467206478118896, + "rewards/ngram_similarity_reward/std": 0.2218778282403946, + "step": 1430 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 771.0, + "completions/max_terminated_length": 771.0, + "completions/mean_length": 441.546875, + "completions/mean_terminated_length": 441.546875, + "completions/min_length": 284.0, + "completions/min_terminated_length": 284.0, + "epoch": 0.6404117252181696, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09199751913547516, + "learning_rate": 4.162129305847707e-06, + "loss": 0.0159, + "num_tokens": 227932506.0, + "reward": 4.9037675857543945, + "reward_std": 0.1362752467393875, + "rewards/accuracy_reward/mean": 4.0, + "rewards/accuracy_reward/std": 2.618614673614502, + "rewards/ngram_similarity_reward/mean": 0.9037677049636841, + "rewards/ngram_similarity_reward/std": 0.22845500707626343, + "step": 1431 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1219.0, + "completions/max_terminated_length": 1219.0, + "completions/mean_length": 487.265625, + "completions/mean_terminated_length": 487.265625, + "completions/min_length": 346.0, + "completions/min_terminated_length": 346.0, + "epoch": 0.6408592526292235, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10971736162900925, + "learning_rate": 4.160832859097243e-06, + "loss": -0.0071, + "num_tokens": 228162699.0, + "reward": 2.998798131942749, + "reward_std": 0.5218923687934875, + "rewards/accuracy_reward/mean": 2.375, + "rewards/accuracy_reward/std": 3.057647228240967, + "rewards/ngram_similarity_reward/mean": 0.6237983703613281, + "rewards/ngram_similarity_reward/std": 0.26022130250930786, + "step": 1432 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 776.0, + "completions/max_terminated_length": 776.0, + "completions/mean_length": 494.171875, + "completions/mean_terminated_length": 494.171875, + "completions/min_length": 286.0, + "completions/min_terminated_length": 286.0, + "epoch": 0.6413067800402774, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08202996104955673, + "learning_rate": 4.159535639994714e-06, + "loss": 0.0028, + "num_tokens": 228326134.0, + "reward": 6.2203145027160645, + "reward_std": 0.07296191155910492, + "rewards/accuracy_reward/mean": 5.5, + "rewards/accuracy_reward/std": 0.0, + "rewards/ngram_similarity_reward/mean": 0.720314621925354, + "rewards/ngram_similarity_reward/std": 0.3275502622127533, + "step": 1433 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1251.0, + "completions/mean_length": 510.65625, + "completions/mean_terminated_length": 486.2539978027344, + "completions/min_length": 317.0, + "completions/min_terminated_length": 317.0, + "epoch": 0.6417543074513314, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1214066594839096, + "learning_rate": 4.158237649250276e-06, + "loss": -0.0613, + "num_tokens": 228500368.0, + "reward": 1.6632508039474487, + "reward_std": 0.6656859517097473, + "rewards/accuracy_reward/mean": 1.0, + "rewards/accuracy_reward/std": 2.861900806427002, + "rewards/ngram_similarity_reward/mean": 0.6632509231567383, + "rewards/ngram_similarity_reward/std": 0.3105030953884125, + "step": 1434 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 908.0, + "completions/max_terminated_length": 908.0, + "completions/mean_length": 551.90625, + "completions/mean_terminated_length": 551.90625, + "completions/min_length": 355.0, + "completions/min_terminated_length": 355.0, + "epoch": 0.6422018348623854, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08250430971384048, + "learning_rate": 4.1569388875745044e-06, + "loss": 0.0044, + "num_tokens": 228654202.0, + "reward": 4.726158618927002, + "reward_std": 0.8597042560577393, + "rewards/accuracy_reward/mean": 4.0, + "rewards/accuracy_reward/std": 2.618614673614502, + "rewards/ngram_similarity_reward/mean": 0.7261587381362915, + "rewards/ngram_similarity_reward/std": 0.15916042029857635, + "step": 1435 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 660.0, + "completions/max_terminated_length": 660.0, + "completions/mean_length": 462.015625, + "completions/mean_terminated_length": 462.015625, + "completions/min_length": 301.0, + "completions/min_terminated_length": 301.0, + "epoch": 0.6426493622734393, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07860218733549118, + "learning_rate": 4.1556393556784e-06, + "loss": 0.0014, + "num_tokens": 228800971.0, + "reward": 6.067918300628662, + "reward_std": 0.13865211606025696, + "rewards/accuracy_reward/mean": 5.5, + "rewards/accuracy_reward/std": 0.0, + "rewards/ngram_similarity_reward/mean": 0.5679180026054382, + "rewards/ngram_similarity_reward/std": 0.2406175434589386, + "step": 1436 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1525.0, + "completions/max_terminated_length": 1525.0, + "completions/mean_length": 579.734375, + "completions/mean_terminated_length": 579.734375, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "epoch": 0.6430968896844932, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08776629716157913, + "learning_rate": 4.154339054273383e-06, + "loss": 0.0446, + "num_tokens": 228971802.0, + "reward": 3.1615724563598633, + "reward_std": 0.05697369575500488, + "rewards/accuracy_reward/mean": 2.5, + "rewards/accuracy_reward/std": 3.0237157344818115, + "rewards/ngram_similarity_reward/mean": 0.6615725159645081, + "rewards/ngram_similarity_reward/std": 0.36188453435897827, + "step": 1437 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 718.0, + "completions/max_terminated_length": 718.0, + "completions/mean_length": 481.359375, + "completions/mean_terminated_length": 481.359375, + "completions/min_length": 282.0, + "completions/min_terminated_length": 282.0, + "epoch": 0.6435444170955471, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12041721493005753, + "learning_rate": 4.153037984071296e-06, + "loss": 0.0159, + "num_tokens": 229177937.0, + "reward": 5.157722473144531, + "reward_std": 1.9307384490966797, + "rewards/accuracy_reward/mean": 4.5625, + "rewards/accuracy_reward/std": 2.195775270462036, + "rewards/ngram_similarity_reward/mean": 0.5952223539352417, + "rewards/ngram_similarity_reward/std": 0.37407711148262024, + "step": 1438 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 912.0, + "completions/max_terminated_length": 912.0, + "completions/mean_length": 527.296875, + "completions/mean_terminated_length": 527.296875, + "completions/min_length": 289.0, + "completions/min_terminated_length": 289.0, + "epoch": 0.643991944506601, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08580014109611511, + "learning_rate": 4.151736145784402e-06, + "loss": -0.0043, + "num_tokens": 229319604.0, + "reward": 3.193204641342163, + "reward_std": 0.19513703882694244, + "rewards/accuracy_reward/mean": 2.5, + "rewards/accuracy_reward/std": 3.0237157344818115, + "rewards/ngram_similarity_reward/mean": 0.6932047605514526, + "rewards/ngram_similarity_reward/std": 0.3354808986186981, + "step": 1439 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 774.0, + "completions/max_terminated_length": 774.0, + "completions/mean_length": 477.859375, + "completions/mean_terminated_length": 477.859375, + "completions/min_length": 313.0, + "completions/min_terminated_length": 313.0, + "epoch": 0.6444394719176549, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0718073844909668, + "learning_rate": 4.150433540125385e-06, + "loss": -0.0022, + "num_tokens": 229481003.0, + "reward": 6.222909450531006, + "reward_std": 0.6646191477775574, + "rewards/accuracy_reward/mean": 5.3125, + "rewards/accuracy_reward/std": 1.0522085428237915, + "rewards/ngram_similarity_reward/mean": 0.9104093313217163, + "rewards/ngram_similarity_reward/std": 0.22802779078483582, + "step": 1440 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 774.0, + "completions/max_terminated_length": 774.0, + "completions/mean_length": 505.671875, + "completions/mean_terminated_length": 505.671875, + "completions/min_length": 324.0, + "completions/min_terminated_length": 324.0, + "epoch": 0.6448869993287089, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08394629508256912, + "learning_rate": 4.149130167807347e-06, + "loss": 0.0163, + "num_tokens": 229615846.0, + "reward": 5.157923698425293, + "reward_std": 0.9705162048339844, + "rewards/accuracy_reward/mean": 4.625, + "rewards/accuracy_reward/std": 2.3333334922790527, + "rewards/ngram_similarity_reward/mean": 0.532923698425293, + "rewards/ngram_similarity_reward/std": 0.39065343141555786, + "step": 1441 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 671.0, + "completions/max_terminated_length": 671.0, + "completions/mean_length": 390.0, + "completions/mean_terminated_length": 390.0, + "completions/min_length": 207.0, + "completions/min_terminated_length": 207.0, + "epoch": 0.6453345267397628, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1086539626121521, + "learning_rate": 4.147826029543815e-06, + "loss": 0.0071, + "num_tokens": 229777062.0, + "reward": 4.193568706512451, + "reward_std": 1.4892494678497314, + "rewards/accuracy_reward/mean": 3.53125, + "rewards/accuracy_reward/std": 2.839454174041748, + "rewards/ngram_similarity_reward/mean": 0.6623190641403198, + "rewards/ngram_similarity_reward/std": 0.3748938739299774, + "step": 1442 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 871.0, + "completions/max_terminated_length": 871.0, + "completions/mean_length": 595.953125, + "completions/mean_terminated_length": 595.953125, + "completions/min_length": 414.0, + "completions/min_terminated_length": 414.0, + "epoch": 0.6457820541508167, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06477946043014526, + "learning_rate": 4.146521126048729e-06, + "loss": -0.0083, + "num_tokens": 229990707.0, + "reward": 4.260105609893799, + "reward_std": 1.5706150531768799, + "rewards/accuracy_reward/mean": 3.71875, + "rewards/accuracy_reward/std": 2.7629566192626953, + "rewards/ngram_similarity_reward/mean": 0.5413554906845093, + "rewards/ngram_similarity_reward/std": 0.22571967542171478, + "step": 1443 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 601.0, + "completions/max_terminated_length": 601.0, + "completions/mean_length": 361.046875, + "completions/mean_terminated_length": 361.046875, + "completions/min_length": 216.0, + "completions/min_terminated_length": 216.0, + "epoch": 0.6462295815618707, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0986800566315651, + "learning_rate": 4.145215458036451e-06, + "loss": -0.0243, + "num_tokens": 230160774.0, + "reward": 4.196253776550293, + "reward_std": 1.152472972869873, + "rewards/accuracy_reward/mean": 3.34375, + "rewards/accuracy_reward/std": 2.9016621112823486, + "rewards/ngram_similarity_reward/mean": 0.8525041341781616, + "rewards/ngram_similarity_reward/std": 0.22282861173152924, + "step": 1444 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 819.0, + "completions/max_terminated_length": 819.0, + "completions/mean_length": 485.859375, + "completions/mean_terminated_length": 485.859375, + "completions/min_length": 302.0, + "completions/min_terminated_length": 302.0, + "epoch": 0.6466771089729246, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.06249188259243965, + "learning_rate": 4.1439090262217614e-06, + "loss": -0.0188, + "num_tokens": 230323053.0, + "reward": 6.313204765319824, + "reward_std": 0.754632830619812, + "rewards/accuracy_reward/mean": 5.3125, + "rewards/accuracy_reward/std": 1.0522085428237915, + "rewards/ngram_similarity_reward/mean": 1.0007052421569824, + "rewards/ngram_similarity_reward/std": 0.1831846833229065, + "step": 1445 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 692.0, + "completions/max_terminated_length": 692.0, + "completions/mean_length": 455.734375, + "completions/mean_terminated_length": 455.734375, + "completions/min_length": 308.0, + "completions/min_terminated_length": 308.0, + "epoch": 0.6471246363839785, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07594480365514755, + "learning_rate": 4.142601831319859e-06, + "loss": 0.0022, + "num_tokens": 230487692.0, + "reward": 6.028704643249512, + "reward_std": 0.8274354934692383, + "rewards/accuracy_reward/mean": 5.3125, + "rewards/accuracy_reward/std": 1.0522085428237915, + "rewards/ngram_similarity_reward/mean": 0.7162050008773804, + "rewards/ngram_similarity_reward/std": 0.33539679646492004, + "step": 1446 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 635.0, + "completions/max_terminated_length": 635.0, + "completions/mean_length": 448.5, + "completions/mean_terminated_length": 448.5, + "completions/min_length": 290.0, + "completions/min_terminated_length": 290.0, + "epoch": 0.6475721637950325, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.09063916653394699, + "learning_rate": 4.141293874046359e-06, + "loss": -0.0091, + "num_tokens": 230630204.0, + "reward": 3.5175516605377197, + "reward_std": 1.1414765119552612, + "rewards/accuracy_reward/mean": 2.875, + "rewards/accuracy_reward/std": 3.000000238418579, + "rewards/ngram_similarity_reward/mean": 0.6425517201423645, + "rewards/ngram_similarity_reward/std": 0.29616227746009827, + "step": 1447 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 812.0, + "completions/max_terminated_length": 812.0, + "completions/mean_length": 484.234375, + "completions/mean_terminated_length": 484.234375, + "completions/min_length": 274.0, + "completions/min_terminated_length": 274.0, + "epoch": 0.6480196912060864, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0769663080573082, + "learning_rate": 4.139985155117296e-06, + "loss": -0.0222, + "num_tokens": 230793227.0, + "reward": 5.935922622680664, + "reward_std": 0.811854898929596, + "rewards/accuracy_reward/mean": 5.3125, + "rewards/accuracy_reward/std": 1.0522085428237915, + "rewards/ngram_similarity_reward/mean": 0.6234228014945984, + "rewards/ngram_similarity_reward/std": 0.2343846708536148, + "step": 1448 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 649.0, + "completions/max_terminated_length": 649.0, + "completions/mean_length": 391.78125, + "completions/mean_terminated_length": 391.78125, + "completions/min_length": 167.0, + "completions/min_terminated_length": 167.0, + "epoch": 0.6484672186171403, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.08732519298791885, + "learning_rate": 4.138675675249119e-06, + "loss": 0.0031, + "num_tokens": 230926109.0, + "reward": 4.527279853820801, + "reward_std": 0.4422011971473694, + "rewards/accuracy_reward/mean": 3.90625, + "rewards/accuracy_reward/std": 2.6709415912628174, + "rewards/ngram_similarity_reward/mean": 0.6210297346115112, + "rewards/ngram_similarity_reward/std": 0.3546171486377716, + "step": 1449 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 767.0, + "completions/max_terminated_length": 767.0, + "completions/mean_length": 487.765625, + "completions/mean_terminated_length": 487.765625, + "completions/min_length": 336.0, + "completions/min_terminated_length": 336.0, + "epoch": 0.6489147460281942, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10862072557210922, + "learning_rate": 4.1373654351586955e-06, + "loss": -0.0008, + "num_tokens": 231127534.0, + "reward": 3.2383363246917725, + "reward_std": 2.2389791011810303, + "rewards/accuracy_reward/mean": 2.484375, + "rewards/accuracy_reward/std": 3.0419929027557373, + "rewards/ngram_similarity_reward/mean": 0.7539613842964172, + "rewards/ngram_similarity_reward/std": 0.3312397003173828, + "step": 1450 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 664.0, + "completions/max_terminated_length": 664.0, + "completions/mean_length": 450.9375, + "completions/mean_terminated_length": 450.9375, + "completions/min_length": 267.0, + "completions/min_terminated_length": 267.0, + "epoch": 0.6493622734392481, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07241150736808777, + "learning_rate": 4.13605443556331e-06, + "loss": 0.0168, + "num_tokens": 231288010.0, + "reward": 5.961592674255371, + "reward_std": 0.5978045463562012, + "rewards/accuracy_reward/mean": 5.3125, + "rewards/accuracy_reward/std": 1.0522085428237915, + "rewards/ngram_similarity_reward/mean": 0.6490925550460815, + "rewards/ngram_similarity_reward/std": 0.27072691917419434, + "step": 1451 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 793.0, + "completions/max_terminated_length": 793.0, + "completions/mean_length": 473.9375, + "completions/mean_terminated_length": 473.9375, + "completions/min_length": 285.0, + "completions/min_terminated_length": 285.0, + "epoch": 0.649809800850302, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11711332201957703, + "learning_rate": 4.1347426771806595e-06, + "loss": 0.0014, + "num_tokens": 231479702.0, + "reward": 1.2033401727676392, + "reward_std": 0.5603854060173035, + "rewards/accuracy_reward/mean": 0.8125, + "rewards/accuracy_reward/std": 2.5, + "rewards/ngram_similarity_reward/mean": 0.39084017276763916, + "rewards/ngram_similarity_reward/std": 0.14672933518886566, + "step": 1452 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 971.0, + "completions/max_terminated_length": 971.0, + "completions/mean_length": 628.734375, + "completions/mean_terminated_length": 628.734375, + "completions/min_length": 397.0, + "completions/min_terminated_length": 397.0, + "epoch": 0.650257328261356, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0671830028295517, + "learning_rate": 4.133430160728859e-06, + "loss": 0.0051, + "num_tokens": 231666981.0, + "reward": 6.058471202850342, + "reward_std": 0.42626237869262695, + "rewards/accuracy_reward/mean": 5.40625, + "rewards/accuracy_reward/std": 0.7500000596046448, + "rewards/ngram_similarity_reward/mean": 0.6522212028503418, + "rewards/ngram_similarity_reward/std": 0.17144225537776947, + "step": 1453 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 626.0, + "completions/max_terminated_length": 626.0, + "completions/mean_length": 471.546875, + "completions/mean_terminated_length": 471.546875, + "completions/min_length": 356.0, + "completions/min_terminated_length": 356.0, + "epoch": 0.65070485567241, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10306880623102188, + "learning_rate": 4.132116886926438e-06, + "loss": 0.0074, + "num_tokens": 231802824.0, + "reward": 6.152955055236816, + "reward_std": 0.18371598422527313, + "rewards/accuracy_reward/mean": 5.5, + "rewards/accuracy_reward/std": 0.0, + "rewards/ngram_similarity_reward/mean": 0.6529550552368164, + "rewards/ngram_similarity_reward/std": 0.25471946597099304, + "step": 1454 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 863.0, + "completions/max_terminated_length": 863.0, + "completions/mean_length": 426.421875, + "completions/mean_terminated_length": 426.421875, + "completions/min_length": 213.0, + "completions/min_terminated_length": 213.0, + "epoch": 0.6511523830834639, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0816989615559578, + "learning_rate": 4.130802856492341e-06, + "loss": -0.0171, + "num_tokens": 231965587.0, + "reward": 4.789144515991211, + "reward_std": 0.15055951476097107, + "rewards/accuracy_reward/mean": 4.0, + "rewards/accuracy_reward/std": 2.618614673614502, + "rewards/ngram_similarity_reward/mean": 0.7891442775726318, + "rewards/ngram_similarity_reward/std": 0.3067992925643921, + "step": 1455 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1011.0, + "completions/mean_length": 508.359375, + "completions/mean_terminated_length": 483.920654296875, + "completions/min_length": 281.0, + "completions/min_terminated_length": 281.0, + "epoch": 0.6515999104945178, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1071004718542099, + "learning_rate": 4.129488070145925e-06, + "loss": -0.0149, + "num_tokens": 232104602.0, + "reward": 4.685388565063477, + "reward_std": 0.09721273183822632, + "rewards/accuracy_reward/mean": 4.0, + "rewards/accuracy_reward/std": 2.618614673614502, + "rewards/ngram_similarity_reward/mean": 0.6853882670402527, + "rewards/ngram_similarity_reward/std": 0.2215292751789093, + "step": 1456 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 620.0, + "completions/max_terminated_length": 620.0, + "completions/mean_length": 426.84375, + "completions/mean_terminated_length": 426.84375, + "completions/min_length": 300.0, + "completions/min_terminated_length": 300.0, + "epoch": 0.6520474379055717, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11188356578350067, + "learning_rate": 4.128172528606963e-06, + "loss": 0.0136, + "num_tokens": 232242368.0, + "reward": 5.314410209655762, + "reward_std": 1.6823713779449463, + "rewards/accuracy_reward/mean": 4.34375, + "rewards/accuracy_reward/std": 2.4314002990722656, + "rewards/ngram_similarity_reward/mean": 0.9706601500511169, + "rewards/ngram_similarity_reward/std": 0.21696914732456207, + "step": 1457 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 681.0, + "completions/max_terminated_length": 681.0, + "completions/mean_length": 434.15625, + "completions/mean_terminated_length": 434.15625, + "completions/min_length": 290.0, + "completions/min_terminated_length": 290.0, + "epoch": 0.6524949653166257, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.11057297885417938, + "learning_rate": 4.126856232595639e-06, + "loss": -0.0247, + "num_tokens": 232398378.0, + "reward": 5.810824394226074, + "reward_std": 1.0515462160110474, + "rewards/accuracy_reward/mean": 5.125, + "rewards/accuracy_reward/std": 1.4638501405715942, + "rewards/ngram_similarity_reward/mean": 0.6858242750167847, + "rewards/ngram_similarity_reward/std": 0.138453409075737, + "step": 1458 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 719.0, + "completions/max_terminated_length": 719.0, + "completions/mean_length": 473.484375, + "completions/mean_terminated_length": 473.484375, + "completions/min_length": 285.0, + "completions/min_terminated_length": 285.0, + "epoch": 0.6529424927276796, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.058471474796533585, + "learning_rate": 4.125539182832553e-06, + "loss": -0.008, + "num_tokens": 232579209.0, + "reward": 4.679767608642578, + "reward_std": 0.07971321046352386, + "rewards/accuracy_reward/mean": 4.0, + "rewards/accuracy_reward/std": 2.618614673614502, + "rewards/ngram_similarity_reward/mean": 0.6797680854797363, + "rewards/ngram_similarity_reward/std": 0.3421189785003662, + "step": 1459 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 930.0, + "completions/max_terminated_length": 930.0, + "completions/mean_length": 454.890625, + "completions/mean_terminated_length": 454.890625, + "completions/min_length": 302.0, + "completions/min_terminated_length": 302.0, + "epoch": 0.6533900201387335, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10413812100887299, + "learning_rate": 4.124221380038716e-06, + "loss": -0.0558, + "num_tokens": 232762962.0, + "reward": 3.1186277866363525, + "reward_std": 0.09125609695911407, + "rewards/accuracy_reward/mean": 2.5, + "rewards/accuracy_reward/std": 3.0237157344818115, + "rewards/ngram_similarity_reward/mean": 0.6186277866363525, + "rewards/ngram_similarity_reward/std": 0.29530709981918335, + "step": 1460 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 892.0, + "completions/max_terminated_length": 892.0, + "completions/mean_length": 509.765625, + "completions/mean_terminated_length": 509.765625, + "completions/min_length": 250.0, + "completions/min_terminated_length": 250.0, + "epoch": 0.6538375475497874, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.18827714025974274, + "learning_rate": 4.1229028249355505e-06, + "loss": -0.0112, + "num_tokens": 233006467.0, + "reward": 1.6649301052093506, + "reward_std": 0.5911146402359009, + "rewards/accuracy_reward/mean": 1.0625, + "rewards/accuracy_reward/std": 2.695528507232666, + "rewards/ngram_similarity_reward/mean": 0.6024301648139954, + "rewards/ngram_similarity_reward/std": 0.2865312695503235, + "step": 1461 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 756.0, + "completions/max_terminated_length": 756.0, + "completions/mean_length": 547.984375, + "completions/mean_terminated_length": 547.984375, + "completions/min_length": 365.0, + "completions/min_terminated_length": 365.0, + "epoch": 0.6542850749608413, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07518404722213745, + "learning_rate": 4.121583518244891e-06, + "loss": -0.0003, + "num_tokens": 233143810.0, + "reward": 4.681684494018555, + "reward_std": 0.10513784736394882, + "rewards/accuracy_reward/mean": 4.0, + "rewards/accuracy_reward/std": 2.618614673614502, + "rewards/ngram_similarity_reward/mean": 0.6816844940185547, + "rewards/ngram_similarity_reward/std": 0.24526838958263397, + "step": 1462 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 537.0, + "completions/max_terminated_length": 537.0, + "completions/mean_length": 336.671875, + "completions/mean_terminated_length": 336.671875, + "completions/min_length": 173.0, + "completions/min_terminated_length": 173.0, + "epoch": 0.6547326023718952, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.09193456172943115, + "learning_rate": 4.120263460688986e-06, + "loss": 0.0114, + "num_tokens": 233263373.0, + "reward": 0.32907330989837646, + "reward_std": 0.03988669440150261, + "rewards/accuracy_reward/mean": -0.5, + "rewards/accuracy_reward/std": 0.0, + "rewards/ngram_similarity_reward/mean": 0.8290732502937317, + "rewards/ngram_similarity_reward/std": 0.3824120759963989, + "step": 1463 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1205.0, + "completions/max_terminated_length": 1205.0, + "completions/mean_length": 498.90625, + "completions/mean_terminated_length": 498.90625, + "completions/min_length": 208.0, + "completions/min_terminated_length": 208.0, + "epoch": 0.6551801297829493, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08146632462739944, + "learning_rate": 4.1189426529904905e-06, + "loss": 0.0205, + "num_tokens": 233429063.0, + "reward": 5.3825764656066895, + "reward_std": 0.8494433760643005, + "rewards/accuracy_reward/mean": 4.828125, + "rewards/accuracy_reward/std": 1.9359153509140015, + "rewards/ngram_similarity_reward/mean": 0.5544514656066895, + "rewards/ngram_similarity_reward/std": 0.2992057204246521, + "step": 1464 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 636.0, + "completions/max_terminated_length": 636.0, + "completions/mean_length": 468.3125, + "completions/mean_terminated_length": 468.3125, + "completions/min_length": 269.0, + "completions/min_terminated_length": 269.0, + "epoch": 0.6556276571940032, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08296996355056763, + "learning_rate": 4.117621095872476e-06, + "loss": -0.0241, + "num_tokens": 233599163.0, + "reward": 3.1460554599761963, + "reward_std": 0.46030616760253906, + "rewards/accuracy_reward/mean": 2.59375, + "rewards/accuracy_reward/std": 3.0222392082214355, + "rewards/ngram_similarity_reward/mean": 0.5523055791854858, + "rewards/ngram_similarity_reward/std": 0.18339583277702332, + "step": 1465 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 696.0, + "completions/max_terminated_length": 696.0, + "completions/mean_length": 451.03125, + "completions/mean_terminated_length": 451.03125, + "completions/min_length": 245.0, + "completions/min_terminated_length": 245.0, + "epoch": 0.6560751846050571, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12082185596227646, + "learning_rate": 4.11629879005842e-06, + "loss": 0.0199, + "num_tokens": 233770557.0, + "reward": 4.085703372955322, + "reward_std": 0.9208123683929443, + "rewards/accuracy_reward/mean": 3.25, + "rewards/accuracy_reward/std": 2.9277002811431885, + "rewards/ngram_similarity_reward/mean": 0.8357031345367432, + "rewards/ngram_similarity_reward/std": 0.2821987569332123, + "step": 1466 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1053.0, + "completions/max_terminated_length": 1053.0, + "completions/mean_length": 441.65625, + "completions/mean_terminated_length": 441.65625, + "completions/min_length": 297.0, + "completions/min_terminated_length": 297.0, + "epoch": 0.656522712016111, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.15086643397808075, + "learning_rate": 4.114975736272209e-06, + "loss": 0.015, + "num_tokens": 233971447.0, + "reward": 5.505659580230713, + "reward_std": 0.8936091065406799, + "rewards/accuracy_reward/mean": 4.96875, + "rewards/accuracy_reward/std": 1.8427786827087402, + "rewards/ngram_similarity_reward/mean": 0.5369095802307129, + "rewards/ngram_similarity_reward/std": 0.3775620758533478, + "step": 1467 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 917.0, + "completions/mean_length": 552.65625, + "completions/mean_terminated_length": 504.4193420410156, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "epoch": 0.6569702394271649, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10218188911676407, + "learning_rate": 4.113651935238144e-06, + "loss": 0.0136, + "num_tokens": 234132161.0, + "reward": 2.611851215362549, + "reward_std": 0.8556914925575256, + "rewards/accuracy_reward/mean": 2.03125, + "rewards/accuracy_reward/std": 2.986577272415161, + "rewards/ngram_similarity_reward/mean": 0.5806010961532593, + "rewards/ngram_similarity_reward/std": 0.3110412359237671, + "step": 1468 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 621.0, + "completions/mean_length": 421.53125, + "completions/mean_terminated_length": 369.06451416015625, + "completions/min_length": 243.0, + "completions/min_terminated_length": 243.0, + "epoch": 0.6574177668382188, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.16987138986587524, + "learning_rate": 4.11232738768093e-06, + "loss": -0.0114, + "num_tokens": 234330659.0, + "reward": 4.307737827301025, + "reward_std": 1.6615090370178223, + "rewards/accuracy_reward/mean": 3.625, + "rewards/accuracy_reward/std": 2.8030595779418945, + "rewards/ngram_similarity_reward/mean": 0.6827379465103149, + "rewards/ngram_similarity_reward/std": 0.3940820097923279, + "step": 1469 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 859.0, + "completions/max_terminated_length": 859.0, + "completions/mean_length": 468.40625, + "completions/mean_terminated_length": 468.40625, + "completions/min_length": 314.0, + "completions/min_terminated_length": 314.0, + "epoch": 0.6578652942492728, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10134927183389664, + "learning_rate": 4.111002094325682e-06, + "loss": -0.0136, + "num_tokens": 234512029.0, + "reward": 2.7333059310913086, + "reward_std": 1.3934979438781738, + "rewards/accuracy_reward/mean": 2.21875, + "rewards/accuracy_reward/std": 3.0103988647460938, + "rewards/ngram_similarity_reward/mean": 0.5145559310913086, + "rewards/ngram_similarity_reward/std": 0.1624448001384735, + "step": 1470 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 900.0, + "completions/mean_length": 654.875, + "completions/mean_terminated_length": 562.0, + "completions/min_length": 359.0, + "completions/min_terminated_length": 359.0, + "epoch": 0.6583128216603267, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08241728693246841, + "learning_rate": 4.109676055897926e-06, + "loss": -0.0989, + "num_tokens": 234658613.0, + "reward": 6.001367568969727, + "reward_std": 0.606982409954071, + "rewards/accuracy_reward/mean": 5.3125, + "rewards/accuracy_reward/std": 1.0522085428237915, + "rewards/ngram_similarity_reward/mean": 0.6888673305511475, + "rewards/ngram_similarity_reward/std": 0.1470806747674942, + "step": 1471 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 827.0, + "completions/max_terminated_length": 827.0, + "completions/mean_length": 466.828125, + "completions/mean_terminated_length": 466.828125, + "completions/min_length": 268.0, + "completions/min_terminated_length": 268.0, + "epoch": 0.6587603490713806, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.10049612820148468, + "learning_rate": 4.108349273123593e-06, + "loss": -0.0132, + "num_tokens": 234791530.0, + "reward": 5.122507095336914, + "reward_std": 0.6011908650398254, + "rewards/accuracy_reward/mean": 4.1875, + "rewards/accuracy_reward/std": 2.5, + "rewards/ngram_similarity_reward/mean": 0.9350072145462036, + "rewards/ngram_similarity_reward/std": 0.2942538559436798, + "step": 1472 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 797.0, + "completions/max_terminated_length": 797.0, + "completions/mean_length": 452.265625, + "completions/mean_terminated_length": 452.265625, + "completions/min_length": 240.0, + "completions/min_terminated_length": 240.0, + "epoch": 0.6592078764824345, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10474356263875961, + "learning_rate": 4.1070217467290215e-06, + "loss": 0.0213, + "num_tokens": 234929131.0, + "reward": 3.1596713066101074, + "reward_std": 0.05670752003788948, + "rewards/accuracy_reward/mean": 2.5, + "rewards/accuracy_reward/std": 3.0237157344818115, + "rewards/ngram_similarity_reward/mean": 0.6596712470054626, + "rewards/ngram_similarity_reward/std": 0.20737488567829132, + "step": 1473 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1480.0, + "completions/mean_length": 652.109375, + "completions/mean_terminated_length": 629.952392578125, + "completions/min_length": 269.0, + "completions/min_terminated_length": 269.0, + "epoch": 0.6596554038934884, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08270086348056793, + "learning_rate": 4.10569347744096e-06, + "loss": 0.047, + "num_tokens": 235098018.0, + "reward": 4.487462997436523, + "reward_std": 0.09502571821212769, + "rewards/accuracy_reward/mean": 4.0, + "rewards/accuracy_reward/std": 2.618614673614502, + "rewards/ngram_similarity_reward/mean": 0.48746317625045776, + "rewards/ngram_similarity_reward/std": 0.28049421310424805, + "step": 1474 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1129.0, + "completions/max_terminated_length": 1129.0, + "completions/mean_length": 520.828125, + "completions/mean_terminated_length": 520.828125, + "completions/min_length": 312.0, + "completions/min_terminated_length": 312.0, + "epoch": 0.6601029313045425, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08269716799259186, + "learning_rate": 4.10436446598656e-06, + "loss": -0.0041, + "num_tokens": 235249527.0, + "reward": 4.493458271026611, + "reward_std": 0.44828683137893677, + "rewards/accuracy_reward/mean": 3.90625, + "rewards/accuracy_reward/std": 2.6709415912628174, + "rewards/ngram_similarity_reward/mean": 0.5872083902359009, + "rewards/ngram_similarity_reward/std": 0.4154118299484253, + "step": 1475 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 655.0, + "completions/max_terminated_length": 655.0, + "completions/mean_length": 428.765625, + "completions/mean_terminated_length": 428.765625, + "completions/min_length": 291.0, + "completions/min_terminated_length": 291.0, + "epoch": 0.6605504587155964, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12616463005542755, + "learning_rate": 4.1030347130933815e-06, + "loss": 0.0004, + "num_tokens": 235413224.0, + "reward": 3.0580625534057617, + "reward_std": 0.8669981956481934, + "rewards/accuracy_reward/mean": 2.3125, + "rewards/accuracy_reward/std": 3.0178043842315674, + "rewards/ngram_similarity_reward/mean": 0.7455626130104065, + "rewards/ngram_similarity_reward/std": 0.3908692002296448, + "step": 1476 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 650.0, + "completions/max_terminated_length": 650.0, + "completions/mean_length": 454.03125, + "completions/mean_terminated_length": 454.03125, + "completions/min_length": 301.0, + "completions/min_terminated_length": 301.0, + "epoch": 0.6609979861266503, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0840691477060318, + "learning_rate": 4.101704219489389e-06, + "loss": -0.0135, + "num_tokens": 235542730.0, + "reward": 6.504312515258789, + "reward_std": 0.21266743540763855, + "rewards/accuracy_reward/mean": 5.5, + "rewards/accuracy_reward/std": 0.0, + "rewards/ngram_similarity_reward/mean": 1.0043129920959473, + "rewards/ngram_similarity_reward/std": 0.26560497283935547, + "step": 1477 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 818.0, + "completions/max_terminated_length": 818.0, + "completions/mean_length": 510.890625, + "completions/mean_terminated_length": 510.890625, + "completions/min_length": 333.0, + "completions/min_terminated_length": 333.0, + "epoch": 0.6614455135377042, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09798114001750946, + "learning_rate": 4.100372985902955e-06, + "loss": -0.014, + "num_tokens": 235705171.0, + "reward": 4.69714879989624, + "reward_std": 0.16241833567619324, + "rewards/accuracy_reward/mean": 4.0, + "rewards/accuracy_reward/std": 2.618614673614502, + "rewards/ngram_similarity_reward/mean": 0.697148859500885, + "rewards/ngram_similarity_reward/std": 0.38769692182540894, + "step": 1478 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 806.0, + "completions/max_terminated_length": 806.0, + "completions/mean_length": 504.609375, + "completions/mean_terminated_length": 504.609375, + "completions/min_length": 257.0, + "completions/min_terminated_length": 257.0, + "epoch": 0.6618930409487581, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09201952069997787, + "learning_rate": 4.099041013062854e-06, + "loss": 0.0368, + "num_tokens": 235850890.0, + "reward": 5.716346740722656, + "reward_std": 1.29547119140625, + "rewards/accuracy_reward/mean": 4.84375, + "rewards/accuracy_reward/std": 1.8874586820602417, + "rewards/ngram_similarity_reward/mean": 0.8725964426994324, + "rewards/ngram_similarity_reward/std": 0.24577252566814423, + "step": 1479 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 820.0, + "completions/max_terminated_length": 820.0, + "completions/mean_length": 512.921875, + "completions/mean_terminated_length": 512.921875, + "completions/min_length": 300.0, + "completions/min_terminated_length": 300.0, + "epoch": 0.662340568359812, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09567677229642868, + "learning_rate": 4.097708301698266e-06, + "loss": 0.0454, + "num_tokens": 236003349.0, + "reward": 3.1331405639648438, + "reward_std": 0.16849875450134277, + "rewards/accuracy_reward/mean": 2.5, + "rewards/accuracy_reward/std": 3.0237157344818115, + "rewards/ngram_similarity_reward/mean": 0.6331405639648438, + "rewards/ngram_similarity_reward/std": 0.2982363998889923, + "step": 1480 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1031.0, + "completions/max_terminated_length": 1031.0, + "completions/mean_length": 431.640625, + "completions/mean_terminated_length": 431.640625, + "completions/min_length": 288.0, + "completions/min_terminated_length": 288.0, + "epoch": 0.662788095770866, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.060426075011491776, + "learning_rate": 4.0963748525387774e-06, + "loss": -0.0121, + "num_tokens": 236150846.0, + "reward": 6.335261344909668, + "reward_std": 0.8731868267059326, + "rewards/accuracy_reward/mean": 5.296875, + "rewards/accuracy_reward/std": 1.1433686017990112, + "rewards/ngram_similarity_reward/mean": 1.038386583328247, + "rewards/ngram_similarity_reward/std": 0.17555567622184753, + "step": 1481 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 751.0, + "completions/max_terminated_length": 751.0, + "completions/mean_length": 476.546875, + "completions/mean_terminated_length": 476.546875, + "completions/min_length": 260.0, + "completions/min_terminated_length": 260.0, + "epoch": 0.6632356231819199, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0681709423661232, + "learning_rate": 4.095040666314377e-06, + "loss": 0.0307, + "num_tokens": 236315233.0, + "reward": 4.7674665451049805, + "reward_std": 0.029874470084905624, + "rewards/accuracy_reward/mean": 4.0, + "rewards/accuracy_reward/std": 2.618614673614502, + "rewards/ngram_similarity_reward/mean": 0.7674665451049805, + "rewards/ngram_similarity_reward/std": 0.20821696519851685, + "step": 1482 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 842.0, + "completions/max_terminated_length": 842.0, + "completions/mean_length": 472.328125, + "completions/mean_terminated_length": 472.328125, + "completions/min_length": 268.0, + "completions/min_terminated_length": 268.0, + "epoch": 0.6636831505929738, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0849587619304657, + "learning_rate": 4.0937057437554565e-06, + "loss": -0.0165, + "num_tokens": 236471446.0, + "reward": 3.6301369667053223, + "reward_std": 0.9095160961151123, + "rewards/accuracy_reward/mean": 2.96875, + "rewards/accuracy_reward/std": 2.986577272415161, + "rewards/ngram_similarity_reward/mean": 0.661387026309967, + "rewards/ngram_similarity_reward/std": 0.2558072805404663, + "step": 1483 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 788.0, + "completions/max_terminated_length": 788.0, + "completions/mean_length": 447.421875, + "completions/mean_terminated_length": 447.421875, + "completions/min_length": 267.0, + "completions/min_terminated_length": 267.0, + "epoch": 0.6641306780040277, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11404269188642502, + "learning_rate": 4.092370085592812e-06, + "loss": -0.0398, + "num_tokens": 236661297.0, + "reward": 0.8565977215766907, + "reward_std": 0.9256554841995239, + "rewards/accuracy_reward/mean": 0.234375, + "rewards/accuracy_reward/std": 2.0098345279693604, + "rewards/ngram_similarity_reward/mean": 0.6222226619720459, + "rewards/ngram_similarity_reward/std": 0.29372283816337585, + "step": 1484 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 745.0, + "completions/max_terminated_length": 745.0, + "completions/mean_length": 460.640625, + "completions/mean_terminated_length": 460.640625, + "completions/min_length": 223.0, + "completions/min_terminated_length": 223.0, + "epoch": 0.6645782054150817, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.06449608504772186, + "learning_rate": 4.091033692557641e-06, + "loss": -0.0014, + "num_tokens": 236790250.0, + "reward": 6.271068572998047, + "reward_std": 0.07193907350301743, + "rewards/accuracy_reward/mean": 5.5, + "rewards/accuracy_reward/std": 0.0, + "rewards/ngram_similarity_reward/mean": 0.7710685133934021, + "rewards/ngram_similarity_reward/std": 0.3435302674770355, + "step": 1485 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 787.0, + "completions/max_terminated_length": 787.0, + "completions/mean_length": 527.671875, + "completions/mean_terminated_length": 527.671875, + "completions/min_length": 332.0, + "completions/min_terminated_length": 332.0, + "epoch": 0.6650257328261356, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.080342136323452, + "learning_rate": 4.089696565381543e-06, + "loss": -0.0054, + "num_tokens": 236930501.0, + "reward": 1.0850721597671509, + "reward_std": 1.204647183418274, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 2.195775270462036, + "rewards/ngram_similarity_reward/mean": 0.6475721597671509, + "rewards/ngram_similarity_reward/std": 0.19706475734710693, + "step": 1486 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 606.0, + "completions/max_terminated_length": 606.0, + "completions/mean_length": 424.796875, + "completions/mean_terminated_length": 424.796875, + "completions/min_length": 323.0, + "completions/min_terminated_length": 323.0, + "epoch": 0.6654732602371896, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.08424574881792068, + "learning_rate": 4.088358704796522e-06, + "loss": 0.0063, + "num_tokens": 237080952.0, + "reward": 5.506689071655273, + "reward_std": 0.8653192520141602, + "rewards/accuracy_reward/mean": 4.75, + "rewards/accuracy_reward/std": 2.0, + "rewards/ngram_similarity_reward/mean": 0.7566890716552734, + "rewards/ngram_similarity_reward/std": 0.2732362449169159, + "step": 1487 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 837.0, + "completions/mean_length": 601.34375, + "completions/mean_terminated_length": 504.9000244140625, + "completions/min_length": 317.0, + "completions/min_terminated_length": 317.0, + "epoch": 0.6659207876482435, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.08288619667291641, + "learning_rate": 4.087020111534981e-06, + "loss": 0.0117, + "num_tokens": 237208430.0, + "reward": 6.119905948638916, + "reward_std": 0.5858101844787598, + "rewards/accuracy_reward/mean": 5.3125, + "rewards/accuracy_reward/std": 1.0522085428237915, + "rewards/ngram_similarity_reward/mean": 0.8074060082435608, + "rewards/ngram_similarity_reward/std": 0.22066588699817657, + "step": 1488 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 688.0, + "completions/max_terminated_length": 688.0, + "completions/mean_length": 443.5625, + "completions/mean_terminated_length": 443.5625, + "completions/min_length": 257.0, + "completions/min_terminated_length": 257.0, + "epoch": 0.6663683150592974, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11840573698282242, + "learning_rate": 4.085680786329725e-06, + "loss": 0.012, + "num_tokens": 237426818.0, + "reward": 3.0602216720581055, + "reward_std": 0.8027669787406921, + "rewards/accuracy_reward/mean": 2.21875, + "rewards/accuracy_reward/std": 3.0103988647460938, + "rewards/ngram_similarity_reward/mean": 0.841471791267395, + "rewards/ngram_similarity_reward/std": 0.22687605023384094, + "step": 1489 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 870.0, + "completions/max_terminated_length": 870.0, + "completions/mean_length": 590.65625, + "completions/mean_terminated_length": 590.65625, + "completions/min_length": 332.0, + "completions/min_terminated_length": 332.0, + "epoch": 0.6668158424703513, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.05773892626166344, + "learning_rate": 4.084340729913959e-06, + "loss": 0.0107, + "num_tokens": 237584524.0, + "reward": 5.3110551834106445, + "reward_std": 0.8410439491271973, + "rewards/accuracy_reward/mean": 4.609375, + "rewards/accuracy_reward/std": 2.2262303829193115, + "rewards/ngram_similarity_reward/mean": 0.7016801238059998, + "rewards/ngram_similarity_reward/std": 0.3378947675228119, + "step": 1490 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 845.0, + "completions/mean_length": 502.65625, + "completions/mean_terminated_length": 478.12701416015625, + "completions/min_length": 243.0, + "completions/min_terminated_length": 243.0, + "epoch": 0.6672633698814052, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0763397067785263, + "learning_rate": 4.082999943021291e-06, + "loss": -0.0176, + "num_tokens": 237762454.0, + "reward": 3.3543882369995117, + "reward_std": 0.6753823757171631, + "rewards/accuracy_reward/mean": 2.78125, + "rewards/accuracy_reward/std": 3.0103988647460938, + "rewards/ngram_similarity_reward/mean": 0.5731382966041565, + "rewards/ngram_similarity_reward/std": 0.4696844816207886, + "step": 1491 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 832.0, + "completions/mean_length": 539.0625, + "completions/mean_terminated_length": 515.1111450195312, + "completions/min_length": 353.0, + "completions/min_terminated_length": 353.0, + "epoch": 0.6677108972924591, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10276427865028381, + "learning_rate": 4.081658426385725e-06, + "loss": -0.0187, + "num_tokens": 237909962.0, + "reward": 4.054726600646973, + "reward_std": 1.9290709495544434, + "rewards/accuracy_reward/mean": 3.15625, + "rewards/accuracy_reward/std": 2.950484275817871, + "rewards/ngram_similarity_reward/mean": 0.8984768390655518, + "rewards/ngram_similarity_reward/std": 0.2063111960887909, + "step": 1492 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 640.0, + "completions/max_terminated_length": 640.0, + "completions/mean_length": 428.03125, + "completions/mean_terminated_length": 428.03125, + "completions/min_length": 221.0, + "completions/min_terminated_length": 221.0, + "epoch": 0.668158424703513, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.08124004304409027, + "learning_rate": 4.080316180741667e-06, + "loss": -0.001, + "num_tokens": 238071116.0, + "reward": 6.309140205383301, + "reward_std": 0.09563571214675903, + "rewards/accuracy_reward/mean": 5.5, + "rewards/accuracy_reward/std": 0.0, + "rewards/ngram_similarity_reward/mean": 0.8091399669647217, + "rewards/ngram_similarity_reward/std": 0.2800677418708801, + "step": 1493 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 906.0, + "completions/mean_length": 588.9375, + "completions/mean_terminated_length": 565.77783203125, + "completions/min_length": 285.0, + "completions/min_terminated_length": 285.0, + "epoch": 0.668605952114567, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09261708706617355, + "learning_rate": 4.0789732068239215e-06, + "loss": -0.0255, + "num_tokens": 238219688.0, + "reward": 6.038747787475586, + "reward_std": 1.0299474000930786, + "rewards/accuracy_reward/mean": 5.125, + "rewards/accuracy_reward/std": 1.4638501405715942, + "rewards/ngram_similarity_reward/mean": 0.9137482047080994, + "rewards/ngram_similarity_reward/std": 0.23467347025871277, + "step": 1494 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 945.0, + "completions/max_terminated_length": 945.0, + "completions/mean_length": 531.8125, + "completions/mean_terminated_length": 531.8125, + "completions/min_length": 322.0, + "completions/min_terminated_length": 322.0, + "epoch": 0.669053479525621, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08115254342556, + "learning_rate": 4.07762950536769e-06, + "loss": -0.0014, + "num_tokens": 238362236.0, + "reward": 3.2857232093811035, + "reward_std": 0.10580325871706009, + "rewards/accuracy_reward/mean": 2.5, + "rewards/accuracy_reward/std": 3.0237157344818115, + "rewards/ngram_similarity_reward/mean": 0.7857229113578796, + "rewards/ngram_similarity_reward/std": 0.2741377651691437, + "step": 1495 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 739.0, + "completions/max_terminated_length": 739.0, + "completions/mean_length": 566.6875, + "completions/mean_terminated_length": 566.6875, + "completions/min_length": 355.0, + "completions/min_terminated_length": 355.0, + "epoch": 0.6695010069366749, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08066169172525406, + "learning_rate": 4.076285077108576e-06, + "loss": 0.0104, + "num_tokens": 238497832.0, + "reward": 4.507000923156738, + "reward_std": 0.5692435503005981, + "rewards/accuracy_reward/mean": 3.8125, + "rewards/accuracy_reward/std": 2.7189810276031494, + "rewards/ngram_similarity_reward/mean": 0.6945005655288696, + "rewards/ngram_similarity_reward/std": 0.32864856719970703, + "step": 1496 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 811.0, + "completions/max_terminated_length": 811.0, + "completions/mean_length": 530.578125, + "completions/mean_terminated_length": 530.578125, + "completions/min_length": 332.0, + "completions/min_terminated_length": 332.0, + "epoch": 0.6699485343477288, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07869521528482437, + "learning_rate": 4.0749399227825775e-06, + "loss": -0.0118, + "num_tokens": 238662765.0, + "reward": 4.468442440032959, + "reward_std": 1.3427462577819824, + "rewards/accuracy_reward/mean": 3.90625, + "rewards/accuracy_reward/std": 2.6709415912628174, + "rewards/ngram_similarity_reward/mean": 0.5621925592422485, + "rewards/ngram_similarity_reward/std": 0.23733443021774292, + "step": 1497 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 865.0, + "completions/mean_length": 522.421875, + "completions/mean_terminated_length": 498.2063903808594, + "completions/min_length": 322.0, + "completions/min_terminated_length": 322.0, + "epoch": 0.6703960617587827, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08195405453443527, + "learning_rate": 4.073594043126093e-06, + "loss": -0.0238, + "num_tokens": 238830936.0, + "reward": 3.77846622467041, + "reward_std": 1.9166414737701416, + "rewards/accuracy_reward/mean": 3.15625, + "rewards/accuracy_reward/std": 2.950484275817871, + "rewards/ngram_similarity_reward/mean": 0.6222162246704102, + "rewards/ngram_similarity_reward/std": 0.24115103483200073, + "step": 1498 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1221.0, + "completions/max_terminated_length": 1221.0, + "completions/mean_length": 496.796875, + "completions/mean_terminated_length": 496.796875, + "completions/min_length": 267.0, + "completions/min_terminated_length": 267.0, + "epoch": 0.6708435891698367, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10084085911512375, + "learning_rate": 4.0722474388759135e-06, + "loss": -0.0317, + "num_tokens": 238995371.0, + "reward": 4.701803207397461, + "reward_std": 0.5719506144523621, + "rewards/accuracy_reward/mean": 3.875, + "rewards/accuracy_reward/std": 2.7284510135650635, + "rewards/ngram_similarity_reward/mean": 0.8268033266067505, + "rewards/ngram_similarity_reward/std": 0.2679290175437927, + "step": 1499 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1388.0, + "completions/mean_length": 595.65625, + "completions/mean_terminated_length": 572.6032104492188, + "completions/min_length": 386.0, + "completions/min_terminated_length": 386.0, + "epoch": 0.6712911165808906, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10507504642009735, + "learning_rate": 4.0709001107692305e-06, + "loss": 0.0258, + "num_tokens": 239127797.0, + "reward": 4.470786094665527, + "reward_std": 0.19319896399974823, + "rewards/accuracy_reward/mean": 4.0, + "rewards/accuracy_reward/std": 2.618614673614502, + "rewards/ngram_similarity_reward/mean": 0.4707862436771393, + "rewards/ngram_similarity_reward/std": 0.3042636215686798, + "step": 1500 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 765.0, + "completions/max_terminated_length": 765.0, + "completions/mean_length": 457.03125, + "completions/mean_terminated_length": 457.03125, + "completions/min_length": 288.0, + "completions/min_terminated_length": 288.0, + "epoch": 0.6717386439919445, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.07408589124679565, + "learning_rate": 4.069552059543632e-06, + "loss": 0.0489, + "num_tokens": 239313671.0, + "reward": 0.45077264308929443, + "reward_std": 0.7991616725921631, + "rewards/accuracy_reward/mean": -0.03125, + "rewards/accuracy_reward/std": 1.3209995031356812, + "rewards/ngram_similarity_reward/mean": 0.48202264308929443, + "rewards/ngram_similarity_reward/std": 0.15846078097820282, + "step": 1501 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 635.0, + "completions/max_terminated_length": 635.0, + "completions/mean_length": 423.234375, + "completions/mean_terminated_length": 423.234375, + "completions/min_length": 298.0, + "completions/min_terminated_length": 298.0, + "epoch": 0.6721861714029984, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10048261284828186, + "learning_rate": 4.068203285937101e-06, + "loss": 0.011, + "num_tokens": 239434678.0, + "reward": 4.453929901123047, + "reward_std": 0.5523361563682556, + "rewards/accuracy_reward/mean": 3.8125, + "rewards/accuracy_reward/std": 2.7189810276031494, + "rewards/ngram_similarity_reward/mean": 0.6414296627044678, + "rewards/ngram_similarity_reward/std": 0.3741045892238617, + "step": 1502 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 813.0, + "completions/max_terminated_length": 813.0, + "completions/mean_length": 498.53125, + "completions/mean_terminated_length": 498.53125, + "completions/min_length": 259.0, + "completions/min_terminated_length": 259.0, + "epoch": 0.6726336988140523, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11623698472976685, + "learning_rate": 4.066853790688013e-06, + "loss": -0.0322, + "num_tokens": 239666312.0, + "reward": 2.941746234893799, + "reward_std": 0.8182100057601929, + "rewards/accuracy_reward/mean": 2.25, + "rewards/accuracy_reward/std": 3.295017957687378, + "rewards/ngram_similarity_reward/mean": 0.6917462944984436, + "rewards/ngram_similarity_reward/std": 0.2632776200771332, + "step": 1503 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 979.0, + "completions/max_terminated_length": 979.0, + "completions/mean_length": 556.203125, + "completions/mean_terminated_length": 556.203125, + "completions/min_length": 321.0, + "completions/min_terminated_length": 321.0, + "epoch": 0.6730812262251062, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.15846550464630127, + "learning_rate": 4.065503574535143e-06, + "loss": 0.0191, + "num_tokens": 239884917.0, + "reward": 2.8816068172454834, + "reward_std": 0.4729578495025635, + "rewards/accuracy_reward/mean": 2.40625, + "rewards/accuracy_reward/std": 3.0222392082214355, + "rewards/ngram_similarity_reward/mean": 0.4753568470478058, + "rewards/ngram_similarity_reward/std": 0.20575736463069916, + "step": 1504 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 885.0, + "completions/mean_length": 636.21875, + "completions/mean_terminated_length": 613.8095703125, + "completions/min_length": 447.0, + "completions/min_terminated_length": 447.0, + "epoch": 0.6735287536361602, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07791364938020706, + "learning_rate": 4.064152638217657e-06, + "loss": 0.0034, + "num_tokens": 240057539.0, + "reward": 4.2021660804748535, + "reward_std": 0.5542839765548706, + "rewards/accuracy_reward/mean": 3.8125, + "rewards/accuracy_reward/std": 2.7189810276031494, + "rewards/ngram_similarity_reward/mean": 0.3896661102771759, + "rewards/ngram_similarity_reward/std": 0.29823753237724304, + "step": 1505 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 794.0, + "completions/mean_length": 578.484375, + "completions/mean_terminated_length": 555.1587524414062, + "completions/min_length": 364.0, + "completions/min_terminated_length": 364.0, + "epoch": 0.6739762810472142, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08407315611839294, + "learning_rate": 4.062800982475121e-06, + "loss": 0.0153, + "num_tokens": 240206690.0, + "reward": 0.8947778940200806, + "reward_std": 0.812238335609436, + "rewards/accuracy_reward/mean": 0.34375, + "rewards/accuracy_reward/std": 2.102294683456421, + "rewards/ngram_similarity_reward/mean": 0.5510278940200806, + "rewards/ngram_similarity_reward/std": 0.34097588062286377, + "step": 1506 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 927.0, + "completions/max_terminated_length": 927.0, + "completions/mean_length": 544.359375, + "completions/mean_terminated_length": 544.359375, + "completions/min_length": 317.0, + "completions/min_terminated_length": 317.0, + "epoch": 0.6744238084582681, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10727450996637344, + "learning_rate": 4.061448608047487e-06, + "loss": 0.0123, + "num_tokens": 240400585.0, + "reward": 4.1962361335754395, + "reward_std": 1.6066899299621582, + "rewards/accuracy_reward/mean": 3.421875, + "rewards/accuracy_reward/std": 2.896657705307007, + "rewards/ngram_similarity_reward/mean": 0.7743610739707947, + "rewards/ngram_similarity_reward/std": 0.2609521448612213, + "step": 1507 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1208.0, + "completions/max_terminated_length": 1208.0, + "completions/mean_length": 658.203125, + "completions/mean_terminated_length": 658.203125, + "completions/min_length": 391.0, + "completions/min_terminated_length": 391.0, + "epoch": 0.674871335869322, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09349507093429565, + "learning_rate": 4.060095515675107e-06, + "loss": 0.0134, + "num_tokens": 240552502.0, + "reward": 2.9654061794281006, + "reward_std": 1.4760980606079102, + "rewards/accuracy_reward/mean": 2.21875, + "rewards/accuracy_reward/std": 3.0103988647460938, + "rewards/ngram_similarity_reward/mean": 0.7466559410095215, + "rewards/ngram_similarity_reward/std": 0.2863253355026245, + "step": 1508 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 671.0, + "completions/max_terminated_length": 671.0, + "completions/mean_length": 432.96875, + "completions/mean_terminated_length": 432.96875, + "completions/min_length": 287.0, + "completions/min_terminated_length": 287.0, + "epoch": 0.6753188632803759, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09913067519664764, + "learning_rate": 4.058741706098721e-06, + "loss": 0.016, + "num_tokens": 240693380.0, + "reward": 6.0691680908203125, + "reward_std": 0.6833319067955017, + "rewards/accuracy_reward/mean": 5.21875, + "rewards/accuracy_reward/std": 1.2782522439956665, + "rewards/ngram_similarity_reward/mean": 0.8504183292388916, + "rewards/ngram_similarity_reward/std": 0.4402073323726654, + "step": 1509 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 835.0, + "completions/max_terminated_length": 835.0, + "completions/mean_length": 428.921875, + "completions/mean_terminated_length": 428.921875, + "completions/min_length": 284.0, + "completions/min_terminated_length": 284.0, + "epoch": 0.6757663906914299, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12266264855861664, + "learning_rate": 4.057387180059465e-06, + "loss": -0.0191, + "num_tokens": 240841247.0, + "reward": 3.98134446144104, + "reward_std": 0.8376370072364807, + "rewards/accuracy_reward/mean": 3.15625, + "rewards/accuracy_reward/std": 2.950484275817871, + "rewards/ngram_similarity_reward/mean": 0.8250945806503296, + "rewards/ngram_similarity_reward/std": 0.29401248693466187, + "step": 1510 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 823.0, + "completions/max_terminated_length": 823.0, + "completions/mean_length": 595.375, + "completions/mean_terminated_length": 595.375, + "completions/min_length": 451.0, + "completions/min_terminated_length": 451.0, + "epoch": 0.6762139181024838, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07436852157115936, + "learning_rate": 4.0560319382988665e-06, + "loss": 0.0041, + "num_tokens": 241011655.0, + "reward": 2.9112842082977295, + "reward_std": 0.09545248746871948, + "rewards/accuracy_reward/mean": 2.5, + "rewards/accuracy_reward/std": 3.0237157344818115, + "rewards/ngram_similarity_reward/mean": 0.411283940076828, + "rewards/ngram_similarity_reward/std": 0.19912941753864288, + "step": 1511 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 726.0, + "completions/max_terminated_length": 726.0, + "completions/mean_length": 518.71875, + "completions/mean_terminated_length": 518.71875, + "completions/min_length": 310.0, + "completions/min_terminated_length": 310.0, + "epoch": 0.6766614455135377, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1377800852060318, + "learning_rate": 4.054675981558845e-06, + "loss": -0.0225, + "num_tokens": 241212437.0, + "reward": 4.467209815979004, + "reward_std": 0.8089848756790161, + "rewards/accuracy_reward/mean": 3.59375, + "rewards/accuracy_reward/std": 2.854785919189453, + "rewards/ngram_similarity_reward/mean": 0.8734598159790039, + "rewards/ngram_similarity_reward/std": 0.2185596376657486, + "step": 1512 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 717.0, + "completions/max_terminated_length": 717.0, + "completions/mean_length": 515.453125, + "completions/mean_terminated_length": 515.453125, + "completions/min_length": 324.0, + "completions/min_terminated_length": 324.0, + "epoch": 0.6771089729245916, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08017116785049438, + "learning_rate": 4.053319310581709e-06, + "loss": -0.0084, + "num_tokens": 241396898.0, + "reward": 1.6002922058105469, + "reward_std": 0.4597763121128082, + "rewards/accuracy_reward/mean": 1.09375, + "rewards/accuracy_reward/std": 2.6709415912628174, + "rewards/ngram_similarity_reward/mean": 0.5065422654151917, + "rewards/ngram_similarity_reward/std": 0.19871973991394043, + "step": 1513 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 747.0, + "completions/max_terminated_length": 747.0, + "completions/mean_length": 395.546875, + "completions/mean_terminated_length": 395.546875, + "completions/min_length": 204.0, + "completions/min_terminated_length": 204.0, + "epoch": 0.6775565003356455, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.11000040918588638, + "learning_rate": 4.051961926110161e-06, + "loss": -0.043, + "num_tokens": 241563813.0, + "reward": 6.277076721191406, + "reward_std": 0.17032390832901, + "rewards/accuracy_reward/mean": 5.5, + "rewards/accuracy_reward/std": 0.0, + "rewards/ngram_similarity_reward/mean": 0.7770768404006958, + "rewards/ngram_similarity_reward/std": 0.39133378863334656, + "step": 1514 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1112.0, + "completions/max_terminated_length": 1112.0, + "completions/mean_length": 488.8125, + "completions/mean_terminated_length": 488.8125, + "completions/min_length": 213.0, + "completions/min_terminated_length": 213.0, + "epoch": 0.6780040277466994, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.15393635630607605, + "learning_rate": 4.0506038288872955e-06, + "loss": 0.0103, + "num_tokens": 241792089.0, + "reward": 4.695099830627441, + "reward_std": 0.04499488323926926, + "rewards/accuracy_reward/mean": 4.0, + "rewards/accuracy_reward/std": 2.618614673614502, + "rewards/ngram_similarity_reward/mean": 0.6950998306274414, + "rewards/ngram_similarity_reward/std": 0.42729097604751587, + "step": 1515 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 710.0, + "completions/max_terminated_length": 710.0, + "completions/mean_length": 474.625, + "completions/mean_terminated_length": 474.625, + "completions/min_length": 289.0, + "completions/min_terminated_length": 289.0, + "epoch": 0.6784515551577535, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.17051804065704346, + "learning_rate": 4.049245019656592e-06, + "loss": -0.0037, + "num_tokens": 242007793.0, + "reward": 4.716339111328125, + "reward_std": 0.4745529294013977, + "rewards/accuracy_reward/mean": 3.90625, + "rewards/accuracy_reward/std": 2.6709415912628174, + "rewards/ngram_similarity_reward/mean": 0.8100893497467041, + "rewards/ngram_similarity_reward/std": 0.2591150999069214, + "step": 1516 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 902.0, + "completions/max_terminated_length": 902.0, + "completions/mean_length": 492.71875, + "completions/mean_terminated_length": 492.71875, + "completions/min_length": 313.0, + "completions/min_terminated_length": 313.0, + "epoch": 0.6788990825688074, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09130734205245972, + "learning_rate": 4.047885499161923e-06, + "loss": 0.022, + "num_tokens": 242175439.0, + "reward": 4.156372547149658, + "reward_std": 0.7526333332061768, + "rewards/accuracy_reward/mean": 3.53125, + "rewards/accuracy_reward/std": 2.839454174041748, + "rewards/ngram_similarity_reward/mean": 0.6251224279403687, + "rewards/ngram_similarity_reward/std": 0.2308683544397354, + "step": 1517 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 816.0, + "completions/mean_length": 519.9375, + "completions/mean_terminated_length": 495.68255615234375, + "completions/min_length": 301.0, + "completions/min_terminated_length": 301.0, + "epoch": 0.6793466099798613, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11831668764352798, + "learning_rate": 4.0465252681475505e-06, + "loss": -0.036, + "num_tokens": 242350523.0, + "reward": 1.7951796054840088, + "reward_std": 0.5443094968795776, + "rewards/accuracy_reward/mean": 1.09375, + "rewards/accuracy_reward/std": 2.6709415912628174, + "rewards/ngram_similarity_reward/mean": 0.7014296650886536, + "rewards/ngram_similarity_reward/std": 0.31123247742652893, + "step": 1518 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 671.0, + "completions/max_terminated_length": 671.0, + "completions/mean_length": 438.765625, + "completions/mean_terminated_length": 438.765625, + "completions/min_length": 285.0, + "completions/min_terminated_length": 285.0, + "epoch": 0.6797941373909152, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10574766993522644, + "learning_rate": 4.0451643273581235e-06, + "loss": 0.0051, + "num_tokens": 242480716.0, + "reward": 4.843680381774902, + "reward_std": 0.9988704919815063, + "rewards/accuracy_reward/mean": 4.09375, + "rewards/accuracy_reward/std": 2.5617377758026123, + "rewards/ngram_similarity_reward/mean": 0.749930202960968, + "rewards/ngram_similarity_reward/std": 0.2892991900444031, + "step": 1519 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1395.0, + "completions/max_terminated_length": 1395.0, + "completions/mean_length": 567.921875, + "completions/mean_terminated_length": 567.921875, + "completions/min_length": 286.0, + "completions/min_terminated_length": 286.0, + "epoch": 0.6802416648019691, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.07118367403745651, + "learning_rate": 4.043802677538682e-06, + "loss": 0.03, + "num_tokens": 242637063.0, + "reward": 1.5142168998718262, + "reward_std": 0.10006881505250931, + "rewards/accuracy_reward/mean": 0.984375, + "rewards/accuracy_reward/std": 2.6306629180908203, + "rewards/ngram_similarity_reward/mean": 0.5298418998718262, + "rewards/ngram_similarity_reward/std": 0.14635279774665833, + "step": 1520 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 730.0, + "completions/max_terminated_length": 730.0, + "completions/mean_length": 479.34375, + "completions/mean_terminated_length": 479.34375, + "completions/min_length": 294.0, + "completions/min_terminated_length": 294.0, + "epoch": 0.680689192213023, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10519301891326904, + "learning_rate": 4.042440319434653e-06, + "loss": 0.0257, + "num_tokens": 242758237.0, + "reward": 4.176997661590576, + "reward_std": 0.9178718328475952, + "rewards/accuracy_reward/mean": 3.53125, + "rewards/accuracy_reward/std": 2.839454174041748, + "rewards/ngram_similarity_reward/mean": 0.6457475423812866, + "rewards/ngram_similarity_reward/std": 0.2892041802406311, + "step": 1521 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 948.0, + "completions/max_terminated_length": 948.0, + "completions/mean_length": 532.0, + "completions/mean_terminated_length": 532.0, + "completions/min_length": 315.0, + "completions/min_terminated_length": 315.0, + "epoch": 0.681136719624077, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08786152303218842, + "learning_rate": 4.041077253791853e-06, + "loss": 0.0127, + "num_tokens": 242908973.0, + "reward": 3.625276565551758, + "reward_std": 0.7645278573036194, + "rewards/accuracy_reward/mean": 2.875, + "rewards/accuracy_reward/std": 3.000000238418579, + "rewards/ngram_similarity_reward/mean": 0.7502766847610474, + "rewards/ngram_similarity_reward/std": 0.33395424485206604, + "step": 1522 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 872.0, + "completions/mean_length": 564.578125, + "completions/mean_terminated_length": 541.0317993164062, + "completions/min_length": 287.0, + "completions/min_terminated_length": 287.0, + "epoch": 0.6815842470351309, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10636013746261597, + "learning_rate": 4.03971348135648e-06, + "loss": 0.0508, + "num_tokens": 243068882.0, + "reward": 2.9963223934173584, + "reward_std": 1.0687932968139648, + "rewards/accuracy_reward/mean": 2.3125, + "rewards/accuracy_reward/std": 3.0178043842315674, + "rewards/ngram_similarity_reward/mean": 0.6838223934173584, + "rewards/ngram_similarity_reward/std": 0.32443106174468994, + "step": 1523 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 860.0, + "completions/max_terminated_length": 860.0, + "completions/mean_length": 499.46875, + "completions/mean_terminated_length": 499.46875, + "completions/min_length": 335.0, + "completions/min_terminated_length": 335.0, + "epoch": 0.6820317744461848, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.08839923143386841, + "learning_rate": 4.038349002875127e-06, + "loss": -0.0321, + "num_tokens": 243221936.0, + "reward": 3.3763747215270996, + "reward_std": 0.6003474593162537, + "rewards/accuracy_reward/mean": 2.6875, + "rewards/accuracy_reward/std": 3.0178043842315674, + "rewards/ngram_similarity_reward/mean": 0.6888746023178101, + "rewards/ngram_similarity_reward/std": 0.25925931334495544, + "step": 1524 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 647.0, + "completions/max_terminated_length": 647.0, + "completions/mean_length": 465.359375, + "completions/mean_terminated_length": 465.359375, + "completions/min_length": 310.0, + "completions/min_terminated_length": 310.0, + "epoch": 0.6824793018572387, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0927591472864151, + "learning_rate": 4.036983819094769e-06, + "loss": 0.0248, + "num_tokens": 243377463.0, + "reward": 4.6241655349731445, + "reward_std": 0.5007472038269043, + "rewards/accuracy_reward/mean": 3.890625, + "rewards/accuracy_reward/std": 2.6998953819274902, + "rewards/ngram_similarity_reward/mean": 0.7335406541824341, + "rewards/ngram_similarity_reward/std": 0.30895963311195374, + "step": 1525 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 839.0, + "completions/max_terminated_length": 839.0, + "completions/mean_length": 522.5625, + "completions/mean_terminated_length": 522.5625, + "completions/min_length": 352.0, + "completions/min_terminated_length": 352.0, + "epoch": 0.6829268292682927, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.09703091531991959, + "learning_rate": 4.0356179307627654e-06, + "loss": 0.0057, + "num_tokens": 243560683.0, + "reward": 4.5334930419921875, + "reward_std": 0.6946358680725098, + "rewards/accuracy_reward/mean": 3.71875, + "rewards/accuracy_reward/std": 2.7629566192626953, + "rewards/ngram_similarity_reward/mean": 0.8147426247596741, + "rewards/ngram_similarity_reward/std": 0.26535749435424805, + "step": 1526 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 684.0, + "completions/mean_length": 489.296875, + "completions/mean_terminated_length": 464.5555725097656, + "completions/min_length": 272.0, + "completions/min_terminated_length": 272.0, + "epoch": 0.6833743566793467, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1149318590760231, + "learning_rate": 4.034251338626867e-06, + "loss": -0.0194, + "num_tokens": 243772062.0, + "reward": 1.503861665725708, + "reward_std": 0.12548813223838806, + "rewards/accuracy_reward/mean": 1.0, + "rewards/accuracy_reward/std": 2.618614673614502, + "rewards/ngram_similarity_reward/mean": 0.5038615465164185, + "rewards/ngram_similarity_reward/std": 0.2763817310333252, + "step": 1527 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1032.0, + "completions/max_terminated_length": 1032.0, + "completions/mean_length": 519.0, + "completions/mean_terminated_length": 519.0, + "completions/min_length": 292.0, + "completions/min_terminated_length": 292.0, + "epoch": 0.6838218840904006, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08250933140516281, + "learning_rate": 4.032884043435204e-06, + "loss": 0.0276, + "num_tokens": 243936014.0, + "reward": 2.9738001823425293, + "reward_std": 0.1276642084121704, + "rewards/accuracy_reward/mean": 2.5, + "rewards/accuracy_reward/std": 3.0237157344818115, + "rewards/ngram_similarity_reward/mean": 0.4738001227378845, + "rewards/ngram_similarity_reward/std": 0.30533042550086975, + "step": 1528 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 840.0, + "completions/max_terminated_length": 840.0, + "completions/mean_length": 569.0625, + "completions/mean_terminated_length": 569.0625, + "completions/min_length": 334.0, + "completions/min_terminated_length": 334.0, + "epoch": 0.6842694115014545, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08203459531068802, + "learning_rate": 4.031516045936295e-06, + "loss": 0.0132, + "num_tokens": 244071554.0, + "reward": 3.4614744186401367, + "reward_std": 1.5858569145202637, + "rewards/accuracy_reward/mean": 2.84375, + "rewards/accuracy_reward/std": 3.0405657291412354, + "rewards/ngram_similarity_reward/mean": 0.6177245378494263, + "rewards/ngram_similarity_reward/std": 0.25231122970581055, + "step": 1529 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 768.0, + "completions/max_terminated_length": 768.0, + "completions/mean_length": 457.59375, + "completions/mean_terminated_length": 457.59375, + "completions/min_length": 280.0, + "completions/min_terminated_length": 280.0, + "epoch": 0.6847169389125084, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11150728911161423, + "learning_rate": 4.030147346879042e-06, + "loss": -0.0276, + "num_tokens": 244246120.0, + "reward": 3.9611706733703613, + "reward_std": 0.8779897689819336, + "rewards/accuracy_reward/mean": 3.34375, + "rewards/accuracy_reward/std": 2.9016621112823486, + "rewards/ngram_similarity_reward/mean": 0.6174206733703613, + "rewards/ngram_similarity_reward/std": 0.2506239712238312, + "step": 1530 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 995.0, + "completions/max_terminated_length": 995.0, + "completions/mean_length": 501.078125, + "completions/mean_terminated_length": 501.078125, + "completions/min_length": 255.0, + "completions/min_terminated_length": 255.0, + "epoch": 0.6851644663235623, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08861125260591507, + "learning_rate": 4.028777947012732e-06, + "loss": 0.0065, + "num_tokens": 244404237.0, + "reward": 3.6665449142456055, + "reward_std": 1.2334115505218506, + "rewards/accuracy_reward/mean": 3.25, + "rewards/accuracy_reward/std": 2.9277002811431885, + "rewards/ngram_similarity_reward/mean": 0.41654497385025024, + "rewards/ngram_similarity_reward/std": 0.2642189562320709, + "step": 1531 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 730.0, + "completions/max_terminated_length": 730.0, + "completions/mean_length": 480.46875, + "completions/mean_terminated_length": 480.46875, + "completions/min_length": 329.0, + "completions/min_terminated_length": 329.0, + "epoch": 0.6856119937346162, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08565875142812729, + "learning_rate": 4.027407847087032e-06, + "loss": 0.0087, + "num_tokens": 244568651.0, + "reward": 3.4153053760528564, + "reward_std": 0.43495145440101624, + "rewards/accuracy_reward/mean": 2.59375, + "rewards/accuracy_reward/std": 3.0222392082214355, + "rewards/ngram_similarity_reward/mean": 0.8215552568435669, + "rewards/ngram_similarity_reward/std": 0.18351885676383972, + "step": 1532 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 867.0, + "completions/max_terminated_length": 867.0, + "completions/mean_length": 512.484375, + "completions/mean_terminated_length": 512.484375, + "completions/min_length": 303.0, + "completions/min_terminated_length": 303.0, + "epoch": 0.6860595211456701, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0879283994436264, + "learning_rate": 4.0260370478519986e-06, + "loss": -0.0244, + "num_tokens": 244779178.0, + "reward": 4.797738075256348, + "reward_std": 0.10125033557415009, + "rewards/accuracy_reward/mean": 4.0, + "rewards/accuracy_reward/std": 2.618614673614502, + "rewards/ngram_similarity_reward/mean": 0.7977379560470581, + "rewards/ngram_similarity_reward/std": 0.20789459347724915, + "step": 1533 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 789.0, + "completions/max_terminated_length": 789.0, + "completions/mean_length": 459.265625, + "completions/mean_terminated_length": 459.265625, + "completions/min_length": 273.0, + "completions/min_terminated_length": 273.0, + "epoch": 0.6865070485567241, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.10340742766857147, + "learning_rate": 4.024665550058065e-06, + "loss": -0.0148, + "num_tokens": 244986763.0, + "reward": 4.677967071533203, + "reward_std": 0.09323635697364807, + "rewards/accuracy_reward/mean": 4.0, + "rewards/accuracy_reward/std": 2.618614673614502, + "rewards/ngram_similarity_reward/mean": 0.6779671311378479, + "rewards/ngram_similarity_reward/std": 0.2466803640127182, + "step": 1534 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 882.0, + "completions/mean_length": 550.515625, + "completions/mean_terminated_length": 450.683349609375, + "completions/min_length": 284.0, + "completions/min_terminated_length": 284.0, + "epoch": 0.686954575967778, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10321526229381561, + "learning_rate": 4.023293354456051e-06, + "loss": 0.0764, + "num_tokens": 245177868.0, + "reward": 4.347613334655762, + "reward_std": 1.1923820972442627, + "rewards/accuracy_reward/mean": 3.71875, + "rewards/accuracy_reward/std": 2.7629566192626953, + "rewards/ngram_similarity_reward/mean": 0.6288636922836304, + "rewards/ngram_similarity_reward/std": 0.3350275158882141, + "step": 1535 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 921.0, + "completions/mean_length": 465.953125, + "completions/mean_terminated_length": 440.84130859375, + "completions/min_length": 306.0, + "completions/min_terminated_length": 306.0, + "epoch": 0.687402103378832, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10282690078020096, + "learning_rate": 4.021920461797157e-06, + "loss": 0.0222, + "num_tokens": 245346457.0, + "reward": 4.505459785461426, + "reward_std": 0.4533050060272217, + "rewards/accuracy_reward/mean": 3.90625, + "rewards/accuracy_reward/std": 2.6709415912628174, + "rewards/ngram_similarity_reward/mean": 0.5992101430892944, + "rewards/ngram_similarity_reward/std": 0.13468407094478607, + "step": 1536 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 897.0, + "completions/mean_length": 520.5, + "completions/mean_terminated_length": 496.2539978027344, + "completions/min_length": 313.0, + "completions/min_terminated_length": 313.0, + "epoch": 0.6878496307898859, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08532318472862244, + "learning_rate": 4.020546872832965e-06, + "loss": -0.024, + "num_tokens": 245493289.0, + "reward": 4.433938026428223, + "reward_std": 0.6108426451683044, + "rewards/accuracy_reward/mean": 3.8125, + "rewards/accuracy_reward/std": 2.7189810276031494, + "rewards/ngram_similarity_reward/mean": 0.6214381456375122, + "rewards/ngram_similarity_reward/std": 0.1984623521566391, + "step": 1537 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 661.0, + "completions/max_terminated_length": 661.0, + "completions/mean_length": 430.21875, + "completions/mean_terminated_length": 430.21875, + "completions/min_length": 255.0, + "completions/min_terminated_length": 255.0, + "epoch": 0.6882971582009398, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.10850238800048828, + "learning_rate": 4.019172588315436e-06, + "loss": 0.0154, + "num_tokens": 245625319.0, + "reward": 4.919859886169434, + "reward_std": 0.453775554895401, + "rewards/accuracy_reward/mean": 4.078125, + "rewards/accuracy_reward/std": 2.593059778213501, + "rewards/ngram_similarity_reward/mean": 0.8417348861694336, + "rewards/ngram_similarity_reward/std": 0.335245281457901, + "step": 1538 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1050.0, + "completions/max_terminated_length": 1050.0, + "completions/mean_length": 518.5, + "completions/mean_terminated_length": 518.5, + "completions/min_length": 349.0, + "completions/min_terminated_length": 349.0, + "epoch": 0.6887446856119938, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08004138618707657, + "learning_rate": 4.017797608996918e-06, + "loss": 0.0577, + "num_tokens": 245760711.0, + "reward": 6.246609687805176, + "reward_std": 0.09443674236536026, + "rewards/accuracy_reward/mean": 5.5, + "rewards/accuracy_reward/std": 0.0, + "rewards/ngram_similarity_reward/mean": 0.7466100454330444, + "rewards/ngram_similarity_reward/std": 0.18616412580013275, + "step": 1539 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 928.0, + "completions/max_terminated_length": 928.0, + "completions/mean_length": 583.765625, + "completions/mean_terminated_length": 583.765625, + "completions/min_length": 328.0, + "completions/min_terminated_length": 328.0, + "epoch": 0.6891922130230477, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08965511620044708, + "learning_rate": 4.016421935630131e-06, + "loss": 0.0032, + "num_tokens": 245944392.0, + "reward": 4.213710308074951, + "reward_std": 1.1697884798049927, + "rewards/accuracy_reward/mean": 3.59375, + "rewards/accuracy_reward/std": 2.854785919189453, + "rewards/ngram_similarity_reward/mean": 0.6199604272842407, + "rewards/ngram_similarity_reward/std": 0.3590463697910309, + "step": 1540 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 735.0, + "completions/max_terminated_length": 735.0, + "completions/mean_length": 438.359375, + "completions/mean_terminated_length": 438.359375, + "completions/min_length": 329.0, + "completions/min_terminated_length": 329.0, + "epoch": 0.6896397404341016, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11437889188528061, + "learning_rate": 4.0150455689681805e-06, + "loss": -0.0005, + "num_tokens": 246081599.0, + "reward": 3.605909824371338, + "reward_std": 1.5501439571380615, + "rewards/accuracy_reward/mean": 2.8125, + "rewards/accuracy_reward/std": 3.080275297164917, + "rewards/ngram_similarity_reward/mean": 0.7934097647666931, + "rewards/ngram_similarity_reward/std": 0.28618934750556946, + "step": 1541 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 806.0, + "completions/max_terminated_length": 806.0, + "completions/mean_length": 478.21875, + "completions/mean_terminated_length": 478.21875, + "completions/min_length": 247.0, + "completions/min_terminated_length": 247.0, + "epoch": 0.6900872678451555, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07439448684453964, + "learning_rate": 4.013668509764552e-06, + "loss": -0.0131, + "num_tokens": 246252285.0, + "reward": 6.477996349334717, + "reward_std": 0.10430392622947693, + "rewards/accuracy_reward/mean": 5.5, + "rewards/accuracy_reward/std": 0.0, + "rewards/ngram_similarity_reward/mean": 0.9779964089393616, + "rewards/ngram_similarity_reward/std": 0.19902634620666504, + "step": 1542 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 689.0, + "completions/max_terminated_length": 689.0, + "completions/mean_length": 442.765625, + "completions/mean_terminated_length": 442.765625, + "completions/min_length": 228.0, + "completions/min_terminated_length": 228.0, + "epoch": 0.6905347952562094, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.09308404475450516, + "learning_rate": 4.012290758773106e-06, + "loss": 0.0194, + "num_tokens": 246411934.0, + "reward": 2.091059446334839, + "reward_std": 1.5338801145553589, + "rewards/accuracy_reward/mean": 1.1875, + "rewards/accuracy_reward/std": 2.7189810276031494, + "rewards/ngram_similarity_reward/mean": 0.9035593271255493, + "rewards/ngram_similarity_reward/std": 0.14946863055229187, + "step": 1543 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 720.0, + "completions/max_terminated_length": 720.0, + "completions/mean_length": 430.171875, + "completions/mean_terminated_length": 430.171875, + "completions/min_length": 235.0, + "completions/min_terminated_length": 235.0, + "epoch": 0.6909823226672633, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.09399006515741348, + "learning_rate": 4.010912316748085e-06, + "loss": -0.0073, + "num_tokens": 246566153.0, + "reward": 4.816415786743164, + "reward_std": 0.13335630297660828, + "rewards/accuracy_reward/mean": 4.0, + "rewards/accuracy_reward/std": 2.618614673614502, + "rewards/ngram_similarity_reward/mean": 0.8164159655570984, + "rewards/ngram_similarity_reward/std": 0.3032083511352539, + "step": 1544 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 820.0, + "completions/max_terminated_length": 820.0, + "completions/mean_length": 537.8125, + "completions/mean_terminated_length": 537.8125, + "completions/min_length": 283.0, + "completions/min_terminated_length": 283.0, + "epoch": 0.6914298500783173, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0755753219127655, + "learning_rate": 4.009533184444111e-06, + "loss": -0.016, + "num_tokens": 246769197.0, + "reward": 3.2236671447753906, + "reward_std": 0.09550964832305908, + "rewards/accuracy_reward/mean": 2.5, + "rewards/accuracy_reward/std": 3.0237157344818115, + "rewards/ngram_similarity_reward/mean": 0.723667323589325, + "rewards/ngram_similarity_reward/std": 0.3105979263782501, + "step": 1545 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 708.0, + "completions/max_terminated_length": 708.0, + "completions/mean_length": 501.546875, + "completions/mean_terminated_length": 501.546875, + "completions/min_length": 291.0, + "completions/min_terminated_length": 291.0, + "epoch": 0.6918773774893712, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.07646788656711578, + "learning_rate": 4.008153362616179e-06, + "loss": 0.0118, + "num_tokens": 246949312.0, + "reward": 3.505671977996826, + "reward_std": 1.1457102298736572, + "rewards/accuracy_reward/mean": 2.765625, + "rewards/accuracy_reward/std": 3.0302298069000244, + "rewards/ngram_similarity_reward/mean": 0.7400468587875366, + "rewards/ngram_similarity_reward/std": 0.15715715289115906, + "step": 1546 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 828.0, + "completions/max_terminated_length": 828.0, + "completions/mean_length": 508.1875, + "completions/mean_terminated_length": 508.1875, + "completions/min_length": 237.0, + "completions/min_terminated_length": 237.0, + "epoch": 0.6923249049004252, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.11373983323574066, + "learning_rate": 4.006772852019664e-06, + "loss": -0.002, + "num_tokens": 247099228.0, + "reward": 1.571900486946106, + "reward_std": 0.07764595746994019, + "rewards/accuracy_reward/mean": 1.0, + "rewards/accuracy_reward/std": 2.618614673614502, + "rewards/ngram_similarity_reward/mean": 0.5719004273414612, + "rewards/ngram_similarity_reward/std": 0.2418157160282135, + "step": 1547 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 787.0, + "completions/max_terminated_length": 787.0, + "completions/mean_length": 511.453125, + "completions/mean_terminated_length": 511.453125, + "completions/min_length": 278.0, + "completions/min_terminated_length": 278.0, + "epoch": 0.6927724323114791, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09819337725639343, + "learning_rate": 4.0053916534103205e-06, + "loss": 0.0368, + "num_tokens": 247300873.0, + "reward": 3.8611645698547363, + "reward_std": 0.7926970720291138, + "rewards/accuracy_reward/mean": 3.25, + "rewards/accuracy_reward/std": 2.9277002811431885, + "rewards/ngram_similarity_reward/mean": 0.6111645102500916, + "rewards/ngram_similarity_reward/std": 0.3728707730770111, + "step": 1548 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 961.0, + "completions/max_terminated_length": 961.0, + "completions/mean_length": 515.5, + "completions/mean_terminated_length": 515.5, + "completions/min_length": 258.0, + "completions/min_terminated_length": 258.0, + "epoch": 0.693219959722533, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08585703372955322, + "learning_rate": 4.004009767544276e-06, + "loss": -0.0004, + "num_tokens": 247444361.0, + "reward": 3.245168924331665, + "reward_std": 0.12304390221834183, + "rewards/accuracy_reward/mean": 2.5, + "rewards/accuracy_reward/std": 3.0237157344818115, + "rewards/ngram_similarity_reward/mean": 0.745168924331665, + "rewards/ngram_similarity_reward/std": 0.1642819494009018, + "step": 1549 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 918.0, + "completions/mean_length": 536.3125, + "completions/mean_terminated_length": 512.3175048828125, + "completions/min_length": 358.0, + "completions/min_terminated_length": 358.0, + "epoch": 0.693667487133587, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08912550657987595, + "learning_rate": 4.002627195178037e-06, + "loss": 0.002, + "num_tokens": 247592973.0, + "reward": 3.1012842655181885, + "reward_std": 0.11610934138298035, + "rewards/accuracy_reward/mean": 2.5, + "rewards/accuracy_reward/std": 3.0237157344818115, + "rewards/ngram_similarity_reward/mean": 0.601284384727478, + "rewards/ngram_similarity_reward/std": 0.3811487853527069, + "step": 1550 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 580.0, + "completions/max_terminated_length": 580.0, + "completions/mean_length": 352.1875, + "completions/mean_terminated_length": 352.1875, + "completions/min_length": 227.0, + "completions/min_terminated_length": 227.0, + "epoch": 0.6941150145446409, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.11063137650489807, + "learning_rate": 4.001243937068482e-06, + "loss": -0.0079, + "num_tokens": 247728553.0, + "reward": 6.3173675537109375, + "reward_std": 0.6602230072021484, + "rewards/accuracy_reward/mean": 5.3125, + "rewards/accuracy_reward/std": 1.0522085428237915, + "rewards/ngram_similarity_reward/mean": 1.0048675537109375, + "rewards/ngram_similarity_reward/std": 0.2490445375442505, + "step": 1551 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 886.0, + "completions/max_terminated_length": 886.0, + "completions/mean_length": 482.28125, + "completions/mean_terminated_length": 482.28125, + "completions/min_length": 294.0, + "completions/min_terminated_length": 294.0, + "epoch": 0.6945625419556948, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.15815483033657074, + "learning_rate": 3.999859993972871e-06, + "loss": -0.0043, + "num_tokens": 247999067.0, + "reward": 3.154822587966919, + "reward_std": 0.07147978246212006, + "rewards/accuracy_reward/mean": 2.5, + "rewards/accuracy_reward/std": 3.0237157344818115, + "rewards/ngram_similarity_reward/mean": 0.6548227071762085, + "rewards/ngram_similarity_reward/std": 0.3912094235420227, + "step": 1552 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 902.0, + "completions/max_terminated_length": 902.0, + "completions/mean_length": 481.359375, + "completions/mean_terminated_length": 481.359375, + "completions/min_length": 302.0, + "completions/min_terminated_length": 302.0, + "epoch": 0.6950100693667487, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0771925300359726, + "learning_rate": 3.998475366648832e-06, + "loss": 0.0053, + "num_tokens": 248126306.0, + "reward": 5.524537086486816, + "reward_std": 0.746326744556427, + "rewards/accuracy_reward/mean": 5.03125, + "rewards/accuracy_reward/std": 1.6229382753372192, + "rewards/ngram_similarity_reward/mean": 0.49328747391700745, + "rewards/ngram_similarity_reward/std": 0.16534650325775146, + "step": 1553 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 840.0, + "completions/max_terminated_length": 840.0, + "completions/mean_length": 528.5, + "completions/mean_terminated_length": 528.5, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "epoch": 0.6954575967778026, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08068442344665527, + "learning_rate": 3.9970900558543744e-06, + "loss": 0.0076, + "num_tokens": 248270066.0, + "reward": 3.2489542961120605, + "reward_std": 2.6484427452087402, + "rewards/accuracy_reward/mean": 2.453125, + "rewards/accuracy_reward/std": 3.077979803085327, + "rewards/ngram_similarity_reward/mean": 0.7958290576934814, + "rewards/ngram_similarity_reward/std": 0.2966296970844269, + "step": 1554 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 867.0, + "completions/max_terminated_length": 867.0, + "completions/mean_length": 559.140625, + "completions/mean_terminated_length": 559.140625, + "completions/min_length": 258.0, + "completions/min_terminated_length": 258.0, + "epoch": 0.6959051241888565, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07420410960912704, + "learning_rate": 3.995704062347874e-06, + "loss": 0.0251, + "num_tokens": 248448299.0, + "reward": 4.802103519439697, + "reward_std": 0.5186323523521423, + "rewards/accuracy_reward/mean": 4.09375, + "rewards/accuracy_reward/std": 2.5617377758026123, + "rewards/ngram_similarity_reward/mean": 0.7083532810211182, + "rewards/ngram_similarity_reward/std": 0.2901707589626312, + "step": 1555 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 860.0, + "completions/max_terminated_length": 860.0, + "completions/mean_length": 637.265625, + "completions/mean_terminated_length": 637.265625, + "completions/min_length": 461.0, + "completions/min_terminated_length": 461.0, + "epoch": 0.6963526515999104, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06902759522199631, + "learning_rate": 3.994317386888089e-06, + "loss": 0.0283, + "num_tokens": 248630572.0, + "reward": 2.794793128967285, + "reward_std": 0.7708073258399963, + "rewards/accuracy_reward/mean": 2.125, + "rewards/accuracy_reward/std": 3.000000238418579, + "rewards/ngram_similarity_reward/mean": 0.6697932481765747, + "rewards/ngram_similarity_reward/std": 0.3138458728790283, + "step": 1556 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 742.0, + "completions/max_terminated_length": 742.0, + "completions/mean_length": 497.765625, + "completions/mean_terminated_length": 497.765625, + "completions/min_length": 311.0, + "completions/min_terminated_length": 311.0, + "epoch": 0.6968001790109645, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09973353892564774, + "learning_rate": 3.992930030234144e-06, + "loss": -0.0031, + "num_tokens": 248775325.0, + "reward": 0.4560176730155945, + "reward_std": 1.1246377229690552, + "rewards/accuracy_reward/mean": -0.03125, + "rewards/accuracy_reward/std": 1.6229382753372192, + "rewards/ngram_similarity_reward/mean": 0.4872676730155945, + "rewards/ngram_similarity_reward/std": 0.1638980507850647, + "step": 1557 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 684.0, + "completions/max_terminated_length": 684.0, + "completions/mean_length": 475.453125, + "completions/mean_terminated_length": 475.453125, + "completions/min_length": 338.0, + "completions/min_terminated_length": 338.0, + "epoch": 0.6972477064220184, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08650655299425125, + "learning_rate": 3.9915419931455414e-06, + "loss": 0.0133, + "num_tokens": 248951002.0, + "reward": 3.812142848968506, + "reward_std": 0.8941503167152405, + "rewards/accuracy_reward/mean": 2.96875, + "rewards/accuracy_reward/std": 2.986577272415161, + "rewards/ngram_similarity_reward/mean": 0.8433928489685059, + "rewards/ngram_similarity_reward/std": 0.2216542363166809, + "step": 1558 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 723.0, + "completions/max_terminated_length": 723.0, + "completions/mean_length": 477.71875, + "completions/mean_terminated_length": 477.71875, + "completions/min_length": 268.0, + "completions/min_terminated_length": 268.0, + "epoch": 0.6976952338330723, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0839940756559372, + "learning_rate": 3.990153276382151e-06, + "loss": -0.0006, + "num_tokens": 249099256.0, + "reward": 4.668988227844238, + "reward_std": 0.07187686860561371, + "rewards/accuracy_reward/mean": 4.0, + "rewards/accuracy_reward/std": 2.618614673614502, + "rewards/ngram_similarity_reward/mean": 0.6689878106117249, + "rewards/ngram_similarity_reward/std": 0.24075810611248016, + "step": 1559 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 699.0, + "completions/max_terminated_length": 699.0, + "completions/mean_length": 400.40625, + "completions/mean_terminated_length": 400.40625, + "completions/min_length": 256.0, + "completions/min_terminated_length": 256.0, + "epoch": 0.6981427612441262, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10552025586366653, + "learning_rate": 3.988763880704218e-06, + "loss": -0.016, + "num_tokens": 249260130.0, + "reward": 2.995523452758789, + "reward_std": 0.06344757974147797, + "rewards/accuracy_reward/mean": 2.5, + "rewards/accuracy_reward/std": 3.0237157344818115, + "rewards/ngram_similarity_reward/mean": 0.49552327394485474, + "rewards/ngram_similarity_reward/std": 0.26851361989974976, + "step": 1560 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 616.0, + "completions/mean_length": 472.953125, + "completions/mean_terminated_length": 447.952392578125, + "completions/min_length": 284.0, + "completions/min_terminated_length": 284.0, + "epoch": 0.6985902886551801, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10093589872121811, + "learning_rate": 3.987373806872362e-06, + "loss": -0.009, + "num_tokens": 249401023.0, + "reward": 4.924391269683838, + "reward_std": 1.5129709243774414, + "rewards/accuracy_reward/mean": 4.09375, + "rewards/accuracy_reward/std": 2.5617377758026123, + "rewards/ngram_similarity_reward/mean": 0.830641508102417, + "rewards/ngram_similarity_reward/std": 0.3078822195529938, + "step": 1561 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 686.0, + "completions/max_terminated_length": 686.0, + "completions/mean_length": 403.46875, + "completions/mean_terminated_length": 403.46875, + "completions/min_length": 190.0, + "completions/min_terminated_length": 190.0, + "epoch": 0.699037816066234, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.08264653384685516, + "learning_rate": 3.985983055647567e-06, + "loss": -0.005, + "num_tokens": 249551613.0, + "reward": 4.847428798675537, + "reward_std": 0.09491054713726044, + "rewards/accuracy_reward/mean": 4.0, + "rewards/accuracy_reward/std": 2.618614673614502, + "rewards/ngram_similarity_reward/mean": 0.8474288582801819, + "rewards/ngram_similarity_reward/std": 0.3070674538612366, + "step": 1562 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 744.0, + "completions/max_terminated_length": 744.0, + "completions/mean_length": 485.9375, + "completions/mean_terminated_length": 485.9375, + "completions/min_length": 307.0, + "completions/min_terminated_length": 307.0, + "epoch": 0.699485343477288, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08242128044366837, + "learning_rate": 3.984591627791194e-06, + "loss": 0.0139, + "num_tokens": 249702969.0, + "reward": 3.1901519298553467, + "reward_std": 0.16040048003196716, + "rewards/accuracy_reward/mean": 2.5, + "rewards/accuracy_reward/std": 3.0237157344818115, + "rewards/ngram_similarity_reward/mean": 0.6901519894599915, + "rewards/ngram_similarity_reward/std": 0.32409507036209106, + "step": 1563 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 993.0, + "completions/max_terminated_length": 993.0, + "completions/mean_length": 454.109375, + "completions/mean_terminated_length": 454.109375, + "completions/min_length": 295.0, + "completions/min_terminated_length": 295.0, + "epoch": 0.6999328708883419, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.09580738097429276, + "learning_rate": 3.98319952406497e-06, + "loss": -0.0167, + "num_tokens": 249885008.0, + "reward": 4.166101932525635, + "reward_std": 1.3613916635513306, + "rewards/accuracy_reward/mean": 3.296875, + "rewards/accuracy_reward/std": 2.97171688079834, + "rewards/ngram_similarity_reward/mean": 0.8692273497581482, + "rewards/ngram_similarity_reward/std": 0.3243560492992401, + "step": 1564 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 696.0, + "completions/mean_length": 524.296875, + "completions/mean_terminated_length": 475.1451416015625, + "completions/min_length": 324.0, + "completions/min_terminated_length": 324.0, + "epoch": 0.7003803982993958, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09756094217300415, + "learning_rate": 3.981806745230995e-06, + "loss": -0.1086, + "num_tokens": 250025651.0, + "reward": 4.522310256958008, + "reward_std": 1.0277568101882935, + "rewards/accuracy_reward/mean": 3.71875, + "rewards/accuracy_reward/std": 2.7629566192626953, + "rewards/ngram_similarity_reward/mean": 0.8035602569580078, + "rewards/ngram_similarity_reward/std": 0.29595068097114563, + "step": 1565 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 610.0, + "completions/max_terminated_length": 610.0, + "completions/mean_length": 446.734375, + "completions/mean_terminated_length": 446.734375, + "completions/min_length": 329.0, + "completions/min_terminated_length": 329.0, + "epoch": 0.7008279257104497, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08303641527891159, + "learning_rate": 3.980413292051737e-06, + "loss": 0.0055, + "num_tokens": 250157410.0, + "reward": 6.100274085998535, + "reward_std": 0.6671102046966553, + "rewards/accuracy_reward/mean": 5.21875, + "rewards/accuracy_reward/std": 1.2782522439956665, + "rewards/ngram_similarity_reward/mean": 0.8815240859985352, + "rewards/ngram_similarity_reward/std": 0.26843953132629395, + "step": 1566 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 664.0, + "completions/max_terminated_length": 664.0, + "completions/mean_length": 455.3125, + "completions/mean_terminated_length": 455.3125, + "completions/min_length": 316.0, + "completions/min_terminated_length": 316.0, + "epoch": 0.7012754531215037, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10217029601335526, + "learning_rate": 3.979019165290034e-06, + "loss": 0.0239, + "num_tokens": 250302742.0, + "reward": 5.739469528198242, + "reward_std": 0.4549495577812195, + "rewards/accuracy_reward/mean": 5.390625, + "rewards/accuracy_reward/std": 0.8750000596046448, + "rewards/ngram_similarity_reward/mean": 0.34884434938430786, + "rewards/ngram_similarity_reward/std": 0.268588125705719, + "step": 1567 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 945.0, + "completions/max_terminated_length": 945.0, + "completions/mean_length": 549.09375, + "completions/mean_terminated_length": 549.09375, + "completions/min_length": 359.0, + "completions/min_terminated_length": 359.0, + "epoch": 0.7017229805325577, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1244237869977951, + "learning_rate": 3.977624365709093e-06, + "loss": -0.0067, + "num_tokens": 250482428.0, + "reward": 4.415213584899902, + "reward_std": 0.19421711564064026, + "rewards/accuracy_reward/mean": 3.984375, + "rewards/accuracy_reward/std": 2.648702621459961, + "rewards/ngram_similarity_reward/mean": 0.4308384656906128, + "rewards/ngram_similarity_reward/std": 0.22356411814689636, + "step": 1568 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 726.0, + "completions/max_terminated_length": 726.0, + "completions/mean_length": 438.59375, + "completions/mean_terminated_length": 438.59375, + "completions/min_length": 300.0, + "completions/min_terminated_length": 300.0, + "epoch": 0.7021705079436116, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09684059768915176, + "learning_rate": 3.9762288940724875e-06, + "loss": -0.0028, + "num_tokens": 250659778.0, + "reward": 5.599760055541992, + "reward_std": 0.8724799752235413, + "rewards/accuracy_reward/mean": 4.84375, + "rewards/accuracy_reward/std": 1.8874586820602417, + "rewards/ngram_similarity_reward/mean": 0.7560100555419922, + "rewards/ngram_similarity_reward/std": 0.19458892941474915, + "step": 1569 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 999.0, + "completions/max_terminated_length": 999.0, + "completions/mean_length": 447.90625, + "completions/mean_terminated_length": 447.90625, + "completions/min_length": 269.0, + "completions/min_terminated_length": 269.0, + "epoch": 0.7026180353546655, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.09088005870580673, + "learning_rate": 3.974832751144161e-06, + "loss": 0.0236, + "num_tokens": 250825052.0, + "reward": 4.93842887878418, + "reward_std": 0.06765494495630264, + "rewards/accuracy_reward/mean": 4.0, + "rewards/accuracy_reward/std": 2.618614673614502, + "rewards/ngram_similarity_reward/mean": 0.9384286999702454, + "rewards/ngram_similarity_reward/std": 0.3072734475135803, + "step": 1570 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 608.0, + "completions/max_terminated_length": 608.0, + "completions/mean_length": 421.640625, + "completions/mean_terminated_length": 421.640625, + "completions/min_length": 260.0, + "completions/min_terminated_length": 260.0, + "epoch": 0.7030655627657194, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12094370275735855, + "learning_rate": 3.973435937688424e-06, + "loss": 0.0044, + "num_tokens": 251016517.0, + "reward": 4.936328887939453, + "reward_std": 0.7063286900520325, + "rewards/accuracy_reward/mean": 4.1875, + "rewards/accuracy_reward/std": 2.5, + "rewards/ngram_similarity_reward/mean": 0.748828649520874, + "rewards/ngram_similarity_reward/std": 0.2553362250328064, + "step": 1571 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 745.0, + "completions/max_terminated_length": 745.0, + "completions/mean_length": 463.921875, + "completions/mean_terminated_length": 463.921875, + "completions/min_length": 321.0, + "completions/min_terminated_length": 321.0, + "epoch": 0.7035130901767733, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12409054487943649, + "learning_rate": 3.972038454469951e-06, + "loss": 0.0217, + "num_tokens": 251184816.0, + "reward": 2.1978726387023926, + "reward_std": 0.8047740459442139, + "rewards/accuracy_reward/mean": 1.46875, + "rewards/accuracy_reward/std": 2.839454174041748, + "rewards/ngram_similarity_reward/mean": 0.7291226387023926, + "rewards/ngram_similarity_reward/std": 0.1631511002779007, + "step": 1572 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1004.0, + "completions/max_terminated_length": 1004.0, + "completions/mean_length": 512.40625, + "completions/mean_terminated_length": 512.40625, + "completions/min_length": 278.0, + "completions/min_terminated_length": 278.0, + "epoch": 0.7039606175878272, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10297142714262009, + "learning_rate": 3.97064030225379e-06, + "loss": -0.0207, + "num_tokens": 251333130.0, + "reward": 2.899536609649658, + "reward_std": 0.5663940906524658, + "rewards/accuracy_reward/mean": 2.546875, + "rewards/accuracy_reward/std": 3.077979803085327, + "rewards/ngram_similarity_reward/mean": 0.352661669254303, + "rewards/ngram_similarity_reward/std": 0.2078230232000351, + "step": 1573 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 839.0, + "completions/max_terminated_length": 839.0, + "completions/mean_length": 568.125, + "completions/mean_terminated_length": 568.125, + "completions/min_length": 339.0, + "completions/min_terminated_length": 339.0, + "epoch": 0.7044081449988812, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12223118543624878, + "learning_rate": 3.969241481805349e-06, + "loss": -0.0197, + "num_tokens": 251512290.0, + "reward": 2.919192314147949, + "reward_std": 0.6469785571098328, + "rewards/accuracy_reward/mean": 2.3125, + "rewards/accuracy_reward/std": 3.0178043842315674, + "rewards/ngram_similarity_reward/mean": 0.6066923141479492, + "rewards/ngram_similarity_reward/std": 0.35552290081977844, + "step": 1574 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 624.0, + "completions/max_terminated_length": 624.0, + "completions/mean_length": 438.765625, + "completions/mean_terminated_length": 438.765625, + "completions/min_length": 257.0, + "completions/min_terminated_length": 257.0, + "epoch": 0.7048556724099351, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1319248229265213, + "learning_rate": 3.9678419938904024e-06, + "loss": 0.0399, + "num_tokens": 251614851.0, + "reward": 4.733461380004883, + "reward_std": 2.085000514984131, + "rewards/accuracy_reward/mean": 4.0, + "rewards/accuracy_reward/std": 2.618614673614502, + "rewards/ngram_similarity_reward/mean": 0.7334614992141724, + "rewards/ngram_similarity_reward/std": 0.2388819009065628, + "step": 1575 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 690.0, + "completions/max_terminated_length": 690.0, + "completions/mean_length": 460.921875, + "completions/mean_terminated_length": 460.921875, + "completions/min_length": 311.0, + "completions/min_terminated_length": 311.0, + "epoch": 0.705303199820989, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12093572318553925, + "learning_rate": 3.966441839275096e-06, + "loss": 0.0198, + "num_tokens": 251819342.0, + "reward": 4.596960067749023, + "reward_std": 0.17896360158920288, + "rewards/accuracy_reward/mean": 4.0, + "rewards/accuracy_reward/std": 2.618614673614502, + "rewards/ngram_similarity_reward/mean": 0.5969597101211548, + "rewards/ngram_similarity_reward/std": 0.26587197184562683, + "step": 1576 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 640.0, + "completions/max_terminated_length": 640.0, + "completions/mean_length": 423.78125, + "completions/mean_terminated_length": 423.78125, + "completions/min_length": 280.0, + "completions/min_terminated_length": 280.0, + "epoch": 0.705750727232043, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.12576833367347717, + "learning_rate": 3.965041018725931e-06, + "loss": -0.0031, + "num_tokens": 251989600.0, + "reward": 3.3358778953552246, + "reward_std": 1.182431697845459, + "rewards/accuracy_reward/mean": 2.6875, + "rewards/accuracy_reward/std": 3.0178043842315674, + "rewards/ngram_similarity_reward/mean": 0.6483778953552246, + "rewards/ngram_similarity_reward/std": 0.3610571026802063, + "step": 1577 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 860.0, + "completions/max_terminated_length": 860.0, + "completions/mean_length": 506.46875, + "completions/mean_terminated_length": 506.46875, + "completions/min_length": 310.0, + "completions/min_terminated_length": 310.0, + "epoch": 0.7061982546430969, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12079954892396927, + "learning_rate": 3.9636395330097805e-06, + "loss": -0.0386, + "num_tokens": 252148190.0, + "reward": 4.8347649574279785, + "reward_std": 0.6759188175201416, + "rewards/accuracy_reward/mean": 4.28125, + "rewards/accuracy_reward/std": 2.4330317974090576, + "rewards/ngram_similarity_reward/mean": 0.553514838218689, + "rewards/ngram_similarity_reward/std": 0.31114792823791504, + "step": 1578 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 879.0, + "completions/mean_length": 614.390625, + "completions/mean_terminated_length": 518.8167114257812, + "completions/min_length": 300.0, + "completions/min_terminated_length": 300.0, + "epoch": 0.7066457820541509, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13940861821174622, + "learning_rate": 3.96223738289388e-06, + "loss": -0.0085, + "num_tokens": 252293943.0, + "reward": 4.402132987976074, + "reward_std": 0.8495877981185913, + "rewards/accuracy_reward/mean": 3.8125, + "rewards/accuracy_reward/std": 2.7189810276031494, + "rewards/ngram_similarity_reward/mean": 0.5896329879760742, + "rewards/ngram_similarity_reward/std": 0.29393261671066284, + "step": 1579 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 733.0, + "completions/mean_length": 480.640625, + "completions/mean_terminated_length": 455.7619323730469, + "completions/min_length": 308.0, + "completions/min_terminated_length": 308.0, + "epoch": 0.7070933094652048, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09094911813735962, + "learning_rate": 3.960834569145829e-06, + "loss": -0.0164, + "num_tokens": 252435440.0, + "reward": 3.8131513595581055, + "reward_std": 1.889657974243164, + "rewards/accuracy_reward/mean": 3.0625, + "rewards/accuracy_reward/std": 2.9700891971588135, + "rewards/ngram_similarity_reward/mean": 0.7506515383720398, + "rewards/ngram_similarity_reward/std": 0.3241376578807831, + "step": 1580 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1069.0, + "completions/max_terminated_length": 1069.0, + "completions/mean_length": 564.640625, + "completions/mean_terminated_length": 564.640625, + "completions/min_length": 333.0, + "completions/min_terminated_length": 333.0, + "epoch": 0.7075408368762587, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.06971049308776855, + "learning_rate": 3.959431092533588e-06, + "loss": -0.0067, + "num_tokens": 252647945.0, + "reward": 5.790827751159668, + "reward_std": 0.6532076001167297, + "rewards/accuracy_reward/mean": 5.21875, + "rewards/accuracy_reward/std": 1.2782522439956665, + "rewards/ngram_similarity_reward/mean": 0.5720778107643127, + "rewards/ngram_similarity_reward/std": 0.325927734375, + "step": 1581 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 627.0, + "completions/max_terminated_length": 627.0, + "completions/mean_length": 440.34375, + "completions/mean_terminated_length": 440.34375, + "completions/min_length": 282.0, + "completions/min_terminated_length": 282.0, + "epoch": 0.7079883642873126, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10880489647388458, + "learning_rate": 3.958026953825482e-06, + "loss": 0.008, + "num_tokens": 252777839.0, + "reward": 4.542692184448242, + "reward_std": 0.07795602083206177, + "rewards/accuracy_reward/mean": 4.0, + "rewards/accuracy_reward/std": 2.618614673614502, + "rewards/ngram_similarity_reward/mean": 0.5426921844482422, + "rewards/ngram_similarity_reward/std": 0.28923484683036804, + "step": 1582 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 629.0, + "completions/mean_length": 479.71875, + "completions/mean_terminated_length": 454.8254089355469, + "completions/min_length": 308.0, + "completions/min_terminated_length": 308.0, + "epoch": 0.7084358916983665, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13043983280658722, + "learning_rate": 3.9566221537901985e-06, + "loss": 0.0307, + "num_tokens": 252915069.0, + "reward": 3.0502660274505615, + "reward_std": 0.10106834024190903, + "rewards/accuracy_reward/mean": 2.5, + "rewards/accuracy_reward/std": 3.0237157344818115, + "rewards/ngram_similarity_reward/mean": 0.5502660274505615, + "rewards/ngram_similarity_reward/std": 0.21487337350845337, + "step": 1583 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 733.0, + "completions/max_terminated_length": 733.0, + "completions/mean_length": 508.28125, + "completions/mean_terminated_length": 508.28125, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "epoch": 0.7088834191094204, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09665711969137192, + "learning_rate": 3.955216693196787e-06, + "loss": -0.0044, + "num_tokens": 253079119.0, + "reward": 4.397170066833496, + "reward_std": 0.5835081338882446, + "rewards/accuracy_reward/mean": 3.8125, + "rewards/accuracy_reward/std": 2.7189810276031494, + "rewards/ngram_similarity_reward/mean": 0.5846704244613647, + "rewards/ngram_similarity_reward/std": 0.08703292161226273, + "step": 1584 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 783.0, + "completions/max_terminated_length": 783.0, + "completions/mean_length": 496.1875, + "completions/mean_terminated_length": 496.1875, + "completions/min_length": 274.0, + "completions/min_terminated_length": 274.0, + "epoch": 0.7093309465204743, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07534337788820267, + "learning_rate": 3.95381057281466e-06, + "loss": -0.0034, + "num_tokens": 253218875.0, + "reward": 4.854658126831055, + "reward_std": 0.0570051483809948, + "rewards/accuracy_reward/mean": 4.0, + "rewards/accuracy_reward/std": 2.618614673614502, + "rewards/ngram_similarity_reward/mean": 0.854658305644989, + "rewards/ngram_similarity_reward/std": 0.33798474073410034, + "step": 1585 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 554.0, + "completions/max_terminated_length": 554.0, + "completions/mean_length": 345.46875, + "completions/mean_terminated_length": 345.46875, + "completions/min_length": 189.0, + "completions/min_terminated_length": 189.0, + "epoch": 0.7097784739315283, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.08280720561742783, + "learning_rate": 3.952403793413587e-06, + "loss": 0.0222, + "num_tokens": 253326457.0, + "reward": 4.747356414794922, + "reward_std": 0.03861922398209572, + "rewards/accuracy_reward/mean": 4.0, + "rewards/accuracy_reward/std": 2.618614673614502, + "rewards/ngram_similarity_reward/mean": 0.7473564147949219, + "rewards/ngram_similarity_reward/std": 0.4246657192707062, + "step": 1586 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 750.0, + "completions/max_terminated_length": 750.0, + "completions/mean_length": 414.625, + "completions/mean_terminated_length": 414.625, + "completions/min_length": 278.0, + "completions/min_terminated_length": 278.0, + "epoch": 0.7102260013425822, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11492746323347092, + "learning_rate": 3.950996355763704e-06, + "loss": 0.0138, + "num_tokens": 253440257.0, + "reward": 5.836974143981934, + "reward_std": 0.7199723124504089, + "rewards/accuracy_reward/mean": 5.21875, + "rewards/accuracy_reward/std": 1.2782522439956665, + "rewards/ngram_similarity_reward/mean": 0.618224024772644, + "rewards/ngram_similarity_reward/std": 0.3036477565765381, + "step": 1587 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 635.0, + "completions/max_terminated_length": 635.0, + "completions/mean_length": 465.90625, + "completions/mean_terminated_length": 465.90625, + "completions/min_length": 320.0, + "completions/min_terminated_length": 320.0, + "epoch": 0.7106735287536362, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09943797439336777, + "learning_rate": 3.949588260635502e-06, + "loss": 0.02, + "num_tokens": 253591483.0, + "reward": 4.5792107582092285, + "reward_std": 0.09714803099632263, + "rewards/accuracy_reward/mean": 4.0, + "rewards/accuracy_reward/std": 2.618614673614502, + "rewards/ngram_similarity_reward/mean": 0.5792109370231628, + "rewards/ngram_similarity_reward/std": 0.3758523166179657, + "step": 1588 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1084.0, + "completions/max_terminated_length": 1084.0, + "completions/mean_length": 634.03125, + "completions/mean_terminated_length": 634.03125, + "completions/min_length": 370.0, + "completions/min_terminated_length": 370.0, + "epoch": 0.7111210561646901, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07041703164577484, + "learning_rate": 3.948179508799835e-06, + "loss": 0.0055, + "num_tokens": 253754333.0, + "reward": 4.734063148498535, + "reward_std": 0.43159162998199463, + "rewards/accuracy_reward/mean": 4.09375, + "rewards/accuracy_reward/std": 2.5617377758026123, + "rewards/ngram_similarity_reward/mean": 0.640312910079956, + "rewards/ngram_similarity_reward/std": 0.3765218257904053, + "step": 1589 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 722.0, + "completions/max_terminated_length": 722.0, + "completions/mean_length": 477.6875, + "completions/mean_terminated_length": 477.6875, + "completions/min_length": 288.0, + "completions/min_terminated_length": 288.0, + "epoch": 0.711568583575744, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11647907644510269, + "learning_rate": 3.946770101027917e-06, + "loss": 0.0276, + "num_tokens": 253934393.0, + "reward": 3.272484540939331, + "reward_std": 0.09996727108955383, + "rewards/accuracy_reward/mean": 2.5, + "rewards/accuracy_reward/std": 3.0237157344818115, + "rewards/ngram_similarity_reward/mean": 0.7724847793579102, + "rewards/ngram_similarity_reward/std": 0.1062595322728157, + "step": 1590 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 637.0, + "completions/max_terminated_length": 637.0, + "completions/mean_length": 437.765625, + "completions/mean_terminated_length": 437.765625, + "completions/min_length": 286.0, + "completions/min_terminated_length": 286.0, + "epoch": 0.712016110986798, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1312415599822998, + "learning_rate": 3.945360038091317e-06, + "loss": 0.0219, + "num_tokens": 254156698.0, + "reward": 3.7038168907165527, + "reward_std": 2.1661648750305176, + "rewards/accuracy_reward/mean": 2.96875, + "rewards/accuracy_reward/std": 2.986577272415161, + "rewards/ngram_similarity_reward/mean": 0.7350670099258423, + "rewards/ngram_similarity_reward/std": 0.2963406443595886, + "step": 1591 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 793.0, + "completions/max_terminated_length": 793.0, + "completions/mean_length": 491.609375, + "completions/mean_terminated_length": 491.609375, + "completions/min_length": 245.0, + "completions/min_terminated_length": 245.0, + "epoch": 0.7124636383978519, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12840750813484192, + "learning_rate": 3.9439493207619695e-06, + "loss": -0.0066, + "num_tokens": 254378305.0, + "reward": 5.733854293823242, + "reward_std": 1.2718347311019897, + "rewards/accuracy_reward/mean": 5.03125, + "rewards/accuracy_reward/std": 1.6229382753372192, + "rewards/ngram_similarity_reward/mean": 0.7026040554046631, + "rewards/ngram_similarity_reward/std": 0.33394986391067505, + "step": 1592 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.078125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1079.0, + "completions/mean_length": 675.25, + "completions/mean_terminated_length": 558.915283203125, + "completions/min_length": 284.0, + "completions/min_terminated_length": 284.0, + "epoch": 0.7129111658089058, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10053110122680664, + "learning_rate": 3.942537949812161e-06, + "loss": -0.0773, + "num_tokens": 254525825.0, + "reward": 3.607769250869751, + "reward_std": 0.7840161323547363, + "rewards/accuracy_reward/mean": 2.875, + "rewards/accuracy_reward/std": 3.000000238418579, + "rewards/ngram_similarity_reward/mean": 0.732769250869751, + "rewards/ngram_similarity_reward/std": 0.4627540111541748, + "step": 1593 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 707.0, + "completions/max_terminated_length": 707.0, + "completions/mean_length": 467.234375, + "completions/mean_terminated_length": 467.234375, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "epoch": 0.7133586932199597, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07122514396905899, + "learning_rate": 3.941125926014537e-06, + "loss": 0.0148, + "num_tokens": 254681904.0, + "reward": 4.114041805267334, + "reward_std": 1.4079729318618774, + "rewards/accuracy_reward/mean": 3.25, + "rewards/accuracy_reward/std": 2.9277002811431885, + "rewards/ngram_similarity_reward/mean": 0.8640419244766235, + "rewards/ngram_similarity_reward/std": 0.2687089443206787, + "step": 1594 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1020.0, + "completions/mean_length": 653.484375, + "completions/mean_terminated_length": 608.5, + "completions/min_length": 323.0, + "completions/min_terminated_length": 323.0, + "epoch": 0.7138062206310136, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08103739470243454, + "learning_rate": 3.939713250142104e-06, + "loss": -0.0419, + "num_tokens": 254838271.0, + "reward": 5.720540523529053, + "reward_std": 0.8042657375335693, + "rewards/accuracy_reward/mean": 5.03125, + "rewards/accuracy_reward/std": 1.6229382753372192, + "rewards/ngram_similarity_reward/mean": 0.6892905235290527, + "rewards/ngram_similarity_reward/std": 0.24583858251571655, + "step": 1595 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 896.0, + "completions/mean_length": 544.15625, + "completions/mean_terminated_length": 495.6451416015625, + "completions/min_length": 214.0, + "completions/min_terminated_length": 214.0, + "epoch": 0.7142537480420675, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0563306026160717, + "learning_rate": 3.938299922968223e-06, + "loss": 0.0129, + "num_tokens": 254982857.0, + "reward": 4.760640621185303, + "reward_std": 0.04776597395539284, + "rewards/accuracy_reward/mean": 4.0, + "rewards/accuracy_reward/std": 2.618614673614502, + "rewards/ngram_similarity_reward/mean": 0.7606406211853027, + "rewards/ngram_similarity_reward/std": 0.267974317073822, + "step": 1596 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 722.0, + "completions/max_terminated_length": 722.0, + "completions/mean_length": 513.84375, + "completions/mean_terminated_length": 513.84375, + "completions/min_length": 318.0, + "completions/min_terminated_length": 318.0, + "epoch": 0.7147012754531215, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0818086639046669, + "learning_rate": 3.936885945266609e-06, + "loss": -0.0053, + "num_tokens": 255121967.0, + "reward": 1.653555989265442, + "reward_std": 0.6274036765098572, + "rewards/accuracy_reward/mean": 1.1875, + "rewards/accuracy_reward/std": 2.7189810276031494, + "rewards/ngram_similarity_reward/mean": 0.46605589985847473, + "rewards/ngram_similarity_reward/std": 0.19381797313690186, + "step": 1597 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 871.0, + "completions/max_terminated_length": 871.0, + "completions/mean_length": 528.53125, + "completions/mean_terminated_length": 528.53125, + "completions/min_length": 296.0, + "completions/min_terminated_length": 296.0, + "epoch": 0.7151488028641755, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08734636008739471, + "learning_rate": 3.935471317811338e-06, + "loss": 0.0064, + "num_tokens": 255254913.0, + "reward": 5.333474159240723, + "reward_std": 0.8434069156646729, + "rewards/accuracy_reward/mean": 4.65625, + "rewards/accuracy_reward/std": 2.102294683456421, + "rewards/ngram_similarity_reward/mean": 0.6772241592407227, + "rewards/ngram_similarity_reward/std": 0.3647167682647705, + "step": 1598 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 920.0, + "completions/max_terminated_length": 920.0, + "completions/mean_length": 501.078125, + "completions/mean_terminated_length": 501.078125, + "completions/min_length": 281.0, + "completions/min_terminated_length": 281.0, + "epoch": 0.7155963302752294, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.05728432536125183, + "learning_rate": 3.9340560413768384e-06, + "loss": 0.0021, + "num_tokens": 255387462.0, + "reward": 1.715783715248108, + "reward_std": 0.13839605450630188, + "rewards/accuracy_reward/mean": 0.96875, + "rewards/accuracy_reward/std": 2.6425621509552, + "rewards/ngram_similarity_reward/mean": 0.7470337152481079, + "rewards/ngram_similarity_reward/std": 0.3163996934890747, + "step": 1599 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 814.0, + "completions/max_terminated_length": 814.0, + "completions/mean_length": 520.71875, + "completions/mean_terminated_length": 520.71875, + "completions/min_length": 308.0, + "completions/min_terminated_length": 308.0, + "epoch": 0.7160438576862833, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1150129958987236, + "learning_rate": 3.932640116737896e-06, + "loss": -0.0153, + "num_tokens": 255568452.0, + "reward": 1.610967755317688, + "reward_std": 0.1124044731259346, + "rewards/accuracy_reward/mean": 1.0, + "rewards/accuracy_reward/std": 2.618614673614502, + "rewards/ngram_similarity_reward/mean": 0.6109675168991089, + "rewards/ngram_similarity_reward/std": 0.20065845549106598, + "step": 1600 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 640.0, + "completions/max_terminated_length": 640.0, + "completions/mean_length": 478.046875, + "completions/mean_terminated_length": 478.046875, + "completions/min_length": 295.0, + "completions/min_terminated_length": 295.0, + "epoch": 0.7164913850973372, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10712695121765137, + "learning_rate": 3.931223544669649e-06, + "loss": -0.0093, + "num_tokens": 255726407.0, + "reward": 1.8906025886535645, + "reward_std": 1.1437827348709106, + "rewards/accuracy_reward/mean": 1.375, + "rewards/accuracy_reward/std": 2.8030595779418945, + "rewards/ngram_similarity_reward/mean": 0.515602707862854, + "rewards/ngram_similarity_reward/std": 0.3079753518104553, + "step": 1601 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 580.0, + "completions/max_terminated_length": 580.0, + "completions/mean_length": 425.71875, + "completions/mean_terminated_length": 425.71875, + "completions/min_length": 272.0, + "completions/min_terminated_length": 272.0, + "epoch": 0.7169389125083911, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.09294099360704422, + "learning_rate": 3.929806325947591e-06, + "loss": -0.0115, + "num_tokens": 255862725.0, + "reward": 2.9013173580169678, + "reward_std": 0.1599607616662979, + "rewards/accuracy_reward/mean": 2.46875, + "rewards/accuracy_reward/std": 3.06007981300354, + "rewards/ngram_similarity_reward/mean": 0.4325675070285797, + "rewards/ngram_similarity_reward/std": 0.17533192038536072, + "step": 1602 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 652.0, + "completions/max_terminated_length": 652.0, + "completions/mean_length": 450.4375, + "completions/mean_terminated_length": 450.4375, + "completions/min_length": 332.0, + "completions/min_terminated_length": 332.0, + "epoch": 0.7173864399194451, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10121306031942368, + "learning_rate": 3.9283884613475706e-06, + "loss": -0.0115, + "num_tokens": 256010737.0, + "reward": 3.354915142059326, + "reward_std": 1.0463835000991821, + "rewards/accuracy_reward/mean": 2.6875, + "rewards/accuracy_reward/std": 3.0178043842315674, + "rewards/ngram_similarity_reward/mean": 0.6674151420593262, + "rewards/ngram_similarity_reward/std": 0.2502375841140747, + "step": 1603 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1283.0, + "completions/mean_length": 632.484375, + "completions/mean_terminated_length": 610.0159301757812, + "completions/min_length": 318.0, + "completions/min_terminated_length": 318.0, + "epoch": 0.717833967330499, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1088629886507988, + "learning_rate": 3.92696995164579e-06, + "loss": 0.002, + "num_tokens": 256189120.0, + "reward": 4.064433574676514, + "reward_std": 1.296493649482727, + "rewards/accuracy_reward/mean": 3.421875, + "rewards/accuracy_reward/std": 2.896657705307007, + "rewards/ngram_similarity_reward/mean": 0.6425585746765137, + "rewards/ngram_similarity_reward/std": 0.35586273670196533, + "step": 1604 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 634.0, + "completions/max_terminated_length": 634.0, + "completions/mean_length": 432.828125, + "completions/mean_terminated_length": 432.828125, + "completions/min_length": 330.0, + "completions/min_terminated_length": 330.0, + "epoch": 0.7182814947415529, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10701368004083633, + "learning_rate": 3.925550797618804e-06, + "loss": 0.0042, + "num_tokens": 256354469.0, + "reward": 4.758089065551758, + "reward_std": 0.5311642289161682, + "rewards/accuracy_reward/mean": 3.90625, + "rewards/accuracy_reward/std": 2.6709415912628174, + "rewards/ngram_similarity_reward/mean": 0.8518393635749817, + "rewards/ngram_similarity_reward/std": 0.24525253474712372, + "step": 1605 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 822.0, + "completions/max_terminated_length": 822.0, + "completions/mean_length": 453.359375, + "completions/mean_terminated_length": 453.359375, + "completions/min_length": 256.0, + "completions/min_terminated_length": 256.0, + "epoch": 0.7187290221526068, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.1467125415802002, + "learning_rate": 3.92413100004352e-06, + "loss": -0.0263, + "num_tokens": 256571372.0, + "reward": 6.293436050415039, + "reward_std": 0.13439010083675385, + "rewards/accuracy_reward/mean": 5.5, + "rewards/accuracy_reward/std": 0.0, + "rewards/ngram_similarity_reward/mean": 0.7934361100196838, + "rewards/ngram_similarity_reward/std": 0.3361327350139618, + "step": 1606 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 837.0, + "completions/max_terminated_length": 837.0, + "completions/mean_length": 433.765625, + "completions/mean_terminated_length": 433.765625, + "completions/min_length": 298.0, + "completions/min_terminated_length": 298.0, + "epoch": 0.7191765495636607, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10858814418315887, + "learning_rate": 3.922710559697196e-06, + "loss": 0.0318, + "num_tokens": 256800589.0, + "reward": 5.375655651092529, + "reward_std": 1.4599380493164062, + "rewards/accuracy_reward/mean": 4.46875, + "rewards/accuracy_reward/std": 2.2815253734588623, + "rewards/ngram_similarity_reward/mean": 0.9069056510925293, + "rewards/ngram_similarity_reward/std": 0.26185810565948486, + "step": 1607 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 862.0, + "completions/max_terminated_length": 862.0, + "completions/mean_length": 579.421875, + "completions/mean_terminated_length": 579.421875, + "completions/min_length": 344.0, + "completions/min_terminated_length": 344.0, + "epoch": 0.7196240769747148, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09472686797380447, + "learning_rate": 3.921289477357445e-06, + "loss": 0.0087, + "num_tokens": 256949304.0, + "reward": 4.724700450897217, + "reward_std": 0.12030484527349472, + "rewards/accuracy_reward/mean": 4.0, + "rewards/accuracy_reward/std": 2.618614673614502, + "rewards/ngram_similarity_reward/mean": 0.7247006297111511, + "rewards/ngram_similarity_reward/std": 0.21106944978237152, + "step": 1608 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 816.0, + "completions/max_terminated_length": 816.0, + "completions/mean_length": 431.421875, + "completions/mean_terminated_length": 431.421875, + "completions/min_length": 298.0, + "completions/min_terminated_length": 298.0, + "epoch": 0.7200716043857687, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.11076363176107407, + "learning_rate": 3.919867753802231e-06, + "loss": 0.0411, + "num_tokens": 257139651.0, + "reward": 3.8031749725341797, + "reward_std": 1.1167179346084595, + "rewards/accuracy_reward/mean": 2.96875, + "rewards/accuracy_reward/std": 2.986577272415161, + "rewards/ngram_similarity_reward/mean": 0.8344252109527588, + "rewards/ngram_similarity_reward/std": 0.3208877742290497, + "step": 1609 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 702.0, + "completions/max_terminated_length": 702.0, + "completions/mean_length": 475.5625, + "completions/mean_terminated_length": 475.5625, + "completions/min_length": 302.0, + "completions/min_terminated_length": 302.0, + "epoch": 0.7205191317968226, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.08394243568181992, + "learning_rate": 3.918445389809866e-06, + "loss": -0.0311, + "num_tokens": 257289575.0, + "reward": 6.31013822555542, + "reward_std": 0.06235632672905922, + "rewards/accuracy_reward/mean": 5.5, + "rewards/accuracy_reward/std": 0.0, + "rewards/ngram_similarity_reward/mean": 0.8101381063461304, + "rewards/ngram_similarity_reward/std": 0.304575115442276, + "step": 1610 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1028.0, + "completions/mean_length": 568.328125, + "completions/mean_terminated_length": 520.5967407226562, + "completions/min_length": 330.0, + "completions/min_terminated_length": 330.0, + "epoch": 0.7209666592078765, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08711081743240356, + "learning_rate": 3.9170223861590165e-06, + "loss": 0.0208, + "num_tokens": 257431964.0, + "reward": 3.1470353603363037, + "reward_std": 0.9846519231796265, + "rewards/accuracy_reward/mean": 2.390625, + "rewards/accuracy_reward/std": 3.0400354862213135, + "rewards/ngram_similarity_reward/mean": 0.7564102411270142, + "rewards/ngram_similarity_reward/std": 0.21799270808696747, + "step": 1611 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 642.0, + "completions/max_terminated_length": 642.0, + "completions/mean_length": 424.515625, + "completions/mean_terminated_length": 424.515625, + "completions/min_length": 229.0, + "completions/min_terminated_length": 229.0, + "epoch": 0.7214141866189304, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10729598253965378, + "learning_rate": 3.915598743628695e-06, + "loss": 0.0266, + "num_tokens": 257562317.0, + "reward": 3.5499908924102783, + "reward_std": 1.1870309114456177, + "rewards/accuracy_reward/mean": 2.78125, + "rewards/accuracy_reward/std": 3.0103988647460938, + "rewards/ngram_similarity_reward/mean": 0.7687408924102783, + "rewards/ngram_similarity_reward/std": 0.3213869333267212, + "step": 1612 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 823.0, + "completions/max_terminated_length": 823.0, + "completions/mean_length": 490.859375, + "completions/mean_terminated_length": 490.859375, + "completions/min_length": 303.0, + "completions/min_terminated_length": 303.0, + "epoch": 0.7218617140299843, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.06329638510942459, + "learning_rate": 3.9141744629982695e-06, + "loss": -0.0139, + "num_tokens": 257709844.0, + "reward": 3.0198938846588135, + "reward_std": 0.43414995074272156, + "rewards/accuracy_reward/mean": 2.40625, + "rewards/accuracy_reward/std": 3.0222392082214355, + "rewards/ngram_similarity_reward/mean": 0.6136438846588135, + "rewards/ngram_similarity_reward/std": 0.38070380687713623, + "step": 1613 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 774.0, + "completions/max_terminated_length": 774.0, + "completions/mean_length": 477.171875, + "completions/mean_terminated_length": 477.171875, + "completions/min_length": 254.0, + "completions/min_terminated_length": 254.0, + "epoch": 0.7223092414410383, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0825599655508995, + "learning_rate": 3.912749545047452e-06, + "loss": 0.0323, + "num_tokens": 257902575.0, + "reward": 4.817748069763184, + "reward_std": 0.1024700403213501, + "rewards/accuracy_reward/mean": 4.0, + "rewards/accuracy_reward/std": 2.618614673614502, + "rewards/ngram_similarity_reward/mean": 0.8177484273910522, + "rewards/ngram_similarity_reward/std": 0.30864542722702026, + "step": 1614 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 796.0, + "completions/max_terminated_length": 796.0, + "completions/mean_length": 513.359375, + "completions/mean_terminated_length": 513.359375, + "completions/min_length": 320.0, + "completions/min_terminated_length": 320.0, + "epoch": 0.7227567688520922, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1094784289598465, + "learning_rate": 3.911323990556305e-06, + "loss": 0.0087, + "num_tokens": 258045206.0, + "reward": 3.0424697399139404, + "reward_std": 0.1478545069694519, + "rewards/accuracy_reward/mean": 2.5, + "rewards/accuracy_reward/std": 3.0237157344818115, + "rewards/ngram_similarity_reward/mean": 0.5424696803092957, + "rewards/ngram_similarity_reward/std": 0.2889968752861023, + "step": 1615 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1260.0, + "completions/max_terminated_length": 1260.0, + "completions/mean_length": 596.25, + "completions/mean_terminated_length": 596.25, + "completions/min_length": 310.0, + "completions/min_terminated_length": 310.0, + "epoch": 0.7232042962631461, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07353372126817703, + "learning_rate": 3.90989780030524e-06, + "loss": -0.0124, + "num_tokens": 258212918.0, + "reward": 3.8062007427215576, + "reward_std": 1.2309160232543945, + "rewards/accuracy_reward/mean": 3.25, + "rewards/accuracy_reward/std": 2.9277002811431885, + "rewards/ngram_similarity_reward/mean": 0.5562007427215576, + "rewards/ngram_similarity_reward/std": 0.3006802201271057, + "step": 1616 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1678.0, + "completions/mean_length": 603.078125, + "completions/mean_terminated_length": 556.4677124023438, + "completions/min_length": 326.0, + "completions/min_terminated_length": 326.0, + "epoch": 0.7236518236742, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09781184047460556, + "learning_rate": 3.9084709750750185e-06, + "loss": -0.0391, + "num_tokens": 258380235.0, + "reward": 1.91732919216156, + "reward_std": 0.7352028489112854, + "rewards/accuracy_reward/mean": 1.21875, + "rewards/accuracy_reward/std": 2.8141860961914062, + "rewards/ngram_similarity_reward/mean": 0.6985790729522705, + "rewards/ngram_similarity_reward/std": 0.27080607414245605, + "step": 1617 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.140625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 807.0, + "completions/mean_length": 743.140625, + "completions/mean_terminated_length": 529.6181640625, + "completions/min_length": 267.0, + "completions/min_terminated_length": 267.0, + "epoch": 0.7240993510852539, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11270378530025482, + "learning_rate": 3.907043515646745e-06, + "loss": -0.0517, + "num_tokens": 258552276.0, + "reward": 3.2019901275634766, + "reward_std": 1.109002709388733, + "rewards/accuracy_reward/mean": 2.578125, + "rewards/accuracy_reward/std": 3.0410144329071045, + "rewards/ngram_similarity_reward/mean": 0.6238652467727661, + "rewards/ngram_similarity_reward/std": 0.37827983498573303, + "step": 1618 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1641.0, + "completions/max_terminated_length": 1641.0, + "completions/mean_length": 488.875, + "completions/mean_terminated_length": 488.875, + "completions/min_length": 298.0, + "completions/min_terminated_length": 298.0, + "epoch": 0.724546878496308, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12448471784591675, + "learning_rate": 3.905615422801875e-06, + "loss": 0.1146, + "num_tokens": 258716364.0, + "reward": 0.19647154211997986, + "reward_std": 0.22791625559329987, + "rewards/accuracy_reward/mean": -0.515625, + "rewards/accuracy_reward/std": 0.125, + "rewards/ngram_similarity_reward/mean": 0.7120965719223022, + "rewards/ngram_similarity_reward/std": 0.22158174216747284, + "step": 1619 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 842.0, + "completions/mean_length": 547.75, + "completions/mean_terminated_length": 523.9365234375, + "completions/min_length": 318.0, + "completions/min_terminated_length": 318.0, + "epoch": 0.7249944059073619, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.08939939737319946, + "learning_rate": 3.904186697322209e-06, + "loss": 0.0042, + "num_tokens": 258862076.0, + "reward": 4.555126190185547, + "reward_std": 0.5544596910476685, + "rewards/accuracy_reward/mean": 3.8125, + "rewards/accuracy_reward/std": 2.7189810276031494, + "rewards/ngram_similarity_reward/mean": 0.7426260709762573, + "rewards/ngram_similarity_reward/std": 0.316176176071167, + "step": 1620 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 641.0, + "completions/max_terminated_length": 641.0, + "completions/mean_length": 412.21875, + "completions/mean_terminated_length": 412.21875, + "completions/min_length": 236.0, + "completions/min_terminated_length": 236.0, + "epoch": 0.7254419333184158, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0931994691491127, + "learning_rate": 3.902757339989893e-06, + "loss": 0.0073, + "num_tokens": 259029306.0, + "reward": 6.047171115875244, + "reward_std": 0.4441218376159668, + "rewards/accuracy_reward/mean": 5.40625, + "rewards/accuracy_reward/std": 0.7500000596046448, + "rewards/ngram_similarity_reward/mean": 0.6409210562705994, + "rewards/ngram_similarity_reward/std": 0.23308861255645752, + "step": 1621 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 717.0, + "completions/max_terminated_length": 717.0, + "completions/mean_length": 458.203125, + "completions/mean_terminated_length": 458.203125, + "completions/min_length": 277.0, + "completions/min_terminated_length": 277.0, + "epoch": 0.7258894607294697, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10467223078012466, + "learning_rate": 3.9013273515874225e-06, + "loss": -0.0121, + "num_tokens": 259172087.0, + "reward": 4.927596569061279, + "reward_std": 0.18634790182113647, + "rewards/accuracy_reward/mean": 4.0, + "rewards/accuracy_reward/std": 2.618614673614502, + "rewards/ngram_similarity_reward/mean": 0.9275966882705688, + "rewards/ngram_similarity_reward/std": 0.2064925581216812, + "step": 1622 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 743.0, + "completions/max_terminated_length": 743.0, + "completions/mean_length": 406.703125, + "completions/mean_terminated_length": 406.703125, + "completions/min_length": 245.0, + "completions/min_terminated_length": 245.0, + "epoch": 0.7263369881405236, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11514323949813843, + "learning_rate": 3.899896732897635e-06, + "loss": -0.0087, + "num_tokens": 259337988.0, + "reward": 5.104360580444336, + "reward_std": 0.10654406249523163, + "rewards/accuracy_reward/mean": 4.0, + "rewards/accuracy_reward/std": 2.618614673614502, + "rewards/ngram_similarity_reward/mean": 1.1043603420257568, + "rewards/ngram_similarity_reward/std": 0.13795940577983856, + "step": 1623 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 750.0, + "completions/max_terminated_length": 750.0, + "completions/mean_length": 491.75, + "completions/mean_terminated_length": 491.75, + "completions/min_length": 293.0, + "completions/min_terminated_length": 293.0, + "epoch": 0.7267845155515775, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10860388725996017, + "learning_rate": 3.898465484703713e-06, + "loss": 0.0192, + "num_tokens": 259482692.0, + "reward": 5.41290283203125, + "reward_std": 1.1937735080718994, + "rewards/accuracy_reward/mean": 4.65625, + "rewards/accuracy_reward/std": 2.102294683456421, + "rewards/ngram_similarity_reward/mean": 0.7566529512405396, + "rewards/ngram_similarity_reward/std": 0.16372349858283997, + "step": 1624 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 657.0, + "completions/mean_length": 471.40625, + "completions/mean_terminated_length": 446.3809814453125, + "completions/min_length": 271.0, + "completions/min_terminated_length": 271.0, + "epoch": 0.7272320429626314, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12862062454223633, + "learning_rate": 3.897033607789187e-06, + "loss": -0.0043, + "num_tokens": 259662430.0, + "reward": 3.2956037521362305, + "reward_std": 0.11109915375709534, + "rewards/accuracy_reward/mean": 2.5, + "rewards/accuracy_reward/std": 3.0237157344818115, + "rewards/ngram_similarity_reward/mean": 0.7956037521362305, + "rewards/ngram_similarity_reward/std": 0.2544381022453308, + "step": 1625 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 649.0, + "completions/mean_length": 441.921875, + "completions/mean_terminated_length": 416.4285888671875, + "completions/min_length": 308.0, + "completions/min_terminated_length": 308.0, + "epoch": 0.7276795703736854, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.15487508475780487, + "learning_rate": 3.895601102937929e-06, + "loss": -0.0359, + "num_tokens": 259842057.0, + "reward": 3.8578715324401855, + "reward_std": 0.7877907752990723, + "rewards/accuracy_reward/mean": 3.0625, + "rewards/accuracy_reward/std": 2.9700891971588135, + "rewards/ngram_similarity_reward/mean": 0.7953715920448303, + "rewards/ngram_similarity_reward/std": 0.35424208641052246, + "step": 1626 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 715.0, + "completions/max_terminated_length": 715.0, + "completions/mean_length": 463.0, + "completions/mean_terminated_length": 463.0, + "completions/min_length": 313.0, + "completions/min_terminated_length": 313.0, + "epoch": 0.7281270977847393, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09510334581136703, + "learning_rate": 3.894167970934155e-06, + "loss": -0.0346, + "num_tokens": 259996745.0, + "reward": 5.00093936920166, + "reward_std": 0.6892349123954773, + "rewards/accuracy_reward/mean": 4.28125, + "rewards/accuracy_reward/std": 2.4330317974090576, + "rewards/ngram_similarity_reward/mean": 0.7196898460388184, + "rewards/ngram_similarity_reward/std": 0.29325228929519653, + "step": 1627 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 576.0, + "completions/max_terminated_length": 576.0, + "completions/mean_length": 400.28125, + "completions/mean_terminated_length": 400.28125, + "completions/min_length": 292.0, + "completions/min_terminated_length": 292.0, + "epoch": 0.7285746251957932, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12284601479768753, + "learning_rate": 3.892734212562423e-06, + "loss": -0.0202, + "num_tokens": 260148267.0, + "reward": 5.090183734893799, + "reward_std": 1.8874471187591553, + "rewards/accuracy_reward/mean": 4.5625, + "rewards/accuracy_reward/std": 2.195775270462036, + "rewards/ngram_similarity_reward/mean": 0.5276838541030884, + "rewards/ngram_similarity_reward/std": 0.2353626936674118, + "step": 1628 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 914.0, + "completions/max_terminated_length": 914.0, + "completions/mean_length": 571.59375, + "completions/mean_terminated_length": 571.59375, + "completions/min_length": 259.0, + "completions/min_terminated_length": 259.0, + "epoch": 0.7290221526068472, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09873700886964798, + "learning_rate": 3.891299828607639e-06, + "loss": 0.0696, + "num_tokens": 260276849.0, + "reward": 3.9734854698181152, + "reward_std": 1.4497017860412598, + "rewards/accuracy_reward/mean": 3.15625, + "rewards/accuracy_reward/std": 2.950484275817871, + "rewards/ngram_similarity_reward/mean": 0.8172354698181152, + "rewards/ngram_similarity_reward/std": 0.32224228978157043, + "step": 1629 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 785.0, + "completions/max_terminated_length": 785.0, + "completions/mean_length": 508.5625, + "completions/mean_terminated_length": 508.5625, + "completions/min_length": 335.0, + "completions/min_terminated_length": 335.0, + "epoch": 0.7294696800179011, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07983771711587906, + "learning_rate": 3.889864819855044e-06, + "loss": -0.0056, + "num_tokens": 260453285.0, + "reward": 4.735306739807129, + "reward_std": 0.15424823760986328, + "rewards/accuracy_reward/mean": 4.0, + "rewards/accuracy_reward/std": 2.618614673614502, + "rewards/ngram_similarity_reward/mean": 0.735306978225708, + "rewards/ngram_similarity_reward/std": 0.24989010393619537, + "step": 1630 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 911.0, + "completions/max_terminated_length": 911.0, + "completions/mean_length": 518.203125, + "completions/mean_terminated_length": 518.203125, + "completions/min_length": 322.0, + "completions/min_terminated_length": 322.0, + "epoch": 0.729917207428955, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13449940085411072, + "learning_rate": 3.8884291870902285e-06, + "loss": -0.0069, + "num_tokens": 260610450.0, + "reward": 3.005706310272217, + "reward_std": 0.621200680732727, + "rewards/accuracy_reward/mean": 2.6875, + "rewards/accuracy_reward/std": 3.0178043842315674, + "rewards/ngram_similarity_reward/mean": 0.31820613145828247, + "rewards/ngram_similarity_reward/std": 0.18179622292518616, + "step": 1631 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 857.0, + "completions/max_terminated_length": 857.0, + "completions/mean_length": 431.796875, + "completions/mean_terminated_length": 431.796875, + "completions/min_length": 270.0, + "completions/min_terminated_length": 270.0, + "epoch": 0.730364734840009, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.112546406686306, + "learning_rate": 3.886992931099118e-06, + "loss": 0.0252, + "num_tokens": 260754421.0, + "reward": 2.45912504196167, + "reward_std": 0.8453289270401001, + "rewards/accuracy_reward/mean": 1.84375, + "rewards/accuracy_reward/std": 2.950484275817871, + "rewards/ngram_similarity_reward/mean": 0.6153750419616699, + "rewards/ngram_similarity_reward/std": 0.33217865228652954, + "step": 1632 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 606.0, + "completions/max_terminated_length": 606.0, + "completions/mean_length": 424.546875, + "completions/mean_terminated_length": 424.546875, + "completions/min_length": 300.0, + "completions/min_terminated_length": 300.0, + "epoch": 0.7308122622510629, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09754940122365952, + "learning_rate": 3.885556052667985e-06, + "loss": 0.0249, + "num_tokens": 260894936.0, + "reward": 6.336062908172607, + "reward_std": 0.10131148993968964, + "rewards/accuracy_reward/mean": 5.5, + "rewards/accuracy_reward/std": 0.0, + "rewards/ngram_similarity_reward/mean": 0.8360626697540283, + "rewards/ngram_similarity_reward/std": 0.15345709025859833, + "step": 1633 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 595.0, + "completions/max_terminated_length": 595.0, + "completions/mean_length": 428.375, + "completions/mean_terminated_length": 428.375, + "completions/min_length": 271.0, + "completions/min_terminated_length": 271.0, + "epoch": 0.7312597896621168, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12307172268629074, + "learning_rate": 3.88411855258344e-06, + "loss": 0.0222, + "num_tokens": 261005072.0, + "reward": 6.0622735023498535, + "reward_std": 1.0507888793945312, + "rewards/accuracy_reward/mean": 5.125, + "rewards/accuracy_reward/std": 1.4638501405715942, + "rewards/ngram_similarity_reward/mean": 0.9372736215591431, + "rewards/ngram_similarity_reward/std": 0.2874011695384979, + "step": 1634 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 973.0, + "completions/max_terminated_length": 973.0, + "completions/mean_length": 493.65625, + "completions/mean_terminated_length": 493.65625, + "completions/min_length": 225.0, + "completions/min_terminated_length": 225.0, + "epoch": 0.7317073170731707, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10293906927108765, + "learning_rate": 3.88268043163243e-06, + "loss": -0.0203, + "num_tokens": 261142730.0, + "reward": 4.750032424926758, + "reward_std": 0.14096632599830627, + "rewards/accuracy_reward/mean": 4.0, + "rewards/accuracy_reward/std": 2.618614673614502, + "rewards/ngram_similarity_reward/mean": 0.7500326633453369, + "rewards/ngram_similarity_reward/std": 0.24176648259162903, + "step": 1635 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 744.0, + "completions/max_terminated_length": 744.0, + "completions/mean_length": 443.28125, + "completions/mean_terminated_length": 443.28125, + "completions/min_length": 236.0, + "completions/min_terminated_length": 236.0, + "epoch": 0.7321548444842246, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.06822414696216583, + "learning_rate": 3.881241690602251e-06, + "loss": 0.0066, + "num_tokens": 261299244.0, + "reward": 5.113492012023926, + "reward_std": 0.5794992446899414, + "rewards/accuracy_reward/mean": 4.1875, + "rewards/accuracy_reward/std": 2.5, + "rewards/ngram_similarity_reward/mean": 0.9259923100471497, + "rewards/ngram_similarity_reward/std": 0.17460310459136963, + "step": 1636 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1122.0, + "completions/max_terminated_length": 1122.0, + "completions/mean_length": 617.828125, + "completions/mean_terminated_length": 617.828125, + "completions/min_length": 375.0, + "completions/min_terminated_length": 375.0, + "epoch": 0.7326023718952785, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08448846638202667, + "learning_rate": 3.879802330280531e-06, + "loss": 0.0022, + "num_tokens": 261468753.0, + "reward": 4.327930927276611, + "reward_std": 0.6729658842086792, + "rewards/accuracy_reward/mean": 3.71875, + "rewards/accuracy_reward/std": 2.7629566192626953, + "rewards/ngram_similarity_reward/mean": 0.6091808676719666, + "rewards/ngram_similarity_reward/std": 0.269133061170578, + "step": 1637 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 722.0, + "completions/max_terminated_length": 722.0, + "completions/mean_length": 480.234375, + "completions/mean_terminated_length": 480.234375, + "completions/min_length": 329.0, + "completions/min_terminated_length": 329.0, + "epoch": 0.7330498993063325, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13625337183475494, + "learning_rate": 3.878362351455237e-06, + "loss": -0.0071, + "num_tokens": 261594816.0, + "reward": 4.344447135925293, + "reward_std": 0.835814893245697, + "rewards/accuracy_reward/mean": 3.625, + "rewards/accuracy_reward/std": 2.8030595779418945, + "rewards/ngram_similarity_reward/mean": 0.7194472551345825, + "rewards/ngram_similarity_reward/std": 0.2889397144317627, + "step": 1638 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1024.0, + "completions/mean_length": 511.921875, + "completions/mean_terminated_length": 487.5397033691406, + "completions/min_length": 207.0, + "completions/min_terminated_length": 207.0, + "epoch": 0.7334974267173865, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.07077033817768097, + "learning_rate": 3.87692175491468e-06, + "loss": 0.0621, + "num_tokens": 261713163.0, + "reward": 4.173326015472412, + "reward_std": 0.8966077566146851, + "rewards/accuracy_reward/mean": 3.53125, + "rewards/accuracy_reward/std": 2.839454174041748, + "rewards/ngram_similarity_reward/mean": 0.6420760154724121, + "rewards/ngram_similarity_reward/std": 0.3697829842567444, + "step": 1639 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 833.0, + "completions/max_terminated_length": 833.0, + "completions/mean_length": 558.265625, + "completions/mean_terminated_length": 558.265625, + "completions/min_length": 249.0, + "completions/min_terminated_length": 249.0, + "epoch": 0.7339449541284404, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07937997579574585, + "learning_rate": 3.875480541447505e-06, + "loss": -0.0224, + "num_tokens": 261874140.0, + "reward": 3.18398380279541, + "reward_std": 0.10827402770519257, + "rewards/accuracy_reward/mean": 2.5, + "rewards/accuracy_reward/std": 3.0237157344818115, + "rewards/ngram_similarity_reward/mean": 0.6839838027954102, + "rewards/ngram_similarity_reward/std": 0.29379263520240784, + "step": 1640 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 750.0, + "completions/mean_length": 567.0625, + "completions/mean_terminated_length": 543.5556030273438, + "completions/min_length": 291.0, + "completions/min_terminated_length": 291.0, + "epoch": 0.7343924815394943, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09610205143690109, + "learning_rate": 3.874038711842696e-06, + "loss": 0.009, + "num_tokens": 262002864.0, + "reward": 4.4142889976501465, + "reward_std": 0.9095728993415833, + "rewards/accuracy_reward/mean": 3.71875, + "rewards/accuracy_reward/std": 2.7629566192626953, + "rewards/ngram_similarity_reward/mean": 0.6955393552780151, + "rewards/ngram_similarity_reward/std": 0.2655259668827057, + "step": 1641 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 656.0, + "completions/mean_length": 501.8125, + "completions/mean_terminated_length": 477.2698669433594, + "completions/min_length": 296.0, + "completions/min_terminated_length": 296.0, + "epoch": 0.7348400089505482, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12218975275754929, + "learning_rate": 3.872596266889572e-06, + "loss": 0.0507, + "num_tokens": 262143108.0, + "reward": 3.1528711318969727, + "reward_std": 0.2617282271385193, + "rewards/accuracy_reward/mean": 2.40625, + "rewards/accuracy_reward/std": 3.1305904388427734, + "rewards/ngram_similarity_reward/mean": 0.7466210126876831, + "rewards/ngram_similarity_reward/std": 0.25369471311569214, + "step": 1642 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1146.0, + "completions/max_terminated_length": 1146.0, + "completions/mean_length": 515.96875, + "completions/mean_terminated_length": 515.96875, + "completions/min_length": 219.0, + "completions/min_terminated_length": 219.0, + "epoch": 0.7352875363616022, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09477532655000687, + "learning_rate": 3.871153207377795e-06, + "loss": 0.0101, + "num_tokens": 262303218.0, + "reward": 4.540840148925781, + "reward_std": 0.11984336376190186, + "rewards/accuracy_reward/mean": 4.0, + "rewards/accuracy_reward/std": 2.618614673614502, + "rewards/ngram_similarity_reward/mean": 0.5408403277397156, + "rewards/ngram_similarity_reward/std": 0.34968307614326477, + "step": 1643 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 705.0, + "completions/max_terminated_length": 705.0, + "completions/mean_length": 440.109375, + "completions/mean_terminated_length": 440.109375, + "completions/min_length": 308.0, + "completions/min_terminated_length": 308.0, + "epoch": 0.7357350637726561, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11149965226650238, + "learning_rate": 3.869709534097355e-06, + "loss": 0.0133, + "num_tokens": 262478761.0, + "reward": 2.4047563076019287, + "reward_std": 1.3677046298980713, + "rewards/accuracy_reward/mean": 1.75, + "rewards/accuracy_reward/std": 2.9277002811431885, + "rewards/ngram_similarity_reward/mean": 0.6547563672065735, + "rewards/ngram_similarity_reward/std": 0.2777254283428192, + "step": 1644 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1148.0, + "completions/max_terminated_length": 1148.0, + "completions/mean_length": 475.46875, + "completions/mean_terminated_length": 475.46875, + "completions/min_length": 312.0, + "completions/min_terminated_length": 312.0, + "epoch": 0.73618259118371, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11225708574056625, + "learning_rate": 3.868265247838586e-06, + "loss": 0.0002, + "num_tokens": 262645991.0, + "reward": 4.428684234619141, + "reward_std": 0.18725836277008057, + "rewards/accuracy_reward/mean": 3.828125, + "rewards/accuracy_reward/std": 2.9279966354370117, + "rewards/ngram_similarity_reward/mean": 0.6005595922470093, + "rewards/ngram_similarity_reward/std": 0.22134476900100708, + "step": 1645 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 826.0, + "completions/max_terminated_length": 826.0, + "completions/mean_length": 462.203125, + "completions/mean_terminated_length": 462.203125, + "completions/min_length": 287.0, + "completions/min_terminated_length": 287.0, + "epoch": 0.7366301185947639, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.10217977315187454, + "learning_rate": 3.866820349392152e-06, + "loss": -0.0361, + "num_tokens": 262765892.0, + "reward": 2.707876205444336, + "reward_std": 1.2375640869140625, + "rewards/accuracy_reward/mean": 1.9375, + "rewards/accuracy_reward/std": 2.9700891971588135, + "rewards/ngram_similarity_reward/mean": 0.7703762054443359, + "rewards/ngram_similarity_reward/std": 0.2640071213245392, + "step": 1646 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 846.0, + "completions/max_terminated_length": 846.0, + "completions/mean_length": 416.34375, + "completions/mean_terminated_length": 416.34375, + "completions/min_length": 240.0, + "completions/min_terminated_length": 240.0, + "epoch": 0.7370776460058178, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.08163312077522278, + "learning_rate": 3.865374839549054e-06, + "loss": 0.0528, + "num_tokens": 262899818.0, + "reward": 3.4912028312683105, + "reward_std": 0.12502653896808624, + "rewards/accuracy_reward/mean": 2.5, + "rewards/accuracy_reward/std": 3.0237157344818115, + "rewards/ngram_similarity_reward/mean": 0.9912027716636658, + "rewards/ngram_similarity_reward/std": 0.18320074677467346, + "step": 1647 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 624.0, + "completions/max_terminated_length": 624.0, + "completions/mean_length": 445.28125, + "completions/mean_terminated_length": 445.28125, + "completions/min_length": 294.0, + "completions/min_terminated_length": 294.0, + "epoch": 0.7375251734168717, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13695028424263, + "learning_rate": 3.863928719100628e-06, + "loss": 0.0157, + "num_tokens": 263042124.0, + "reward": 4.577659606933594, + "reward_std": 0.6269991993904114, + "rewards/accuracy_reward/mean": 3.8125, + "rewards/accuracy_reward/std": 2.7189810276031494, + "rewards/ngram_similarity_reward/mean": 0.765159547328949, + "rewards/ngram_similarity_reward/std": 0.27326247096061707, + "step": 1648 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 1583.0, + "completions/max_terminated_length": 720.0, + "completions/mean_length": 460.15625, + "completions/mean_terminated_length": 442.3333740234375, + "completions/min_length": 288.0, + "completions/min_terminated_length": 288.0, + "epoch": 0.7379727008279258, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11578591167926788, + "learning_rate": 3.862481988838544e-06, + "loss": -0.0722, + "num_tokens": 263254199.0, + "reward": 4.636940956115723, + "reward_std": 0.883092999458313, + "rewards/accuracy_reward/mean": 4.0, + "rewards/accuracy_reward/std": 2.618614673614502, + "rewards/ngram_similarity_reward/mean": 0.6369410753250122, + "rewards/ngram_similarity_reward/std": 0.376800537109375, + "step": 1649 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 724.0, + "completions/max_terminated_length": 724.0, + "completions/mean_length": 462.890625, + "completions/mean_terminated_length": 462.890625, + "completions/min_length": 268.0, + "completions/min_terminated_length": 268.0, + "epoch": 0.7384202282389797, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10988388955593109, + "learning_rate": 3.861034649554807e-06, + "loss": -0.0338, + "num_tokens": 263384112.0, + "reward": 3.533763885498047, + "reward_std": 1.0205178260803223, + "rewards/accuracy_reward/mean": 2.765625, + "rewards/accuracy_reward/std": 3.0302298069000244, + "rewards/ngram_similarity_reward/mean": 0.7681391835212708, + "rewards/ngram_similarity_reward/std": 0.2180566042661667, + "step": 1650 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 749.0, + "completions/mean_length": 462.21875, + "completions/mean_terminated_length": 437.0476379394531, + "completions/min_length": 314.0, + "completions/min_terminated_length": 314.0, + "epoch": 0.7388677556500336, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13337351381778717, + "learning_rate": 3.8595867020417525e-06, + "loss": -0.0145, + "num_tokens": 263544094.0, + "reward": 3.9548983573913574, + "reward_std": 1.4088985919952393, + "rewards/accuracy_reward/mean": 3.25, + "rewards/accuracy_reward/std": 2.9277002811431885, + "rewards/ngram_similarity_reward/mean": 0.7048982381820679, + "rewards/ngram_similarity_reward/std": 0.22468942403793335, + "step": 1651 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1078.0, + "completions/max_terminated_length": 1078.0, + "completions/mean_length": 513.984375, + "completions/mean_terminated_length": 513.984375, + "completions/min_length": 311.0, + "completions/min_terminated_length": 311.0, + "epoch": 0.7393152830610875, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11442842334508896, + "learning_rate": 3.858138147092051e-06, + "loss": -0.016, + "num_tokens": 263675885.0, + "reward": 5.1964521408081055, + "reward_std": 1.0896323919296265, + "rewards/accuracy_reward/mean": 4.28125, + "rewards/accuracy_reward/std": 2.4330317974090576, + "rewards/ngram_similarity_reward/mean": 0.9152022004127502, + "rewards/ngram_similarity_reward/std": 0.3200232684612274, + "step": 1652 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 699.0, + "completions/max_terminated_length": 699.0, + "completions/mean_length": 426.484375, + "completions/mean_terminated_length": 426.484375, + "completions/min_length": 268.0, + "completions/min_terminated_length": 268.0, + "epoch": 0.7397628104721414, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10637952387332916, + "learning_rate": 3.856688985498707e-06, + "loss": -0.0017, + "num_tokens": 263810892.0, + "reward": 4.461467266082764, + "reward_std": 0.4828716814517975, + "rewards/accuracy_reward/mean": 3.90625, + "rewards/accuracy_reward/std": 2.6709415912628174, + "rewards/ngram_similarity_reward/mean": 0.5552173256874084, + "rewards/ngram_similarity_reward/std": 0.2920112907886505, + "step": 1653 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 706.0, + "completions/max_terminated_length": 706.0, + "completions/mean_length": 485.203125, + "completions/mean_terminated_length": 485.203125, + "completions/min_length": 324.0, + "completions/min_terminated_length": 324.0, + "epoch": 0.7402103378831953, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1339215636253357, + "learning_rate": 3.855239218055055e-06, + "loss": -0.013, + "num_tokens": 263964265.0, + "reward": 4.873979568481445, + "reward_std": 1.5473817586898804, + "rewards/accuracy_reward/mean": 4.0, + "rewards/accuracy_reward/std": 2.618614673614502, + "rewards/ngram_similarity_reward/mean": 0.8739794492721558, + "rewards/ngram_similarity_reward/std": 0.22126427292823792, + "step": 1654 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 889.0, + "completions/max_terminated_length": 889.0, + "completions/mean_length": 420.0625, + "completions/mean_terminated_length": 420.0625, + "completions/min_length": 243.0, + "completions/min_terminated_length": 243.0, + "epoch": 0.7406578652942493, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10984153300523758, + "learning_rate": 3.8537888455547595e-06, + "loss": -0.0018, + "num_tokens": 264089837.0, + "reward": 5.766287803649902, + "reward_std": 1.2951836585998535, + "rewards/accuracy_reward/mean": 4.921875, + "rewards/accuracy_reward/std": 1.8153201341629028, + "rewards/ngram_similarity_reward/mean": 0.8444128036499023, + "rewards/ngram_similarity_reward/std": 0.23695027828216553, + "step": 1655 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 950.0, + "completions/max_terminated_length": 950.0, + "completions/mean_length": 569.640625, + "completions/mean_terminated_length": 569.640625, + "completions/min_length": 300.0, + "completions/min_terminated_length": 300.0, + "epoch": 0.7411053927053032, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09492413699626923, + "learning_rate": 3.85233786879182e-06, + "loss": 0.0257, + "num_tokens": 264220230.0, + "reward": 3.574841022491455, + "reward_std": 1.4794020652770996, + "rewards/accuracy_reward/mean": 2.96875, + "rewards/accuracy_reward/std": 2.986577272415161, + "rewards/ngram_similarity_reward/mean": 0.6060912609100342, + "rewards/ngram_similarity_reward/std": 0.24550238251686096, + "step": 1656 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 779.0, + "completions/max_terminated_length": 779.0, + "completions/mean_length": 415.125, + "completions/mean_terminated_length": 415.125, + "completions/min_length": 254.0, + "completions/min_terminated_length": 254.0, + "epoch": 0.7415529201163571, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.14240625500679016, + "learning_rate": 3.850886288560565e-06, + "loss": -0.0264, + "num_tokens": 264360862.0, + "reward": 4.8376312255859375, + "reward_std": 0.09781719744205475, + "rewards/accuracy_reward/mean": 4.0, + "rewards/accuracy_reward/std": 2.618614673614502, + "rewards/ngram_similarity_reward/mean": 0.837631344795227, + "rewards/ngram_similarity_reward/std": 0.21419653296470642, + "step": 1657 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.078125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 749.0, + "completions/mean_length": 553.96875, + "completions/mean_terminated_length": 427.3559265136719, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "epoch": 0.742000447527411, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13936901092529297, + "learning_rate": 3.849434105655653e-06, + "loss": -0.0329, + "num_tokens": 264520412.0, + "reward": 3.005133628845215, + "reward_std": 0.6597869396209717, + "rewards/accuracy_reward/mean": 2.3125, + "rewards/accuracy_reward/std": 3.0178043842315674, + "rewards/ngram_similarity_reward/mean": 0.6926336884498596, + "rewards/ngram_similarity_reward/std": 0.25201693177223206, + "step": 1658 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 651.0, + "completions/max_terminated_length": 651.0, + "completions/mean_length": 385.703125, + "completions/mean_terminated_length": 385.703125, + "completions/min_length": 236.0, + "completions/min_terminated_length": 236.0, + "epoch": 0.7424479749384649, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12147440761327744, + "learning_rate": 3.847981320872074e-06, + "loss": 0.0279, + "num_tokens": 264686105.0, + "reward": 2.880711078643799, + "reward_std": 1.4913966655731201, + "rewards/accuracy_reward/mean": 2.21875, + "rewards/accuracy_reward/std": 3.0103988647460938, + "rewards/ngram_similarity_reward/mean": 0.6619612574577332, + "rewards/ngram_similarity_reward/std": 0.3021509349346161, + "step": 1659 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 651.0, + "completions/max_terminated_length": 651.0, + "completions/mean_length": 435.453125, + "completions/mean_terminated_length": 435.453125, + "completions/min_length": 303.0, + "completions/min_terminated_length": 303.0, + "epoch": 0.742895502349519, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11805487424135208, + "learning_rate": 3.846527935005145e-06, + "loss": 0.0191, + "num_tokens": 264839110.0, + "reward": 3.053068161010742, + "reward_std": 0.47172486782073975, + "rewards/accuracy_reward/mean": 2.359375, + "rewards/accuracy_reward/std": 3.075077533721924, + "rewards/ngram_similarity_reward/mean": 0.6936930418014526, + "rewards/ngram_similarity_reward/std": 0.31582656502723694, + "step": 1660 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 811.0, + "completions/max_terminated_length": 811.0, + "completions/mean_length": 491.265625, + "completions/mean_terminated_length": 491.265625, + "completions/min_length": 307.0, + "completions/min_terminated_length": 307.0, + "epoch": 0.7433430297605729, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0991583988070488, + "learning_rate": 3.845073948850513e-06, + "loss": -0.0153, + "num_tokens": 265004391.0, + "reward": 2.2496678829193115, + "reward_std": 1.1892794370651245, + "rewards/accuracy_reward/mean": 1.5625, + "rewards/accuracy_reward/std": 2.872281312942505, + "rewards/ngram_similarity_reward/mean": 0.687167763710022, + "rewards/ngram_similarity_reward/std": 0.2876705527305603, + "step": 1661 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1055.0, + "completions/max_terminated_length": 1055.0, + "completions/mean_length": 531.0625, + "completions/mean_terminated_length": 531.0625, + "completions/min_length": 281.0, + "completions/min_terminated_length": 281.0, + "epoch": 0.7437905571716268, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.10559695959091187, + "learning_rate": 3.843619363204157e-06, + "loss": 0.0238, + "num_tokens": 265174059.0, + "reward": 3.0393824577331543, + "reward_std": 0.5305798053741455, + "rewards/accuracy_reward/mean": 2.390625, + "rewards/accuracy_reward/std": 3.0400354862213135, + "rewards/ngram_similarity_reward/mean": 0.6487575769424438, + "rewards/ngram_similarity_reward/std": 0.33010560274124146, + "step": 1662 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 804.0, + "completions/mean_length": 578.15625, + "completions/mean_terminated_length": 530.741943359375, + "completions/min_length": 355.0, + "completions/min_terminated_length": 355.0, + "epoch": 0.7442380845826807, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10930699855089188, + "learning_rate": 3.842164178862378e-06, + "loss": -0.0353, + "num_tokens": 265346533.0, + "reward": 3.4777369499206543, + "reward_std": 0.6202689409255981, + "rewards/accuracy_reward/mean": 2.6875, + "rewards/accuracy_reward/std": 3.0178043842315674, + "rewards/ngram_similarity_reward/mean": 0.7902370691299438, + "rewards/ngram_similarity_reward/std": 0.23186592757701874, + "step": 1663 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 864.0, + "completions/max_terminated_length": 864.0, + "completions/mean_length": 531.15625, + "completions/mean_terminated_length": 531.15625, + "completions/min_length": 250.0, + "completions/min_terminated_length": 250.0, + "epoch": 0.7446856119937346, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10400087386369705, + "learning_rate": 3.84070839662181e-06, + "loss": 0.0315, + "num_tokens": 265511775.0, + "reward": 3.239142417907715, + "reward_std": 0.08492843806743622, + "rewards/accuracy_reward/mean": 2.5, + "rewards/accuracy_reward/std": 3.0237157344818115, + "rewards/ngram_similarity_reward/mean": 0.7391422986984253, + "rewards/ngram_similarity_reward/std": 0.2556718587875366, + "step": 1664 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 682.0, + "completions/max_terminated_length": 682.0, + "completions/mean_length": 378.46875, + "completions/mean_terminated_length": 378.46875, + "completions/min_length": 231.0, + "completions/min_terminated_length": 231.0, + "epoch": 0.7451331394047885, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.12179583311080933, + "learning_rate": 3.839252017279412e-06, + "loss": 0.0367, + "num_tokens": 265716189.0, + "reward": 5.8060760498046875, + "reward_std": 0.6848320364952087, + "rewards/accuracy_reward/mean": 5.21875, + "rewards/accuracy_reward/std": 1.2782522439956665, + "rewards/ngram_similarity_reward/mean": 0.5873256921768188, + "rewards/ngram_similarity_reward/std": 0.4143521785736084, + "step": 1665 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 582.0, + "completions/max_terminated_length": 582.0, + "completions/mean_length": 426.953125, + "completions/mean_terminated_length": 426.953125, + "completions/min_length": 297.0, + "completions/min_terminated_length": 297.0, + "epoch": 0.7455806668158425, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.09366989880800247, + "learning_rate": 3.83779504163247e-06, + "loss": 0.0202, + "num_tokens": 265836186.0, + "reward": 5.846687316894531, + "reward_std": 0.8473169207572937, + "rewards/accuracy_reward/mean": 5.03125, + "rewards/accuracy_reward/std": 1.6229382753372192, + "rewards/ngram_similarity_reward/mean": 0.8154371380805969, + "rewards/ngram_similarity_reward/std": 0.3438029885292053, + "step": 1666 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 622.0, + "completions/max_terminated_length": 622.0, + "completions/mean_length": 435.828125, + "completions/mean_terminated_length": 435.828125, + "completions/min_length": 304.0, + "completions/min_terminated_length": 304.0, + "epoch": 0.7460281942268964, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12821348011493683, + "learning_rate": 3.836337470478596e-06, + "loss": 0.0414, + "num_tokens": 265992207.0, + "reward": 2.483971118927002, + "reward_std": 1.8114967346191406, + "rewards/accuracy_reward/mean": 2.03125, + "rewards/accuracy_reward/std": 2.986577272415161, + "rewards/ngram_similarity_reward/mean": 0.4527212977409363, + "rewards/ngram_similarity_reward/std": 0.11671894788742065, + "step": 1667 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 877.0, + "completions/max_terminated_length": 877.0, + "completions/mean_length": 482.046875, + "completions/mean_terminated_length": 482.046875, + "completions/min_length": 305.0, + "completions/min_terminated_length": 305.0, + "epoch": 0.7464757216379503, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0851159542798996, + "learning_rate": 3.834879304615729e-06, + "loss": -0.0183, + "num_tokens": 266129634.0, + "reward": 2.963693618774414, + "reward_std": 0.6637592315673828, + "rewards/accuracy_reward/mean": 2.28125, + "rewards/accuracy_reward/std": 3.0522892475128174, + "rewards/ngram_similarity_reward/mean": 0.6824436187744141, + "rewards/ngram_similarity_reward/std": 0.16195547580718994, + "step": 1668 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 679.0, + "completions/mean_length": 525.28125, + "completions/mean_terminated_length": 476.1612854003906, + "completions/min_length": 287.0, + "completions/min_terminated_length": 287.0, + "epoch": 0.7469232490490042, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11129762977361679, + "learning_rate": 3.833420544842135e-06, + "loss": -0.0332, + "num_tokens": 266249396.0, + "reward": 6.464842796325684, + "reward_std": 0.08006338775157928, + "rewards/accuracy_reward/mean": 5.5, + "rewards/accuracy_reward/std": 0.0, + "rewards/ngram_similarity_reward/mean": 0.9648431539535522, + "rewards/ngram_similarity_reward/std": 0.17534130811691284, + "step": 1669 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 687.0, + "completions/max_terminated_length": 687.0, + "completions/mean_length": 406.078125, + "completions/mean_terminated_length": 406.078125, + "completions/min_length": 280.0, + "completions/min_terminated_length": 280.0, + "epoch": 0.7473707764600582, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13044987618923187, + "learning_rate": 3.831961191956401e-06, + "loss": 0.0307, + "num_tokens": 266456153.0, + "reward": 4.403616905212402, + "reward_std": 1.0030522346496582, + "rewards/accuracy_reward/mean": 3.71875, + "rewards/accuracy_reward/std": 2.7629566192626953, + "rewards/ngram_similarity_reward/mean": 0.6848673820495605, + "rewards/ngram_similarity_reward/std": 0.2170443832874298, + "step": 1670 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 742.0, + "completions/max_terminated_length": 742.0, + "completions/mean_length": 495.09375, + "completions/mean_terminated_length": 495.09375, + "completions/min_length": 315.0, + "completions/min_terminated_length": 315.0, + "epoch": 0.7478183038711121, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.07495610415935516, + "learning_rate": 3.830501246757442e-06, + "loss": -0.021, + "num_tokens": 266640687.0, + "reward": 1.750803828239441, + "reward_std": 0.42258378863334656, + "rewards/accuracy_reward/mean": 1.09375, + "rewards/accuracy_reward/std": 2.6709415912628174, + "rewards/ngram_similarity_reward/mean": 0.6570536494255066, + "rewards/ngram_similarity_reward/std": 0.10247427970170975, + "step": 1671 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 660.0, + "completions/max_terminated_length": 660.0, + "completions/mean_length": 418.734375, + "completions/mean_terminated_length": 418.734375, + "completions/min_length": 270.0, + "completions/min_terminated_length": 270.0, + "epoch": 0.7482658312821661, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12411162257194519, + "learning_rate": 3.829040710044495e-06, + "loss": 0.0021, + "num_tokens": 266802990.0, + "reward": 5.783257484436035, + "reward_std": 1.2124266624450684, + "rewards/accuracy_reward/mean": 4.984375, + "rewards/accuracy_reward/std": 1.790558934211731, + "rewards/ngram_similarity_reward/mean": 0.7988826036453247, + "rewards/ngram_similarity_reward/std": 0.21857917308807373, + "step": 1672 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 849.0, + "completions/max_terminated_length": 849.0, + "completions/mean_length": 541.8125, + "completions/mean_terminated_length": 541.8125, + "completions/min_length": 311.0, + "completions/min_terminated_length": 311.0, + "epoch": 0.74871335869322, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.14073511958122253, + "learning_rate": 3.827579582617126e-06, + "loss": -0.002, + "num_tokens": 266970130.0, + "reward": 3.7232022285461426, + "reward_std": 1.6067060232162476, + "rewards/accuracy_reward/mean": 2.78125, + "rewards/accuracy_reward/std": 3.0103988647460938, + "rewards/ngram_similarity_reward/mean": 0.9419524669647217, + "rewards/ngram_similarity_reward/std": 0.2151007354259491, + "step": 1673 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 779.0, + "completions/max_terminated_length": 779.0, + "completions/mean_length": 402.15625, + "completions/mean_terminated_length": 402.15625, + "completions/min_length": 228.0, + "completions/min_terminated_length": 228.0, + "epoch": 0.7491608861042739, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.09434747695922852, + "learning_rate": 3.826117865275216e-06, + "loss": 0.0164, + "num_tokens": 267157980.0, + "reward": 2.489003896713257, + "reward_std": 0.8085304498672485, + "rewards/accuracy_reward/mean": 1.65625, + "rewards/accuracy_reward/std": 2.9016621112823486, + "rewards/ngram_similarity_reward/mean": 0.8327538967132568, + "rewards/ngram_similarity_reward/std": 0.19913837313652039, + "step": 1674 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 591.0, + "completions/max_terminated_length": 591.0, + "completions/mean_length": 410.8125, + "completions/mean_terminated_length": 410.8125, + "completions/min_length": 288.0, + "completions/min_terminated_length": 288.0, + "epoch": 0.7496084135153278, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12702973186969757, + "learning_rate": 3.824655558818976e-06, + "loss": 0.0109, + "num_tokens": 267287584.0, + "reward": 4.820849895477295, + "reward_std": 0.1471467763185501, + "rewards/accuracy_reward/mean": 4.0, + "rewards/accuracy_reward/std": 2.618614673614502, + "rewards/ngram_similarity_reward/mean": 0.8208498954772949, + "rewards/ngram_similarity_reward/std": 0.235096737742424, + "step": 1675 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 881.0, + "completions/max_terminated_length": 881.0, + "completions/mean_length": 505.234375, + "completions/mean_terminated_length": 505.234375, + "completions/min_length": 287.0, + "completions/min_terminated_length": 287.0, + "epoch": 0.7500559409263817, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09892650693655014, + "learning_rate": 3.823192664048936e-06, + "loss": 0.0008, + "num_tokens": 267458399.0, + "reward": 3.194218873977661, + "reward_std": 0.08959155529737473, + "rewards/accuracy_reward/mean": 2.5, + "rewards/accuracy_reward/std": 3.0237157344818115, + "rewards/ngram_similarity_reward/mean": 0.6942191123962402, + "rewards/ngram_similarity_reward/std": 0.254846453666687, + "step": 1676 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 540.0, + "completions/mean_length": 413.5, + "completions/mean_terminated_length": 360.774169921875, + "completions/min_length": 236.0, + "completions/min_terminated_length": 236.0, + "epoch": 0.7505034683374356, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.1347137987613678, + "learning_rate": 3.82172918176595e-06, + "loss": 0.0065, + "num_tokens": 267674559.0, + "reward": 1.778525710105896, + "reward_std": 0.16010552644729614, + "rewards/accuracy_reward/mean": 0.984375, + "rewards/accuracy_reward/std": 2.6306629180908203, + "rewards/ngram_similarity_reward/mean": 0.7941508293151855, + "rewards/ngram_similarity_reward/std": 0.32695913314819336, + "step": 1677 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 874.0, + "completions/mean_length": 558.28125, + "completions/mean_terminated_length": 534.6349487304688, + "completions/min_length": 357.0, + "completions/min_terminated_length": 357.0, + "epoch": 0.7509509957484896, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10573645681142807, + "learning_rate": 3.820265112771192e-06, + "loss": -0.0171, + "num_tokens": 267826433.0, + "reward": 3.6088709831237793, + "reward_std": 0.9199876189231873, + "rewards/accuracy_reward/mean": 3.3125, + "rewards/accuracy_reward/std": 2.948634386062622, + "rewards/ngram_similarity_reward/mean": 0.29637086391448975, + "rewards/ngram_similarity_reward/std": 0.1301681399345398, + "step": 1678 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 803.0, + "completions/max_terminated_length": 803.0, + "completions/mean_length": 522.46875, + "completions/mean_terminated_length": 522.46875, + "completions/min_length": 223.0, + "completions/min_terminated_length": 223.0, + "epoch": 0.7513985231595435, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0744960680603981, + "learning_rate": 3.818800457866158e-06, + "loss": -0.0546, + "num_tokens": 267986175.0, + "reward": 4.9039812088012695, + "reward_std": 1.5562037229537964, + "rewards/accuracy_reward/mean": 4.09375, + "rewards/accuracy_reward/std": 2.5617377758026123, + "rewards/ngram_similarity_reward/mean": 0.8102311491966248, + "rewards/ngram_similarity_reward/std": 0.200358584523201, + "step": 1679 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 651.0, + "completions/max_terminated_length": 651.0, + "completions/mean_length": 473.15625, + "completions/mean_terminated_length": 473.15625, + "completions/min_length": 317.0, + "completions/min_terminated_length": 317.0, + "epoch": 0.7518460505705975, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.09306388348340988, + "learning_rate": 3.817335217852664e-06, + "loss": 0.0059, + "num_tokens": 268119897.0, + "reward": 5.6161651611328125, + "reward_std": 0.8234336972236633, + "rewards/accuracy_reward/mean": 4.65625, + "rewards/accuracy_reward/std": 2.102294683456421, + "rewards/ngram_similarity_reward/mean": 0.9599153995513916, + "rewards/ngram_similarity_reward/std": 0.31456896662712097, + "step": 1680 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 718.0, + "completions/max_terminated_length": 718.0, + "completions/mean_length": 494.0, + "completions/mean_terminated_length": 494.0, + "completions/min_length": 336.0, + "completions/min_terminated_length": 336.0, + "epoch": 0.7522935779816514, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1134127527475357, + "learning_rate": 3.8158693935328485e-06, + "loss": -0.0021, + "num_tokens": 268301369.0, + "reward": 4.461511611938477, + "reward_std": 0.17666134238243103, + "rewards/accuracy_reward/mean": 3.984375, + "rewards/accuracy_reward/std": 2.648702621459961, + "rewards/ngram_similarity_reward/mean": 0.4771363437175751, + "rewards/ngram_similarity_reward/std": 0.3310698866844177, + "step": 1681 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 764.0, + "completions/max_terminated_length": 764.0, + "completions/mean_length": 527.53125, + "completions/mean_terminated_length": 527.53125, + "completions/min_length": 364.0, + "completions/min_terminated_length": 364.0, + "epoch": 0.7527411053927053, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09310425817966461, + "learning_rate": 3.814402985709167e-06, + "loss": -0.0041, + "num_tokens": 268438523.0, + "reward": 3.3843183517456055, + "reward_std": 0.16243869066238403, + "rewards/accuracy_reward/mean": 2.484375, + "rewards/accuracy_reward/std": 3.0419929027557373, + "rewards/ngram_similarity_reward/mean": 0.899943470954895, + "rewards/ngram_similarity_reward/std": 0.33668068051338196, + "step": 1682 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 776.0, + "completions/max_terminated_length": 776.0, + "completions/mean_length": 461.125, + "completions/mean_terminated_length": 461.125, + "completions/min_length": 322.0, + "completions/min_terminated_length": 322.0, + "epoch": 0.7531886328037593, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12268368154764175, + "learning_rate": 3.8129359951843963e-06, + "loss": 0.0474, + "num_tokens": 268600499.0, + "reward": 5.946240425109863, + "reward_std": 1.2668113708496094, + "rewards/accuracy_reward/mean": 5.203125, + "rewards/accuracy_reward/std": 1.3531819581985474, + "rewards/ngram_similarity_reward/mean": 0.7431154251098633, + "rewards/ngram_similarity_reward/std": 0.3180932402610779, + "step": 1683 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 838.0, + "completions/mean_length": 484.71875, + "completions/mean_terminated_length": 459.90478515625, + "completions/min_length": 250.0, + "completions/min_terminated_length": 250.0, + "epoch": 0.7536361602148132, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10037180781364441, + "learning_rate": 3.811468422761631e-06, + "loss": 0.0605, + "num_tokens": 268782273.0, + "reward": 6.026999473571777, + "reward_std": 0.5377567410469055, + "rewards/accuracy_reward/mean": 5.40625, + "rewards/accuracy_reward/std": 0.7500000596046448, + "rewards/ngram_similarity_reward/mean": 0.6207495331764221, + "rewards/ngram_similarity_reward/std": 0.32750198245048523, + "step": 1684 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 622.0, + "completions/max_terminated_length": 622.0, + "completions/mean_length": 443.203125, + "completions/mean_terminated_length": 443.203125, + "completions/min_length": 278.0, + "completions/min_terminated_length": 278.0, + "epoch": 0.7540836876258671, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.1014128252863884, + "learning_rate": 3.8100002692442855e-06, + "loss": 0.0167, + "num_tokens": 268932286.0, + "reward": 2.9368717670440674, + "reward_std": 0.045694585889577866, + "rewards/accuracy_reward/mean": 2.5, + "rewards/accuracy_reward/std": 3.0237157344818115, + "rewards/ngram_similarity_reward/mean": 0.43687185645103455, + "rewards/ngram_similarity_reward/std": 0.22525519132614136, + "step": 1685 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 636.0, + "completions/max_terminated_length": 636.0, + "completions/mean_length": 479.1875, + "completions/mean_terminated_length": 479.1875, + "completions/min_length": 287.0, + "completions/min_terminated_length": 287.0, + "epoch": 0.754531215036921, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.08453965187072754, + "learning_rate": 3.8085315354360917e-06, + "loss": -0.0105, + "num_tokens": 269097226.0, + "reward": 6.354506492614746, + "reward_std": 0.05074494332075119, + "rewards/accuracy_reward/mean": 5.5, + "rewards/accuracy_reward/std": 0.0, + "rewards/ngram_similarity_reward/mean": 0.8545065522193909, + "rewards/ngram_similarity_reward/std": 0.27964460849761963, + "step": 1686 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 759.0, + "completions/max_terminated_length": 759.0, + "completions/mean_length": 416.03125, + "completions/mean_terminated_length": 416.03125, + "completions/min_length": 212.0, + "completions/min_terminated_length": 212.0, + "epoch": 0.7549787424479749, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0906090959906578, + "learning_rate": 3.807062222141099e-06, + "loss": 0.0053, + "num_tokens": 269278268.0, + "reward": 4.438193321228027, + "reward_std": 0.6636765003204346, + "rewards/accuracy_reward/mean": 3.8125, + "rewards/accuracy_reward/std": 2.7189810276031494, + "rewards/ngram_similarity_reward/mean": 0.6256935596466064, + "rewards/ngram_similarity_reward/std": 0.28762826323509216, + "step": 1687 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1198.0, + "completions/max_terminated_length": 1198.0, + "completions/mean_length": 576.0, + "completions/mean_terminated_length": 576.0, + "completions/min_length": 290.0, + "completions/min_terminated_length": 290.0, + "epoch": 0.7554262698590288, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0749097540974617, + "learning_rate": 3.805592330163675e-06, + "loss": -0.0114, + "num_tokens": 269426076.0, + "reward": 4.431105136871338, + "reward_std": 0.4843612611293793, + "rewards/accuracy_reward/mean": 3.890625, + "rewards/accuracy_reward/std": 2.6998953819274902, + "rewards/ngram_similarity_reward/mean": 0.5404797792434692, + "rewards/ngram_similarity_reward/std": 0.2098207175731659, + "step": 1688 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 764.0, + "completions/max_terminated_length": 764.0, + "completions/mean_length": 505.078125, + "completions/mean_terminated_length": 505.078125, + "completions/min_length": 299.0, + "completions/min_terminated_length": 299.0, + "epoch": 0.7558737972700827, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10263252258300781, + "learning_rate": 3.804121860308502e-06, + "loss": 0.0431, + "num_tokens": 269539921.0, + "reward": 5.977553367614746, + "reward_std": 0.07637765258550644, + "rewards/accuracy_reward/mean": 5.5, + "rewards/accuracy_reward/std": 0.0, + "rewards/ngram_similarity_reward/mean": 0.4775530993938446, + "rewards/ngram_similarity_reward/std": 0.18223613500595093, + "step": 1689 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.078125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1534.0, + "completions/mean_length": 701.5625, + "completions/mean_terminated_length": 587.4576416015625, + "completions/min_length": 254.0, + "completions/min_terminated_length": 254.0, + "epoch": 0.7563213246811367, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13813172280788422, + "learning_rate": 3.8026508133805806e-06, + "loss": -0.01, + "num_tokens": 269707701.0, + "reward": 2.7967331409454346, + "reward_std": 0.5754700899124146, + "rewards/accuracy_reward/mean": 2.3125, + "rewards/accuracy_reward/std": 3.0178043842315674, + "rewards/ngram_similarity_reward/mean": 0.4842332601547241, + "rewards/ngram_similarity_reward/std": 0.23618005216121674, + "step": 1690 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 799.0, + "completions/max_terminated_length": 799.0, + "completions/mean_length": 453.140625, + "completions/mean_terminated_length": 453.140625, + "completions/min_length": 297.0, + "completions/min_terminated_length": 297.0, + "epoch": 0.7567688520921907, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11035829782485962, + "learning_rate": 3.801179190185227e-06, + "loss": -0.0044, + "num_tokens": 269895326.0, + "reward": 0.7430910468101501, + "reward_std": 0.8383191823959351, + "rewards/accuracy_reward/mean": 0.15625, + "rewards/accuracy_reward/std": 1.8874586820602417, + "rewards/ngram_similarity_reward/mean": 0.5868409872055054, + "rewards/ngram_similarity_reward/std": 0.20448924601078033, + "step": 1691 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 753.0, + "completions/max_terminated_length": 753.0, + "completions/mean_length": 445.28125, + "completions/mean_terminated_length": 445.28125, + "completions/min_length": 286.0, + "completions/min_terminated_length": 286.0, + "epoch": 0.7572163795032446, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.16396446526050568, + "learning_rate": 3.799706991528072e-06, + "loss": 0.0013, + "num_tokens": 270028832.0, + "reward": 4.434956073760986, + "reward_std": 0.7647863626480103, + "rewards/accuracy_reward/mean": 3.71875, + "rewards/accuracy_reward/std": 2.7629566192626953, + "rewards/ngram_similarity_reward/mean": 0.7162062525749207, + "rewards/ngram_similarity_reward/std": 0.3267899751663208, + "step": 1692 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 673.0, + "completions/mean_length": 448.203125, + "completions/mean_terminated_length": 422.8095397949219, + "completions/min_length": 227.0, + "completions/min_terminated_length": 227.0, + "epoch": 0.7576639069142985, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.10294193774461746, + "learning_rate": 3.7982342182150627e-06, + "loss": -0.0186, + "num_tokens": 270168109.0, + "reward": 4.480494499206543, + "reward_std": 0.8373005390167236, + "rewards/accuracy_reward/mean": 3.4375, + "rewards/accuracy_reward/std": 2.872281312942505, + "rewards/ngram_similarity_reward/mean": 1.0429949760437012, + "rewards/ngram_similarity_reward/std": 0.19750304520130157, + "step": 1693 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 821.0, + "completions/max_terminated_length": 821.0, + "completions/mean_length": 504.078125, + "completions/mean_terminated_length": 504.078125, + "completions/min_length": 294.0, + "completions/min_terminated_length": 294.0, + "epoch": 0.7581114343253524, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12504678964614868, + "learning_rate": 3.7967608710524596e-06, + "loss": -0.0071, + "num_tokens": 270353554.0, + "reward": 1.8410844802856445, + "reward_std": 1.1014564037322998, + "rewards/accuracy_reward/mean": 1.28125, + "rewards/accuracy_reward/std": 2.7629566192626953, + "rewards/ngram_similarity_reward/mean": 0.5598344206809998, + "rewards/ngram_similarity_reward/std": 0.330642431974411, + "step": 1694 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 718.0, + "completions/mean_length": 512.375, + "completions/mean_terminated_length": 462.83868408203125, + "completions/min_length": 294.0, + "completions/min_terminated_length": 294.0, + "epoch": 0.7585589617364064, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13782398402690887, + "learning_rate": 3.7952869508468375e-06, + "loss": 0.0321, + "num_tokens": 270513194.0, + "reward": 3.069380760192871, + "reward_std": 1.9833970069885254, + "rewards/accuracy_reward/mean": 2.328125, + "rewards/accuracy_reward/std": 3.109405755996704, + "rewards/ngram_similarity_reward/mean": 0.7412558197975159, + "rewards/ngram_similarity_reward/std": 0.23963335156440735, + "step": 1695 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 974.0, + "completions/max_terminated_length": 974.0, + "completions/mean_length": 584.734375, + "completions/mean_terminated_length": 584.734375, + "completions/min_length": 327.0, + "completions/min_terminated_length": 327.0, + "epoch": 0.7590064891474603, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.08866584300994873, + "learning_rate": 3.793812458405086e-06, + "loss": 0.0096, + "num_tokens": 270672537.0, + "reward": 3.525939464569092, + "reward_std": 0.8003028631210327, + "rewards/accuracy_reward/mean": 3.25, + "rewards/accuracy_reward/std": 2.9277002811431885, + "rewards/ngram_similarity_reward/mean": 0.27593910694122314, + "rewards/ngram_similarity_reward/std": 0.16110624372959137, + "step": 1696 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 903.0, + "completions/mean_length": 550.828125, + "completions/mean_terminated_length": 527.0635375976562, + "completions/min_length": 212.0, + "completions/min_terminated_length": 212.0, + "epoch": 0.7594540165585142, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12685289978981018, + "learning_rate": 3.792337394534407e-06, + "loss": -0.0162, + "num_tokens": 270880942.0, + "reward": 4.023468494415283, + "reward_std": 1.6493728160858154, + "rewards/accuracy_reward/mean": 3.21875, + "rewards/accuracy_reward/std": 2.9732606410980225, + "rewards/ngram_similarity_reward/mean": 0.8047187328338623, + "rewards/ngram_similarity_reward/std": 0.2672255039215088, + "step": 1697 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 942.0, + "completions/mean_length": 554.03125, + "completions/mean_terminated_length": 530.3175048828125, + "completions/min_length": 354.0, + "completions/min_terminated_length": 354.0, + "epoch": 0.7599015439695681, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08748823404312134, + "learning_rate": 3.7908617600423146e-06, + "loss": 0.0346, + "num_tokens": 271034832.0, + "reward": 5.809333801269531, + "reward_std": 1.2132689952850342, + "rewards/accuracy_reward/mean": 4.921875, + "rewards/accuracy_reward/std": 1.8153201341629028, + "rewards/ngram_similarity_reward/mean": 0.8874589204788208, + "rewards/ngram_similarity_reward/std": 0.20694543421268463, + "step": 1698 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 555.0, + "completions/max_terminated_length": 555.0, + "completions/mean_length": 392.015625, + "completions/mean_terminated_length": 392.015625, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "epoch": 0.760349071380622, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13442236185073853, + "learning_rate": 3.789385555736638e-06, + "loss": 0.0282, + "num_tokens": 271186993.0, + "reward": 6.382952690124512, + "reward_std": 0.2047431319952011, + "rewards/accuracy_reward/mean": 5.5, + "rewards/accuracy_reward/std": 0.0, + "rewards/ngram_similarity_reward/mean": 0.8829526305198669, + "rewards/ngram_similarity_reward/std": 0.29756835103034973, + "step": 1699 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 671.0, + "completions/max_terminated_length": 671.0, + "completions/mean_length": 458.203125, + "completions/mean_terminated_length": 458.203125, + "completions/min_length": 312.0, + "completions/min_terminated_length": 312.0, + "epoch": 0.7607965987916759, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10752265900373459, + "learning_rate": 3.7879087824255155e-06, + "loss": 0.0086, + "num_tokens": 271334814.0, + "reward": 6.193717956542969, + "reward_std": 0.10046614706516266, + "rewards/accuracy_reward/mean": 5.5, + "rewards/accuracy_reward/std": 0.0, + "rewards/ngram_similarity_reward/mean": 0.6937180757522583, + "rewards/ngram_similarity_reward/std": 0.2641277313232422, + "step": 1700 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 714.0, + "completions/max_terminated_length": 714.0, + "completions/mean_length": 442.3125, + "completions/mean_terminated_length": 442.3125, + "completions/min_length": 231.0, + "completions/min_terminated_length": 231.0, + "epoch": 0.76124412620273, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1317412108182907, + "learning_rate": 3.7864314409173977e-06, + "loss": 0.0131, + "num_tokens": 271500034.0, + "reward": 3.051396369934082, + "reward_std": 1.8890447616577148, + "rewards/accuracy_reward/mean": 2.40625, + "rewards/accuracy_reward/std": 3.0222392082214355, + "rewards/ngram_similarity_reward/mean": 0.645146369934082, + "rewards/ngram_similarity_reward/std": 0.323547899723053, + "step": 1701 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 568.0, + "completions/max_terminated_length": 568.0, + "completions/mean_length": 395.84375, + "completions/mean_terminated_length": 395.84375, + "completions/min_length": 289.0, + "completions/min_terminated_length": 289.0, + "epoch": 0.7616916536137839, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10261240601539612, + "learning_rate": 3.7849535320210456e-06, + "loss": 0.0094, + "num_tokens": 271673816.0, + "reward": 6.512883186340332, + "reward_std": 0.15527239441871643, + "rewards/accuracy_reward/mean": 5.5, + "rewards/accuracy_reward/std": 0.0, + "rewards/ngram_similarity_reward/mean": 1.0128833055496216, + "rewards/ngram_similarity_reward/std": 0.1972905546426773, + "step": 1702 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 925.0, + "completions/max_terminated_length": 925.0, + "completions/mean_length": 530.703125, + "completions/mean_terminated_length": 530.703125, + "completions/min_length": 274.0, + "completions/min_terminated_length": 274.0, + "epoch": 0.7621391810248378, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1261419802904129, + "learning_rate": 3.7834750565455337e-06, + "loss": -0.0152, + "num_tokens": 271904837.0, + "reward": 3.066795587539673, + "reward_std": 0.5125082731246948, + "rewards/accuracy_reward/mean": 2.40625, + "rewards/accuracy_reward/std": 3.0222392082214355, + "rewards/ngram_similarity_reward/mean": 0.6605455875396729, + "rewards/ngram_similarity_reward/std": 0.3532313108444214, + "step": 1703 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 738.0, + "completions/mean_length": 494.140625, + "completions/mean_terminated_length": 469.4762268066406, + "completions/min_length": 302.0, + "completions/min_terminated_length": 302.0, + "epoch": 0.7625867084358917, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0991603285074234, + "learning_rate": 3.7819960153002423e-06, + "loss": -0.003, + "num_tokens": 272044974.0, + "reward": 3.3295774459838867, + "reward_std": 0.6637980341911316, + "rewards/accuracy_reward/mean": 2.671875, + "rewards/accuracy_reward/std": 3.037097215652466, + "rewards/ngram_similarity_reward/mean": 0.6577024459838867, + "rewards/ngram_similarity_reward/std": 0.24875979125499725, + "step": 1704 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 684.0, + "completions/max_terminated_length": 684.0, + "completions/mean_length": 461.53125, + "completions/mean_terminated_length": 461.53125, + "completions/min_length": 240.0, + "completions/min_terminated_length": 240.0, + "epoch": 0.7630342358469456, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.1032610833644867, + "learning_rate": 3.7805164090948658e-06, + "loss": -0.0488, + "num_tokens": 272190544.0, + "reward": 3.4105939865112305, + "reward_std": 1.555128812789917, + "rewards/accuracy_reward/mean": 2.875, + "rewards/accuracy_reward/std": 3.000000238418579, + "rewards/ngram_similarity_reward/mean": 0.5355940461158752, + "rewards/ngram_similarity_reward/std": 0.1661282628774643, + "step": 1705 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 725.0, + "completions/mean_length": 491.5, + "completions/mean_terminated_length": 466.7936706542969, + "completions/min_length": 287.0, + "completions/min_terminated_length": 287.0, + "epoch": 0.7634817632579995, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09144928306341171, + "learning_rate": 3.779036238739404e-06, + "loss": -0.0045, + "num_tokens": 272327696.0, + "reward": 5.4266486167907715, + "reward_std": 0.7983545660972595, + "rewards/accuracy_reward/mean": 4.9375, + "rewards/accuracy_reward/std": 1.7627090215682983, + "rewards/ngram_similarity_reward/mean": 0.4891488254070282, + "rewards/ngram_similarity_reward/std": 0.2724377512931824, + "step": 1706 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 859.0, + "completions/mean_length": 554.875, + "completions/mean_terminated_length": 531.1746215820312, + "completions/min_length": 338.0, + "completions/min_terminated_length": 338.0, + "epoch": 0.7639292906690535, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11611030995845795, + "learning_rate": 3.7775555050441693e-06, + "loss": 0.0478, + "num_tokens": 272491048.0, + "reward": 3.4665048122406006, + "reward_std": 1.703005313873291, + "rewards/accuracy_reward/mean": 2.71875, + "rewards/accuracy_reward/std": 3.08847713470459, + "rewards/ngram_similarity_reward/mean": 0.7477551102638245, + "rewards/ngram_similarity_reward/std": 0.30098485946655273, + "step": 1707 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 928.0, + "completions/max_terminated_length": 928.0, + "completions/mean_length": 441.015625, + "completions/mean_terminated_length": 441.015625, + "completions/min_length": 274.0, + "completions/min_terminated_length": 274.0, + "epoch": 0.7643768180801074, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0940161719918251, + "learning_rate": 3.7760742088197794e-06, + "loss": 0.0098, + "num_tokens": 272644425.0, + "reward": 4.572781562805176, + "reward_std": 0.4228802025318146, + "rewards/accuracy_reward/mean": 3.90625, + "rewards/accuracy_reward/std": 2.6709415912628174, + "rewards/ngram_similarity_reward/mean": 0.6665315628051758, + "rewards/ngram_similarity_reward/std": 0.4215388596057892, + "step": 1708 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 878.0, + "completions/max_terminated_length": 878.0, + "completions/mean_length": 468.671875, + "completions/mean_terminated_length": 468.671875, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "epoch": 0.7648243454911613, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.11359170079231262, + "learning_rate": 3.774592350877161e-06, + "loss": -0.0031, + "num_tokens": 272790532.0, + "reward": 0.6768046617507935, + "reward_std": 0.7949711680412292, + "rewards/accuracy_reward/mean": 0.3125, + "rewards/accuracy_reward/std": 1.7354841232299805, + "rewards/ngram_similarity_reward/mean": 0.36430463194847107, + "rewards/ngram_similarity_reward/std": 0.21512088179588318, + "step": 1709 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 814.0, + "completions/max_terminated_length": 814.0, + "completions/mean_length": 501.953125, + "completions/mean_terminated_length": 501.953125, + "completions/min_length": 303.0, + "completions/min_terminated_length": 303.0, + "epoch": 0.7652718729022152, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09563387185335159, + "learning_rate": 3.7731099320275484e-06, + "loss": 0.002, + "num_tokens": 272966801.0, + "reward": 3.2861225605010986, + "reward_std": 0.14564277231693268, + "rewards/accuracy_reward/mean": 2.484375, + "rewards/accuracy_reward/std": 3.0419929027557373, + "rewards/ngram_similarity_reward/mean": 0.8017475605010986, + "rewards/ngram_similarity_reward/std": 0.1357528418302536, + "step": 1710 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 655.0, + "completions/max_terminated_length": 655.0, + "completions/mean_length": 419.34375, + "completions/mean_terminated_length": 419.34375, + "completions/min_length": 268.0, + "completions/min_terminated_length": 268.0, + "epoch": 0.7657194003132692, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.1052631288766861, + "learning_rate": 3.7716269530824835e-06, + "loss": 0.0322, + "num_tokens": 273093031.0, + "reward": 3.015035629272461, + "reward_std": 1.1161115169525146, + "rewards/accuracy_reward/mean": 2.125, + "rewards/accuracy_reward/std": 3.000000238418579, + "rewards/ngram_similarity_reward/mean": 0.8900353908538818, + "rewards/ngram_similarity_reward/std": 0.22037431597709656, + "step": 1711 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1088.0, + "completions/max_terminated_length": 1088.0, + "completions/mean_length": 465.921875, + "completions/mean_terminated_length": 465.921875, + "completions/min_length": 319.0, + "completions/min_terminated_length": 319.0, + "epoch": 0.7661669277243232, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11694445461034775, + "learning_rate": 3.770143414853814e-06, + "loss": -0.0549, + "num_tokens": 273235154.0, + "reward": 1.3984278440475464, + "reward_std": 1.64403235912323, + "rewards/accuracy_reward/mean": 0.625, + "rewards/accuracy_reward/std": 2.3603873252868652, + "rewards/ngram_similarity_reward/mean": 0.7734278440475464, + "rewards/ngram_similarity_reward/std": 0.29485347867012024, + "step": 1712 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 758.0, + "completions/max_terminated_length": 758.0, + "completions/mean_length": 470.265625, + "completions/mean_terminated_length": 470.265625, + "completions/min_length": 297.0, + "completions/min_terminated_length": 297.0, + "epoch": 0.7666144551353771, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.18651999533176422, + "learning_rate": 3.7686593181536946e-06, + "loss": 0.0215, + "num_tokens": 273364435.0, + "reward": 6.231356620788574, + "reward_std": 0.06699755787849426, + "rewards/accuracy_reward/mean": 5.5, + "rewards/accuracy_reward/std": 0.0, + "rewards/ngram_similarity_reward/mean": 0.7313565611839294, + "rewards/ngram_similarity_reward/std": 0.3515256941318512, + "step": 1713 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 783.0, + "completions/mean_length": 667.796875, + "completions/mean_terminated_length": 470.6250305175781, + "completions/min_length": 338.0, + "completions/min_terminated_length": 338.0, + "epoch": 0.767061982546431, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10543458163738251, + "learning_rate": 3.7671746637945845e-06, + "loss": -0.1146, + "num_tokens": 273501654.0, + "reward": 3.568516254425049, + "reward_std": 2.31913161277771, + "rewards/accuracy_reward/mean": 2.875, + "rewards/accuracy_reward/std": 3.000000238418579, + "rewards/ngram_similarity_reward/mean": 0.6935162544250488, + "rewards/ngram_similarity_reward/std": 0.444365531206131, + "step": 1714 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 810.0, + "completions/max_terminated_length": 810.0, + "completions/mean_length": 453.03125, + "completions/mean_terminated_length": 453.03125, + "completions/min_length": 256.0, + "completions/min_terminated_length": 256.0, + "epoch": 0.7675095099574849, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.09862305223941803, + "learning_rate": 3.765689452589249e-06, + "loss": 0.0132, + "num_tokens": 273623208.0, + "reward": 4.911007881164551, + "reward_std": 0.06254077702760696, + "rewards/accuracy_reward/mean": 4.0, + "rewards/accuracy_reward/std": 2.618614673614502, + "rewards/ngram_similarity_reward/mean": 0.9110076427459717, + "rewards/ngram_similarity_reward/std": 0.22745934128761292, + "step": 1715 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 713.0, + "completions/mean_length": 496.78125, + "completions/mean_terminated_length": 472.15875244140625, + "completions/min_length": 303.0, + "completions/min_terminated_length": 303.0, + "epoch": 0.7679570373685388, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11860388517379761, + "learning_rate": 3.764203685350759e-06, + "loss": 0.0158, + "num_tokens": 273849834.0, + "reward": 4.710654258728027, + "reward_std": 1.5064451694488525, + "rewards/accuracy_reward/mean": 4.0, + "rewards/accuracy_reward/std": 2.618614673614502, + "rewards/ngram_similarity_reward/mean": 0.7106548547744751, + "rewards/ngram_similarity_reward/std": 0.35538649559020996, + "step": 1716 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.078125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 829.0, + "completions/mean_length": 549.90625, + "completions/mean_terminated_length": 422.94915771484375, + "completions/min_length": 178.0, + "completions/min_terminated_length": 178.0, + "epoch": 0.7684045647795927, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11047927290201187, + "learning_rate": 3.7627173628924878e-06, + "loss": 0.0317, + "num_tokens": 273995604.0, + "reward": 4.477602958679199, + "reward_std": 0.08160172402858734, + "rewards/accuracy_reward/mean": 4.0, + "rewards/accuracy_reward/std": 2.618614673614502, + "rewards/ngram_similarity_reward/mean": 0.4776029586791992, + "rewards/ngram_similarity_reward/std": 0.18633443117141724, + "step": 1717 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 652.0, + "completions/mean_length": 486.5, + "completions/mean_terminated_length": 461.7143249511719, + "completions/min_length": 319.0, + "completions/min_terminated_length": 319.0, + "epoch": 0.7688520921906467, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10446258634328842, + "learning_rate": 3.7612304860281142e-06, + "loss": -0.0119, + "num_tokens": 274136580.0, + "reward": 3.9350204467773438, + "reward_std": 0.8723170757293701, + "rewards/accuracy_reward/mean": 3.15625, + "rewards/accuracy_reward/std": 2.950484275817871, + "rewards/ngram_similarity_reward/mean": 0.7787700891494751, + "rewards/ngram_similarity_reward/std": 0.22662793099880219, + "step": 1718 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 553.0, + "completions/max_terminated_length": 553.0, + "completions/mean_length": 371.171875, + "completions/mean_terminated_length": 371.171875, + "completions/min_length": 239.0, + "completions/min_terminated_length": 239.0, + "epoch": 0.7692996196017006, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.14761660993099213, + "learning_rate": 3.7597430555716204e-06, + "loss": 0.0037, + "num_tokens": 274312911.0, + "reward": 3.5941073894500732, + "reward_std": 0.7306583523750305, + "rewards/accuracy_reward/mean": 2.875, + "rewards/accuracy_reward/std": 3.000000238418579, + "rewards/ngram_similarity_reward/mean": 0.7191075086593628, + "rewards/ngram_similarity_reward/std": 0.36091846227645874, + "step": 1719 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 900.0, + "completions/max_terminated_length": 900.0, + "completions/mean_length": 512.484375, + "completions/mean_terminated_length": 512.484375, + "completions/min_length": 305.0, + "completions/min_terminated_length": 305.0, + "epoch": 0.7697471470127545, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.08637955039739609, + "learning_rate": 3.7582550723372912e-06, + "loss": -0.001, + "num_tokens": 274448222.0, + "reward": 6.481459617614746, + "reward_std": 0.08800999820232391, + "rewards/accuracy_reward/mean": 5.5, + "rewards/accuracy_reward/std": 0.0, + "rewards/ngram_similarity_reward/mean": 0.9814598560333252, + "rewards/ngram_similarity_reward/std": 0.13310606777668, + "step": 1720 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1366.0, + "completions/max_terminated_length": 1366.0, + "completions/mean_length": 589.578125, + "completions/mean_terminated_length": 589.578125, + "completions/min_length": 300.0, + "completions/min_terminated_length": 300.0, + "epoch": 0.7701946744238085, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.126104474067688, + "learning_rate": 3.7567665371397137e-06, + "loss": 0.0233, + "num_tokens": 274696915.0, + "reward": 3.328207492828369, + "reward_std": 0.5135577917098999, + "rewards/accuracy_reward/mean": 2.578125, + "rewards/accuracy_reward/std": 3.0410144329071045, + "rewards/ngram_similarity_reward/mean": 0.7500826120376587, + "rewards/ngram_similarity_reward/std": 0.2727869749069214, + "step": 1721 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 875.0, + "completions/mean_length": 565.15625, + "completions/mean_terminated_length": 541.6190795898438, + "completions/min_length": 315.0, + "completions/min_terminated_length": 315.0, + "epoch": 0.7706422018348624, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08273998647928238, + "learning_rate": 3.7552774507937787e-06, + "loss": -0.0139, + "num_tokens": 274832173.0, + "reward": 4.147893905639648, + "reward_std": 1.1679359674453735, + "rewards/accuracy_reward/mean": 3.53125, + "rewards/accuracy_reward/std": 2.839454174041748, + "rewards/ngram_similarity_reward/mean": 0.6166439056396484, + "rewards/ngram_similarity_reward/std": 0.17714762687683105, + "step": 1722 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 514.0, + "completions/max_terminated_length": 514.0, + "completions/mean_length": 318.828125, + "completions/mean_terminated_length": 318.828125, + "completions/min_length": 201.0, + "completions/min_terminated_length": 201.0, + "epoch": 0.7710897292459163, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.1377553939819336, + "learning_rate": 3.753787814114677e-06, + "loss": -0.0052, + "num_tokens": 274980034.0, + "reward": 3.1438984870910645, + "reward_std": 0.4386993646621704, + "rewards/accuracy_reward/mean": 2.59375, + "rewards/accuracy_reward/std": 3.0222392082214355, + "rewards/ngram_similarity_reward/mean": 0.5501485466957092, + "rewards/ngram_similarity_reward/std": 0.19643662869930267, + "step": 1723 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 976.0, + "completions/mean_length": 620.140625, + "completions/mean_terminated_length": 524.9500122070312, + "completions/min_length": 349.0, + "completions/min_terminated_length": 349.0, + "epoch": 0.7715372566569703, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10216798633337021, + "learning_rate": 3.7522976279179013e-06, + "loss": 0.0368, + "num_tokens": 275136171.0, + "reward": 5.586757659912109, + "reward_std": 1.5851632356643677, + "rewards/accuracy_reward/mean": 4.84375, + "rewards/accuracy_reward/std": 1.8874586820602417, + "rewards/ngram_similarity_reward/mean": 0.7430075407028198, + "rewards/ngram_similarity_reward/std": 0.22118467092514038, + "step": 1724 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 729.0, + "completions/max_terminated_length": 729.0, + "completions/mean_length": 468.421875, + "completions/mean_terminated_length": 468.421875, + "completions/min_length": 333.0, + "completions/min_terminated_length": 333.0, + "epoch": 0.7719847840680242, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1122996062040329, + "learning_rate": 3.7508068930192455e-06, + "loss": -0.0024, + "num_tokens": 275314214.0, + "reward": 3.8260443210601807, + "reward_std": 1.3485864400863647, + "rewards/accuracy_reward/mean": 3.15625, + "rewards/accuracy_reward/std": 2.950484275817871, + "rewards/ngram_similarity_reward/mean": 0.6697943210601807, + "rewards/ngram_similarity_reward/std": 0.2788363993167877, + "step": 1725 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 696.0, + "completions/max_terminated_length": 696.0, + "completions/mean_length": 464.921875, + "completions/mean_terminated_length": 464.921875, + "completions/min_length": 312.0, + "completions/min_terminated_length": 312.0, + "epoch": 0.7724323114790781, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13570821285247803, + "learning_rate": 3.749315610234802e-06, + "loss": -0.013, + "num_tokens": 275434529.0, + "reward": 5.731655120849609, + "reward_std": 0.5602239966392517, + "rewards/accuracy_reward/mean": 5.3125, + "rewards/accuracy_reward/std": 1.0522085428237915, + "rewards/ngram_similarity_reward/mean": 0.419155091047287, + "rewards/ngram_similarity_reward/std": 0.2204188108444214, + "step": 1726 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 574.0, + "completions/max_terminated_length": 574.0, + "completions/mean_length": 407.65625, + "completions/mean_terminated_length": 407.65625, + "completions/min_length": 309.0, + "completions/min_terminated_length": 309.0, + "epoch": 0.772879838890132, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.09329795837402344, + "learning_rate": 3.7478237803809677e-06, + "loss": -0.0051, + "num_tokens": 275629819.0, + "reward": 3.1534743309020996, + "reward_std": 1.598434567451477, + "rewards/accuracy_reward/mean": 2.3125, + "rewards/accuracy_reward/std": 3.0178043842315674, + "rewards/ngram_similarity_reward/mean": 0.8409743309020996, + "rewards/ngram_similarity_reward/std": 0.2519665062427521, + "step": 1727 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 665.0, + "completions/max_terminated_length": 665.0, + "completions/mean_length": 388.265625, + "completions/mean_terminated_length": 388.265625, + "completions/min_length": 202.0, + "completions/min_terminated_length": 202.0, + "epoch": 0.7733273663011859, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.08827347308397293, + "learning_rate": 3.7463314042744336e-06, + "loss": 0.0172, + "num_tokens": 275784204.0, + "reward": 4.787602424621582, + "reward_std": 0.42931514978408813, + "rewards/accuracy_reward/mean": 3.90625, + "rewards/accuracy_reward/std": 2.6709415912628174, + "rewards/ngram_similarity_reward/mean": 0.8813523054122925, + "rewards/ngram_similarity_reward/std": 0.28254538774490356, + "step": 1728 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 737.0, + "completions/max_terminated_length": 737.0, + "completions/mean_length": 482.546875, + "completions/mean_terminated_length": 482.546875, + "completions/min_length": 275.0, + "completions/min_terminated_length": 275.0, + "epoch": 0.7737748937122398, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.16683056950569153, + "learning_rate": 3.7448384827321932e-06, + "loss": 0.0021, + "num_tokens": 275946383.0, + "reward": 4.025887966156006, + "reward_std": 1.2076332569122314, + "rewards/accuracy_reward/mean": 3.4375, + "rewards/accuracy_reward/std": 2.872281312942505, + "rewards/ngram_similarity_reward/mean": 0.5883880257606506, + "rewards/ngram_similarity_reward/std": 0.13875126838684082, + "step": 1729 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 1591.0, + "completions/max_terminated_length": 977.0, + "completions/mean_length": 491.875, + "completions/mean_terminated_length": 456.4193420410156, + "completions/min_length": 249.0, + "completions/min_terminated_length": 249.0, + "epoch": 0.7742224211232938, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.16087134182453156, + "learning_rate": 3.7433450165715372e-06, + "loss": 0.0374, + "num_tokens": 276168649.0, + "reward": 4.721775054931641, + "reward_std": 0.24048519134521484, + "rewards/accuracy_reward/mean": 4.0, + "rewards/accuracy_reward/std": 2.618614673614502, + "rewards/ngram_similarity_reward/mean": 0.7217749357223511, + "rewards/ngram_similarity_reward/std": 0.3546585142612457, + "step": 1730 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1195.0, + "completions/max_terminated_length": 1195.0, + "completions/mean_length": 494.21875, + "completions/mean_terminated_length": 494.21875, + "completions/min_length": 202.0, + "completions/min_terminated_length": 202.0, + "epoch": 0.7746699485343477, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13882260024547577, + "learning_rate": 3.7418510066100544e-06, + "loss": 0.0169, + "num_tokens": 276310327.0, + "reward": 2.9754281044006348, + "reward_std": 0.050156496465206146, + "rewards/accuracy_reward/mean": 2.5, + "rewards/accuracy_reward/std": 3.0237157344818115, + "rewards/ngram_similarity_reward/mean": 0.47542816400527954, + "rewards/ngram_similarity_reward/std": 0.19159510731697083, + "step": 1731 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 600.0, + "completions/max_terminated_length": 600.0, + "completions/mean_length": 444.359375, + "completions/mean_terminated_length": 444.359375, + "completions/min_length": 300.0, + "completions/min_terminated_length": 300.0, + "epoch": 0.7751174759454017, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.07742560654878616, + "learning_rate": 3.740356453665632e-06, + "loss": 0.0196, + "num_tokens": 276477038.0, + "reward": 6.4582014083862305, + "reward_std": 0.10869987308979034, + "rewards/accuracy_reward/mean": 5.5, + "rewards/accuracy_reward/std": 0.0, + "rewards/ngram_similarity_reward/mean": 0.9582017660140991, + "rewards/ngram_similarity_reward/std": 0.1663198322057724, + "step": 1732 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 511.0, + "completions/max_terminated_length": 511.0, + "completions/mean_length": 369.203125, + "completions/mean_terminated_length": 369.203125, + "completions/min_length": 245.0, + "completions/min_terminated_length": 245.0, + "epoch": 0.7755650033564556, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.08513515442609787, + "learning_rate": 3.738861358556455e-06, + "loss": 0.007, + "num_tokens": 276607531.0, + "reward": 6.405516624450684, + "reward_std": 0.11752209067344666, + "rewards/accuracy_reward/mean": 5.5, + "rewards/accuracy_reward/std": 0.0, + "rewards/ngram_similarity_reward/mean": 0.9055166244506836, + "rewards/ngram_similarity_reward/std": 0.29450514912605286, + "step": 1733 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 749.0, + "completions/max_terminated_length": 749.0, + "completions/mean_length": 447.515625, + "completions/mean_terminated_length": 447.515625, + "completions/min_length": 312.0, + "completions/min_terminated_length": 312.0, + "epoch": 0.7760125307675095, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.11043563485145569, + "learning_rate": 3.7373657221010027e-06, + "loss": -0.0045, + "num_tokens": 276743020.0, + "reward": 5.889714241027832, + "reward_std": 0.8159486651420593, + "rewards/accuracy_reward/mean": 5.03125, + "rewards/accuracy_reward/std": 1.6229382753372192, + "rewards/ngram_similarity_reward/mean": 0.8584644794464111, + "rewards/ngram_similarity_reward/std": 0.18844884634017944, + "step": 1734 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 845.0, + "completions/mean_length": 461.734375, + "completions/mean_terminated_length": 410.56451416015625, + "completions/min_length": 234.0, + "completions/min_terminated_length": 234.0, + "epoch": 0.7764600581785635, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.17382760345935822, + "learning_rate": 3.735869545118053e-06, + "loss": -0.0272, + "num_tokens": 276867899.0, + "reward": 0.08718939870595932, + "reward_std": 0.5110728740692139, + "rewards/accuracy_reward/mean": -0.53125, + "rewards/accuracy_reward/std": 0.8351171612739563, + "rewards/ngram_similarity_reward/mean": 0.6184394359588623, + "rewards/ngram_similarity_reward/std": 0.1825799196958542, + "step": 1735 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 819.0, + "completions/max_terminated_length": 819.0, + "completions/mean_length": 479.953125, + "completions/mean_terminated_length": 479.953125, + "completions/min_length": 278.0, + "completions/min_terminated_length": 278.0, + "epoch": 0.7769075855896174, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10287876427173615, + "learning_rate": 3.73437282842668e-06, + "loss": -0.0017, + "num_tokens": 277028088.0, + "reward": 4.718317985534668, + "reward_std": 0.22731512784957886, + "rewards/accuracy_reward/mean": 3.984375, + "rewards/accuracy_reward/std": 2.648702621459961, + "rewards/ngram_similarity_reward/mean": 0.733942449092865, + "rewards/ngram_similarity_reward/std": 0.34278374910354614, + "step": 1736 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 904.0, + "completions/max_terminated_length": 904.0, + "completions/mean_length": 459.65625, + "completions/mean_terminated_length": 459.65625, + "completions/min_length": 217.0, + "completions/min_terminated_length": 217.0, + "epoch": 0.7773551130006713, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09848344326019287, + "learning_rate": 3.7328755728462513e-06, + "loss": -0.0324, + "num_tokens": 277177938.0, + "reward": 4.4768171310424805, + "reward_std": 0.5262054800987244, + "rewards/accuracy_reward/mean": 3.90625, + "rewards/accuracy_reward/std": 2.6709415912628174, + "rewards/ngram_similarity_reward/mean": 0.5705671310424805, + "rewards/ngram_similarity_reward/std": 0.27129846811294556, + "step": 1737 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 580.0, + "completions/max_terminated_length": 580.0, + "completions/mean_length": 380.203125, + "completions/mean_terminated_length": 380.203125, + "completions/min_length": 199.0, + "completions/min_terminated_length": 199.0, + "epoch": 0.7778026404117252, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.08698117733001709, + "learning_rate": 3.731377779196431e-06, + "loss": -0.0085, + "num_tokens": 277319519.0, + "reward": 2.3189797401428223, + "reward_std": 0.8595125675201416, + "rewards/accuracy_reward/mean": 1.5625, + "rewards/accuracy_reward/std": 2.872281312942505, + "rewards/ngram_similarity_reward/mean": 0.7564799785614014, + "rewards/ngram_similarity_reward/std": 0.3434167206287384, + "step": 1738 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.078125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 716.0, + "completions/mean_length": 562.765625, + "completions/mean_terminated_length": 436.8983154296875, + "completions/min_length": 278.0, + "completions/min_terminated_length": 278.0, + "epoch": 0.7782501678227791, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09498909115791321, + "learning_rate": 3.7298794482971773e-06, + "loss": -0.1533, + "num_tokens": 277471408.0, + "reward": 5.730620384216309, + "reward_std": 1.2200387716293335, + "rewards/accuracy_reward/mean": 4.9375, + "rewards/accuracy_reward/std": 1.7627090215682983, + "rewards/ngram_similarity_reward/mean": 0.7931205034255981, + "rewards/ngram_similarity_reward/std": 0.3747904598712921, + "step": 1739 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 907.0, + "completions/mean_length": 603.5, + "completions/mean_terminated_length": 580.5714721679688, + "completions/min_length": 369.0, + "completions/min_terminated_length": 369.0, + "epoch": 0.778697695233833, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13159863650798798, + "learning_rate": 3.7283805809687427e-06, + "loss": -0.0029, + "num_tokens": 277598768.0, + "reward": 3.0115487575531006, + "reward_std": 0.5999206304550171, + "rewards/accuracy_reward/mean": 2.359375, + "rewards/accuracy_reward/std": 3.075077533721924, + "rewards/ngram_similarity_reward/mean": 0.652173638343811, + "rewards/ngram_similarity_reward/std": 0.17643523216247559, + "step": 1740 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 835.0, + "completions/max_terminated_length": 835.0, + "completions/mean_length": 433.03125, + "completions/mean_terminated_length": 433.03125, + "completions/min_length": 213.0, + "completions/min_terminated_length": 213.0, + "epoch": 0.779145222644887, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.10145675390958786, + "learning_rate": 3.7268811780316726e-06, + "loss": 0.0091, + "num_tokens": 277718882.0, + "reward": 3.3468050956726074, + "reward_std": 0.6583366990089417, + "rewards/accuracy_reward/mean": 2.78125, + "rewards/accuracy_reward/std": 3.0103988647460938, + "rewards/ngram_similarity_reward/mean": 0.5655550956726074, + "rewards/ngram_similarity_reward/std": 0.34843239188194275, + "step": 1741 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 694.0, + "completions/max_terminated_length": 694.0, + "completions/mean_length": 397.96875, + "completions/mean_terminated_length": 397.96875, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "epoch": 0.779592750055941, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.10779356956481934, + "learning_rate": 3.725381240306807e-06, + "loss": -0.0318, + "num_tokens": 277857664.0, + "reward": 6.0483808517456055, + "reward_std": 0.10932482033967972, + "rewards/accuracy_reward/mean": 5.5, + "rewards/accuracy_reward/std": 0.0, + "rewards/ngram_similarity_reward/mean": 0.5483807921409607, + "rewards/ngram_similarity_reward/std": 0.19631454348564148, + "step": 1742 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1445.0, + "completions/max_terminated_length": 1445.0, + "completions/mean_length": 751.46875, + "completions/mean_terminated_length": 751.46875, + "completions/min_length": 335.0, + "completions/min_terminated_length": 335.0, + "epoch": 0.7800402774669949, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06561101973056793, + "learning_rate": 3.7238807686152773e-06, + "loss": -0.0637, + "num_tokens": 278055854.0, + "reward": 3.3921141624450684, + "reward_std": 1.6131346225738525, + "rewards/accuracy_reward/mean": 2.671875, + "rewards/accuracy_reward/std": 3.037097215652466, + "rewards/ngram_similarity_reward/mean": 0.7202394008636475, + "rewards/ngram_similarity_reward/std": 0.19713854789733887, + "step": 1743 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 591.0, + "completions/max_terminated_length": 591.0, + "completions/mean_length": 411.828125, + "completions/mean_terminated_length": 411.828125, + "completions/min_length": 289.0, + "completions/min_terminated_length": 289.0, + "epoch": 0.7804878048780488, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.12284394353628159, + "learning_rate": 3.722379763778508e-06, + "loss": 0.0036, + "num_tokens": 278187603.0, + "reward": 5.143313407897949, + "reward_std": 1.265124797821045, + "rewards/accuracy_reward/mean": 4.28125, + "rewards/accuracy_reward/std": 2.4330317974090576, + "rewards/ngram_similarity_reward/mean": 0.8620636463165283, + "rewards/ngram_similarity_reward/std": 0.26387763023376465, + "step": 1744 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 817.0, + "completions/max_terminated_length": 817.0, + "completions/mean_length": 501.3125, + "completions/mean_terminated_length": 501.3125, + "completions/min_length": 312.0, + "completions/min_terminated_length": 312.0, + "epoch": 0.7809353322891027, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.09614390879869461, + "learning_rate": 3.7208782266182153e-06, + "loss": -0.0188, + "num_tokens": 278337799.0, + "reward": 2.9313480854034424, + "reward_std": 0.04841892421245575, + "rewards/accuracy_reward/mean": 2.5, + "rewards/accuracy_reward/std": 3.0237157344818115, + "rewards/ngram_similarity_reward/mean": 0.43134796619415283, + "rewards/ngram_similarity_reward/std": 0.1135719045996666, + "step": 1745 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 755.0, + "completions/max_terminated_length": 755.0, + "completions/mean_length": 505.640625, + "completions/mean_terminated_length": 505.640625, + "completions/min_length": 352.0, + "completions/min_terminated_length": 352.0, + "epoch": 0.7813828597001566, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08681753277778625, + "learning_rate": 3.7193761579564075e-06, + "loss": -0.0226, + "num_tokens": 278503488.0, + "reward": 3.15793514251709, + "reward_std": 0.09037813544273376, + "rewards/accuracy_reward/mean": 2.5, + "rewards/accuracy_reward/std": 3.0237157344818115, + "rewards/ngram_similarity_reward/mean": 0.6579351425170898, + "rewards/ngram_similarity_reward/std": 0.2862730324268341, + "step": 1746 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 859.0, + "completions/max_terminated_length": 859.0, + "completions/mean_length": 459.90625, + "completions/mean_terminated_length": 459.90625, + "completions/min_length": 275.0, + "completions/min_terminated_length": 275.0, + "epoch": 0.7818303871112106, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0824127346277237, + "learning_rate": 3.7178735586153817e-06, + "loss": 0.017, + "num_tokens": 278653306.0, + "reward": 4.810904502868652, + "reward_std": 0.09839123487472534, + "rewards/accuracy_reward/mean": 4.0, + "rewards/accuracy_reward/std": 2.618614673614502, + "rewards/ngram_similarity_reward/mean": 0.8109046220779419, + "rewards/ngram_similarity_reward/std": 0.23212027549743652, + "step": 1747 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 867.0, + "completions/max_terminated_length": 867.0, + "completions/mean_length": 476.0, + "completions/mean_terminated_length": 476.0, + "completions/min_length": 192.0, + "completions/min_terminated_length": 192.0, + "epoch": 0.7822779145222645, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1105496883392334, + "learning_rate": 3.716370429417728e-06, + "loss": 0.02, + "num_tokens": 278806858.0, + "reward": 3.7836852073669434, + "reward_std": 0.7806867361068726, + "rewards/accuracy_reward/mean": 3.53125, + "rewards/accuracy_reward/std": 2.839454174041748, + "rewards/ngram_similarity_reward/mean": 0.252435564994812, + "rewards/ngram_similarity_reward/std": 0.16177693009376526, + "step": 1748 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1001.0, + "completions/max_terminated_length": 1001.0, + "completions/mean_length": 494.140625, + "completions/mean_terminated_length": 494.140625, + "completions/min_length": 285.0, + "completions/min_terminated_length": 285.0, + "epoch": 0.7827254419333184, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.14065444469451904, + "learning_rate": 3.7148667711863253e-06, + "loss": -0.0056, + "num_tokens": 279012995.0, + "reward": 1.6374659538269043, + "reward_std": 0.7110859155654907, + "rewards/accuracy_reward/mean": 1.140625, + "rewards/accuracy_reward/std": 2.7566208839416504, + "rewards/ngram_similarity_reward/mean": 0.4968408942222595, + "rewards/ngram_similarity_reward/std": 0.195643812417984, + "step": 1749 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 934.0, + "completions/max_terminated_length": 934.0, + "completions/mean_length": 525.65625, + "completions/mean_terminated_length": 525.65625, + "completions/min_length": 302.0, + "completions/min_terminated_length": 302.0, + "epoch": 0.7831729693443723, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.05567352473735809, + "learning_rate": 3.7133625847443426e-06, + "loss": -0.0103, + "num_tokens": 279174237.0, + "reward": 3.3230068683624268, + "reward_std": 0.04326403886079788, + "rewards/accuracy_reward/mean": 2.5, + "rewards/accuracy_reward/std": 3.0237157344818115, + "rewards/ngram_similarity_reward/mean": 0.8230066895484924, + "rewards/ngram_similarity_reward/std": 0.23887582123279572, + "step": 1750 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 754.0, + "completions/max_terminated_length": 754.0, + "completions/mean_length": 487.984375, + "completions/mean_terminated_length": 487.984375, + "completions/min_length": 309.0, + "completions/min_terminated_length": 309.0, + "epoch": 0.7836204967554262, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09342952072620392, + "learning_rate": 3.711857870915237e-06, + "loss": 0.0294, + "num_tokens": 279361340.0, + "reward": 4.5902204513549805, + "reward_std": 1.5695034265518188, + "rewards/accuracy_reward/mean": 3.8125, + "rewards/accuracy_reward/std": 2.7189810276031494, + "rewards/ngram_similarity_reward/mean": 0.7777203321456909, + "rewards/ngram_similarity_reward/std": 0.2926555573940277, + "step": 1751 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 511.0, + "completions/mean_length": 410.03125, + "completions/mean_terminated_length": 384.0317687988281, + "completions/min_length": 258.0, + "completions/min_terminated_length": 258.0, + "epoch": 0.7840680241664802, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.16596359014511108, + "learning_rate": 3.7103526305227565e-06, + "loss": -0.0127, + "num_tokens": 279463742.0, + "reward": 5.742364883422852, + "reward_std": 0.785595178604126, + "rewards/accuracy_reward/mean": 5.03125, + "rewards/accuracy_reward/std": 1.6229382753372192, + "rewards/ngram_similarity_reward/mean": 0.711115300655365, + "rewards/ngram_similarity_reward/std": 0.48928964138031006, + "step": 1752 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 745.0, + "completions/max_terminated_length": 745.0, + "completions/mean_length": 452.484375, + "completions/mean_terminated_length": 452.484375, + "completions/min_length": 277.0, + "completions/min_terminated_length": 277.0, + "epoch": 0.7845155515775342, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.11051739007234573, + "learning_rate": 3.7088468643909346e-06, + "loss": 0.0023, + "num_tokens": 279602941.0, + "reward": 5.040217876434326, + "reward_std": 0.06647270172834396, + "rewards/accuracy_reward/mean": 4.0, + "rewards/accuracy_reward/std": 2.618614673614502, + "rewards/ngram_similarity_reward/mean": 1.0402178764343262, + "rewards/ngram_similarity_reward/std": 0.1375802755355835, + "step": 1753 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 935.0, + "completions/max_terminated_length": 935.0, + "completions/mean_length": 549.875, + "completions/mean_terminated_length": 549.875, + "completions/min_length": 279.0, + "completions/min_terminated_length": 279.0, + "epoch": 0.7849630789885881, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07461415231227875, + "learning_rate": 3.7073405733440955e-06, + "loss": -0.0009, + "num_tokens": 279748853.0, + "reward": 4.671903610229492, + "reward_std": 0.1308913230895996, + "rewards/accuracy_reward/mean": 4.0, + "rewards/accuracy_reward/std": 2.618614673614502, + "rewards/ngram_similarity_reward/mean": 0.671903669834137, + "rewards/ngram_similarity_reward/std": 0.26790651679039, + "step": 1754 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 607.0, + "completions/max_terminated_length": 607.0, + "completions/mean_length": 370.5, + "completions/mean_terminated_length": 370.5, + "completions/min_length": 206.0, + "completions/min_terminated_length": 206.0, + "epoch": 0.785410606399642, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.09640100598335266, + "learning_rate": 3.7058337582068476e-06, + "loss": -0.0201, + "num_tokens": 279904661.0, + "reward": 3.4819796085357666, + "reward_std": 0.39395928382873535, + "rewards/accuracy_reward/mean": 2.59375, + "rewards/accuracy_reward/std": 3.0222392082214355, + "rewards/ngram_similarity_reward/mean": 0.8882298469543457, + "rewards/ngram_similarity_reward/std": 0.42067092657089233, + "step": 1755 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 667.0, + "completions/max_terminated_length": 667.0, + "completions/mean_length": 409.375, + "completions/mean_terminated_length": 409.375, + "completions/min_length": 283.0, + "completions/min_terminated_length": 283.0, + "epoch": 0.7858581338106959, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.14509481191635132, + "learning_rate": 3.7043264198040897e-06, + "loss": 0.0049, + "num_tokens": 280145949.0, + "reward": 2.6730215549468994, + "reward_std": 0.7071306705474854, + "rewards/accuracy_reward/mean": 2.109375, + "rewards/accuracy_reward/std": 3.0164480209350586, + "rewards/ngram_similarity_reward/mean": 0.5636464357376099, + "rewards/ngram_similarity_reward/std": 0.2723451256752014, + "step": 1756 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1003.0, + "completions/max_terminated_length": 1003.0, + "completions/mean_length": 515.6875, + "completions/mean_terminated_length": 515.6875, + "completions/min_length": 323.0, + "completions/min_terminated_length": 323.0, + "epoch": 0.7863056612217498, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09780410677194595, + "learning_rate": 3.7028185589610035e-06, + "loss": 0.0056, + "num_tokens": 280297049.0, + "reward": 6.146766185760498, + "reward_std": 0.6536001563072205, + "rewards/accuracy_reward/mean": 5.3125, + "rewards/accuracy_reward/std": 1.0522085428237915, + "rewards/ngram_similarity_reward/mean": 0.834265947341919, + "rewards/ngram_similarity_reward/std": 0.2860173285007477, + "step": 1757 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 690.0, + "completions/max_terminated_length": 690.0, + "completions/mean_length": 439.15625, + "completions/mean_terminated_length": 439.15625, + "completions/min_length": 274.0, + "completions/min_terminated_length": 274.0, + "epoch": 0.7867531886328037, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.10675951093435287, + "learning_rate": 3.7013101765030597e-06, + "loss": 0.0213, + "num_tokens": 280432499.0, + "reward": 4.9260478019714355, + "reward_std": 0.06432859599590302, + "rewards/accuracy_reward/mean": 4.0, + "rewards/accuracy_reward/std": 2.618614673614502, + "rewards/ngram_similarity_reward/mean": 0.9260480403900146, + "rewards/ngram_similarity_reward/std": 0.18244914710521698, + "step": 1758 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 754.0, + "completions/mean_length": 458.71875, + "completions/mean_terminated_length": 433.4920959472656, + "completions/min_length": 239.0, + "completions/min_terminated_length": 239.0, + "epoch": 0.7872007160438577, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.1172085776925087, + "learning_rate": 3.6998012732560127e-06, + "loss": -0.0153, + "num_tokens": 280644529.0, + "reward": 2.6222519874572754, + "reward_std": 0.7941980957984924, + "rewards/accuracy_reward/mean": 2.046875, + "rewards/accuracy_reward/std": 3.080557107925415, + "rewards/ngram_similarity_reward/mean": 0.5753771066665649, + "rewards/ngram_similarity_reward/std": 0.2146225869655609, + "step": 1759 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 772.0, + "completions/max_terminated_length": 772.0, + "completions/mean_length": 444.5, + "completions/mean_terminated_length": 444.5, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "epoch": 0.7876482434549116, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.08716736733913422, + "learning_rate": 3.698291850045902e-06, + "loss": 0.0079, + "num_tokens": 280783713.0, + "reward": 3.271043539047241, + "reward_std": 0.10971924662590027, + "rewards/accuracy_reward/mean": 2.5, + "rewards/accuracy_reward/std": 3.0237157344818115, + "rewards/ngram_similarity_reward/mean": 0.7710434198379517, + "rewards/ngram_similarity_reward/std": 0.2659313380718231, + "step": 1760 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 795.0, + "completions/max_terminated_length": 795.0, + "completions/mean_length": 515.5, + "completions/mean_terminated_length": 515.5, + "completions/min_length": 287.0, + "completions/min_terminated_length": 287.0, + "epoch": 0.7880957708659655, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.07007109373807907, + "learning_rate": 3.6967819076990546e-06, + "loss": -0.0025, + "num_tokens": 280967249.0, + "reward": 3.259934425354004, + "reward_std": 0.5627215504646301, + "rewards/accuracy_reward/mean": 2.6875, + "rewards/accuracy_reward/std": 3.0178043842315674, + "rewards/ngram_similarity_reward/mean": 0.5724344253540039, + "rewards/ngram_similarity_reward/std": 0.5026712417602539, + "step": 1761 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 752.0, + "completions/max_terminated_length": 752.0, + "completions/mean_length": 453.109375, + "completions/mean_terminated_length": 453.109375, + "completions/min_length": 321.0, + "completions/min_terminated_length": 321.0, + "epoch": 0.7885432982770195, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0714978277683258, + "learning_rate": 3.695271447042077e-06, + "loss": -0.0219, + "num_tokens": 281114312.0, + "reward": 4.756280422210693, + "reward_std": 0.062186598777770996, + "rewards/accuracy_reward/mean": 4.0, + "rewards/accuracy_reward/std": 2.618614673614502, + "rewards/ngram_similarity_reward/mean": 0.7562806606292725, + "rewards/ngram_similarity_reward/std": 0.20702359080314636, + "step": 1762 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 751.0, + "completions/max_terminated_length": 751.0, + "completions/mean_length": 509.125, + "completions/mean_terminated_length": 509.125, + "completions/min_length": 283.0, + "completions/min_terminated_length": 283.0, + "epoch": 0.7889908256880734, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.06607312709093094, + "learning_rate": 3.6937604689018634e-06, + "loss": -0.0346, + "num_tokens": 281232960.0, + "reward": 6.391016960144043, + "reward_std": 0.08699575811624527, + "rewards/accuracy_reward/mean": 5.5, + "rewards/accuracy_reward/std": 0.0, + "rewards/ngram_similarity_reward/mean": 0.8910167813301086, + "rewards/ngram_similarity_reward/std": 0.2577274441719055, + "step": 1763 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 943.0, + "completions/max_terminated_length": 943.0, + "completions/mean_length": 460.875, + "completions/mean_terminated_length": 460.875, + "completions/min_length": 196.0, + "completions/min_terminated_length": 196.0, + "epoch": 0.7894383530991274, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.06838354468345642, + "learning_rate": 3.692248974105589e-06, + "loss": 0.0448, + "num_tokens": 281375896.0, + "reward": 3.369481086730957, + "reward_std": 0.39635372161865234, + "rewards/accuracy_reward/mean": 2.578125, + "rewards/accuracy_reward/std": 3.0410144329071045, + "rewards/ngram_similarity_reward/mean": 0.791356086730957, + "rewards/ngram_similarity_reward/std": 0.30302315950393677, + "step": 1764 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 793.0, + "completions/max_terminated_length": 793.0, + "completions/mean_length": 514.453125, + "completions/mean_terminated_length": 514.453125, + "completions/min_length": 310.0, + "completions/min_terminated_length": 310.0, + "epoch": 0.7898858805101813, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08800314366817474, + "learning_rate": 3.6907369634807132e-06, + "loss": -0.0067, + "num_tokens": 281549157.0, + "reward": 5.869156837463379, + "reward_std": 0.7548459768295288, + "rewards/accuracy_reward/mean": 5.21875, + "rewards/accuracy_reward/std": 1.2782522439956665, + "rewards/ngram_similarity_reward/mean": 0.6504068374633789, + "rewards/ngram_similarity_reward/std": 0.30050766468048096, + "step": 1765 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 625.0, + "completions/max_terminated_length": 625.0, + "completions/mean_length": 438.5625, + "completions/mean_terminated_length": 438.5625, + "completions/min_length": 274.0, + "completions/min_terminated_length": 274.0, + "epoch": 0.7903334079212352, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.090070940554142, + "learning_rate": 3.6892244378549775e-06, + "loss": -0.0097, + "num_tokens": 281669769.0, + "reward": 4.827174663543701, + "reward_std": 0.10513508319854736, + "rewards/accuracy_reward/mean": 4.0, + "rewards/accuracy_reward/std": 2.618614673614502, + "rewards/ngram_similarity_reward/mean": 0.8271746635437012, + "rewards/ngram_similarity_reward/std": 0.24546904861927032, + "step": 1766 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 799.0, + "completions/max_terminated_length": 799.0, + "completions/mean_length": 443.4375, + "completions/mean_terminated_length": 443.4375, + "completions/min_length": 268.0, + "completions/min_terminated_length": 268.0, + "epoch": 0.7907809353322891, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.103789322078228, + "learning_rate": 3.687711398056404e-06, + "loss": 0.0321, + "num_tokens": 281818997.0, + "reward": 3.0784761905670166, + "reward_std": 2.0790152549743652, + "rewards/accuracy_reward/mean": 2.5, + "rewards/accuracy_reward/std": 3.0237157344818115, + "rewards/ngram_similarity_reward/mean": 0.5784761309623718, + "rewards/ngram_similarity_reward/std": 0.26098406314849854, + "step": 1767 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 864.0, + "completions/max_terminated_length": 864.0, + "completions/mean_length": 445.875, + "completions/mean_terminated_length": 445.875, + "completions/min_length": 229.0, + "completions/min_terminated_length": 229.0, + "epoch": 0.791228462743343, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.10282961279153824, + "learning_rate": 3.6861978449132974e-06, + "loss": 0.0062, + "num_tokens": 281959437.0, + "reward": 1.7224863767623901, + "reward_std": 0.16980081796646118, + "rewards/accuracy_reward/mean": 0.90625, + "rewards/accuracy_reward/std": 2.688710927963257, + "rewards/ngram_similarity_reward/mean": 0.8162364363670349, + "rewards/ngram_similarity_reward/std": 0.3230360150337219, + "step": 1768 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 664.0, + "completions/max_terminated_length": 664.0, + "completions/mean_length": 445.015625, + "completions/mean_terminated_length": 445.015625, + "completions/min_length": 285.0, + "completions/min_terminated_length": 285.0, + "epoch": 0.7916759901543969, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.09563589841127396, + "learning_rate": 3.684683779254245e-06, + "loss": -0.0191, + "num_tokens": 282092974.0, + "reward": 6.333760738372803, + "reward_std": 0.41795966029167175, + "rewards/accuracy_reward/mean": 5.40625, + "rewards/accuracy_reward/std": 0.7500000596046448, + "rewards/ngram_similarity_reward/mean": 0.9275108575820923, + "rewards/ngram_similarity_reward/std": 0.22054578363895416, + "step": 1769 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 576.0, + "completions/max_terminated_length": 576.0, + "completions/mean_length": 379.234375, + "completions/mean_terminated_length": 379.234375, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "epoch": 0.7921235175654509, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13181842863559723, + "learning_rate": 3.6831692019081118e-06, + "loss": 0.0045, + "num_tokens": 282212301.0, + "reward": 6.2495317459106445, + "reward_std": 0.08606675267219543, + "rewards/accuracy_reward/mean": 5.5, + "rewards/accuracy_reward/std": 0.0, + "rewards/ngram_similarity_reward/mean": 0.749532163143158, + "rewards/ngram_similarity_reward/std": 0.26874497532844543, + "step": 1770 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 840.0, + "completions/max_terminated_length": 840.0, + "completions/mean_length": 554.671875, + "completions/mean_terminated_length": 554.671875, + "completions/min_length": 344.0, + "completions/min_terminated_length": 344.0, + "epoch": 0.7925710449765048, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07528512924909592, + "learning_rate": 3.681654113704044e-06, + "loss": -0.023, + "num_tokens": 282384152.0, + "reward": 4.789369583129883, + "reward_std": 0.1011795625090599, + "rewards/accuracy_reward/mean": 4.0, + "rewards/accuracy_reward/std": 2.618614673614502, + "rewards/ngram_similarity_reward/mean": 0.7893695831298828, + "rewards/ngram_similarity_reward/std": 0.25145819783210754, + "step": 1771 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 588.0, + "completions/max_terminated_length": 588.0, + "completions/mean_length": 420.796875, + "completions/mean_terminated_length": 420.796875, + "completions/min_length": 270.0, + "completions/min_terminated_length": 270.0, + "epoch": 0.7930185723875587, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.10464499145746231, + "learning_rate": 3.6801385154714676e-06, + "loss": -0.0179, + "num_tokens": 282589547.0, + "reward": 1.6843279600143433, + "reward_std": 0.05340495705604553, + "rewards/accuracy_reward/mean": 1.0, + "rewards/accuracy_reward/std": 2.618614673614502, + "rewards/ngram_similarity_reward/mean": 0.6843281388282776, + "rewards/ngram_similarity_reward/std": 0.13969381153583527, + "step": 1772 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 751.0, + "completions/max_terminated_length": 751.0, + "completions/mean_length": 462.640625, + "completions/mean_terminated_length": 462.640625, + "completions/min_length": 288.0, + "completions/min_terminated_length": 288.0, + "epoch": 0.7934660997986127, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.09221204370260239, + "learning_rate": 3.6786224080400886e-06, + "loss": 0.011, + "num_tokens": 282751956.0, + "reward": 4.661773681640625, + "reward_std": 0.5073695778846741, + "rewards/accuracy_reward/mean": 3.890625, + "rewards/accuracy_reward/std": 2.6998953819274902, + "rewards/ngram_similarity_reward/mean": 0.7711489200592041, + "rewards/ngram_similarity_reward/std": 0.41562578082084656, + "step": 1773 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 785.0, + "completions/max_terminated_length": 785.0, + "completions/mean_length": 413.140625, + "completions/mean_terminated_length": 413.140625, + "completions/min_length": 242.0, + "completions/min_terminated_length": 242.0, + "epoch": 0.7939136272096666, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.11431113630533218, + "learning_rate": 3.6771057922398905e-06, + "loss": 0.0065, + "num_tokens": 282903709.0, + "reward": 2.763495922088623, + "reward_std": 1.5588185787200928, + "rewards/accuracy_reward/mean": 2.125, + "rewards/accuracy_reward/std": 3.000000238418579, + "rewards/ngram_similarity_reward/mean": 0.6384957432746887, + "rewards/ngram_similarity_reward/std": 0.19880542159080505, + "step": 1774 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 703.0, + "completions/max_terminated_length": 703.0, + "completions/mean_length": 438.359375, + "completions/mean_terminated_length": 438.359375, + "completions/min_length": 244.0, + "completions/min_terminated_length": 244.0, + "epoch": 0.7943611546207205, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10721493512392044, + "learning_rate": 3.6755886689011355e-06, + "loss": -0.0138, + "num_tokens": 283071956.0, + "reward": 1.9768624305725098, + "reward_std": 1.5460984706878662, + "rewards/accuracy_reward/mean": 1.1875, + "rewards/accuracy_reward/std": 2.7189810276031494, + "rewards/ngram_similarity_reward/mean": 0.7893625497817993, + "rewards/ngram_similarity_reward/std": 0.27228429913520813, + "step": 1775 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 691.0, + "completions/mean_length": 460.96875, + "completions/mean_terminated_length": 435.7778015136719, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "epoch": 0.7948086820317745, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.08337391912937164, + "learning_rate": 3.674071038854364e-06, + "loss": -0.0193, + "num_tokens": 283212034.0, + "reward": 4.8831915855407715, + "reward_std": 0.09472521394491196, + "rewards/accuracy_reward/mean": 4.0, + "rewards/accuracy_reward/std": 2.618614673614502, + "rewards/ngram_similarity_reward/mean": 0.883191704750061, + "rewards/ngram_similarity_reward/std": 0.2588989734649658, + "step": 1776 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 631.0, + "completions/max_terminated_length": 631.0, + "completions/mean_length": 402.265625, + "completions/mean_terminated_length": 402.265625, + "completions/min_length": 272.0, + "completions/min_terminated_length": 272.0, + "epoch": 0.7952562094428284, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13653582334518433, + "learning_rate": 3.672552902930394e-06, + "loss": 0.0153, + "num_tokens": 283378675.0, + "reward": 4.7928595542907715, + "reward_std": 0.11162438988685608, + "rewards/accuracy_reward/mean": 4.0, + "rewards/accuracy_reward/std": 2.618614673614502, + "rewards/ngram_similarity_reward/mean": 0.7928594350814819, + "rewards/ngram_similarity_reward/std": 0.2462923675775528, + "step": 1777 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 851.0, + "completions/mean_length": 492.921875, + "completions/mean_terminated_length": 468.2381286621094, + "completions/min_length": 204.0, + "completions/min_terminated_length": 204.0, + "epoch": 0.7957037368538823, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10597686469554901, + "learning_rate": 3.671034261960319e-06, + "loss": 0.0321, + "num_tokens": 283553006.0, + "reward": 2.369813919067383, + "reward_std": 1.3356808423995972, + "rewards/accuracy_reward/mean": 1.65625, + "rewards/accuracy_reward/std": 2.9016621112823486, + "rewards/ngram_similarity_reward/mean": 0.7135640382766724, + "rewards/ngram_similarity_reward/std": 0.18256360292434692, + "step": 1778 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 641.0, + "completions/mean_length": 416.734375, + "completions/mean_terminated_length": 390.8412780761719, + "completions/min_length": 227.0, + "completions/min_terminated_length": 227.0, + "epoch": 0.7961512642649362, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12527470290660858, + "learning_rate": 3.669515116775511e-06, + "loss": 0.0801, + "num_tokens": 283715805.0, + "reward": 2.960556745529175, + "reward_std": 0.07754544168710709, + "rewards/accuracy_reward/mean": 2.5, + "rewards/accuracy_reward/std": 3.0237157344818115, + "rewards/ngram_similarity_reward/mean": 0.46055683493614197, + "rewards/ngram_similarity_reward/std": 0.23064681887626648, + "step": 1779 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 807.0, + "completions/max_terminated_length": 807.0, + "completions/mean_length": 547.859375, + "completions/mean_terminated_length": 547.859375, + "completions/min_length": 310.0, + "completions/min_terminated_length": 310.0, + "epoch": 0.7965987916759901, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0594840906560421, + "learning_rate": 3.6679954682076158e-06, + "loss": -0.0067, + "num_tokens": 283880004.0, + "reward": 6.205974578857422, + "reward_std": 0.06984958052635193, + "rewards/accuracy_reward/mean": 5.5, + "rewards/accuracy_reward/std": 0.0, + "rewards/ngram_similarity_reward/mean": 0.7059743404388428, + "rewards/ngram_similarity_reward/std": 0.29315268993377686, + "step": 1780 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 596.0, + "completions/max_terminated_length": 596.0, + "completions/mean_length": 378.46875, + "completions/mean_terminated_length": 378.46875, + "completions/min_length": 197.0, + "completions/min_terminated_length": 197.0, + "epoch": 0.797046319087044, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.11517168581485748, + "learning_rate": 3.6664753170885574e-06, + "loss": -0.0129, + "num_tokens": 284110754.0, + "reward": 4.98866081237793, + "reward_std": 0.08581934124231339, + "rewards/accuracy_reward/mean": 4.0, + "rewards/accuracy_reward/std": 2.618614673614502, + "rewards/ngram_similarity_reward/mean": 0.988660991191864, + "rewards/ngram_similarity_reward/std": 0.23940570652484894, + "step": 1781 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 607.0, + "completions/max_terminated_length": 607.0, + "completions/mean_length": 388.03125, + "completions/mean_terminated_length": 388.03125, + "completions/min_length": 250.0, + "completions/min_terminated_length": 250.0, + "epoch": 0.797493846498098, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.10036811977624893, + "learning_rate": 3.6649546642505324e-06, + "loss": 0.0273, + "num_tokens": 284260772.0, + "reward": 4.801386833190918, + "reward_std": 0.18152017891407013, + "rewards/accuracy_reward/mean": 4.0, + "rewards/accuracy_reward/std": 2.618614673614502, + "rewards/ngram_similarity_reward/mean": 0.801386833190918, + "rewards/ngram_similarity_reward/std": 0.3517768681049347, + "step": 1782 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 843.0, + "completions/max_terminated_length": 843.0, + "completions/mean_length": 538.703125, + "completions/mean_terminated_length": 538.703125, + "completions/min_length": 347.0, + "completions/min_terminated_length": 347.0, + "epoch": 0.797941373909152, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08701319992542267, + "learning_rate": 3.663433510526014e-06, + "loss": -0.0013, + "num_tokens": 284418113.0, + "reward": 2.274028778076172, + "reward_std": 0.8565681576728821, + "rewards/accuracy_reward/mean": 1.9375, + "rewards/accuracy_reward/std": 2.9700891971588135, + "rewards/ngram_similarity_reward/mean": 0.33652883768081665, + "rewards/ngram_similarity_reward/std": 0.21484392881393433, + "step": 1783 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1211.0, + "completions/max_terminated_length": 1211.0, + "completions/mean_length": 509.15625, + "completions/mean_terminated_length": 509.15625, + "completions/min_length": 278.0, + "completions/min_terminated_length": 278.0, + "epoch": 0.7983889013202059, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1106950119137764, + "learning_rate": 3.6619118567477474e-06, + "loss": -0.0118, + "num_tokens": 284573179.0, + "reward": 5.514496326446533, + "reward_std": 0.9562253952026367, + "rewards/accuracy_reward/mean": 4.65625, + "rewards/accuracy_reward/std": 2.102294683456421, + "rewards/ngram_similarity_reward/mean": 0.8582462668418884, + "rewards/ngram_similarity_reward/std": 0.21829448640346527, + "step": 1784 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 871.0, + "completions/max_terminated_length": 871.0, + "completions/mean_length": 468.109375, + "completions/mean_terminated_length": 468.109375, + "completions/min_length": 278.0, + "completions/min_terminated_length": 278.0, + "epoch": 0.7988364287312598, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0942654237151146, + "learning_rate": 3.660389703748754e-06, + "loss": 0.0083, + "num_tokens": 284724146.0, + "reward": 6.243251800537109, + "reward_std": 0.4922480583190918, + "rewards/accuracy_reward/mean": 5.40625, + "rewards/accuracy_reward/std": 0.7500000596046448, + "rewards/ngram_similarity_reward/mean": 0.8370020389556885, + "rewards/ngram_similarity_reward/std": 0.14942000806331635, + "step": 1785 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 718.0, + "completions/max_terminated_length": 718.0, + "completions/mean_length": 394.921875, + "completions/mean_terminated_length": 394.921875, + "completions/min_length": 232.0, + "completions/min_terminated_length": 232.0, + "epoch": 0.7992839561423137, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.09098318219184875, + "learning_rate": 3.658867052362328e-06, + "loss": 0.0327, + "num_tokens": 284875453.0, + "reward": 5.2692790031433105, + "reward_std": 0.8968341946601868, + "rewards/accuracy_reward/mean": 4.75, + "rewards/accuracy_reward/std": 2.0, + "rewards/ngram_similarity_reward/mean": 0.5192788243293762, + "rewards/ngram_similarity_reward/std": 0.296322762966156, + "step": 1786 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1232.0, + "completions/mean_length": 508.359375, + "completions/mean_terminated_length": 458.69354248046875, + "completions/min_length": 295.0, + "completions/min_terminated_length": 295.0, + "epoch": 0.7997314835533677, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11340255290269852, + "learning_rate": 3.6573439034220336e-06, + "loss": -0.0219, + "num_tokens": 285025012.0, + "reward": 4.652604579925537, + "reward_std": 0.4156516492366791, + "rewards/accuracy_reward/mean": 3.90625, + "rewards/accuracy_reward/std": 2.6709415912628174, + "rewards/ngram_similarity_reward/mean": 0.7463546991348267, + "rewards/ngram_similarity_reward/std": 0.3110392391681671, + "step": 1787 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 863.0, + "completions/max_terminated_length": 863.0, + "completions/mean_length": 497.953125, + "completions/mean_terminated_length": 497.953125, + "completions/min_length": 303.0, + "completions/min_terminated_length": 303.0, + "epoch": 0.8001790109644216, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1081513836979866, + "learning_rate": 3.6558202577617125e-06, + "loss": 0.0199, + "num_tokens": 285132033.0, + "reward": 3.0789520740509033, + "reward_std": 0.4339574873447418, + "rewards/accuracy_reward/mean": 2.40625, + "rewards/accuracy_reward/std": 3.0222392082214355, + "rewards/ngram_similarity_reward/mean": 0.6727021932601929, + "rewards/ngram_similarity_reward/std": 0.37716013193130493, + "step": 1788 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 793.0, + "completions/mean_length": 557.921875, + "completions/mean_terminated_length": 534.2698974609375, + "completions/min_length": 301.0, + "completions/min_terminated_length": 301.0, + "epoch": 0.8006265383754755, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09166616201400757, + "learning_rate": 3.654296116215473e-06, + "loss": -0.0014, + "num_tokens": 285305788.0, + "reward": 4.567384719848633, + "reward_std": 0.7663179636001587, + "rewards/accuracy_reward/mean": 3.625, + "rewards/accuracy_reward/std": 2.8030595779418945, + "rewards/ngram_similarity_reward/mean": 0.9423847198486328, + "rewards/ngram_similarity_reward/std": 0.2744849920272827, + "step": 1789 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 655.0, + "completions/max_terminated_length": 655.0, + "completions/mean_length": 440.828125, + "completions/mean_terminated_length": 440.828125, + "completions/min_length": 308.0, + "completions/min_terminated_length": 308.0, + "epoch": 0.8010740657865294, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13537095487117767, + "learning_rate": 3.6527714796176996e-06, + "loss": -0.0083, + "num_tokens": 285402305.0, + "reward": 4.586366653442383, + "reward_std": 0.16081549227237701, + "rewards/accuracy_reward/mean": 4.0, + "rewards/accuracy_reward/std": 2.618614673614502, + "rewards/ngram_similarity_reward/mean": 0.5863662958145142, + "rewards/ngram_similarity_reward/std": 0.267704576253891, + "step": 1790 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 761.0, + "completions/max_terminated_length": 761.0, + "completions/mean_length": 392.03125, + "completions/mean_terminated_length": 392.03125, + "completions/min_length": 187.0, + "completions/min_terminated_length": 187.0, + "epoch": 0.8015215931975833, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11276783049106598, + "learning_rate": 3.6512463488030443e-06, + "loss": 0.0058, + "num_tokens": 285537891.0, + "reward": 3.4525465965270996, + "reward_std": 0.44818663597106934, + "rewards/accuracy_reward/mean": 2.59375, + "rewards/accuracy_reward/std": 3.0222392082214355, + "rewards/ngram_similarity_reward/mean": 0.8587964177131653, + "rewards/ngram_similarity_reward/std": 0.2226724773645401, + "step": 1791 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 783.0, + "completions/max_terminated_length": 783.0, + "completions/mean_length": 478.5625, + "completions/mean_terminated_length": 478.5625, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "epoch": 0.8019691206086372, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.115529865026474, + "learning_rate": 3.6497207246064296e-06, + "loss": 0.0023, + "num_tokens": 285670903.0, + "reward": 2.5690736770629883, + "reward_std": 0.7571008801460266, + "rewards/accuracy_reward/mean": 2.125, + "rewards/accuracy_reward/std": 3.000000238418579, + "rewards/ngram_similarity_reward/mean": 0.44407370686531067, + "rewards/ngram_similarity_reward/std": 0.3232196569442749, + "step": 1792 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1029.0, + "completions/max_terminated_length": 1029.0, + "completions/mean_length": 557.09375, + "completions/mean_terminated_length": 557.09375, + "completions/min_length": 299.0, + "completions/min_terminated_length": 299.0, + "epoch": 0.8024166480196913, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08564143627882004, + "learning_rate": 3.648194607863052e-06, + "loss": 0.0524, + "num_tokens": 285806157.0, + "reward": 4.387675762176514, + "reward_std": 0.807447075843811, + "rewards/accuracy_reward/mean": 3.8125, + "rewards/accuracy_reward/std": 2.7189810276031494, + "rewards/ngram_similarity_reward/mean": 0.5751754641532898, + "rewards/ngram_similarity_reward/std": 0.21848054230213165, + "step": 1793 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1018.0, + "completions/max_terminated_length": 1018.0, + "completions/mean_length": 453.515625, + "completions/mean_terminated_length": 453.515625, + "completions/min_length": 223.0, + "completions/min_terminated_length": 223.0, + "epoch": 0.8028641754307452, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.14359252154827118, + "learning_rate": 3.646667999408373e-06, + "loss": -0.0296, + "num_tokens": 285955630.0, + "reward": 4.477880477905273, + "reward_std": 1.0613147020339966, + "rewards/accuracy_reward/mean": 3.703125, + "rewards/accuracy_reward/std": 2.789889335632324, + "rewards/ngram_similarity_reward/mean": 0.7747553586959839, + "rewards/ngram_similarity_reward/std": 0.289902001619339, + "step": 1794 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1058.0, + "completions/max_terminated_length": 1058.0, + "completions/mean_length": 519.09375, + "completions/mean_terminated_length": 519.09375, + "completions/min_length": 289.0, + "completions/min_terminated_length": 289.0, + "epoch": 0.8033117028417991, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11683569103479385, + "learning_rate": 3.6451409000781263e-06, + "loss": 0.0152, + "num_tokens": 286067060.0, + "reward": 5.668905735015869, + "reward_std": 0.8359593152999878, + "rewards/accuracy_reward/mean": 5.171875, + "rewards/accuracy_reward/std": 1.491294264793396, + "rewards/ngram_similarity_reward/mean": 0.4970306158065796, + "rewards/ngram_similarity_reward/std": 0.3316640555858612, + "step": 1795 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 645.0, + "completions/mean_length": 526.0, + "completions/mean_terminated_length": 451.14752197265625, + "completions/min_length": 253.0, + "completions/min_terminated_length": 253.0, + "epoch": 0.803759230252853, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1556176394224167, + "learning_rate": 3.643613310708314e-06, + "loss": 0.0674, + "num_tokens": 286204356.0, + "reward": 6.280373573303223, + "reward_std": 0.08985870331525803, + "rewards/accuracy_reward/mean": 5.5, + "rewards/accuracy_reward/std": 0.0, + "rewards/ngram_similarity_reward/mean": 0.7803735733032227, + "rewards/ngram_similarity_reward/std": 0.33898887038230896, + "step": 1796 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 692.0, + "completions/max_terminated_length": 692.0, + "completions/mean_length": 492.515625, + "completions/mean_terminated_length": 492.515625, + "completions/min_length": 330.0, + "completions/min_terminated_length": 330.0, + "epoch": 0.8042067576639069, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11379936337471008, + "learning_rate": 3.642085232135204e-06, + "loss": -0.022, + "num_tokens": 286339941.0, + "reward": 5.672510147094727, + "reward_std": 1.0292677879333496, + "rewards/accuracy_reward/mean": 5.1875, + "rewards/accuracy_reward/std": 1.42400062084198, + "rewards/ngram_similarity_reward/mean": 0.48501038551330566, + "rewards/ngram_similarity_reward/std": 0.2070183902978897, + "step": 1797 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 943.0, + "completions/max_terminated_length": 943.0, + "completions/mean_length": 507.125, + "completions/mean_terminated_length": 507.125, + "completions/min_length": 298.0, + "completions/min_terminated_length": 298.0, + "epoch": 0.8046542850749608, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1011330708861351, + "learning_rate": 3.640556665195335e-06, + "loss": 0.0084, + "num_tokens": 286502445.0, + "reward": 0.8693095445632935, + "reward_std": 0.9130166172981262, + "rewards/accuracy_reward/mean": 0.140625, + "rewards/accuracy_reward/std": 1.8970921039581299, + "rewards/ngram_similarity_reward/mean": 0.7286845445632935, + "rewards/ngram_similarity_reward/std": 0.25176477432250977, + "step": 1798 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 904.0, + "completions/mean_length": 464.796875, + "completions/mean_terminated_length": 439.66668701171875, + "completions/min_length": 208.0, + "completions/min_terminated_length": 208.0, + "epoch": 0.8051018124860148, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.09837361425161362, + "learning_rate": 3.639027610725512e-06, + "loss": -0.0147, + "num_tokens": 286651008.0, + "reward": 3.1708290576934814, + "reward_std": 0.09569500386714935, + "rewards/accuracy_reward/mean": 2.5, + "rewards/accuracy_reward/std": 3.0237157344818115, + "rewards/ngram_similarity_reward/mean": 0.6708289980888367, + "rewards/ngram_similarity_reward/std": 0.3423401713371277, + "step": 1799 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 831.0, + "completions/max_terminated_length": 831.0, + "completions/mean_length": 501.6875, + "completions/mean_terminated_length": 501.6875, + "completions/min_length": 241.0, + "completions/min_terminated_length": 241.0, + "epoch": 0.8055493398970687, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.07266926765441895, + "learning_rate": 3.6374980695628064e-06, + "loss": 0.0053, + "num_tokens": 286785260.0, + "reward": 5.345115661621094, + "reward_std": 0.8422203063964844, + "rewards/accuracy_reward/mean": 4.5625, + "rewards/accuracy_reward/std": 2.195775270462036, + "rewards/ngram_similarity_reward/mean": 0.7826155424118042, + "rewards/ngram_similarity_reward/std": 0.24295289814472198, + "step": 1800 + } + ], + "logging_steps": 1, + "max_steps": 4470, + "num_input_tokens_seen": 286785260, + "num_train_epochs": 2, + "save_steps": 50, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}